网页信息抽取的模板匹配方法

roki 2009-06-15
package com.rayeen.spider;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;

import com.rayeen.constant.ConfConstant;
import com.rayeen.util.DomUtils;


public class SpiderVM {

	protected static final Logger LOG = Logger.getLogger(SpiderVM.class);
	
	
	public static List<Map<String, String>> getParseResult(String content, String regkey){
		
		if(!RegConfUtils.regexpMap.containsKey(regkey)){
			LOG.error("unexist regkey:"+regkey);
			return new ArrayList();
		}
		
		 return
		  SpiderVM.getResult(
				 content, RegConfUtils.regexpMap
				.get(regkey),new SaveParam());
	}
	
	
	
	public static List<Map<String, String>> getNeedTransParseResult(String content, String regkey) throws Exception{
		
		if(!RegConfUtils.regexpMap.containsKey(regkey)){
			LOG.error("unexist regkey:"+regkey);
			return new ArrayList();
		}
		
		 return
		  SpiderVM.getResult(
				  
				  DomUtils.getContent(content,"gbk"), 
				 
				 RegConfUtils.regexpMap
				.get(regkey),new SaveParam());
	}
	
	
	

	public static List<Map<String, String>> getResult(String content, String regexp,SaveParam  saveParam){

		List<Map<String, String>> resultMapList = new ArrayList();
		

		VarAssign varReg = null;
		VarAssign grpVarReg = null;
		Matcher grpMatch = null;

		// 组匹配模式的式子
		if (saveParam.getGroupMode() != null) {
			grpVarReg = RegularExpUtils.parseGroupMatchRegexp(regexp);

			Matcher groupMatcher = RegularExpUtils.GROUP_VAR_PATTERN
					.matcher(regexp);
			regexp = groupMatcher.replaceAll("[\\$]");

		}

		if (saveParam.getMatchMod().equals(ConfConstant.REGULAR)) {
			varReg = RegularExpUtils.parseMatchRegexpAdv(regexp, saveParam
					.getRegmodStr());

		} else {
			varReg = RegularExpUtils.parseMatchRegexp(regexp);
		}

		Pattern regular = varReg.getRegexp();

		boolean matched = false;

		content = content.replaceAll("&amp;", "&");
		content = content.replaceAll("&lt;", "<");
		content = content.replaceAll("&gt;", ">");
		content = content.replaceAll("&quot;", "\"");

		Matcher m = regular.matcher(content);
		String fullExp = "";
		int last = 0;
		int itemCnt = 0;
		int matchCnt = 0;

		int grpLast = 0;
		int grpCnt = 0;

		while (m.find(last) && matchCnt < saveParam.getMaxMatch()) {

			matched = true;

			last = m.end();

			if (saveParam.getExclude().contains(new Integer(itemCnt++))) {
				continue;
			}

			// 不包含的数据,不计算在“最多匹配{2}”中
			matchCnt++;

			Map resultMap = new HashMap();

			for (int i = 0; i <= m.groupCount(); i++) {
				String resu = m.group(i);
				if (i == 0) {
					fullExp = resu;
					// resultMap.put(ConfConstant.FULL_MATCH, fullExp);

				} else {
					resultMap.put(varReg.getVarlist().get(i - 1).getVar(), m
							.group(i));

				}
			}

			// begin add group mode
			// 单匹配模式推进一次, 组匹配模式也推进一次
			if (null != grpMatch && grpMatch.find(grpLast)) {
				grpLast = grpMatch.end();

				Map<String, String> grpResultMap = new HashMap();
				String grpStr = "";
				// 对于每个组匹配
				for (int i = 0; i <= grpMatch.groupCount(); i++) {
					String resu = grpMatch.group(i);
					if (i == 0) {
						// grpStr = res;
						// /grpResultMap.put(ConfConstant.FULL_MATCH,
						// fullExp);
					} else {

						String target = grpMatch.group(i);
						String grpName = grpVarReg.getVarlist().get(i - 1)
								.getVar();

						VarAssign va = grpVarReg.getVarlist().get(i - 1)
								.getSubVarAssign();
						Matcher tmpM = va.getRegexp().matcher(target);
						int tmpLast = 0;
						int tmpCnt = 0;
						StringBuffer result = new StringBuffer();

						while (tmpM.find(tmpLast)) {
							tmpLast = tmpM.end();
							ArrayList<String> resultArray = new ArrayList();

							for (int t = 0; t <= tmpM.groupCount(); t++) {
								if (t == 0) {
								} else {

									if (saveParam.getGroupMode().isShowName()) {

										resultArray.add(va.getVarlist().get(
												t - 1).getVar()
												+ saveParam.getGroupMode()
														.getNameSeparator()
												+ tmpM.group(t));
									} else {
										resultArray.add(tmpM.group(t));
									}
								}
							}
							result.append(StringUtils.join(resultArray
									.iterator(), saveParam.getGroupMode()
									.getFieldSeparator()));

							result.append(saveParam.getGroupMode()
									.getLineSeparator());

						}
						resultMap.put(grpName, result.toString());
					}
				}
			}

			resultMapList.add(resultMap);

			if (last == content.length())
				break;

		}

		if (!matched) {
			// 如果匹配失败,预先打印信息,而不是留到save的时候报错,这不是必须的
			LOG.warn("match failed:" + regexp);
		}
		
		return resultMapList;
	}

}
 
roki 2009-06-15
帖子怎么修改啊   :
roki 2009-06-15
package com.rayeen.spider;

import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;

public class RegularExpUtils {

	final static HashMap<String, Integer> modifierMap = new HashMap();
	static {
		modifierMap.put("i", Pattern.CASE_INSENSITIVE);
		modifierMap.put("m", Pattern.MULTILINE);
		modifierMap.put("s", Pattern.DOTALL);

	}

	private static String specialString = ".?*()[]^+|$\\ ";

	// 前面不是'\'的[%... %],代表组模式,如果加了'\'则不是
	public static Pattern GROUP_VAR_PATTERN = Pattern.compile(
			"((?<!\\\\)\\[%(\\w+)=(.*?)%\\])", Pattern.CASE_INSENSITIVE
					| Pattern.MULTILINE | Pattern.DOTALL);

	public static Pattern UNGREED_PLACEHOLDER_PATTERN = Pattern.compile(
			"\\(\\.\\*\\?\\)", Pattern.LITERAL);

	public static Pattern UNGREED_VAR_PLACEHOLDER_PATTERN = Pattern
			.compile("\\\\\\[\\\\\\$\\w+?\\\\\\]");

	static Pattern REG_PATTERN = Pattern.compile(
			"\\.|\\?|\\*|\\(|\\)|\\[|\\]|\\^|\\+|\\||\\$|\\\\|\\{",
			Pattern.CASE_INSENSITIVE);

	static Pattern BLANK_PLACEHOLDER_PATTERN = Pattern.compile("[$]",
			Pattern.LITERAL);

	static Pattern GREED_BLANK_PLACEHOLDER_PATTERN = Pattern.compile("[#]",
			Pattern.LITERAL);

	public static Pattern VAR_PATTERN = Pattern.compile(
			"\\[(\\$|#)(.*?)(\\((.*?)\\))?\\]", Pattern.CASE_INSENSITIVE);

	// static Pattern UNFREED_VAR_PATTERN = Pattern.compile("\\[\\#(.*?)\\]",
	// Pattern.CASE_INSENSITIVE);

	static Pattern FULL_PATTERN = Pattern.compile("^\\[\\$(.*?)\\]$",
			Pattern.CASE_INSENSITIVE);

	// 定制正则的高级匹配模式, 主要问题是,需要判断前面要跳过几个group
	// 需要对每个变量所在的group做 name->int 的hash
	static public VarAssign parseMatchRegexpAdv(String exp, String modStr) {
		VarAssign varExp = new VarAssign();

		// 去掉空的占位符
		exp = BLANK_PLACEHOLDER_PATTERN.matcher(exp).replaceAll(".*?");
		exp = GREED_BLANK_PLACEHOLDER_PATTERN.matcher(exp).replaceAll(".*");

		Matcher matcher = VAR_PATTERN.matcher(exp);

		int last = 0;
		int itemCnt = 0;
		while (matcher.find(last)) {
			last = matcher.end();

			StringBuffer sbuf = new StringBuffer();
			if (matcher.groupCount() > 0) {

				String var = matcher.group(2);

				if (StringUtils.isEmpty(var))
					continue;

				String needle = matcher.group(4);
				if (StringUtils.isEmpty(needle)) {

					exp = Pattern.compile("[$" + var + "]", Pattern.LITERAL)
							.matcher(exp).replaceAll("(.*?)");

					exp = Pattern.compile("[#" + var + "]", Pattern.LITERAL)
							.matcher(exp).replaceAll("(.*)");
				} else {

					String rep = escapeRegular(needle);

					exp = Pattern.compile("[$" + var + "(" + needle + ")]",
							Pattern.LITERAL).matcher(exp).replaceFirst(
							"(" + rep + "*?)");

					exp = Pattern.compile("[#" + var + "(" + needle + ")]",
							Pattern.LITERAL).matcher(exp).replaceAll(
							"(" + rep + "*)");
				}

				itemCnt++;

				varExp.getVarlist().add(new VarAssign.Var(var, itemCnt));

			}
		}

		int mod = 0;

		String[] ms = modStr.split("");
		for (String m : ms) {
			Integer i = modifierMap.get(m);
			if (null != i) {
				mod |= i.intValue();
			}
		}

		varExp.setRegexp(Pattern.compile(exp, mod));
		varExp.setRegexpStr(exp);
		return varExp;
	}

	static public VarAssign parseMatchRegexp(String exp) {

		VarAssign varExp = new VarAssign();

		Matcher fullMatch = FULL_PATTERN.matcher(exp);
		if (fullMatch.find()) {
			String var = fullMatch.group(1);
			varExp.getVarlist().add(new VarAssign.Var(var, 1));
			String regexp = "(.*)";
			varExp.setRegexpStr(regexp);
			varExp.setRegexp(Pattern.compile(regexp, Pattern.MULTILINE
					| Pattern.DOTALL));
			return varExp;
		}

		Matcher matcher = VAR_PATTERN.matcher(exp);

		// 转义正则元字符
		Matcher m = REG_PATTERN.matcher(exp);
		StringBuffer expBuf = new StringBuffer();

		// 转义正则元字符
		while (m.find()) {
			m.appendReplacement(expBuf, "\\\\$0");
		}
		m.appendTail(expBuf);

		exp = expBuf.toString();

		// 去掉空的占位符
		exp = Pattern.compile("\\[\\$\\]", Pattern.LITERAL).matcher(exp)
				.replaceAll(".*?");
		exp = Pattern.compile("\\[#\\]", Pattern.LITERAL).matcher(exp)
				.replaceAll(".*");

		exp = exp.replaceAll("\\s+", "\\\\s+");

		// 找到每一个替换变量
		int last = 0;

		int itemCnt = 0;
		while (matcher.find(last)) {
			last = matcher.end();

			StringBuffer sbuf = new StringBuffer();
			// 匹配到[.+?]的话,记录各个变量的group值, 只能是扁平结构
			if (matcher.groupCount() > 0) {

				String var = matcher.group(2);

				if (StringUtils.isEmpty(var))
					continue;

				// String needle=matcher.group(3);
				// if(StringUtils.isEmpty(needle)){
				// needle=".*";
				// }

				String needle = matcher.group(4);
				if (StringUtils.isEmpty(needle)) {

					exp = Pattern.compile("\\[\\$" + var + "\\]",
							Pattern.LITERAL).matcher(exp).replaceAll("(.*?)");

					exp = Pattern
							.compile("\\[#" + var + "\\]", Pattern.LITERAL)
							.matcher(exp).replaceAll("(.*)");
				} else {

					needle = escapeRegular(needle);

					exp = Pattern.compile(
							"\\[\\$" + var + "\\(" + needle + "\\)\\]",
							Pattern.LITERAL).matcher(exp).replaceAll(
							"(" + needle + "*?)");

					exp = Pattern.compile("\\[#" + var + "(" + needle + ")\\]",
							Pattern.LITERAL).matcher(exp).replaceAll(
							"(" + needle + "*)");
				}

				itemCnt++;
				// 把占位符替换为正则式,暂时不考虑

				varExp.getVarlist().add(new VarAssign.Var(var, itemCnt));

			}
		}

		varExp.setRegexp(Pattern.compile(exp, Pattern.MULTILINE
				| Pattern.DOTALL | Pattern.CASE_INSENSITIVE));
		varExp.setRegexpStr(exp);
		return varExp;
	}

	// 开启组匹配模式,暂时组模式里只能再包含一个变量,因为一个组最终匹配出来,比如
	// [%actor=<a href="/person/905359/" title="塞吉·卡斯特里图/Sergio
	// Castellitto">塞吉·卡斯特里图 Sergio Castellitto</a>]
	// </p><p><a href="/person/957004/" title="/Tiziana Lodato"> Tiziana
	// Lodato</a></p>
	// <p><a href="/person/936223/" title="/Franco Scaldati"> Franco
	// Scaldati</a> 
	// <a href="/movie/10000/fullcredits.html" title="更多
	/**
	 * [%film=<a href="[$acturl]">[$actor]</a>%]
	 */
	// 组模式必须要和原来的模式分两次处理
	static public VarAssign parseGroupMatchRegexp(String exp) {

		VarAssign varExp = new VarAssign();

		// 去掉所有单个模式的匹配

		// 找出表达式中的组模式
		Matcher groupMatcher = GROUP_VAR_PATTERN.matcher(exp);
		String expStr = groupMatcher.replaceAll("(.*?)");

		// 转义正则元字符
		Matcher regm = REG_PATTERN.matcher(expStr);
		StringBuffer sBuf = new StringBuffer();

		// 转义正则元字符
		while (regm.find()) {
			regm.appendReplacement(sBuf, "\\\\$0");
		}
		regm.appendTail(sBuf);

		expStr = sBuf.toString();

		expStr = UNGREED_PLACEHOLDER_PATTERN.matcher(expStr)
				.replaceAll("(.*?)");

		expStr = expStr.replaceAll("\\s+", "\\\\s+");

		// 去掉空的占位符
		expStr = Pattern.compile("\\[\\$\\]", Pattern.LITERAL).matcher(expStr)
				.replaceAll(".*?");
		expStr = Pattern.compile("\\[#\\]", Pattern.LITERAL).matcher(expStr)
				.replaceAll(".*");

		int last = 0;
		int itemCnt = 0;

		while (groupMatcher.find(last)) {

			VarAssign grpVarExp = new VarAssign();

			last = groupMatcher.end();

			StringBuffer sbuf = new StringBuffer();
			// 匹配到[.+?]的话,记录各个变量的group值, 只能是扁平结构
			if (groupMatcher.groupCount() > 0) {

				// film
				String gpName = groupMatcher.group(2);

				// <a href="[$acturl]">[$actor]</a>
				String gpStr = groupMatcher.group(3);

				// var=$actor
				if (StringUtils.isEmpty(gpStr))
					continue;

				Matcher varMatcher = VAR_PATTERN.matcher(gpStr);

				//
				// Matcher m = REG_PATTERN.matcher(gpStr);
				// StringBuffer expBuf = new StringBuffer();
				//
				//	
				// m.appendTail(expBuf);
				//
				// gpStr = expBuf.toString();

				int varLast = 0;
				int varItemCnt = 0;
				while (varMatcher.find(varLast)) {
					varLast = varMatcher.end();

					StringBuffer varSbuf = new StringBuffer();
					if (varMatcher.groupCount() > 0) {
						String var = varMatcher.group(2);
						if (StringUtils.isEmpty(var))
							continue;

						gpStr = Pattern.compile("[$" + var + "]",
								Pattern.LITERAL).matcher(gpStr).replaceAll(
								"(.*?)");

						gpStr = Pattern.compile("[#" + var + "]",
								Pattern.LITERAL).matcher(gpStr).replaceAll(
								"(.*)");

						// 去掉空的占位符
						gpStr = Pattern.compile("[$]", Pattern.LITERAL)
								.matcher(gpStr).replaceAll(".*?");
						gpStr = Pattern.compile("[#]", Pattern.LITERAL)
								.matcher(gpStr).replaceAll(".*");

						varItemCnt++;
						// 把占位符替换为正则式,暂时不考虑

						// 这里还缺少一步,就是先把\\换成\,再把\[换成[
						// gpStr=gpStr.replaceAll("\\\\\\\\","\\\\");
						// gpStr=gpStr.replaceAll("\\\\\\[","[");

						grpVarExp.getVarlist().add(
								new VarAssign.Var(var, varItemCnt));

					}
					;
				}

				grpVarExp.setRegexpStr(gpStr);
				grpVarExp.setRegexp(Pattern.compile(gpStr, Pattern.MULTILINE
						| Pattern.DOTALL | Pattern.CASE_INSENSITIVE));

				varExp.getVarlist().add(
						new VarAssign.Var(gpName, itemCnt, grpVarExp));
				itemCnt++;

			}
		}

		expStr = VAR_PATTERN.matcher(expStr).replaceAll(".*?");

		// 这里还缺少一步,就是先把\\换成\,再把\[换成[
		// expStr=expStr.replaceAll("\\\\\\\\","\\\\");
		// expStr=expStr.replaceAll("\\\\\\[","[");

		int a;

		// Matcher groupMatcher =
		// RegularExpUtils.GROUP_VAR_PATTERN.matcher(regexp);
		// regexp=groupMatcher.replaceAll("[\\$]");
		expStr = UNGREED_VAR_PLACEHOLDER_PATTERN.matcher(expStr).replaceAll(
				".*?");
		// expStr=Pattern.compile("\\\\[(\\\\$|#)(\\w+?)(\\((.*?)\\))?\\\\]").matcher(expStr).replaceAll("[\\$]");

		varExp.setRegexp(Pattern.compile(expStr, Pattern.MULTILINE
				| Pattern.DOTALL | Pattern.CASE_INSENSITIVE));
		varExp.setRegexpStr(expStr);
		return varExp;

	}

	// static public Outlink[] getNamedMatchUrl(Outlink[] outLinks, String
	// pattern) {
	//
	// //先取出完全符合模式的links
	// ArrayList<Outlink> result = new ArrayList();
	// Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
	// for (Outlink ol : outLinks) {
	// if (p.matcher(ol.getToUrl()).find()) {
	// result.add(ol);
	// }
	// }
	//		
	//		
	//
	// int i = pattern.indexOf("\"");
	// int j = pattern.lastIndexOf("\"");
	// // 取出两个双引号之间的内容,来匹配outlink
	// pattern = pattern.substring(i + 1, j - 1).trim();
	//
	// p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
	//		
	// ArrayList<Outlink> finalResult = new ArrayList();
	//
	// for (Outlink ol : finalResult) {
	// if (p.matcher(ol.getToUrl()).find()) {
	// result.add(ol);
	// }
	// }
	//
	// return result.toArray(new Outlink[0]);
	// }

	static public String escapeRegular(String str) {
		Matcher m = REG_PATTERN.matcher(str);
		StringBuffer expBuf = new StringBuffer();

		// 转义正则元字符
		while (m.find()) {
			m.appendReplacement(expBuf, "\\\\$0");
		}
		m.appendTail(expBuf);

		str = expBuf.toString();

		return str;
	}

	// static public Outlink[] getNormalMatchUrl(Outlink[] outLinks, String
	// pattern) {
	//
	// // sql语法
	// ArrayList<Outlink> result = new ArrayList();
	//
	// // 不支持单个的匹配,只支持非贪婪匹配
	// pattern = pattern.replaceAll("\\.", "\\\\.");
	// pattern = pattern.replaceAll("\\?", "\\\\?");
	// pattern = pattern.replaceAll("\\*", ".*?");
	//
	// Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
	//
	// for (Outlink ol : outLinks) {
	// if (p.matcher(ol.getToUrl()).find()) {
	// result.add(ol);
	// }
	// }
	//
	// return result.toArray(new Outlink[0]);
	// }
}

wukele 2009-10-06
好乱
gstarwd 2010-04-30
直接用htmlparser  不好吗》?
johnbinwang 2010-04-30
项目中我是用HtmlCleaner过一遍,然后直接JDOM+XPath的。
kqy929 2010-07-08
gstarwd 写道
直接用htmlparser  不好吗》?

如果要做到通过,还是得靠正则,且也省时省力,不需要去关注页面层次结构。
楼主的方法挺不错的。
但觉得还可以更灵活,什么样的网页都有可能会碰到。
Global site tag (gtag.js) - Google Analytics