网页信息抽取的模板匹配方法(原理)
roki
2009-06-15
先看一段模板配置 <?xml version="1.0" encoding="GBK" ?> <configuration> <item name="getGidStr"><![CDATA[<select name=gid >[$gidstr]</select>]]> </item> <item name="getGidStr_group"><![CDATA[<option value="[$gid]"[$]>[$groupname]</option>]]> </item> </configuration>
<option value="[$gid]"[$]>[$groupname]</option> 转化为Java正则,并且要记录$gid和$groupname这两个变量的位置 ,转化的步骤,首先是转义正则元字符
static Pattern REG_PATTERN = Pattern.compile(
Matcher regm = REG_PATTERN.matcher(expStr);
然后是去匹配模板中的变量
public static Pattern VAR_PATTERN = Pattern.compile(
Matcher matcher = VAR_PATTERN.matcher(exp);
整个处理函数 ,返回的是这样一个结构
public class VarAssign { Pattern regexp; String regexpStr; ArrayList<Var> varlist=new ArrayList(); public static class Var{ String var;//变量 int ids;//group VarAssign subVarAssign; //如果是组模式,则这个变量会有组模式中的子模式的信 public Var(String var, int ids,VarAssign varAssign) { this.var = var; this.ids = ids; this.subVarAssign=varAssign; } public Var(String var, int ids) { this.var = var; this.ids = ids; } public int getIds() { return ids; } public void setIds(int ids) { this.ids = ids; } public String getVar() { return var; } public void setVar(String var) { this.var = var; } public VarAssign getSubVarAssign() { return subVarAssign; } public void setSubVarAssign(VarAssign subVarAssign) { this.subVarAssign = subVarAssign; } } public String toString(){ StringBuffer sb=new StringBuffer(); sb.append("regexp:"+regexp+"\n"); for(Var v:varlist){ sb.append(v.getVar()+":"+v.getIds()); } return sb.toString(); } public Pattern getRegexp() { return regexp; } public void setRegexp(Pattern regexp) { this.regexp = regexp; } public VarAssign() { } public ArrayList<Var> getVarlist() { return varlist; } public void setVarlist(ArrayList<Var> varlist) { this.varlist = varlist; } public String getRegexpStr() { return regexpStr; } public void setRegexpStr(String regexpStr) { this.regexpStr = regexpStr; } }
|