《搜索引擎零距离》IRVM的语义识别demo代码
roki
2009-06-19
执行入口程序
package com.rayeen.spider.vertical.recognize; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.PushbackReader; import java.io.StringReader; import java.io.UnsupportedEncodingException; import org.apache.lucene.analysis.Token; import com.uucun.spider.vertical.recognize.product.ArProductLexer; import com.uucun.spider.vertical.recognize.product.RoleTagProduct; public class TestLexer { public static void main(String[] args) throws ARLexerException, IOException { //String str="1<img src=asdfdasf/>end<a href=asd>adf</a>sa文件fd下载a下面sf下页魅力11秒27分<img src=asdfdasf/>end"; String str="当 价: <input name=head_biao2 " + "报价:10元 价格<em>56.00</em>元 as ¥500 fff $444 kkk £666 <script>sadfsaf safdasfd<dsa>script></script>"+ " <li > 价格:<em>56.00</em>元</li> asdf价格<b>1000元</b><!--"+ "i < currentPostageItems.length//--> 报价<b>1100元</b> 11 k kiss base<card title=\"打开文章\" id=\"hucn\">"+ "<p><< <a href=\"downview.asp?ID=138870&add=ok&page=1&url=&gg=2136347\">>></a><br/>"+ "一眼万年<br/>----------"+ "<br/><a href=\"http://wap.hucn.net/picc/79/261440454.wav\">免费下载</a>"+ "<br/>大小:510KB kB Kb kb 11 k 23 K 22b 23 b"+ "<br/>添加时间:9.26 14:41<br/><a href=\"revewlist.asp?ID=138871&TP=2&page=1&url=&gg=2136347\">发表评论 "; ARLexer l = null; if (args != null && args.length > 0) { try { InputStreamReader isr = new InputStreamReader(new FileInputStream(args[0]), "GBK"); l = new ArProductLexer(new PushbackReader(new BufferedReader(isr))); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } } else { l = new ArProductLexer(new PushbackReader(new StringReader(str),1024)); } RoleTagProduct roletag=new RoleTagProduct(); RoleTagProduct.initilize(); l.setRoleTag(roletag); l.parserToken(); Token tk=l.next(); while(null!=tk){ System.out.println(tk); tk=l.next(); } } }; |
|
roki
2009-06-19
输出结果:
(当 ,0,0,type=TXT) (当 ,0,0,type=CONTENT) (价,0,0,type=PRICE) (价,0,0,type=CONTENT) (: ,0,0,type=TXT) (: ,0,0,type=CONTENT) (56.00,0,0,type=NUM) (56.00,0,0,type=CONTENT) (,0,0,type=ENGTAG) (,0,0,type=CONTENT) (元 as ,0,0,type=TXT) (元 as ,0,0,type=CONTENT) (¥,0,0,type=PRICE_PREFIX) (¥,0,0,type=CONTENT) (500,0,0,type=NUM) (500,0,0,type=CONTENT) ( fff ,0,0,type=TXT) ( fff ,0,0,type=CONTENT) ($,0,0,type=PRICE_PREFIX) ($,0,0,type=CONTENT) (444,0,0,type=NUM) (444,0,0,type=CONTENT) (k,0,0,type=MEASURE) (k,0,0,type=CONTENT) (kk,0,0,type=TXT) (kk,0,0,type=CONTENT) (£,0,0,type=PRICE_PREFIX) (£,0,0,type=CONTENT) (666,0,0,type=NUM) (666,0,0,type=CONTENT) (价,0,0,type=PRICE) (价,0,0,type=CONTENT) (格:,0,0,type=TXT) (格:,0,0,type=CONTENT) (56.00,0,0,type=NUM) (56.00,0,0,type=CONTENT) (,0,0,type=ENGTAG) (,0,0,type=CONTENT) (元,0,0,type=TXT) (元,0,0,type=CONTENT) (,0,0,type=ENGTAG) (,0,0,type=CONTENT) ( asdf,0,0,type=TXT) ( asdf,0,0,type=CONTENT) (价,0,0,type=PRICE) (价,0,0,type=CONTENT) (格,0,0,type=TXT) (格,0,0,type=CONTENT) (1000,0,0,type=NUM) (1000,0,0,type=CONTENT) (元,0,0,type=PRICE_MEASURE) (元,0,0,type=CONTENT) (,0,0,type=ENGTAG) (,0,0,type=CONTENT) ( 报<-,0,0,type=TXT) ( 报<-,0,0,type=CONTENT) (价,0,0,type=PRICE) (价,0,0,type=CONTENT) (1100,0,0,type=NUM) (1100,0,0,type=CONTENT) (元,0,0,type=PRICE_MEASURE) (元,0,0,type=CONTENT) (,0,0,type=ENGTAG) (,0,0,type=CONTENT) (11,0,0,type=NUM) (11,0,0,type=CONTENT) (k,0,0,type=MEASURE) (k,0,0,type=CONTENT) (kiss base,0,0,type=TXT) (kiss base,0,0,type=CONTENT) (<card title="打开文章" id="hucn">,0,0,type=TITLE) (<card title="打开文章" id="hucn">,0,0,type=CONTENT) (<<,0,0,type=TXT) (<<,0,0,type=CONTENT) (<a href="downview.asp?ID=138870&add=ok&page=1&url=&gg=2136347">,0,0,type=HREF) (<a href="downview.asp?ID=138870&add=ok&page=1&url=&gg=2136347">,0,0,type=CONTENT) (>>,0,0,type=TXT) (>>,0,0,type=CONTENT) (,0,0,type=ENGTAG) (,0,0,type=CONTENT) (一眼万年----------,0,0,type=TXT) (一眼万年----------,0,0,type=CONTENT) (<a href="http://wap.hucn.net/picc/79/261440454.wav">,0,0,type=HREF) (<a href="http://wap.hucn.net/picc/79/261440454.wav">,0,0,type=CONTENT) (免费下载,0,0,type=TXT) (免费下载,0,0,type=CONTENT) (,0,0,type=ENGTAG) (,0,0,type=CONTENT) (大小,0,0,type=SIZE) (大小,0,0,type=CONTENT) (:,0,0,type=TXT) (:,0,0,type=CONTENT) (510,0,0,type=NUM) (510,0,0,type=CONTENT) (K,0,0,type=MEASURE) (K,0,0,type=CONTENT) (B kB Kb kb ,0,0,type=TXT) (B kB Kb kb ,0,0,type=CONTENT) (11,0,0,type=NUM) (11,0,0,type=CONTENT) (k,0,0,type=MEASURE) (k,0,0,type=CONTENT) (23,0,0,type=NUM) (23,0,0,type=CONTENT) (K,0,0,type=MEASURE) (K,0,0,type=CONTENT) (22,0,0,type=NUM) (22,0,0,type=CONTENT) (b ,0,0,type=TXT) (b ,0,0,type=CONTENT) (23,0,0,type=NUM) (23,0,0,type=CONTENT) ( b添加时间:,0,0,type=TXT) ( b添加时间:,0,0,type=CONTENT) (9.26,0,0,type=NUM) (9.26,0,0,type=CONTENT) (14,0,0,type=NUM) (14,0,0,type=CONTENT) (:,0,0,type=TXT) (:,0,0,type=CONTENT) (41,0,0,type=NUM) (41,0,0,type=CONTENT) (<a href="revewlist.asp?ID=138871&TP=2&page=1&url=&gg=2136347">,0,0,type=HREF) (<a href="revewlist.asp?ID=138871&TP=2&page=1&url=&gg=2136347">,0,0,type=CONTENT) (发表评论 ,0,0,type=TXT) (发表评论 ,0,0,type=CONTENT) |
|
roki
2009-06-19
/* This file was generated by SableCC (http://www.sablecc.org/). */ package com.rayeen.spider.vertical.recognize.product; import java.io.*; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Set; import com.uucun.spider.vertical.recognize.ARLexer; import com.uucun.spider.vertical.recognize.ARToken; import com.uucun.spider.vertical.recognize.DfaState; import com.uucun.spider.vertical.recognize.RoleTag; import com.uucun.spider.vertical.util.LexUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.math.*; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; @SuppressWarnings("nls") public class ArProductLexer extends Tokenizer implements ARLexer { protected ARToken token; private PushbackReader in; protected int line; protected int pos; protected boolean cr; private boolean eof; protected final StringBuffer text = new StringBuffer(); protected ArrayList<Token> tokenStack = new ArrayList(); // private char[] content; protected int stat = 0; public ArProductLexer(@SuppressWarnings("hiding") PushbackReader in) { this.in = in; } private int arTokenPos = 0; private int tokenPos = 0; /* * (non-Javadoc) * * @see com.uucun.spider.vertical.recognize.ARLexer#getContent(int) */ public String getContent(int i) { if (i < 0) return ""; if (i * 2 + 1 < tokenStack.size()) { return tokenStack.get(i * 2 + 1).termText(); } else { return ""; } } /* * (non-Javadoc) * * @see com.uucun.spider.vertical.recognize.ARLexer#next() */ public Token next() throws IOException { if (tokenPos < tokenStack.size()) { return tokenStack.get(tokenPos++); } else { return null; } } /* * (non-Javadoc) * * @see com.uucun.spider.vertical.recognize.ARLexer#parserToken() */ public void parserToken() throws IOException { this.text.setLength(0); while (true) { // boolean shouldLoop = false; int c = getChar(); if (c != -1) { switch (c) { case 10: if (this.cr) { this.cr = false; } else { this.line++; this.pos = 0; } break; case 13: this.line++; this.pos = 0; this.cr = true; break; default: // this.pos++; this.cr = false; break; } this.text.append((char) c); // if (DfaState.BEGIN == stat || DfaState.END_TXT == stat // || DfaState.END_ROLE_TAG == stat // || DfaState.END_NUM == stat || DfaState.END_IMG == stat // || DfaState.END_TAG == stat) { if (this.text.toString().equals("<")) { processHtmltag(); continue;// 结束本次循环 } // } // 处理数字 if (LexUtils.isNumber(this.text.toString())) { stat = processNum(); continue;// 结束本次循环 } // 如果这个字符是roletag的首字符 String cur = this.text.toString(); String firstChar = "" + (char) c; if (RoleTagProduct.tagFirstCharSet.containsKey(firstChar)) { processRoletag(firstChar); continue;// 结束本次循环 } // 是普通文本 if (DfaState.END_ROLE_TAG == stat || DfaState.END_NUM == stat || DfaState.END_TIME == stat || DfaState.END_TXT == stat || DfaState.END_HREF == stat || DfaState.END_IMG == stat) { stat = processTxt(false); } } else { // 合并txt judgeFakeRoletag(); combineTxt(); return; } } } /** * 添加测度,可能是特殊的PRICE_MEASURE或者普通的MEASURE * * @param newtokenStack * @param txt */ private void addMeasureToken(List<Token> newtokenStack, String txt) { if (getRoleTag().getTagSet().get(RoleTag.PRICE_PREFIX).contains(txt)) { newtokenStack.add(new Token(txt, 0, 0, RoleTag.PRICE_PREFIX)); } else if (getRoleTag().getTagSet().get(RoleTag.PRICE_POSTFIX) .contains(txt)) { newtokenStack.add(new Token(txt, 0, 0, RoleTag.PRICE_MEASURE)); } else { newtokenStack.add(new Token(txt, 0, 0, RoleTag.MEASURE)); } } // 根据各个tagrole之前的前后限制关系, 把错分的tagrole修正 // 从这个函数开始,不处理ARToken private int judgeFakeRoletag() throws IOException { int idx = 0; ArrayList<Token> newtokenStack = new ArrayList(); idx = 0; while (idx < tokenStack.size()) { // 在通用处理之前,把所有需要判断错分的情况处理掉 // 把MEASURE_PREFIX + MEASURE_POSTFIX替换成 MEASURE if (tokenStack.get(idx).type() == RoleTag.MEASURE_PREFIX) { if (tokenStack.get(idx + 2).type() == RoleTag.PRICE_POSTFIX) { String txt = tokenStack.get(idx + 1).termText() + tokenStack.get(idx + 3).termText(); addMeasureToken(newtokenStack, txt); newtokenStack.add(new Token(txt, 0, 0, RoleTag.CONTENT)); idx += 4;// 正确识别出Measure后移动2位 } else { Token target = getLastMeaningfulTokenIndex(idx); if (target != null && target.type() == RoleTag.NUM) { String txt = tokenStack.get(idx + 1).termText(); addMeasureToken(newtokenStack, txt); newtokenStack .add(new Token(txt, 0, 0, RoleTag.CONTENT)); idx += 2;// 正确识别出Measure后移动2位 } else { // 不是真正的Measure,修正结果 newtokenStack.add(new Token(tokenStack.get(idx) .termText(), 0, 0, RoleTag.TXT)); newtokenStack.add(new Token(tokenStack.get(idx + 1) .termText(), 0, 0, RoleTag.CONTENT)); idx += 2;// 正确识别出Measure后移动2位 } } continue; } // 处理PRICE_PREFIX+NUM的情况: 如果'$'或者'¥,'£'之前是一个数字,则标记为PRICE_MEASURE if (tokenStack.get(idx).type() == RoleTag.PRICE_PREFIX) { Token target = getNextMeaningfulTokenIndex(idx); if (target != null && target.type() == RoleTag.NUM) { String txt = tokenStack.get(idx + 1).termText();// 获取文本 addMeasureToken(newtokenStack, txt); newtokenStack.add(new Token(txt, 0, 0, RoleTag.CONTENT)); idx += 2;// 正确识别出Measure后移动2位 } else { // 不是真正的Measure,修正结果 newtokenStack.add(new Token(tokenStack.get(idx).termText(), 0, 0, RoleTag.TXT)); newtokenStack.add(new Token(tokenStack.get(idx + 1) .termText(), 0, 0, RoleTag.CONTENT)); idx += 2;// 正确识别出Measure后移动2位 } continue; } // 处理错分的情况: 如果'元'或者'USD'之前是一个数字,则标记为PRICE_MEASURE,否则标记为TXT if (tokenStack.get(idx).type() == RoleTag.PRICE_POSTFIX) { Token target = getLastMeaningfulTokenIndex(idx); if (target != null && target.type() == RoleTag.NUM) { String txt = tokenStack.get(idx + 1).termText();// 获取文本 addMeasureToken(newtokenStack, txt); newtokenStack.add(new Token(txt, 0, 0, RoleTag.CONTENT)); idx += 2;// 正确识别出Measure后移动2位 } else { // 不是真正的Measure,修正结果 newtokenStack.add(new Token(tokenStack.get(idx).termText(), 0, 0, RoleTag.TXT)); newtokenStack.add(new Token(tokenStack.get(idx + 1) .termText(), 0, 0, RoleTag.CONTENT)); idx += 2;// 正确识别出Measure后移动2位 } continue; } newtokenStack.add(new Token(tokenStack.get(idx).termText(), 0, 0, tokenStack.get(idx).type())); newtokenStack.add(new Token(tokenStack.get(idx + 1).termText(), 0, 0, RoleTag.CONTENT)); idx += 2;// 正确识别出Measure后移动2位 } tokenStack = newtokenStack; return 0; } private int combineTxt() throws IOException { ArrayList<ARToken> newARTokenStack = new ArrayList(); int idx = 0; ArrayList<Token> newtokenStack = new ArrayList(); idx = 0; while (idx < tokenStack.size()) { if (tokenStack.get(idx).type() != RoleTag.TXT) { newtokenStack.add(tokenStack.get(idx)); newtokenStack.add(tokenStack.get(idx + 1)); idx = idx + 2; } else { int start = tokenStack.get(idx).startOffset(); StringBuffer sb = new StringBuffer(); sb.append(tokenStack.get(idx).termText()); idx = idx + 2; while (idx < tokenStack.size() && tokenStack.get(idx).type() == RoleTag.TXT) { sb.append(tokenStack.get(idx).termText()); idx += 2; } int end = tokenStack.get(idx - 2).endOffset(); newtokenStack.add(new Token(sb.toString(), start, end, RoleTag.TXT)); newtokenStack.add(new Token(sb.toString(), start, end, RoleTag.CONTENT)); } } tokenStack = newtokenStack; return 0; } private int processHtmltag() throws IOException { int curpos = this.pos; int c = getNotNullChar(); if ('/' == c) {// 是关闭符号,读到'>'为止,读完所有剩下的关闭符号 while ((char) c != '>') { c = getChar(); } // 暂时不要ENDTAG这个RoleTag if (true) { tokenStack.add(new Token("", curpos - 1, this.pos, RoleTag.ENDTAG)); tokenStack.add(new Token("", curpos - 1, this.pos, RoleTag.CONTENT)); } this.text.setLength(0); return stat; } // 处理<!--xxx-->形式的注释 if ('!' == c) { int c1 = getChar(); int c2 = getChar(); String line = "" + (char) c1 + (char) c2; if ("--".equals(line)) { while ((char) c != '-' && c != -1) { c = getChar(); char ch = (char) c; } int cc = (char) getChar(); if (cc == '-') { int ccc = (char) getChar(); if (ccc == '>') { this.text.setLength(0); processTxt(false); } } } else { pushBack(line, 0); } } if ('a' == c || 'A' == c) {// 是href stat = DfaState.BEGIN_HREF; this.text.append((char) c); // 读完所有href的内容 while ((char) c != '>') { c = getChar(); this.text.append((char) c); // this.pos++; } stat = DfaState.END_HREF; // 用相同的位置记录一个HREF tokenStack.add(new Token(this.text.toString(), curpos - 1, this.pos, RoleTag.HREF)); tokenStack.add(new Token(this.text.toString(), curpos - 1, this.pos, RoleTag.CONTENT)); this.text.setLength(0); return stat; } // 需要充分考虑"<input....>" if ('i' == c || 'I' == c) { StringBuffer img = new StringBuffer(); char[] img2 = new char[] { (char) getChar(), (char) getChar() }; img.append(img2); if (img.toString().equalsIgnoreCase("mg")) { stat = DfaState.BEGIN_IMG; this.text.append((char) c); this.text.append(img.toString());// 添上"img" // 读完所有img的内容 while ((char) c != '>') { c = getChar(); this.text.append((char) c); } stat = DfaState.END_IMG; tokenStack.add(new Token(this.text.toString(), curpos - 1, this.pos, RoleTag.IMG)); tokenStack.add(new Token(this.text.toString(), curpos - 1, this.pos, RoleTag.CONTENT)); this.text.setLength(0); } else { // 读完所有剩下的内容 while ((char) c != '>') { c = getChar(); } this.text.setLength(0); } return stat; } // card if ('c' == c || 'c' == c) { StringBuffer card = new StringBuffer(); char[] card3 = new char[] { (char) getChar(), (char) getChar(), (char) getChar() }; card.append(card3); if (card.toString().equalsIgnoreCase("ard")) { stat = DfaState.BEGIN_CARD; this.text.append((char) c); this.text.append(card.toString());// 添上"img" // 读完所有card的内容,包括title while ((char) c != '>') { c = getChar(); this.text.append((char) c); } stat = DfaState.END_CARD; tokenStack.add(new Token(this.text.toString(), curpos - 1, this.pos, RoleTag.TITLE)); tokenStack.add(new Token(this.text.toString(), curpos - 1, this.pos, RoleTag.CONTENT)); this.text.setLength(0); } else { pushBack(card.toString(), 0); } return stat; } if (StringUtils.isAlpha("" + (char) c)) {// 是html标签,但不是感兴趣的标签(比如script,strong),读完标签中的内容, StringBuffer tagBuf = new StringBuffer(); tagBuf.append("" + (char) c); String str = ""; // while ( (StringUtils.isNotBlank(str) ||!str.equals(">") ) && c != // -1) { while (c != ' ' && c != '>' && c != ' ' && c != '\n' && c != -1) { c = getChar(); str = "" + (char) c; tagBuf.append(str); } String tag = tagBuf.substring(0, tagBuf.length() - 1); // 如果是<script>节点,则节点内的txt都不处理,比较困难。。。需要状态机 if (tag.toString().equals("script")) { final int BEGIN_SCRIPT = 0; final int MET_BEGINTAG = 1;// < final int MET_BEGIN_ENGTAG = 2;// </ final int MET_S = 3;// </s final int MET_SC = 4;// </sc final int MET_SCR = 5;// </scr final int MET_SCRI = 6;// </scri final int MET_SCRIP = 7;// </scrip final int MET_SCRIPT = 8;// </script final int MET_SCRIPT_ENDTAG = 9;// </script> int stat = BEGIN_SCRIPT; LOOP: while (c != -1) { c = getChar(); char cc = (char) c; switch (c) { case '<': stat = MET_BEGINTAG; break; case '/': if (stat == MET_BEGINTAG) { stat = MET_BEGIN_ENGTAG; } else { stat = BEGIN_SCRIPT; } break; case 's': case 'S': if (stat == MET_BEGIN_ENGTAG) { stat = MET_S; } else { stat = BEGIN_SCRIPT; } break; case 'c': case 'C': if (stat == MET_S) { stat = MET_SC; } else { stat = BEGIN_SCRIPT; } break; case 'r': case 'R': if (stat == MET_SC) { stat = MET_SCR; } else { stat = BEGIN_SCRIPT; } break; case 'i': case 'I': if (stat == MET_SCR) { stat = MET_SCRI; } else { stat = BEGIN_SCRIPT; } break; case 'p': case 'P': if (stat == MET_SCRI) { stat = MET_SCRIP; } else { stat = BEGIN_SCRIPT; } break; case 't': case 'T': if (stat == MET_SCRIP) { stat = MET_SCRIPT; } else { stat = BEGIN_SCRIPT; } break; case '>': if (stat == MET_SCRIPT) { stat = MET_SCRIPT_ENDTAG; break LOOP; } else { stat = BEGIN_SCRIPT; } break; } } } // 如果是<script>节点,则节点内的txt都不处理,比较困难。。。需要状态机 if (tag.toString().equals("style")) { final int BEGIN_STYLE = 0; final int MET_BEGINTAG = 1;// < final int MET_BEGIN_ENGTAG = 2;// </ final int MET_S = 3;// </s final int MET_ST = 4;// </sc final int MET_STY = 5;// </scr final int MET_STYL = 6;// </scri final int MET_STYLE = 7;// </scrip final int MET_SCRIPT_ENDTAG = 9;// </script> int stat = BEGIN_STYLE; LOOP: while (c != -1) { c = getChar(); char cc = (char) c; switch (c) { case '<': stat = MET_BEGINTAG; break; case '/': if (stat == MET_BEGINTAG) { stat = MET_BEGIN_ENGTAG; } else { stat = BEGIN_STYLE; } break; case 's': case 'S': if (stat == MET_BEGIN_ENGTAG) { stat = MET_S; } else { stat = BEGIN_STYLE; } break; case 't': case 'T': if (stat == MET_S) { stat = MET_ST; } else { stat = BEGIN_STYLE; } break; case 'y': case 'Y': if (stat == MET_ST) { stat = MET_STY; } else { stat = BEGIN_STYLE; } break; case 'l': case 'L': if (stat == MET_STY) { stat = MET_STYL; } else { stat = BEGIN_STYLE; } break; case 'e': case 'E': if (stat == MET_STYL) { stat = MET_STYLE; } else { stat = BEGIN_STYLE; } break; case '>': if (stat == MET_STYLE) { stat = MET_SCRIPT_ENDTAG; break LOOP; } else { stat = BEGIN_STYLE; } break; } } } while ((char) c != '>' && c != -1) { c = getChar(); char ch = (char) c; } } else {// 肯定不是标签,而是 a < b这种 stat = DfaState.SURE_NOT_HTMLTAG; pushBack("<" + (char) c, 0); this.text.setLength(0); processTxt(true); } this.text.setLength(0); // pushBack("" + (char) c, 0); return stat; } private int processNum() throws IOException { stat = DfaState.BEGIN_NUM; int curpos = this.pos; while (true) { int tc = getChar(); // this.pos++; String chr = "" + (char) tc; if (NumberUtils.isDigits(chr) || chr.equals(".")) { this.text.append(chr); } else { pushBack(chr, 0); // this.pos--; stat = DfaState.END_NUM; tokenStack.add(new Token(this.text.toString(), curpos - 1, this.pos, RoleTag.NUM)); tokenStack.add(new Token(this.text.toString(), curpos - 1, this.pos, RoleTag.CONTENT)); this.text.setLength(0); break; } } stat = DfaState.END_NUM; return stat; } private void error(String err) { System.out.println(err); } // 最短匹配,要求不能有 “价”,“价格”这种开头相同的RoleTag private int processRoletag(String firstChar) throws IOException { // 记下类似 "as $"情况下, "as "这个文本 String formalTxt = this.text.toString().substring(0, this.text.length() - 1); int curpos = this.pos; stat = DfaState.END_ROLE_TAG; RoleTagProduct.tagFirstCharSet.get(firstChar); Set<Integer> skipSet = RoleTagProduct.maxTagLength.get(firstChar); if (skipSet.size() == 0) { error("blank skipSet for:" + firstChar); } Integer[] skipList = skipSet.toArray(new Integer[] {}); int sz = skipList[skipList.length - 1]; // firstChar本身就是roletag if (0 == sz) { // "下载"是包含的,“下面”是不包含的 if (getRoleTag().getTagRoleMultiMap().containsKey(firstChar)) { if (StringUtils.isNotBlank(formalTxt)) { tokenStack.add(new Token(formalTxt, 0, 0, RoleTag.TXT)); tokenStack.add(new Token(formalTxt, 0, 0, RoleTag.CONTENT)); } Collection roleCollection = (Collection) getRoleTag() .getTagRoleMultiMap().get(firstChar); String role = (String) roleCollection.iterator().next(); tokenStack .add(new Token(firstChar, curpos - 1, this.pos, role)); tokenStack.add(new Token(firstChar, curpos - 1, this.pos, RoleTag.CONTENT)); stat = DfaState.FOUND_ROLE_TAG; } } int szIdx = 1; String curLine = firstChar; for (; szIdx < sz + 1; szIdx++) { int ch = getNotNoiseChar(); curLine += ((char) ch); // 如果位置对的话,检测在哪个tagCharSet中,用最优先匹配的策略 if (RoleTagProduct.maxTagLength.get(firstChar).contains(szIdx)) { // "下载"是包含的,“下面”是不包含的 if (getRoleTag().getTagRoleMultiMap().containsKey(curLine)) { if (StringUtils.isNotBlank(formalTxt)) { tokenStack.add(new Token(formalTxt, 0, 0, RoleTag.TXT)); tokenStack.add(new Token(formalTxt, 0, 0, RoleTag.CONTENT)); } Collection roleCollection = (Collection) getRoleTag() .getTagRoleMultiMap().get(curLine); String role = (String) roleCollection.iterator().next(); tokenStack.add(new Token(curLine, curpos - 1, this.pos, role)); tokenStack.add(new Token(curLine, curpos - 1, this.pos, RoleTag.CONTENT)); stat = DfaState.FOUND_ROLE_TAG; break; } } else { continue; } } if (DfaState.FOUND_ROLE_TAG != stat) { // 如果没有找到符合的roletag,把读出来的sz+1个char吐回去 stat = DfaState.NOT_FOUND_ROLE_TAG; pushBack(formalTxt + curLine, 0); this.text.setLength(0); processTxt(true); } this.text.setLength(0); stat = DfaState.END_ROLE_TAG; return stat; } /** * processTxt(true)是RoleTag回吐之后的txt处理 * 而processTxt(false)是需要考虑RoleTag可能性的txt处理,也就是处理txt的时候,可能会遇到RoleTag * * * @param skipRoletagFirstChar * @return * @throws IOException */ private int processTxt(boolean skipRoletagFirstChar) throws IOException { // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // 进这个函数之前第一个char已经放入this.text了 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! int c; int curpos = this.pos; // 如果是在 if (skipRoletagFirstChar || DfaState.NOT_FOUND_ROLE_TAG == stat) { curpos++; } while (true) { c = getChar(); if (-1 == c) { break; } boolean mayNotTxt = (!skipRoletagFirstChar && RoleTagProduct.tagFirstCharSet .containsKey("" + (char) c)) || c == '<' || LexUtils.isNumber(c); // 如果遇到一个RoleTag,则把这个RoleTag压回去,然后结束这个txt if (DfaState.SURE_NOT_HTMLTAG != stat && mayNotTxt) { pushBack("" + (char) c, 0); break; } skipRoletagFirstChar = false; this.text.append((char) c); // 如果是硬性的定为txt的,则直接单字符作为一个token if (DfaState.SURE_NOT_HTMLTAG == stat) { break; } } // 空白文本不处理 if (!StringUtils.isBlank(this.text.toString())) { tokenStack.add(new Token(this.text.toString(), curpos - 1, this.pos, RoleTag.TXT)); tokenStack.add(new Token(this.text.toString(), curpos - 1, this.pos, RoleTag.CONTENT)); } this.text.setLength(0); stat = DfaState.END_TXT; return stat; } // 取出非噪音的字符,用在读取roletag词的时候 protected int getNotNoiseChar() throws IOException { if (this.eof) { return -1; } int result = -1; while (true) { result = this.in.read(); if (result == -1) { this.eof = true; return result; } this.pos++; if (!StringUtils.isWhitespace("" + (char) result)) { return result; } } } protected int getNotNullChar() throws IOException { if (this.eof) { return -1; } int result = -1; while (true) { result = this.in.read(); if (result == -1) { this.eof = true; return result; } this.pos++; if (!StringUtils.isWhitespace("" + (char) result)) { return result; } } } protected int getChar() throws IOException { if (this.eof) { return -1; } int result = this.in.read(); if (result == -1) { this.eof = true; return result; } this.pos++; return result; } protected void pushBack(String cur, int acceptLength) throws IOException { int length = cur.length(); for (int i = length - 1; i >= acceptLength; i--) { this.eof = false; this.in.unread(cur.charAt(i)); this.pos--; } } private void pushBack(int acceptLength) throws IOException { int length = this.text.length(); for (int i = length - 1; i >= acceptLength; i--) { this.eof = false; this.in.unread(this.text.charAt(i)); } } private String getText(int acceptLength) { StringBuffer s = new StringBuffer(acceptLength); for (int i = 0; i < acceptLength; i++) { s.append(this.text.charAt(i)); } return s.toString(); } // 到token表中取上一个非空的Txt节点 protected Token getLastMeaningfulTokenIndex(int curIdx) { for (int i = curIdx - 1; i >= 0; i--) { if (tokenStack.get(i).type().equals(RoleTag.CONTENT)) { continue; } if (tokenStack.get(i).type().equals(RoleTag.TXT)) { if (StringUtils.isBlank(tokenStack.get(i).termText())) { continue; } } return tokenStack.get(i); } return null; } // 到token表中取下一个 非空的Txt节点 protected Token getNextMeaningfulTokenIndex(int curIdx) { for (int i = curIdx + 1; i < tokenStack.size(); i++) { if (tokenStack.get(i).type().equals(RoleTag.CONTENT)) { continue; } if (tokenStack.get(i).type().equals(RoleTag.TXT)) { if (StringUtils.isBlank(tokenStack.get(i).termText())) { continue; } } return tokenStack.get(i); } return null; } RoleTag roleTag; public RoleTag getRoleTag() { return roleTag; } public void setRoleTag(RoleTag roleTag) { this.roleTag = roleTag; } public Token nextClean() throws IOException { return next(); } } |
|
roki
2009-06-19
package com.rayeen.spider.vertical.recognize; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import org.apache.commons.collections.map.MultiValueMap; import org.apache.commons.collections.MultiMap; //角色标注 public interface RoleTag { final static HashMap<String, Integer> RoleTagMap = new HashMap(); final public static String TIME = "TIME"; final public static String DOWNLOAD = "DOWNLOAD"; final public static String NEXT = "NEXT"; final public static String NUM = "NUM"; final public static String HREF = "HREF"; final public static String IMG = "IMG"; final public static String TXT = "TXT"; final public static String TITLE = "TITLE"; final public static String SIZE = "SIZE"; final public static String PRICE = "PRICE"; final public static String ENDTAG = "ENGTAG"; final public static String CONTENT = "CONTENT"; final public static String TAG = "TAG"; final public static String MEASURE = "MEASURE"; final public static String PRICE_MEASURE = "PRICE_MEASURE"; final public static String PRICE_PREFIX = "PRICE_PREFIX"; final public static String MEASURE_PREFIX = "MEASURE_PREFIX"; final public static String MEASURE_POSTFIX = "MEASURE_POSTFIX"; final public static String PRICE_POSTFIX = "PRICE_POSTFIX"; public Map<String, Set<String>> getTagSet(); public MultiMap getTagRoleMultiMap(); public Map<String, Set<String>> getTagFirstCharSet(); final public static Map<String, Set<Integer>> maxTagLength = new TreeMap(); /** * TXT TIME+ ->TIME 规约 URL DOWNLOAD ->DOWNLOAD_URL规约 URL NEXT ->NEXT_URL规约 * * * TXT TIME URL DOWNLOAD DOWNLOAD_URL NEXT NEXT_URL */ /** * 如果当前char是“下”,并且下第一个非噪音char是“载“的话,产生RoleTag=0 也可以直接用字符串表示,跳过整数转换 */ // // static public void initilize() { // // HashSet sizeSet = new HashSet(); // sizeSet.add("大小"); // tagSet.put(SIZE, sizeSet); // tagRoleMultiMap.put("大小", SIZE); // // // begin 价格标记,优先匹配“价格” // HashSet priceSet = new HashSet(); // priceSet.add("价格");// 可以用来分析价格信息 // priceSet.add("价"); // tagSet.put(PRICE, priceSet); // tagRoleMultiMap.put("价格", PRICE); // tagRoleMultiMap.put("价", PRICE); // // // end价格标记 // // HashSet measurePrefixSet = new HashSet(); // measurePrefixSet.add("k"); // measurePrefixSet.add("K"); // measurePrefixSet.add("十"); // measurePrefixSet.add("百"); // measurePrefixSet.add("千"); // measurePrefixSet.add("万"); // measurePrefixSet.add("亿"); // measurePrefixSet.add("兆"); // tagSet.put(MEASURE_PREFIX, measurePrefixSet); // tagRoleMultiMap.put("k", MEASURE_PREFIX); // tagRoleMultiMap.put("K", MEASURE_PREFIX); // tagRoleMultiMap.put("十", MEASURE_PREFIX); // tagRoleMultiMap.put("百", MEASURE_PREFIX); // tagRoleMultiMap.put("千", MEASURE_PREFIX); // tagRoleMultiMap.put("万", MEASURE_PREFIX); // tagRoleMultiMap.put("亿", MEASURE_PREFIX); // tagRoleMultiMap.put("兆", MEASURE_PREFIX); // // // begin 后缀货币单位 // HashSet pricePostfixSet = new HashSet(); // pricePostfixSet.add("元"); // pricePostfixSet.add("USD"); // // tagSet.put(PRICE_POSTFIX, pricePostfixSet); // tagRoleMultiMap.put("元", PRICE_POSTFIX); // tagRoleMultiMap.put("USD", PRICE_POSTFIX); // // // end 后缀货币单位 // // // begin 前缀货币单位 // HashSet pricePrefixSet = new HashSet(); // pricePrefixSet.add("¥"); // pricePrefixSet.add("$"); // pricePrefixSet.add("£"); // // tagSet.put(PRICE_PREFIX, pricePrefixSet); // // tagRoleMultiMap.put("¥", PRICE_PREFIX); // tagRoleMultiMap.put("$", PRICE_PREFIX); // tagRoleMultiMap.put("£", PRICE_PREFIX); // // // end 前缀货币单位 // // /** // * 先对char1用 tagCharSet.get(char1)拿到DOWNLOAD和NEXT两个字符串 // * 然后用tagSet.get(DOWNLOAD),和tagSet.get(NEXT) 当char1符合的时候,直接break出循环(), // * 否则往前lookahead一个字符,看char1+char2是否在tagSet.get(DOWNLOAD)或tagSet.get(NEXT)中 // * 否则,所以此次匹配失败 // * // * 以上是词法分析需要的数据 // * // * 以下是语法分析需要的数据 // */ // // for (String tag : tagSet.keySet()) { // Set<String> words = tagSet.get(tag); // for (String wd : words) { // if (!tagCharSet.containsKey(wd)) { // tagCharSet.put(wd.substring(0, 1), new TreeSet()); // maxTagLength.put(wd.substring(0, 1), new TreeSet()); // } // tagCharSet.get(wd.substring(0, 1)).add(tag); // } // } // // // 以"大" 开头的tag的可能长度是2,因此只要lookahead 1个字符就可以了 // maxTagLength.get("大").add(1); // // maxTagLength.get("k").add(0); // maxTagLength.get("K").add(0); // maxTagLength.get("十").add(0); // maxTagLength.get("百").add(0); // maxTagLength.get("千").add(0); // maxTagLength.get("万").add(0); // maxTagLength.get("亿").add(0); // maxTagLength.get("兆").add(0); // // maxTagLength.get("价").add(1); // // maxTagLength.get("¥").add(0); // maxTagLength.get("$").add(0); // maxTagLength.get("£").add(0); // // maxTagLength.get("元").add(0); // maxTagLength.get("U").add(2); // // } // // /** // * @param args // */ // public static void main(String[] args) { // // TODO Auto-generated method stub // // } } |
|
roki
2009-06-19
package com.rayeen.spider.vertical.recognize.product; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import org.apache.commons.collections.map.MultiValueMap; import org.apache.commons.collections.MultiMap; import com.rayeen.spider.vertical.recognize.RoleTag; public class RoleTagProduct implements RoleTag{ //{'prefix'=>[$,¥。。。]} private final static Map<String, Set<String>> tagSet = new HashMap(); final static Map<String, Set<String>> tagFirstCharSet = new HashMap(); final static Map<String, Set<Integer>> maxTagLength = new TreeMap(); private final static MultiMap tagRoleMultiMap = new MultiValueMap(); public static Set<String> noisePriceSet = new HashSet(); static { noisePriceSet.add("5元"); noisePriceSet.add("5.0元"); noisePriceSet.add("5.00元"); noisePriceSet.add("8.0元"); noisePriceSet.add("10.0元"); noisePriceSet.add("12.0元"); noisePriceSet.add("15.0元"); noisePriceSet.add("25.0元"); initilize(); } /** * TXT TIME+ ->TIME 规约 URL DOWNLOAD ->DOWNLOAD_URL规约 URL NEXT ->NEXT_URL规约 * * * TXT TIME URL DOWNLOAD DOWNLOAD_URL NEXT NEXT_URL */ /** * 如果当前char是“下”,并且下第一个非噪音char是“载“的话,产生RoleTag=0 也可以直接用字符串表示,跳过整数转换 */ static public void initilize() { HashSet sizeSet = new HashSet(); sizeSet.add("大小"); tagSet.put(RoleTag.SIZE, sizeSet); tagRoleMultiMap.put("大小", RoleTag.SIZE); // begin 价格标记 HashSet priceSet = new HashSet(); priceSet.add("价"); tagSet.put(RoleTag.PRICE, priceSet); tagRoleMultiMap.put("价", RoleTag.PRICE); // end价格标记 HashSet measurePrefixSet = new HashSet(); measurePrefixSet.add("k"); measurePrefixSet.add("K"); measurePrefixSet.add("十"); measurePrefixSet.add("百"); measurePrefixSet.add("千"); measurePrefixSet.add("万"); measurePrefixSet.add("亿"); measurePrefixSet.add("兆"); tagSet.put(RoleTag.MEASURE_PREFIX, measurePrefixSet); tagRoleMultiMap.put("k",RoleTag. MEASURE_PREFIX); tagRoleMultiMap.put("K", RoleTag.MEASURE_PREFIX); tagRoleMultiMap.put("十", RoleTag.MEASURE_PREFIX); tagRoleMultiMap.put("百",RoleTag.MEASURE_PREFIX); tagRoleMultiMap.put("千",RoleTag. MEASURE_PREFIX); tagRoleMultiMap.put("万", RoleTag.MEASURE_PREFIX); tagRoleMultiMap.put("亿", RoleTag.MEASURE_PREFIX); tagRoleMultiMap.put("兆", RoleTag.MEASURE_PREFIX); // begin 后缀货币单位 HashSet pricePostfixSet = new HashSet(); pricePostfixSet.add("元"); pricePostfixSet.add("USD"); tagSet.put(RoleTag.PRICE_POSTFIX, pricePostfixSet); tagRoleMultiMap.put("元", RoleTag.PRICE_POSTFIX); tagRoleMultiMap.put("USD", RoleTag.PRICE_POSTFIX); // end 后缀货币单位 // begin 前缀货币单位 HashSet pricePrefixSet = new HashSet(); pricePrefixSet.add("¥"); pricePrefixSet.add("$"); pricePrefixSet.add("£"); tagSet.put(RoleTag.PRICE_PREFIX, pricePrefixSet); tagRoleMultiMap.put("¥", RoleTag.PRICE_PREFIX); tagRoleMultiMap.put("$", RoleTag.PRICE_PREFIX); tagRoleMultiMap.put("£",RoleTag.PRICE_PREFIX); // end 前缀货币单位 /** * 先对char1用 tagCharSet.get(char1)拿到DOWNLOAD和NEXT两个字符串 * 然后用tagSet.get(DOWNLOAD),和tagSet.get(NEXT) 当char1符合的时候,直接break出循环(), * 否则往前lookahead一个字符,看char1+char2是否在tagSet.get(DOWNLOAD)或tagSet.get(NEXT)中 * 否则,所以此次匹配失败 * * 以上是词法分析需要的数据 * * 以下是语法分析需要的数据 */ for (String tag : tagSet.keySet()) { Set<String> words = tagSet.get(tag); for (String wd : words) { if (!tagFirstCharSet.containsKey(wd)) { tagFirstCharSet.put(wd.substring(0, 1), new TreeSet()); maxTagLength.put(wd.substring(0, 1), new TreeSet()); } tagFirstCharSet.get(wd.substring(0, 1)).add(tag); } } // 以"大" 开头的tag的可能长度是2,因此只要lookahead 1个字符就可以了 maxTagLength.get("大").add(1); maxTagLength.get("k").add(0); maxTagLength.get("K").add(0); maxTagLength.get("十").add(0); maxTagLength.get("百").add(0); maxTagLength.get("千").add(0); maxTagLength.get("万").add(0); maxTagLength.get("亿").add(0); maxTagLength.get("兆").add(0); maxTagLength.get("价").add(0); maxTagLength.get("¥").add(0); maxTagLength.get("$").add(0); maxTagLength.get("£").add(0); maxTagLength.get("元").add(0); maxTagLength.get("U").add(2); } /** * @param args */ public static void main(String[] args) { } public Map<String, Set<String>> getTagSet() { return tagSet; } public MultiMap getTagRoleMultiMap() { return tagRoleMultiMap; } public Map<String, Set<String>> getTagFirstCharSet() { return tagFirstCharSet; } } |
|
roki
2009-06-19
只是一个Demo,手工写了词法识别,一定程度上能实现自然语义识别(角色标注),没做什么规则引擎, 现有的几个规则都是java代码实现的,现有规则包括:
识别数字、价格、URL地址、货币 没有深入做下去。 下面这段代码是lucene来索引原文本中的词语角色: package com.rayeen.spider.vertical.recognize; import java.io.IOException; import java.io.PushbackReader; import java.io.StringReader; import java.util.ArrayList; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.KeywordAnalyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import com.rayeen.spider.vertical.recognize.product.ArProductLexer; import com.rayeen.spider.vertical.recognize.product.RoleTagProduct; public class TestIndex { /** * @param args */ public static void main(String[] args) { final RAMDirectory ramDirTag = new RAMDirectory(); final RAMDirectory ramDirContent = new RAMDirectory(); FSDirectory fsDir = null; try { fsDir = FSDirectory.getDirectory("tmp"); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { Analyzer analyzer = new KeywordAnalyzer(); IndexWriter writerTag = new IndexWriter(ramDirTag, analyzer); IndexWriter writerContent = new IndexWriter(ramDirContent, analyzer); String str = "1<img src=asdfdasf/><a href=1>end<a href=2>adf</a>sa文件fd下载a<a href=3>下面<a href=4>sf下页魅力11秒27分<img src=asdfdasf/>end"; Document docTag = new Document(); Document docContent = new Document(); // doc.add(new Field("tk.type()", "tk.termText()", // Field.Store.YES,Field.Index.UN_TOKENIZED, // Field.TermVector.WITH_POSITIONS_OFFSETS)); ArProductLexer lexer = new ArProductLexer(new PushbackReader(new StringReader(str), 1024)); RoleTagProduct roletag=new RoleTagProduct(); RoleTagProduct.initilize(); lexer.parserToken(); Token tk = lexer.next(); while (null != tk) { System.out.println(tk); if (tk.type().equals(RoleTag.CONTENT)) { docContent.add(new Field(RoleTag.CONTENT, tk.termText(), Field.Store.YES, Field.Index.UN_TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); } else { docTag.add(new Field(RoleTag.TAG, tk.type(), Field.Store.YES, Field.Index.UN_TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); } tk = lexer.next(); } writerTag.addDocument(docTag); writerContent.addDocument(docContent); // } // writer.optimize(); writerTag.close(); writerContent.close(); IndexReader readerTag = IndexReader.open(ramDirTag); IndexReader readerContent = IndexReader.open(ramDirContent); IndexSearcher searcherTag = new IndexSearcher(readerTag); IndexSearcher searcherContent = new IndexSearcher(readerContent); // readerTag.document(0); // Document d=searcherTag.doc(0); // 先到tag索引中去找tag位置 TermPositionVector tpv = (TermPositionVector) readerTag .getTermFreqVector(0, RoleTag.TAG); String[] terms = tpv.getTerms(); ArrayList<Integer> downloadPosSet = new ArrayList(); // 获得指定role出现的位置,这里的例子中download在第七位,然后直接到Lexer中的第七个位置向上下两边搜索 for (int i = 0; i < tpv.size(); i++) { if (terms[i].equals(RoleTag.DOWNLOAD)) { int[] posArray = tpv.getTermPositions(i); for (int p : posArray) { downloadPosSet.add(p); } break; } } ArrayList<Integer> urlPosSet = new ArrayList(); for (int i = 0; i < tpv.size(); i++) { if (terms[i].equals(RoleTag.HREF)) { int[] posArray = tpv.getTermPositions(i); for (int p : posArray) { urlPosSet.add(p); } break; } } for (int downloadPos : downloadPosSet) { int leftNear = -1, rightNear = Integer.MAX_VALUE; ArrayList<String> urls = new ArrayList(); for (Integer i : urlPosSet) { if (i < downloadPos) { if (leftNear < i) { leftNear = i; } } if (i > downloadPos) { if (rightNear > i) { rightNear = i; } } } if (leftNear > 0) { urls.add(lexer.getContent(leftNear)); } if (rightNear < Integer.MAX_VALUE) { urls.add(lexer.getContent(rightNear)); } // 找到离“下载”两个字最近的url!!!!! // 类似,可以找到离“时间”RoleTag最近的Number System.out.println(urls); } } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (LockObtainFailedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } |
|
roki
2009-06-19
输出结果:
<a href=3> 也就是离“下载”这个语素最近的一个URL语素 |
|
roki
2009-06-19
因为已经对原文中的词语以及它的角色做了索引, 需要自动识别各个词素的时候,就简单了,比如, 需要提取出原文中靠近 “下载”这个词的一个URL,作为下载地址用,那么,
只要先读出“下载”的位置,然后向后或者向前搜索最靠近的一个URL Token,就解决问题了。 这段代码里充分使用了lucene做索引的时候写入的 termVector信息(而且做索引的时候还必须是 Field.TermVector.WITH_POSITIONS_OFFSETS 的,这样索引里的termVector才会包含Token的位置偏移量)。 这个算法涉及到lucene索引的底层结构了, 可以算是一个对lucene比较特别的应用 |
|
wjm0729
2010-06-20
用正则不是更简单快捷码???
|