《搜索引擎零距离》IRVM的语义识别demo代码

roki 2009-06-19
执行入口程序
package com.rayeen.spider.vertical.recognize;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;

import org.apache.lucene.analysis.Token;

import com.uucun.spider.vertical.recognize.product.ArProductLexer;
import com.uucun.spider.vertical.recognize.product.RoleTagProduct;

public class TestLexer {
	public static void main(String[] args) throws ARLexerException, IOException {
		
		//String str="1<img src=asdfdasf/>end<a href=asd>adf</a>sa文件fd下载a下面sf下页魅力11秒27分<img src=asdfdasf/>end";
		String str="当 价: <input name=head_biao2  " +
				"报价:10元  价格<em>56.00</em>元 as ¥500  fff  $444 kkk  £666 <script>sadfsaf safdasfd<dsa>script></script>"+
			
			
			"  <li > 价格:<em>56.00</em>元</li> asdf价格<b>1000元</b><!--"+
 "i < currentPostageItems.length//-->  报价<b>1100元</b> 11 k  kiss base<card title=\"打开文章\" id=\"hucn\">"+
"<p><< <a href=\"downview.asp?ID=138870&add=ok&page=1&url=&gg=2136347\">>></a><br/>"+
"一眼万年<br/>----------"+
"<br/><a href=\"http://wap.hucn.net/picc/79/261440454.wav\">免费下载</a>"+
"<br/>大小:510KB  kB Kb kb 11 k 23 K 22b 23 b"+
"<br/>添加时间:9.26 14:41<br/><a href=\"revewlist.asp?ID=138871&TP=2&page=1&url=&gg=2136347\">发表评论 ";
		
		ARLexer l = null;
		if (args != null && args.length > 0) {
			try {
				InputStreamReader isr = new InputStreamReader(new FileInputStream(args[0]), "GBK");
				l = new ArProductLexer(new PushbackReader(new BufferedReader(isr)));
			} catch (FileNotFoundException e) {
				e.printStackTrace();
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			}
		} else {		
			l = new ArProductLexer(new PushbackReader(new StringReader(str),1024));
 
		}
		
		
		
		RoleTagProduct roletag=new RoleTagProduct();
		RoleTagProduct.initilize();
		
		l.setRoleTag(roletag);
		l.parserToken();
 
		
		Token tk=l.next();
		while(null!=tk){
			System.out.println(tk);
			tk=l.next();
		}

	}
};

roki 2009-06-19
输出结果:

(当 ,0,0,type=TXT)
(当 ,0,0,type=CONTENT)
(价,0,0,type=PRICE)
(价,0,0,type=CONTENT)
(: ,0,0,type=TXT)
(: ,0,0,type=CONTENT)
(56.00,0,0,type=NUM)
(56.00,0,0,type=CONTENT)
(,0,0,type=ENGTAG)
(,0,0,type=CONTENT)
(元 as ,0,0,type=TXT)
(元 as ,0,0,type=CONTENT)
(¥,0,0,type=PRICE_PREFIX)
(¥,0,0,type=CONTENT)
(500,0,0,type=NUM)
(500,0,0,type=CONTENT)
(  fff  ,0,0,type=TXT)
(  fff  ,0,0,type=CONTENT)
($,0,0,type=PRICE_PREFIX)
($,0,0,type=CONTENT)
(444,0,0,type=NUM)
(444,0,0,type=CONTENT)
(k,0,0,type=MEASURE)
(k,0,0,type=CONTENT)
(kk,0,0,type=TXT)
(kk,0,0,type=CONTENT)
(£,0,0,type=PRICE_PREFIX)
(£,0,0,type=CONTENT)
(666,0,0,type=NUM)
(666,0,0,type=CONTENT)
(价,0,0,type=PRICE)
(价,0,0,type=CONTENT)
(格:,0,0,type=TXT)
(格:,0,0,type=CONTENT)
(56.00,0,0,type=NUM)
(56.00,0,0,type=CONTENT)
(,0,0,type=ENGTAG)
(,0,0,type=CONTENT)
(元,0,0,type=TXT)
(元,0,0,type=CONTENT)
(,0,0,type=ENGTAG)
(,0,0,type=CONTENT)
( asdf,0,0,type=TXT)
( asdf,0,0,type=CONTENT)
(价,0,0,type=PRICE)
(价,0,0,type=CONTENT)
(格,0,0,type=TXT)
(格,0,0,type=CONTENT)
(1000,0,0,type=NUM)
(1000,0,0,type=CONTENT)
(元,0,0,type=PRICE_MEASURE)
(元,0,0,type=CONTENT)
(,0,0,type=ENGTAG)
(,0,0,type=CONTENT)
(  报<-,0,0,type=TXT)
(  报<-,0,0,type=CONTENT)
(价,0,0,type=PRICE)
(价,0,0,type=CONTENT)
(1100,0,0,type=NUM)
(1100,0,0,type=CONTENT)
(元,0,0,type=PRICE_MEASURE)
(元,0,0,type=CONTENT)
(,0,0,type=ENGTAG)
(,0,0,type=CONTENT)
(11,0,0,type=NUM)
(11,0,0,type=CONTENT)
(k,0,0,type=MEASURE)
(k,0,0,type=CONTENT)
(kiss base,0,0,type=TXT)
(kiss base,0,0,type=CONTENT)
(<card title="打开文章" id="hucn">,0,0,type=TITLE)
(<card title="打开文章" id="hucn">,0,0,type=CONTENT)
(<<,0,0,type=TXT)
(<<,0,0,type=CONTENT)
(<a href="downview.asp?ID=138870&add=ok&page=1&url=&gg=2136347">,0,0,type=HREF)
(<a href="downview.asp?ID=138870&add=ok&page=1&url=&gg=2136347">,0,0,type=CONTENT)
(>>,0,0,type=TXT)
(>>,0,0,type=CONTENT)
(,0,0,type=ENGTAG)
(,0,0,type=CONTENT)
(一眼万年----------,0,0,type=TXT)
(一眼万年----------,0,0,type=CONTENT)
(<a href="http://wap.hucn.net/picc/79/261440454.wav">,0,0,type=HREF)
(<a href="http://wap.hucn.net/picc/79/261440454.wav">,0,0,type=CONTENT)
(免费下载,0,0,type=TXT)
(免费下载,0,0,type=CONTENT)
(,0,0,type=ENGTAG)
(,0,0,type=CONTENT)
(大小,0,0,type=SIZE)
(大小,0,0,type=CONTENT)
(:,0,0,type=TXT)
(:,0,0,type=CONTENT)
(510,0,0,type=NUM)
(510,0,0,type=CONTENT)
(K,0,0,type=MEASURE)
(K,0,0,type=CONTENT)
(B  kB Kb kb ,0,0,type=TXT)
(B  kB Kb kb ,0,0,type=CONTENT)
(11,0,0,type=NUM)
(11,0,0,type=CONTENT)
(k,0,0,type=MEASURE)
(k,0,0,type=CONTENT)
(23,0,0,type=NUM)
(23,0,0,type=CONTENT)
(K,0,0,type=MEASURE)
(K,0,0,type=CONTENT)
(22,0,0,type=NUM)
(22,0,0,type=CONTENT)
(b ,0,0,type=TXT)
(b ,0,0,type=CONTENT)
(23,0,0,type=NUM)
(23,0,0,type=CONTENT)
( b添加时间:,0,0,type=TXT)
( b添加时间:,0,0,type=CONTENT)
(9.26,0,0,type=NUM)
(9.26,0,0,type=CONTENT)
(14,0,0,type=NUM)
(14,0,0,type=CONTENT)
(:,0,0,type=TXT)
(:,0,0,type=CONTENT)
(41,0,0,type=NUM)
(41,0,0,type=CONTENT)
(<a href="revewlist.asp?ID=138871&TP=2&page=1&url=&gg=2136347">,0,0,type=HREF)
(<a href="revewlist.asp?ID=138871&TP=2&page=1&url=&gg=2136347">,0,0,type=CONTENT)
(发表评论 ,0,0,type=TXT)
(发表评论 ,0,0,type=CONTENT)
roki 2009-06-19
/* This file was generated by SableCC (http://www.sablecc.org/). */

package com.rayeen.spider.vertical.recognize.product;

import java.io.*;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;

import com.uucun.spider.vertical.recognize.ARLexer;
import com.uucun.spider.vertical.recognize.ARToken;
import com.uucun.spider.vertical.recognize.DfaState;
import com.uucun.spider.vertical.recognize.RoleTag;
import com.uucun.spider.vertical.util.LexUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.*;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;

@SuppressWarnings("nls")
public class ArProductLexer extends Tokenizer implements ARLexer {
	protected ARToken token;

	private PushbackReader in;

	protected int line;

	protected int pos;

	protected boolean cr;

	private boolean eof;

	protected final StringBuffer text = new StringBuffer();

	protected ArrayList<Token> tokenStack = new ArrayList();

	// private char[] content;

	protected int stat = 0;

	public ArProductLexer(@SuppressWarnings("hiding")
	PushbackReader in) {
		this.in = in;
	}

	private int arTokenPos = 0;

	private int tokenPos = 0;

	/*
	 * (non-Javadoc)
	 * 
	 * @see com.uucun.spider.vertical.recognize.ARLexer#getContent(int)
	 */
	public String getContent(int i) {

		if (i < 0)
			return "";

		if (i * 2 + 1 < tokenStack.size()) {
			return tokenStack.get(i * 2 + 1).termText();
		} else {
			return "";
		}
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see com.uucun.spider.vertical.recognize.ARLexer#next()
	 */
	public Token next() throws IOException {

		if (tokenPos < tokenStack.size()) {
			return tokenStack.get(tokenPos++);
		} else {
			return null;
		}
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see com.uucun.spider.vertical.recognize.ARLexer#parserToken()
	 */
	public void parserToken() throws IOException {

		this.text.setLength(0);
		while (true) {
			// boolean shouldLoop = false;

			int c = getChar();

			if (c != -1) {
				switch (c) {
				case 10:
					if (this.cr) {
						this.cr = false;
					} else {
						this.line++;
						this.pos = 0;
					}
					break;
				case 13:
					this.line++;
					this.pos = 0;
					this.cr = true;
					break;
				default:
					// this.pos++;
					this.cr = false;
					break;
				}

				this.text.append((char) c);

				// if (DfaState.BEGIN == stat || DfaState.END_TXT == stat
				// || DfaState.END_ROLE_TAG == stat
				// || DfaState.END_NUM == stat || DfaState.END_IMG == stat
				// || DfaState.END_TAG == stat) {
				if (this.text.toString().equals("<")) {
					processHtmltag();
					continue;// 结束本次循环
				}

				// }

				// 处理数字
				if (LexUtils.isNumber(this.text.toString())) {
					stat = processNum();
					continue;// 结束本次循环
				}

				// 如果这个字符是roletag的首字符
				String cur = this.text.toString();
				String firstChar = "" + (char) c;
				if (RoleTagProduct.tagFirstCharSet.containsKey(firstChar)) {
					processRoletag(firstChar);
					continue;// 结束本次循环
				}

				// 是普通文本
				if (DfaState.END_ROLE_TAG == stat || DfaState.END_NUM == stat
						|| DfaState.END_TIME == stat
						|| DfaState.END_TXT == stat
						|| DfaState.END_HREF == stat
						|| DfaState.END_IMG == stat) {
					stat = processTxt(false);
				}

			} else {
				// 合并txt
				judgeFakeRoletag();
				combineTxt();
				return;
			}
		}

	}

	/**
	 * 添加测度,可能是特殊的PRICE_MEASURE或者普通的MEASURE
	 * 
	 * @param newtokenStack
	 * @param txt
	 */
	private void addMeasureToken(List<Token> newtokenStack, String txt) {

		if (getRoleTag().getTagSet().get(RoleTag.PRICE_PREFIX).contains(txt)) {
			newtokenStack.add(new Token(txt, 0, 0, RoleTag.PRICE_PREFIX));
		} else if (getRoleTag().getTagSet().get(RoleTag.PRICE_POSTFIX)
				.contains(txt)) {
			newtokenStack.add(new Token(txt, 0, 0, RoleTag.PRICE_MEASURE));
		} else {
			newtokenStack.add(new Token(txt, 0, 0, RoleTag.MEASURE));
		}

	}

	// 根据各个tagrole之前的前后限制关系, 把错分的tagrole修正
	// 从这个函数开始,不处理ARToken
	private int judgeFakeRoletag() throws IOException {

		int idx = 0;
		ArrayList<Token> newtokenStack = new ArrayList();
		idx = 0;
		while (idx < tokenStack.size()) {

			// 在通用处理之前,把所有需要判断错分的情况处理掉

			// 把MEASURE_PREFIX + MEASURE_POSTFIX替换成 MEASURE
			if (tokenStack.get(idx).type() == RoleTag.MEASURE_PREFIX) {
				if (tokenStack.get(idx + 2).type() == RoleTag.PRICE_POSTFIX) {
					String txt = tokenStack.get(idx + 1).termText()
							+ tokenStack.get(idx + 3).termText();

					addMeasureToken(newtokenStack, txt);
					newtokenStack.add(new Token(txt, 0, 0, RoleTag.CONTENT));
					idx += 4;// 正确识别出Measure后移动2位
				} else {

					Token target = getLastMeaningfulTokenIndex(idx);
					if (target != null && target.type() == RoleTag.NUM) {
						String txt = tokenStack.get(idx + 1).termText();

						addMeasureToken(newtokenStack, txt);
						newtokenStack
								.add(new Token(txt, 0, 0, RoleTag.CONTENT));
						idx += 2;// 正确识别出Measure后移动2位
					} else {
						// 不是真正的Measure,修正结果
						newtokenStack.add(new Token(tokenStack.get(idx)
								.termText(), 0, 0, RoleTag.TXT));
						newtokenStack.add(new Token(tokenStack.get(idx + 1)
								.termText(), 0, 0, RoleTag.CONTENT));
						idx += 2;// 正确识别出Measure后移动2位
					}
				}

				continue;

			}

			// 处理PRICE_PREFIX+NUM的情况: 如果'$'或者'¥,'£'之前是一个数字,则标记为PRICE_MEASURE
			if (tokenStack.get(idx).type() == RoleTag.PRICE_PREFIX) {

				Token target = getNextMeaningfulTokenIndex(idx);
				if (target != null && target.type() == RoleTag.NUM) {
					String txt = tokenStack.get(idx + 1).termText();// 获取文本

					addMeasureToken(newtokenStack, txt);
					newtokenStack.add(new Token(txt, 0, 0, RoleTag.CONTENT));
					idx += 2;// 正确识别出Measure后移动2位
				} else {
					// 不是真正的Measure,修正结果
					newtokenStack.add(new Token(tokenStack.get(idx).termText(),
							0, 0, RoleTag.TXT));
					newtokenStack.add(new Token(tokenStack.get(idx + 1)
							.termText(), 0, 0, RoleTag.CONTENT));
					idx += 2;// 正确识别出Measure后移动2位
				}

				continue;

			}

			// 处理错分的情况: 如果'元'或者'USD'之前是一个数字,则标记为PRICE_MEASURE,否则标记为TXT
			if (tokenStack.get(idx).type() == RoleTag.PRICE_POSTFIX) {

				Token target = getLastMeaningfulTokenIndex(idx);
				if (target != null && target.type() == RoleTag.NUM) {
					String txt = tokenStack.get(idx + 1).termText();// 获取文本

					addMeasureToken(newtokenStack, txt);
					newtokenStack.add(new Token(txt, 0, 0, RoleTag.CONTENT));
					idx += 2;// 正确识别出Measure后移动2位
				} else {
					// 不是真正的Measure,修正结果
					newtokenStack.add(new Token(tokenStack.get(idx).termText(),
							0, 0, RoleTag.TXT));
					newtokenStack.add(new Token(tokenStack.get(idx + 1)
							.termText(), 0, 0, RoleTag.CONTENT));
					idx += 2;// 正确识别出Measure后移动2位
				}

				continue;

			}

			newtokenStack.add(new Token(tokenStack.get(idx).termText(), 0, 0,
					tokenStack.get(idx).type()));
			newtokenStack.add(new Token(tokenStack.get(idx + 1).termText(), 0,
					0, RoleTag.CONTENT));
			idx += 2;// 正确识别出Measure后移动2位

		}

		tokenStack = newtokenStack;
		return 0;
	}

	private int combineTxt() throws IOException {
		ArrayList<ARToken> newARTokenStack = new ArrayList();
		int idx = 0;

		ArrayList<Token> newtokenStack = new ArrayList();
		idx = 0;
		while (idx < tokenStack.size()) {
			if (tokenStack.get(idx).type() != RoleTag.TXT) {
				newtokenStack.add(tokenStack.get(idx));
				newtokenStack.add(tokenStack.get(idx + 1));
				idx = idx + 2;
			} else {

				int start = tokenStack.get(idx).startOffset();
				StringBuffer sb = new StringBuffer();
				sb.append(tokenStack.get(idx).termText());
				idx = idx + 2;

				while (idx < tokenStack.size()
						&& tokenStack.get(idx).type() == RoleTag.TXT) {
					sb.append(tokenStack.get(idx).termText());
					idx += 2;
				}
				int end = tokenStack.get(idx - 2).endOffset();

				newtokenStack.add(new Token(sb.toString(), start, end,
						RoleTag.TXT));
				newtokenStack.add(new Token(sb.toString(), start, end,
						RoleTag.CONTENT));

			}

		}

		tokenStack = newtokenStack;
		return 0;
	}

	private int processHtmltag() throws IOException {
		int curpos = this.pos;

		int c = getNotNullChar();

		if ('/' == c) {// 是关闭符号,读到'>'为止,读完所有剩下的关闭符号
			while ((char) c != '>') {
				c = getChar();
			}

			// 暂时不要ENDTAG这个RoleTag
			if (true) {
				tokenStack.add(new Token("", curpos - 1, this.pos,
						RoleTag.ENDTAG));
				tokenStack.add(new Token("", curpos - 1, this.pos,
						RoleTag.CONTENT));
			}
			this.text.setLength(0);
			return stat;
		}

		// 处理<!--xxx-->形式的注释
		if ('!' == c) {

			int c1 = getChar();
			int c2 = getChar();
			String line = "" + (char) c1 + (char) c2;

			if ("--".equals(line)) {
				while ((char) c != '-' && c != -1) {
					c = getChar();
					char ch = (char) c;
				}

				int cc = (char) getChar();
				if (cc == '-') {
					int ccc = (char) getChar();
					if (ccc == '>') {
						this.text.setLength(0);
						processTxt(false);
					}
				}

			} else {
				pushBack(line, 0);
			}

		}

		if ('a' == c || 'A' == c) {// 是href

			stat = DfaState.BEGIN_HREF;

			this.text.append((char) c);
			// 读完所有href的内容
			while ((char) c != '>') {
				c = getChar();
				this.text.append((char) c);
				// this.pos++;
			}

			stat = DfaState.END_HREF;

			// 用相同的位置记录一个HREF
			tokenStack.add(new Token(this.text.toString(), curpos - 1,
					this.pos, RoleTag.HREF));
			tokenStack.add(new Token(this.text.toString(), curpos - 1,
					this.pos, RoleTag.CONTENT));

			this.text.setLength(0);
			return stat;

		}

		// 需要充分考虑"<input....>"
		if ('i' == c || 'I' == c) {
			StringBuffer img = new StringBuffer();
			char[] img2 = new char[] { (char) getChar(), (char) getChar() };
			img.append(img2);
			if (img.toString().equalsIgnoreCase("mg")) {

				stat = DfaState.BEGIN_IMG;
				this.text.append((char) c);
				this.text.append(img.toString());// 添上"img"
				// 读完所有img的内容
				while ((char) c != '>') {
					c = getChar();
					this.text.append((char) c);
				}

				stat = DfaState.END_IMG;

				tokenStack.add(new Token(this.text.toString(), curpos - 1,
						this.pos, RoleTag.IMG));
				tokenStack.add(new Token(this.text.toString(), curpos - 1,
						this.pos, RoleTag.CONTENT));

				this.text.setLength(0);

			} else {
				// 读完所有剩下的内容
				while ((char) c != '>') {
					c = getChar();
				}
				this.text.setLength(0);
			}

			return stat;
		}

		// card
		if ('c' == c || 'c' == c) {
			StringBuffer card = new StringBuffer();
			char[] card3 = new char[] { (char) getChar(), (char) getChar(),
					(char) getChar() };
			card.append(card3);
			if (card.toString().equalsIgnoreCase("ard")) {

				stat = DfaState.BEGIN_CARD;
				this.text.append((char) c);
				this.text.append(card.toString());// 添上"img"
				// 读完所有card的内容,包括title
				while ((char) c != '>') {
					c = getChar();
					this.text.append((char) c);
				}

				stat = DfaState.END_CARD;

				tokenStack.add(new Token(this.text.toString(), curpos - 1,
						this.pos, RoleTag.TITLE));
				tokenStack.add(new Token(this.text.toString(), curpos - 1,
						this.pos, RoleTag.CONTENT));

				this.text.setLength(0);

			} else {
				pushBack(card.toString(), 0);
			}

			return stat;

		}

		if (StringUtils.isAlpha("" + (char) c)) {// 是html标签,但不是感兴趣的标签(比如script,strong),读完标签中的内容,

			StringBuffer tagBuf = new StringBuffer();
			tagBuf.append("" + (char) c);
			String str = "";

			// while ( (StringUtils.isNotBlank(str) ||!str.equals(">") ) && c !=
			// -1) {
			while (c != ' ' && c != '>' && c != '	' && c != '\n' && c != -1) {
				c = getChar();
				str = "" + (char) c;
				tagBuf.append(str);
			}
			String tag = tagBuf.substring(0, tagBuf.length() - 1);

			// 如果是<script>节点,则节点内的txt都不处理,比较困难。。。需要状态机
			if (tag.toString().equals("script")) {
				final int BEGIN_SCRIPT = 0;
				final int MET_BEGINTAG = 1;// <
				final int MET_BEGIN_ENGTAG = 2;// </
				final int MET_S = 3;// </s
				final int MET_SC = 4;// </sc
				final int MET_SCR = 5;// </scr
				final int MET_SCRI = 6;// </scri
				final int MET_SCRIP = 7;// </scrip
				final int MET_SCRIPT = 8;// </script
				final int MET_SCRIPT_ENDTAG = 9;// </script>

				int stat = BEGIN_SCRIPT;

				LOOP: while (c != -1) {
					c = getChar();
					char cc = (char) c;

					switch (c) {
					case '<':
						stat = MET_BEGINTAG;
						break;

					case '/':
						if (stat == MET_BEGINTAG) {
							stat = MET_BEGIN_ENGTAG;
						} else {
							stat = BEGIN_SCRIPT;
						}
						break;

					case 's':
					case 'S':
						if (stat == MET_BEGIN_ENGTAG) {
							stat = MET_S;
						} else {
							stat = BEGIN_SCRIPT;
						}
						break;

					case 'c':
					case 'C':
						if (stat == MET_S) {
							stat = MET_SC;
						} else {
							stat = BEGIN_SCRIPT;
						}
						break;

					case 'r':
					case 'R':
						if (stat == MET_SC) {
							stat = MET_SCR;
						} else {
							stat = BEGIN_SCRIPT;
						}
						break;

					case 'i':
					case 'I':
						if (stat == MET_SCR) {
							stat = MET_SCRI;
						} else {
							stat = BEGIN_SCRIPT;
						}
						break;

					case 'p':
					case 'P':
						if (stat == MET_SCRI) {
							stat = MET_SCRIP;
						} else {
							stat = BEGIN_SCRIPT;
						}
						break;

					case 't':
					case 'T':
						if (stat == MET_SCRIP) {
							stat = MET_SCRIPT;
						} else {
							stat = BEGIN_SCRIPT;
						}
						break;

					case '>':
						if (stat == MET_SCRIPT) {
							stat = MET_SCRIPT_ENDTAG;
							break LOOP;

						} else {
							stat = BEGIN_SCRIPT;
						}

						break;

					}

				}

			}

			// 如果是<script>节点,则节点内的txt都不处理,比较困难。。。需要状态机
			if (tag.toString().equals("style")) {
				final int BEGIN_STYLE = 0;
				final int MET_BEGINTAG = 1;// <
				final int MET_BEGIN_ENGTAG = 2;// </
				final int MET_S = 3;// </s
				final int MET_ST = 4;// </sc
				final int MET_STY = 5;// </scr
				final int MET_STYL = 6;// </scri
				final int MET_STYLE = 7;// </scrip
				final int MET_SCRIPT_ENDTAG = 9;// </script>

				int stat = BEGIN_STYLE;

				LOOP: while (c != -1) {
					c = getChar();
					char cc = (char) c;

					switch (c) {
					case '<':
						stat = MET_BEGINTAG;
						break;

					case '/':
						if (stat == MET_BEGINTAG) {
							stat = MET_BEGIN_ENGTAG;
						} else {
							stat = BEGIN_STYLE;
						}
						break;

					case 's':
					case 'S':
						if (stat == MET_BEGIN_ENGTAG) {
							stat = MET_S;
						} else {
							stat = BEGIN_STYLE;
						}
						break;

					case 't':
					case 'T':
						if (stat == MET_S) {
							stat = MET_ST;
						} else {
							stat = BEGIN_STYLE;
						}
						break;

					case 'y':
					case 'Y':
						if (stat == MET_ST) {
							stat = MET_STY;
						} else {
							stat = BEGIN_STYLE;
						}
						break;

					case 'l':
					case 'L':
						if (stat == MET_STY) {
							stat = MET_STYL;
						} else {
							stat = BEGIN_STYLE;
						}
						break;

					case 'e':
					case 'E':
						if (stat == MET_STYL) {
							stat = MET_STYLE;
						} else {
							stat = BEGIN_STYLE;
						}
						break;

					case '>':
						if (stat == MET_STYLE) {
							stat = MET_SCRIPT_ENDTAG;
							break LOOP;

						} else {
							stat = BEGIN_STYLE;
						}

						break;

					}

				}

			}

			while ((char) c != '>' && c != -1) {
				c = getChar();
				char ch = (char) c;
			}
		} else {// 肯定不是标签,而是 a < b这种

			stat = DfaState.SURE_NOT_HTMLTAG;
			pushBack("<" + (char) c, 0);
			this.text.setLength(0);
			processTxt(true);
		}

		this.text.setLength(0);
		// pushBack("" + (char) c, 0);
		return stat;

	}

	private int processNum() throws IOException {
		stat = DfaState.BEGIN_NUM;
		int curpos = this.pos;
		while (true) {
			int tc = getChar();
			// this.pos++;
			String chr = "" + (char) tc;

			if (NumberUtils.isDigits(chr) || chr.equals(".")) {
				this.text.append(chr);
			} else {
				pushBack(chr, 0);
				// this.pos--;
				stat = DfaState.END_NUM;

				tokenStack.add(new Token(this.text.toString(), curpos - 1,
						this.pos, RoleTag.NUM));
				tokenStack.add(new Token(this.text.toString(), curpos - 1,
						this.pos, RoleTag.CONTENT));

				this.text.setLength(0);

				break;
			}
		}
		stat = DfaState.END_NUM;
		return stat;
	}

	private void error(String err) {
		System.out.println(err);
	}

	// 最短匹配,要求不能有 “价”,“价格”这种开头相同的RoleTag
	private int processRoletag(String firstChar) throws IOException {

		// 记下类似 "as $"情况下, "as "这个文本
		String formalTxt = this.text.toString().substring(0,
				this.text.length() - 1);

		int curpos = this.pos;

		stat = DfaState.END_ROLE_TAG;

		RoleTagProduct.tagFirstCharSet.get(firstChar);
		Set<Integer> skipSet = RoleTagProduct.maxTagLength.get(firstChar);
		if (skipSet.size() == 0) {
			error("blank skipSet for:" + firstChar);

		}

		Integer[] skipList = skipSet.toArray(new Integer[] {});
		int sz = skipList[skipList.length - 1];

		// firstChar本身就是roletag
		if (0 == sz) {

			// "下载"是包含的,“下面”是不包含的
			if (getRoleTag().getTagRoleMultiMap().containsKey(firstChar)) {

				if (StringUtils.isNotBlank(formalTxt)) {
					tokenStack.add(new Token(formalTxt, 0, 0, RoleTag.TXT));
					tokenStack.add(new Token(formalTxt, 0, 0, RoleTag.CONTENT));
				}

				Collection roleCollection = (Collection) getRoleTag()
						.getTagRoleMultiMap().get(firstChar);
				String role = (String) roleCollection.iterator().next();

				tokenStack
						.add(new Token(firstChar, curpos - 1, this.pos, role));
				tokenStack.add(new Token(firstChar, curpos - 1, this.pos,
						RoleTag.CONTENT));

				stat = DfaState.FOUND_ROLE_TAG;
			}

		}

		int szIdx = 1;
		String curLine = firstChar;
		for (; szIdx < sz + 1; szIdx++) {
			int ch = getNotNoiseChar();
			curLine += ((char) ch);
			// 如果位置对的话,检测在哪个tagCharSet中,用最优先匹配的策略
			if (RoleTagProduct.maxTagLength.get(firstChar).contains(szIdx)) {

				// "下载"是包含的,“下面”是不包含的
				if (getRoleTag().getTagRoleMultiMap().containsKey(curLine)) {

					if (StringUtils.isNotBlank(formalTxt)) {
						tokenStack.add(new Token(formalTxt, 0, 0, RoleTag.TXT));
						tokenStack.add(new Token(formalTxt, 0, 0,
								RoleTag.CONTENT));
					}

					Collection roleCollection = (Collection) getRoleTag()
							.getTagRoleMultiMap().get(curLine);
					String role = (String) roleCollection.iterator().next();

					tokenStack.add(new Token(curLine, curpos - 1, this.pos,
							role));
					tokenStack.add(new Token(curLine, curpos - 1, this.pos,
							RoleTag.CONTENT));

					stat = DfaState.FOUND_ROLE_TAG;
					break;
				}

			} else {
				continue;
			}

		}

		if (DfaState.FOUND_ROLE_TAG != stat) {
			// 如果没有找到符合的roletag,把读出来的sz+1个char吐回去
			stat = DfaState.NOT_FOUND_ROLE_TAG;
			pushBack(formalTxt + curLine, 0);
			this.text.setLength(0);
			processTxt(true);
		}

		this.text.setLength(0);
		stat = DfaState.END_ROLE_TAG;
		return stat;
	}

	/**
	 * processTxt(true)是RoleTag回吐之后的txt处理
	 * 而processTxt(false)是需要考虑RoleTag可能性的txt处理,也就是处理txt的时候,可能会遇到RoleTag
	 * 
	 * 
	 * @param skipRoletagFirstChar
	 * @return
	 * @throws IOException
	 */

	private int processTxt(boolean skipRoletagFirstChar) throws IOException {
		// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
		// 进这个函数之前第一个char已经放入this.text了
		// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

		int c;
		int curpos = this.pos;

		// 如果是在
		if (skipRoletagFirstChar || DfaState.NOT_FOUND_ROLE_TAG == stat) {
			curpos++;
		}

		while (true) {
			c = getChar();
			if (-1 == c) {
				break;
			}

			boolean mayNotTxt = (!skipRoletagFirstChar && RoleTagProduct.tagFirstCharSet
					.containsKey("" + (char) c))
					|| c == '<' || LexUtils.isNumber(c);

			// 如果遇到一个RoleTag,则把这个RoleTag压回去,然后结束这个txt
			if (DfaState.SURE_NOT_HTMLTAG != stat && mayNotTxt) {
				pushBack("" + (char) c, 0);
				break;
			}
			skipRoletagFirstChar = false;
			this.text.append((char) c);

			// 如果是硬性的定为txt的,则直接单字符作为一个token
			if (DfaState.SURE_NOT_HTMLTAG == stat) {
				break;
			}
		}

		// 空白文本不处理
		if (!StringUtils.isBlank(this.text.toString())) {

			tokenStack.add(new Token(this.text.toString(), curpos - 1,
					this.pos, RoleTag.TXT));
			tokenStack.add(new Token(this.text.toString(), curpos - 1,
					this.pos, RoleTag.CONTENT));
		}

		this.text.setLength(0);
		stat = DfaState.END_TXT;
		return stat;
	}

	// 取出非噪音的字符,用在读取roletag词的时候
	protected int getNotNoiseChar() throws IOException {
		if (this.eof) {
			return -1;
		}

		int result = -1;

		while (true) {
			result = this.in.read();
			if (result == -1) {
				this.eof = true;
				return result;
			}

			this.pos++;

			if (!StringUtils.isWhitespace("" + (char) result)) {
				return result;
			}
		}

	}

	protected int getNotNullChar() throws IOException {
		if (this.eof) {
			return -1;
		}

		int result = -1;

		while (true) {
			result = this.in.read();

			if (result == -1) {
				this.eof = true;
				return result;
			}

			this.pos++;

			if (!StringUtils.isWhitespace("" + (char) result)) {
				return result;
			}
		}

	}

	protected int getChar() throws IOException {
		if (this.eof) {
			return -1;
		}

		int result = this.in.read();

		if (result == -1) {
			this.eof = true;
			return result;
		}

		this.pos++;

		return result;
	}

	protected void pushBack(String cur, int acceptLength) throws IOException {
		int length = cur.length();
		for (int i = length - 1; i >= acceptLength; i--) {
			this.eof = false;

			this.in.unread(cur.charAt(i));

			this.pos--;
		}
	}

	private void pushBack(int acceptLength) throws IOException {
		int length = this.text.length();
		for (int i = length - 1; i >= acceptLength; i--) {
			this.eof = false;

			this.in.unread(this.text.charAt(i));
		}
	}

	private String getText(int acceptLength) {
		StringBuffer s = new StringBuffer(acceptLength);
		for (int i = 0; i < acceptLength; i++) {
			s.append(this.text.charAt(i));
		}

		return s.toString();
	}

	// 到token表中取上一个非空的Txt节点
	protected Token getLastMeaningfulTokenIndex(int curIdx) {
		for (int i = curIdx - 1; i >= 0; i--) {

			if (tokenStack.get(i).type().equals(RoleTag.CONTENT)) {
				continue;
			}
			if (tokenStack.get(i).type().equals(RoleTag.TXT)) {
				if (StringUtils.isBlank(tokenStack.get(i).termText())) {
					continue;

				}

			}

			return tokenStack.get(i);

		}

		return null;
	}

	// 到token表中取下一个 非空的Txt节点
	protected Token getNextMeaningfulTokenIndex(int curIdx) {
		for (int i = curIdx + 1; i < tokenStack.size(); i++) {

			if (tokenStack.get(i).type().equals(RoleTag.CONTENT)) {
				continue;
			}
			if (tokenStack.get(i).type().equals(RoleTag.TXT)) {
				if (StringUtils.isBlank(tokenStack.get(i).termText())) {
					continue;

				}

			}

			return tokenStack.get(i);

		}

		return null;
	}

	RoleTag roleTag;

	public RoleTag getRoleTag() {
		return roleTag;
	}

	public void setRoleTag(RoleTag roleTag) {
		this.roleTag = roleTag;
	}

	public Token nextClean() throws IOException {
		return next();
	}

}
roki 2009-06-19
package com.rayeen.spider.vertical.recognize;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.collections.map.MultiValueMap;
import org.apache.commons.collections.MultiMap;

//角色标注
public interface RoleTag {

	final static HashMap<String, Integer> RoleTagMap = new HashMap();

	final public static String TIME = "TIME";

	final public static String DOWNLOAD = "DOWNLOAD";

	final public static String NEXT = "NEXT";

	final public static String NUM = "NUM";

	final public static String HREF = "HREF";

	final public static String IMG = "IMG";

	final public static String TXT = "TXT";

	final public static String TITLE = "TITLE";

	final public static String SIZE = "SIZE";

	final public static String PRICE = "PRICE";

	final public static String ENDTAG = "ENGTAG";

	final public static String CONTENT = "CONTENT";

	final public static String TAG = "TAG";

	final public static String MEASURE = "MEASURE";

	final public static String PRICE_MEASURE = "PRICE_MEASURE";

	final public static String PRICE_PREFIX = "PRICE_PREFIX";

	final public static String MEASURE_PREFIX = "MEASURE_PREFIX";

	final public static String MEASURE_POSTFIX = "MEASURE_POSTFIX";

	final public static String PRICE_POSTFIX = "PRICE_POSTFIX";
	
	
	
	public Map<String, Set<String>>   getTagSet();
	
	public  MultiMap getTagRoleMultiMap();
	
	public Map<String, Set<String>> getTagFirstCharSet();



	final public static Map<String, Set<Integer>> maxTagLength = new TreeMap();


	/**
	 * TXT TIME+ ->TIME 规约 URL DOWNLOAD ->DOWNLOAD_URL规约 URL NEXT ->NEXT_URL规约
	 * 
	 * 
	 * TXT TIME URL DOWNLOAD DOWNLOAD_URL NEXT NEXT_URL
	 */

	/**
	 * 如果当前char是“下”,并且下第一个非噪音char是“载“的话,产生RoleTag=0 也可以直接用字符串表示,跳过整数转换
	 */
//
//	static public void initilize() {
//
//		HashSet sizeSet = new HashSet();
//		sizeSet.add("大小");
//		tagSet.put(SIZE, sizeSet);
//		tagRoleMultiMap.put("大小", SIZE);
//
//		// begin 价格标记,优先匹配“价格”
//		HashSet priceSet = new HashSet();
//		priceSet.add("价格");// 可以用来分析价格信息
//		priceSet.add("价");
//		tagSet.put(PRICE, priceSet);
//		tagRoleMultiMap.put("价格", PRICE);
//		tagRoleMultiMap.put("价", PRICE);
//
//		// end价格标记
//
//		HashSet measurePrefixSet = new HashSet();
//		measurePrefixSet.add("k");
//		measurePrefixSet.add("K");
//		measurePrefixSet.add("十");
//		measurePrefixSet.add("百");
//		measurePrefixSet.add("千");
//		measurePrefixSet.add("万");
//		measurePrefixSet.add("亿");
//		measurePrefixSet.add("兆");
//		tagSet.put(MEASURE_PREFIX, measurePrefixSet);
//		tagRoleMultiMap.put("k", MEASURE_PREFIX);
//		tagRoleMultiMap.put("K", MEASURE_PREFIX);
//		tagRoleMultiMap.put("十", MEASURE_PREFIX);
//		tagRoleMultiMap.put("百", MEASURE_PREFIX);
//		tagRoleMultiMap.put("千", MEASURE_PREFIX);
//		tagRoleMultiMap.put("万", MEASURE_PREFIX);
//		tagRoleMultiMap.put("亿", MEASURE_PREFIX);
//		tagRoleMultiMap.put("兆", MEASURE_PREFIX);
//
//		// begin 后缀货币单位
//		HashSet pricePostfixSet = new HashSet();
//		pricePostfixSet.add("元");
//		pricePostfixSet.add("USD");
//
//		tagSet.put(PRICE_POSTFIX, pricePostfixSet);
//		tagRoleMultiMap.put("元", PRICE_POSTFIX);
//		tagRoleMultiMap.put("USD", PRICE_POSTFIX);
//
//		// end 后缀货币单位
//
//		// begin 前缀货币单位
//		HashSet pricePrefixSet = new HashSet();
//		pricePrefixSet.add("¥");
//		pricePrefixSet.add("$");
//		pricePrefixSet.add("£");
//
//		tagSet.put(PRICE_PREFIX, pricePrefixSet);
//
//		tagRoleMultiMap.put("¥", PRICE_PREFIX);
//		tagRoleMultiMap.put("$", PRICE_PREFIX);
//		tagRoleMultiMap.put("£", PRICE_PREFIX);
//
//		// end 前缀货币单位
//
//		/**
//		 * 先对char1用 tagCharSet.get(char1)拿到DOWNLOAD和NEXT两个字符串
//		 * 然后用tagSet.get(DOWNLOAD),和tagSet.get(NEXT) 当char1符合的时候,直接break出循环(),
//		 * 否则往前lookahead一个字符,看char1+char2是否在tagSet.get(DOWNLOAD)或tagSet.get(NEXT)中
//		 * 否则,所以此次匹配失败
//		 * 
//		 * 以上是词法分析需要的数据
//		 * 
//		 * 以下是语法分析需要的数据
//		 */
//
//		for (String tag : tagSet.keySet()) {
//			Set<String> words = tagSet.get(tag);
//			for (String wd : words) {
//				if (!tagCharSet.containsKey(wd)) {
//					tagCharSet.put(wd.substring(0, 1), new TreeSet());
//					maxTagLength.put(wd.substring(0, 1), new TreeSet());
//				}
//				tagCharSet.get(wd.substring(0, 1)).add(tag);
//			}
//		}
//
//		// 以"大" 开头的tag的可能长度是2,因此只要lookahead 1个字符就可以了
//		maxTagLength.get("大").add(1);
//
//		maxTagLength.get("k").add(0);
//		maxTagLength.get("K").add(0);
//		maxTagLength.get("十").add(0);
//		maxTagLength.get("百").add(0);
//		maxTagLength.get("千").add(0);
//		maxTagLength.get("万").add(0);
//		maxTagLength.get("亿").add(0);
//		maxTagLength.get("兆").add(0);
//
//		maxTagLength.get("价").add(1);
//
//		maxTagLength.get("¥").add(0);
//		maxTagLength.get("$").add(0);
//		maxTagLength.get("£").add(0);
//
//		maxTagLength.get("元").add(0);
//		maxTagLength.get("U").add(2);
//
//	}
//
//	/**
//	 * @param args
//	 */
//	public static void main(String[] args) {
//		// TODO Auto-generated method stub
//
//	}

}
roki 2009-06-19
package com.rayeen.spider.vertical.recognize.product;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.collections.map.MultiValueMap;
import org.apache.commons.collections.MultiMap;

import com.rayeen.spider.vertical.recognize.RoleTag;

public class RoleTagProduct implements RoleTag{


//{'prefix'=>[$,¥。。。]}
	private final static Map<String, Set<String>> tagSet = new HashMap();

	 final static Map<String, Set<String>> tagFirstCharSet = new HashMap();

	 final static Map<String, Set<Integer>> maxTagLength = new TreeMap();

	private final static MultiMap tagRoleMultiMap = new MultiValueMap();

	public static Set<String> noisePriceSet = new HashSet();
	static {
		noisePriceSet.add("5元"); 
		noisePriceSet.add("5.0元");
		noisePriceSet.add("5.00元");
		noisePriceSet.add("8.0元");
		noisePriceSet.add("10.0元");
		noisePriceSet.add("12.0元");
		noisePriceSet.add("15.0元");
		noisePriceSet.add("25.0元");
		initilize();
	}

	/**
	 * TXT TIME+ ->TIME 规约 URL DOWNLOAD ->DOWNLOAD_URL规约 URL NEXT ->NEXT_URL规约
	 * 
	 * 
	 * TXT TIME URL DOWNLOAD DOWNLOAD_URL NEXT NEXT_URL
	 */

	/**
	 * 如果当前char是“下”,并且下第一个非噪音char是“载“的话,产生RoleTag=0 也可以直接用字符串表示,跳过整数转换
	 */

	static public void initilize() {

		HashSet sizeSet = new HashSet();
		sizeSet.add("大小");
		tagSet.put(RoleTag.SIZE, sizeSet);
		tagRoleMultiMap.put("大小", RoleTag.SIZE);

		// begin 价格标记
		HashSet priceSet = new HashSet();
		priceSet.add("价");
		tagSet.put(RoleTag.PRICE, priceSet);
		tagRoleMultiMap.put("价", RoleTag.PRICE);

		// end价格标记

		HashSet measurePrefixSet = new HashSet();
		measurePrefixSet.add("k");
		measurePrefixSet.add("K");
		measurePrefixSet.add("十");
		measurePrefixSet.add("百");
		measurePrefixSet.add("千");
		measurePrefixSet.add("万");
		measurePrefixSet.add("亿");
		measurePrefixSet.add("兆");
		tagSet.put(RoleTag.MEASURE_PREFIX, measurePrefixSet);
		tagRoleMultiMap.put("k",RoleTag. MEASURE_PREFIX);
		tagRoleMultiMap.put("K", RoleTag.MEASURE_PREFIX);
		tagRoleMultiMap.put("十", RoleTag.MEASURE_PREFIX);
		tagRoleMultiMap.put("百",RoleTag.MEASURE_PREFIX);
		tagRoleMultiMap.put("千",RoleTag. MEASURE_PREFIX);
		tagRoleMultiMap.put("万", RoleTag.MEASURE_PREFIX);
		tagRoleMultiMap.put("亿", RoleTag.MEASURE_PREFIX);
		tagRoleMultiMap.put("兆", RoleTag.MEASURE_PREFIX);

		// begin 后缀货币单位
		HashSet pricePostfixSet = new HashSet();
		pricePostfixSet.add("元");
		pricePostfixSet.add("USD");

		tagSet.put(RoleTag.PRICE_POSTFIX, pricePostfixSet);
		tagRoleMultiMap.put("元", RoleTag.PRICE_POSTFIX);
		tagRoleMultiMap.put("USD", RoleTag.PRICE_POSTFIX);

		// end 后缀货币单位

		// begin 前缀货币单位
		HashSet pricePrefixSet = new HashSet();
		pricePrefixSet.add("¥");
		pricePrefixSet.add("$");
		pricePrefixSet.add("£");

		tagSet.put(RoleTag.PRICE_PREFIX, pricePrefixSet);

		tagRoleMultiMap.put("¥", RoleTag.PRICE_PREFIX);
		tagRoleMultiMap.put("$", RoleTag.PRICE_PREFIX);
		tagRoleMultiMap.put("£",RoleTag.PRICE_PREFIX);

		// end 前缀货币单位

		/**
		 * 先对char1用 tagCharSet.get(char1)拿到DOWNLOAD和NEXT两个字符串
		 * 然后用tagSet.get(DOWNLOAD),和tagSet.get(NEXT) 当char1符合的时候,直接break出循环(),
		 * 否则往前lookahead一个字符,看char1+char2是否在tagSet.get(DOWNLOAD)或tagSet.get(NEXT)中
		 * 否则,所以此次匹配失败
		 * 
		 * 以上是词法分析需要的数据
		 * 
		 * 以下是语法分析需要的数据
		 */

		for (String tag : tagSet.keySet()) {
			Set<String> words = tagSet.get(tag);
			for (String wd : words) {
				if (!tagFirstCharSet.containsKey(wd)) {
					tagFirstCharSet.put(wd.substring(0, 1), new TreeSet());
					maxTagLength.put(wd.substring(0, 1), new TreeSet());
				}
				tagFirstCharSet.get(wd.substring(0, 1)).add(tag);
			}
		}

		// 以"大" 开头的tag的可能长度是2,因此只要lookahead 1个字符就可以了
		maxTagLength.get("大").add(1);

		maxTagLength.get("k").add(0);
		maxTagLength.get("K").add(0);
		maxTagLength.get("十").add(0);
		maxTagLength.get("百").add(0);
		maxTagLength.get("千").add(0);
		maxTagLength.get("万").add(0);
		maxTagLength.get("亿").add(0);
		maxTagLength.get("兆").add(0);


		maxTagLength.get("价").add(0);

		maxTagLength.get("¥").add(0);
		maxTagLength.get("$").add(0);
		maxTagLength.get("£").add(0);

		maxTagLength.get("元").add(0);
		maxTagLength.get("U").add(2);

	}

	/**
	 * @param args
	 */
	public static void main(String[] args) {

	}

	public Map<String, Set<String>> getTagSet() {
		return tagSet;
	}

	public MultiMap getTagRoleMultiMap() {
		return tagRoleMultiMap;
	}

	public Map<String, Set<String>> getTagFirstCharSet() {
		return tagFirstCharSet;
	}

}
roki 2009-06-19
只是一个Demo,手工写了词法识别,一定程度上能实现自然语义识别(角色标注),没做什么规则引擎, 现有的几个规则都是java代码实现的,现有规则包括:

识别数字、价格、URL地址、货币

没有深入做下去。

下面这段代码是lucene来索引原文本中的词语角色:

package com.rayeen.spider.vertical.recognize;

import java.io.IOException;
import java.io.PushbackReader;
import java.io.StringReader;
import java.util.ArrayList;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;

import com.rayeen.spider.vertical.recognize.product.ArProductLexer;
import com.rayeen.spider.vertical.recognize.product.RoleTagProduct;

public class TestIndex {

	/**
	 * @param args
	 */
	public static void main(String[] args) {

		final RAMDirectory ramDirTag = new RAMDirectory();

		final RAMDirectory ramDirContent = new RAMDirectory();
		FSDirectory fsDir = null;
		try {
			fsDir = FSDirectory.getDirectory("tmp");
		} catch (IOException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}

		try {

			Analyzer analyzer = new KeywordAnalyzer();

			 IndexWriter writerTag = new  IndexWriter(ramDirTag, analyzer);
			 IndexWriter writerContent = new  IndexWriter(ramDirContent,
					analyzer);

			String str = "1<img src=asdfdasf/><a href=1>end<a href=2>adf</a>sa文件fd下载a<a href=3>下面<a href=4>sf下页魅力11秒27分<img src=asdfdasf/>end";

			Document docTag = new Document();
			Document docContent = new Document();

			// doc.add(new Field("tk.type()", "tk.termText()",
			// Field.Store.YES,Field.Index.UN_TOKENIZED,
			// Field.TermVector.WITH_POSITIONS_OFFSETS));

			ArProductLexer lexer = new ArProductLexer(new PushbackReader(new StringReader(str),
					1024));
			RoleTagProduct roletag=new RoleTagProduct();
			RoleTagProduct.initilize();
			
			
			lexer.parserToken();

			Token tk = lexer.next();
			while (null != tk) {
				System.out.println(tk);
				if (tk.type().equals(RoleTag.CONTENT)) {
					docContent.add(new Field(RoleTag.CONTENT, tk.termText(),
							Field.Store.YES, Field.Index.UN_TOKENIZED,
							Field.TermVector.WITH_POSITIONS_OFFSETS));

				} else {

					docTag.add(new Field(RoleTag.TAG, tk.type(),
							Field.Store.YES, Field.Index.UN_TOKENIZED,
							Field.TermVector.WITH_POSITIONS_OFFSETS));

				}
				tk = lexer.next();
			}

			writerTag.addDocument(docTag);
			writerContent.addDocument(docContent);
			// }
			// writer.optimize();
			writerTag.close();
			writerContent.close();

			IndexReader readerTag = IndexReader.open(ramDirTag);
			IndexReader readerContent = IndexReader.open(ramDirContent);

			IndexSearcher searcherTag = new IndexSearcher(readerTag);
			IndexSearcher searcherContent = new IndexSearcher(readerContent);
			
//			readerTag.document(0);
//			Document d=searcherTag.doc(0);

			// 先到tag索引中去找tag位置
			TermPositionVector tpv = (TermPositionVector) readerTag
					.getTermFreqVector(0, RoleTag.TAG);

			String[] terms = tpv.getTerms();

			ArrayList<Integer> downloadPosSet = new ArrayList();

			// 获得指定role出现的位置,这里的例子中download在第七位,然后直接到Lexer中的第七个位置向上下两边搜索
			for (int i = 0; i < tpv.size(); i++) {
				if (terms[i].equals(RoleTag.DOWNLOAD)) {
					int[] posArray = tpv.getTermPositions(i);

					for (int p : posArray) {
						downloadPosSet.add(p);
					}
					break;

				}
			}

			ArrayList<Integer> urlPosSet = new ArrayList();
			for (int i = 0; i < tpv.size(); i++) {
				if (terms[i].equals(RoleTag.HREF)) {
					int[] posArray = tpv.getTermPositions(i);
					for (int p : posArray) {
						urlPosSet.add(p);
					}
					break;

				}
			}
			for (int downloadPos : downloadPosSet) {
				int leftNear = -1, rightNear = Integer.MAX_VALUE;
				ArrayList<String> urls = new ArrayList();
				for (Integer i : urlPosSet) {
					if (i < downloadPos) {
						if (leftNear < i) {
							leftNear = i;
						}
					}
					if (i > downloadPos) {
						if (rightNear > i) {
							rightNear = i;
						}
					}
				}
				if (leftNear > 0) {
					urls.add(lexer.getContent(leftNear));
				}
				if (rightNear < Integer.MAX_VALUE) {
					urls.add(lexer.getContent(rightNear));
				}
				// 找到离“下载”两个字最近的url!!!!!
				// 类似,可以找到离“时间”RoleTag最近的Number
				System.out.println(urls);
			}

		} catch (CorruptIndexException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (LockObtainFailedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} 

	}

}
roki 2009-06-19
输出结果:
<a href=3>

也就是离“下载”这个语素最近的一个URL语素
roki 2009-06-19
因为已经对原文中的词语以及它的角色做了索引, 需要自动识别各个词素的时候,就简单了,比如, 需要提取出原文中靠近 “下载”这个词的一个URL,作为下载地址用,那么,
只要先读出“下载”的位置,然后向后或者向前搜索最靠近的一个URL Token,就解决问题了。 这段代码里充分使用了lucene做索引的时候写入的 termVector信息(而且做索引的时候还必须是 Field.TermVector.WITH_POSITIONS_OFFSETS 的,这样索引里的termVector才会包含Token的位置偏移量)。

这个算法涉及到lucene索引的底层结构了, 可以算是一个对lucene比较特别的应用
wjm0729 2010-06-20
用正则不是更简单快捷码???
Global site tag (gtag.js) - Google Analytics