《搜索引擎零距离》IRS脚本语言SableCC规格说明
roki
2009-06-19
Package com.rayeen.spider.vertical.parser; Helpers all = [0 .. 0xFFFF]; digit = ['0' .. '9']; tab = 9; cr = 13; lf = 10; eol = cr lf | cr | lf; blank = (' ' | tab | eol)+; lowercase = ['a' .. 'z']; uppercase = ['A' .. 'Z']; letter = lowercase | uppercase ; not_eol = [all - [cr + lf]]; not_star = [all - '*']; not_star_slash = [not_star - '/']; simple_escape_seq = '\' ''' | '\"' | '\?' | '\\' | '\a' | '\b' | '\f' | '\n' | '\r' | '\t' | '\v'; s_char = [all - ['"' + ['\' + [cr + lf]]]] | simple_escape_seq; s_char_seq = s_char+; //inline_static_string = [ all - [']'+['['+''']]]*; inline_static_string = [ all -['"'+['}'+ ['{'+[']'+['['+''']]]]]]*; //inline_static_string = [ all - [pair+quote]]*; a = ['a' + 'A']; b = ['b' + 'B']; c = ['c' + 'C']; d = ['d' + 'D']; e = ['e' + 'E']; f = ['f' + 'F']; g = ['g' + 'G']; h = ['h' + 'H']; i = ['i' + 'I']; j = ['j' + 'J']; k = ['k' + 'K']; l = ['l' + 'L']; m = ['m' + 'M']; n = ['n' + 'N']; o = ['o' + 'O']; p = ['p' + 'P']; q = ['q' + 'Q']; r = ['r' + 'R']; s = ['s' + 'S']; t = ['t' + 'T']; u = ['u' + 'U']; v = ['v' + 'V']; w = ['w' + 'W']; x = ['x' + 'X']; y = ['y' + 'Y']; z = ['z' + 'Z']; States code, string,node,action,conf; Tokens blank = blank; // digit=digit; letter=letter; d_dot = '..'; number=digit+; //除了空格之外所有字符都可以是标识符 //必须进入string 状态之后,才能开始提取除了[]{}"之外的所有字符 {string} inline_static_string = inline_static_string; {code} plus_eq = '+='; {code} minus_eq = '-='; {code} star_eq = '*='; {code} slash_eq = '/='; {code} dot_eq = '.='; {code} perc_eq = '%='; {code} caret_eq = '^='; {code} amp_eq = '&='; {code} bar_eq = '|='; {code} sh_l_eq = '<<='; {code} sh_r_eq = '>>='; {code} bop_sh_left = '<<'; {code} bop_sh_right = '>>'; {code} point_assoc = '=>'; {code} point_elem = '->'; {code} cop_eq = '=='; {code} cop_leq = '==='; {code} cop_nleq = '!=='; {code} cop_lteq = '<='; {code} cop_gteq = '>='; {code} cop_lt = '<'; {code} cop_gt = '>'; {code} cop_neq = '!='; {code} cop_or = '||'; {code} exclamation = '!'; {code} ampersand = '&'; {code} bar = '|'; {code} caret = '^'; {code} tilde = '~'; {code} equal = '='; {code,node} star = '*'; {code} at = '@'; {code} div = '/'; {code} mod = '%'; {code} plus_plus = '++'; {code} minus = '-'; {code->node} l_par = '('; {node->code} r_par = ')'; // {string->code} l_brace = '{'; // {code->string} r_brace = '}'; // {string->code} l_bracket = '['; // {code->string} r_bracket = ']'; {code} semicolon = ';'; {code} colon = ':'; {code} coloncolon = '::'; {code}dot = '.'; {code} comma = ','; {code} dollar = '$'; {code} quest = '?'; {code,node} static_string = ''' ([all - ['\' + ''']] | '\' all)* '''; /* '4vim */ //action状态暂时不做区分,可能引入细化的词法分析文件 {code->action}conf='配置'; //中文句子,'动作'之后,解析中文句子,类此的词,还可以有其他词 {code->action}action='动作'; {code->action}crawler_conf='爬虫配置'; {action} action_begin= [':'+':']; {action->code} action_end= ['.'+'。']; {action} cn_string = ([all - [':'+[':'+['.' + '。']]]])*; page_config='页面配置'; system_config='系统配置'; download = '下载'; sitename = '站点名'; pagename = '页面名'; parent_pagename = '父页面名'; entrance = '入口'; inpage='页内'; outpage='页外'; next_entrance='次级'; encode='编码'; crawler_conf_name='爬虫配置名'; extract= '提取'; tag_name='标签名'; node_path='节点'; extract_rule='提取规则'; entity_name='实体名'; field_name='字段名'; match='匹配'; execute='执行'; crawl='抓取'; exclude='排除'; save='保存'; ruby='ruby'|'Ruby'; auto_recognize='自动识别'; {code->string} string_start = '"'; {string->code} string_end = '"'; {string} string_cvar_start =; {string, code->string} string_cvar_end =; comment = ('//' not_eol* eol) | ('/*' not_star* '*'+ (not_star_slash not_star* '*'+)* '/'); Ignored Tokens comment, blank; Productions program = statment +; //按照新版的写法的话,应该是 {page} page_description{->page_description.statment} //page_description{->statment} //New statment.page(...) statment = {page} page_description| {crawler} crawler_block| {sysconf} T.system_config T.static_string {->New statment.sysconf(static_string)} | {execute} execute_block ; page_description = page_config [page]:page_statement+ semicolon{->New page_description([page])}; crawler_block = crawler_conf action_begin cn_string action_end {->New crawler_block(cn_string)} ; execute_block = execute [exec]:execute_statment+ semicolon{->New execute_block([exec])} ; execute_statment= {crawler_site} crawl sitename static_string{->New execute_statment.site(static_string) } | {crawler_page} crawl pagename static_string{->New execute_statment.page(static_string) } ; page_statement = //执行脚本(ruby) {execute} ruby colon static_string {-> New page_statement.execute(static_string)} | //自动识别 {auto_recognize} auto_recognize colon [name]:static_string [pattern]:static_string {-> New page_statement.recognize(name,pattern)} | //爬虫配置声明 {crawler_declare} crawler_conf_name static_string {-> New page_statement.crawler_conf_name(static_string)}| //站点名声明 {site_declare} sitename static_string {-> New page_statement.site_declare(static_string)}| //页面声明 {page_declare} pagename static_string {-> New page_statement.page_declare(static_string)}| //父页面声明 {parent_page_declare} parent_pagename static_string {-> New page_statement.parent_page_declare(static_string)}| //入口声明,encode(编码)和 exclude(排除的链接) 都可选 {entr} entrance entrance_data+ conf_block? {->New page_statement.entr([entrance_data],conf_block)}| {extract} extract_declarations {->extract_declarations.page_statement}| //次级入口 {next_entrance} P.next_entrance_declare {->next_entrance_declare.page_statement}; next_entrance_declare {->page_statement} //页内入口 ={in_page} T.next_entrance inpage [reg]:static_string [pagename]:static_string exclude_statment? {->New page_statement.inpage_nextentr(reg,pagename,exclude_statment) }| //页外入口(关联入口) {out_page} T.next_entrance outpage [reg]:static_string [pagename]:static_string exclude_statment? {->New page_statement.outpage_nextentr(reg,pagename,exclude_statment) }; // {execute} T.next_entrance execute [reg]:static_string [pagename]:static_string // {->New page_statement.execute(reg,pagename,exclude_statment) }; exclude_statment= exclude static_string{->New exclude_statment(static_string) }; //提取(声明)描述,包括“标签名”, extract_declarations{->page_statement} = //节点声明-- 节点 1.2.3 {node_declaration} node_tag P.node_chain_string [s2]:static_string [s3]:static_string action_block? {->New page_statement.node_declaration(node_tag,[P.node_chain_string.nc],s2,s3,P.action_block)} | {regmatch_declaration} regmatch_tag [s2]:static_string [s3]:static_string action_block? {->New page_statement.regmatch_declare(regmatch_tag,s2,s3,P.action_block)} | {save_declaration} T.save colon [name]:static_string [stat]:static_string {->New page_statement.save_declare(name,stat)} ; //暂时把所有动作字符串放进一个节点,减少语法树复杂度 //以动态函数调用的形式来处理 action_block{->action_block?}= action action_begin cn_string action_end {->New action_block(cn_string)} ; conf_block{->conf_block?}= conf action_begin cn_string action_end {->New conf_block(cn_string)} ; //节点可以有名字 也可以没名字 regmatch_tag= {noname} T.match{->New regmatch_tag.noname()}| {hasname} T.match colon [s1]:static_string{->New regmatch_tag.hasname(s1) }; //节点可以有名字 也可以没名字 node_tag= {noname} T.node_path{->New node_tag.noname()}| {hasname} T.node_path colon [s1]:static_string{->New node_tag.hasname(s1) }; //节点链 1.1.2.3.* node_chain_string {->[nc]:node_value*} = node_value [node_ids]:node_tail* {->[node_value,node_ids.node_value]}; //节点,可以是数字或* node_value{->node_value}= {number} number {->New node_value.number(number) }| {star} star {->New node_value.star(star)}; //.1 node_tail {->node_value} = {number} dot number {->New node_value.number(number) }| {star} dot star {->New node_value.star( star)}; entrance_data {->entrance_data?} = {string} static_string{ ->New entrance_data.string(static_string) } | {pnr } l_brace [d1]:number d_dot [d2]:number r_brace { ->New entrance_data.pnr(d1, d2) } | {upnr} l_bracket [d1]:number d_dot [d2]:number r_bracket { ->New entrance_data.upnr(d1, d2) } | {lr} l_bracket [l1]:letter d_dot [l2]:letter r_bracket { ->New entrance_data.lr(l1, l2) } | {external} cop_lt static_string cop_gt{ ->New entrance_data.external(static_string)} ; terminator {-> } = {semicolon} semicolon {-> }; Abstract Syntax Tree program =statment+; statment = {page} page_description | {crawler} crawler_block | {sysconf} [confstring]:static_string| {execute} execute_block; page_description = page_statement+ ; crawler_block =[statments]: cn_string; execute_block = execute_statment+ ; page_statement = {recognize} [name]:static_string [patten]:static_string| {execute} [script]:static_string | {crawler_conf_name} [crawler_conf_name]:static_string| {site_declare} [sitename]:static_string| {page_declare} [pagename]:static_string| {parent_page_declare} [pagename]:static_string| {entr} entrance_data+ [conf]:conf_block? | {extract} extract_declarations+| {inpage_nextentr} [reg]:static_string [pagename]:static_string [exclude]:exclude_statment?| {outpage_nextentr} [reg]:static_string [pagename]:static_string [exclude]:exclude_statment?| {go_nextentr} [scriptname]:static_string [script]:static_string | {regmatch_declare} regmatch_tag [s1]:static_string [s2]:static_string [action]:action_block?| {save_declare} [name]:static_string [stat]:static_string | {node_declaration} node_tag node_value+ [rule]:static_string [fields]:static_string [action]:action_block?; action_block=[statments]: cn_string; conf_block =[statments]: cn_string; execute_statment= {site} static_string| {page} static_string; entrance_data = {string} [string]:static_string | {pnr} [d1]:number [d2]:number | {upnr} [d1]:number [d2]:number | {lr} [l1]:letter [l2]:letter | {external} [list]:static_string ; exclude_statment= static_string; extract_declarations = {node_value} node_value+; node_tag= {noname}| {hasname} [name]:static_string; node_value = {number} number| {star} star; regmatch_tag= {noname}| {hasname} [name]:static_string; |
|
daizhenze
2009-08-08
问下运行上面代码得出:
-- Generating parser for irs.sable in E:\IRS\IRS org.sablecc.sablecc.parser.ParserException: [266,46] expecting: ';' 是什么原因照成的 |
|
roki
2009-08-12
用哪个版本的SableCC? 我用的3.0的
|
|
daizhenze
2009-08-12
我用的是sablecc-2.18.2,但每次都有是到那里有错,我再试试
|
|
daizhenze
2009-08-12
还是不行,我现在用的是sablecc-3.2的,没找到你那个版本,能不能把你的源程序发一份给我,我自己慢慢想找吧,看看有什么地方我做的和你不一样。谢谢!
我的邮箱daizhenze@126.com |