《搜索引擎零距离》IRS脚本语言SableCC规格说明

roki 2009-06-19

Package com.rayeen.spider.vertical.parser;

Helpers

    all = [0 .. 0xFFFF];
    digit = ['0' .. '9'];
    tab = 9;
    cr = 13;
    lf = 10;
    eol = cr lf | cr | lf;

    blank = (' ' | tab | eol)+;




    lowercase = ['a' .. 'z'];
    uppercase = ['A' .. 'Z'];



  letter = lowercase | uppercase ;



not_eol = [all - [cr + lf]];
	not_star = [all - '*'];
	not_star_slash = [not_star - '/'];





	simple_escape_seq = 
		'\' ''' | '\"' | '\?' | '\\' | '\a' | '\b' | '\f' | '\n' |
		'\r' | '\t' | '\v';
	s_char = [all - ['"' + ['\' + [cr + lf]]]] | simple_escape_seq;
	s_char_seq = s_char+;


//inline_static_string = [ all - [']'+['['+''']]]*;
inline_static_string = [ all -['"'+['}'+ ['{'+[']'+['['+''']]]]]]*;
//inline_static_string = [ all - [pair+quote]]*;



  a = ['a' + 'A'];
  b = ['b' + 'B'];
  c = ['c' + 'C'];
  d = ['d' + 'D'];
  e = ['e' + 'E'];
  f = ['f' + 'F'];
  g = ['g' + 'G'];
  h = ['h' + 'H'];
  i = ['i' + 'I'];
  j = ['j' + 'J'];
  k = ['k' + 'K'];
  l = ['l' + 'L'];
  m = ['m' + 'M'];
  n = ['n' + 'N'];
  o = ['o' + 'O'];
  p = ['p' + 'P'];
  q = ['q' + 'Q'];
  r = ['r' + 'R'];
  s = ['s' + 'S'];
  t = ['t' + 'T'];
  u = ['u' + 'U'];
  v = ['v' + 'V'];
  w = ['w' + 'W'];
  x = ['x' + 'X'];
  y = ['y' + 'Y'];
  z = ['z' + 'Z'];

States
 code, string,node,action,conf;


Tokens


 	blank = blank;
//	digit=digit;
	letter=letter;
	

	d_dot = '..';


	number=digit+;







  

//除了空格之外所有字符都可以是标识符


//必须进入string 状态之后,才能开始提取除了[]{}"之外的所有字符
   {string} inline_static_string = inline_static_string;


	

 {code} plus_eq = '+=';
  {code} minus_eq = '-=';
  {code} star_eq = '*=';
  {code} slash_eq = '/=';
  {code} dot_eq = '.=';
  {code} perc_eq = '%=';
  {code} caret_eq = '^=';
  {code} amp_eq = '&=';
  {code} bar_eq = '|=';
  {code} sh_l_eq = '<<=';
  {code} sh_r_eq = '>>=';

  {code} bop_sh_left = '<<';
  {code} bop_sh_right = '>>';

  {code} point_assoc = '=>';
  {code} point_elem = '->';

  {code} cop_eq = '==';
  {code} cop_leq = '===';
  {code} cop_nleq = '!==';
  {code} cop_lteq = '<=';
  {code} cop_gteq = '>=';
  {code} cop_lt = '<';
  {code} cop_gt = '>';
  {code} cop_neq = '!=';
  {code} cop_or = '||';


  {code} exclamation = '!';
  {code} ampersand = '&';
  {code} bar = '|';
  {code} caret = '^';
  {code} tilde = '~';
  {code} equal = '=';
  {code,node} star = '*';
  {code} at = '@';
  {code} div = '/';
  {code} mod = '%';
  {code} plus_plus = '++';
  {code} minus = '-';
  {code->node} l_par = '(';
  {node->code} r_par = ')';


//  {string->code}
l_brace = '{';  
//  {code->string}
r_brace = '}';
//  {string->code}
l_bracket = '[';
//  {code->string}
 r_bracket = ']';
 

  {code} semicolon = ';';
  {code} colon = ':';
  {code} coloncolon = '::';
  
  {code}dot = '.';
  {code} comma = ',';
  {code} dollar = '$';
  {code} quest = '?';




 {code,node} static_string = ''' ([all - ['\' + ''']] | '\' all)*  '''; /* '4vim */

//action状态暂时不做区分,可能引入细化的词法分析文件

   {code->action}conf='配置';

    //中文句子,'动作'之后,解析中文句子,类此的词,还可以有其他词
   {code->action}action='动作';
   {code->action}crawler_conf='爬虫配置';
   {action} action_begin= [':'+':'];
   {action->code} action_end= ['.'+'。'];
   {action} cn_string = ([all - [':'+[':'+['.' + '。']]]])*;




    page_config='页面配置';

    system_config='系统配置';


    download = '下载';
    sitename = '站点名';
    pagename = '页面名';
    parent_pagename = '父页面名';
    entrance = '入口';

 

    
   
    inpage='页内';
    outpage='页外';

    next_entrance='次级';

    encode='编码';

    crawler_conf_name='爬虫配置名';
    extract= '提取';
    tag_name='标签名';
    node_path='节点';
    extract_rule='提取规则';
    entity_name='实体名';
    field_name='字段名';
    match='匹配';
    execute='执行';
    crawl='抓取';
    exclude='排除';
    save='保存';
    ruby='ruby'|'Ruby';
    auto_recognize='自动识别';
  

  



  {code->string} string_start = '"';
  {string->code} string_end = '"';


  {string} string_cvar_start =;
  {string, code->string} string_cvar_end =;



	comment = 
		('//' not_eol* eol) |
		('/*' not_star* '*'+ (not_star_slash not_star* '*'+)* '/');




Ignored Tokens
    comment,
    blank;

Productions

program		 = statment +;

//按照新版的写法的话,应该是 {page} page_description{->page_description.statment}
//page_description{->statment}
//New statment.page(...)


statment 	=
		 {page} page_description|
		 {crawler} crawler_block|
		 {sysconf} T.system_config T.static_string {->New statment.sysconf(static_string)} |
		 {execute} execute_block ;



page_description   = page_config [page]:page_statement+ semicolon{->New page_description([page])};

crawler_block       = crawler_conf action_begin cn_string action_end {->New crawler_block(cn_string)} ;

execute_block       = execute [exec]:execute_statment+  semicolon{->New execute_block([exec])}  ;




execute_statment=
	{crawler_site} crawl sitename static_string{->New execute_statment.site(static_string) } |
	{crawler_page} crawl pagename static_string{->New execute_statment.page(static_string) } ;
	

page_statement   = 

	//执行脚本(ruby)

	{execute}  ruby  colon static_string 		 {-> New page_statement.execute(static_string)} |

	//自动识别
	{auto_recognize} auto_recognize  colon [name]:static_string [pattern]:static_string 
							  {-> New page_statement.recognize(name,pattern)} |


 	//爬虫配置声明
	{crawler_declare} crawler_conf_name  static_string {-> New page_statement.crawler_conf_name(static_string)}|

 	//站点名声明
	{site_declare} sitename  static_string {-> New page_statement.site_declare(static_string)}|

 	//页面声明
	{page_declare} pagename  static_string {-> New page_statement.page_declare(static_string)}|
	
	//父页面声明
	{parent_page_declare} parent_pagename  static_string {-> New page_statement.parent_page_declare(static_string)}|


	//入口声明,encode(编码)和 exclude(排除的链接) 都可选
	{entr}   entrance   entrance_data+ conf_block? {->New page_statement.entr([entrance_data],conf_block)}|

	
	{extract} extract_declarations {->extract_declarations.page_statement}|

	//次级入口

	{next_entrance}  P.next_entrance_declare {->next_entrance_declare.page_statement};
	




next_entrance_declare {->page_statement}
	//页内入口
	={in_page} T.next_entrance inpage  [reg]:static_string [pagename]:static_string  exclude_statment?
		{->New page_statement.inpage_nextentr(reg,pagename,exclude_statment) }|

	//页外入口(关联入口)
	{out_page}  T.next_entrance outpage  [reg]:static_string [pagename]:static_string  exclude_statment?
		{->New page_statement.outpage_nextentr(reg,pagename,exclude_statment) };


//	{execute}  T.next_entrance execute  [reg]:static_string [pagename]:static_string
//		{->New page_statement.execute(reg,pagename,exclude_statment) };




exclude_statment= exclude static_string{->New exclude_statment(static_string) };


//提取(声明)描述,包括“标签名”,
extract_declarations{->page_statement} =
	//节点声明-- 节点 1.2.3
	{node_declaration} node_tag  P.node_chain_string [s2]:static_string [s3]:static_string action_block?
			 {->New page_statement.node_declaration(node_tag,[P.node_chain_string.nc],s2,s3,P.action_block)} |

	{regmatch_declaration} regmatch_tag   [s2]:static_string [s3]:static_string action_block?
			 {->New page_statement.regmatch_declare(regmatch_tag,s2,s3,P.action_block)} |

	{save_declaration} T.save colon [name]:static_string  [stat]:static_string
			 {->New page_statement.save_declare(name,stat)} ;



//暂时把所有动作字符串放进一个节点,减少语法树复杂度
//以动态函数调用的形式来处理
action_block{->action_block?}= action action_begin cn_string action_end {->New action_block(cn_string)} ;
	

conf_block{->conf_block?}= conf action_begin cn_string action_end {->New conf_block(cn_string)} ;


//节点可以有名字 也可以没名字
regmatch_tag=
	{noname} T.match{->New regmatch_tag.noname()}|
	{hasname} T.match colon [s1]:static_string{->New regmatch_tag.hasname(s1) };


//节点可以有名字 也可以没名字
node_tag=
	{noname} T.node_path{->New node_tag.noname()}|
	{hasname} T.node_path colon [s1]:static_string{->New node_tag.hasname(s1) };




//节点链 1.1.2.3.*
node_chain_string {->[nc]:node_value*} 
	= node_value [node_ids]:node_tail* {->[node_value,node_ids.node_value]};




//节点,可以是数字或*
node_value{->node_value}= 
       {number}  number {->New node_value.number(number) }|
       {star}     star  {->New node_value.star(star)};

//.1
node_tail {->node_value} =
       {number} dot number {->New node_value.number(number) }|
       {star} dot star  {->New node_value.star( star)};






entrance_data {->entrance_data?}  =

       {string}   static_string{ ->New entrance_data.string(static_string) }
      | {pnr }   l_brace [d1]:number d_dot [d2]:number r_brace { ->New entrance_data.pnr(d1, d2) }
      | {upnr}   l_bracket [d1]:number d_dot [d2]:number r_bracket { ->New entrance_data.upnr(d1, d2) }
      | {lr} l_bracket [l1]:letter d_dot [l2]:letter r_bracket { ->New entrance_data.lr(l1, l2) }
      | {external} 	cop_lt static_string cop_gt{ ->New entrance_data.external(static_string)}
  ;




terminator {-> } =
      {semicolon}         semicolon {-> };

Abstract Syntax Tree

program		 =statment+;

		

statment =	{page}    page_description |
		{crawler} crawler_block    |
		{sysconf}  [confstring]:static_string|
		{execute}  execute_block;



page_description	=    page_statement+  ;

crawler_block		=[statments]: cn_string;

execute_block		=    execute_statment+ ;


page_statement	=
		{recognize}  [name]:static_string [patten]:static_string|
		{execute}  [script]:static_string |
		{crawler_conf_name} [crawler_conf_name]:static_string|
		{site_declare} [sitename]:static_string|
          	{page_declare} [pagename]:static_string|
          	{parent_page_declare} [pagename]:static_string|
		{entr} entrance_data+ [conf]:conf_block? |
		{extract} extract_declarations+|
		{inpage_nextentr} [reg]:static_string [pagename]:static_string [exclude]:exclude_statment?|
		{outpage_nextentr}  [reg]:static_string [pagename]:static_string [exclude]:exclude_statment?|
		{go_nextentr}  [scriptname]:static_string [script]:static_string |
		{regmatch_declare} regmatch_tag [s1]:static_string [s2]:static_string  [action]:action_block?|
		{save_declare}  [name]:static_string [stat]:static_string |
		{node_declaration} node_tag node_value+ [rule]:static_string [fields]:static_string [action]:action_block?;


action_block=[statments]: cn_string;
conf_block  =[statments]: cn_string;


execute_statment=
	{site} static_string|
	{page} static_string;



entrance_data                =
			{string} [string]:static_string 	|
                        {pnr} [d1]:number [d2]:number 	 	|
                        {upnr}     [d1]:number [d2]:number 	|
                        {lr}      [l1]:letter  [l2]:letter	|
			{external} [list]:static_string
                      ;



exclude_statment= static_string;


extract_declarations 	=
		{node_value} node_value+;


node_tag=
	{noname}|
	{hasname} [name]:static_string;


node_value =
	{number} number|
	{star} star;


regmatch_tag=
	{noname}|
	{hasname} [name]:static_string;
daizhenze 2009-08-08
问下运行上面代码得出:
-- Generating parser for irs.sable in E:\IRS\IRS
org.sablecc.sablecc.parser.ParserException: [266,46] expecting: ';'
是什么原因照成的
roki 2009-08-12
用哪个版本的SableCC?  我用的3.0的
daizhenze 2009-08-12
我用的是sablecc-2.18.2,但每次都有是到那里有错,我再试试
daizhenze 2009-08-12
还是不行,我现在用的是sablecc-3.2的,没找到你那个版本,能不能把你的源程序发一份给我,我自己慢慢想找吧,看看有什么地方我做的和你不一样。谢谢!
我的邮箱daizhenze@126.com
Global site tag (gtag.js) - Google Analytics