From 66a38d816de5d5932fd2d99c74538c43422ad44a Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Thu, 19 Apr 2012 21:45:46 -0700 Subject: First cut at implementing \s, \d, \w, \S, \D and \W regex tokens. * lib.c (init): Call regex_init. * parser.l: return new REGTOKEN kind. * parser.y (REGTOKEN): New token type. (REGTERM): Translate REGTERM to keyword. (regclass): Restructured to handle inherited nodes as lists. (regclassterm): Produce $$ as list. Add handling for REGTOKEN occurring inside character class by expanding it. This might not be the best approach. (yybadtoken): Handle REGTOKEN in switch. * regex.c (struct any_char_set, struct small_char_set, struct displaced_char_set, struct large_char_set, struct xlarge_char_set): New bitfield member, stat. (char_set_create): New parameter for indicating static char set. (char_set_destroy): Do not free a static char set. (char_set_compile): Pass zero to new parameter of char_set_create. (spaces): New static array. (space_cs, digit_cs, word_cs, cspace_cs, cdigit_cs, cword_cs): New static pointers to char_set_t. (init_special_char_sets, nfa_compile_given_set): New static function. (nfa_compile_regex, dv_compile_regex): Handle new character set token keywords. (space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k, regex_space_chars): New variables. (regex_init): New function. * regex.h (space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k, regex_space_chars, regex_init): Declared. --- parser.y | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'parser.y') diff --git a/parser.y b/parser.y index 7a058d60..c57bc901 100644 --- a/parser.y +++ b/parser.y @@ -76,7 +76,7 @@ static val parsed_spec; %token NUMBER METANUM -%token REGCHAR LITCHAR +%token REGCHAR REGTOKEN LITCHAR %token METAPAR METABKT SPLICE %type spec clauses clauses_opt clause @@ -105,7 +105,7 @@ static val parsed_spec; %left '|' '/' %left '&' %right '~' '*' '?' '+' '%' -%right '.' REGCHAR LITCHAR +%right '.' REGCHAR REGTOKEN LITCHAR %right DOTDOT %% @@ -796,18 +796,48 @@ regterm : regterm '*' { $$ = list(zeroplus_s, $1, nao); } | ']' { $$ = chr(']'); } | '-' { $$ = chr('-'); } | REGCHAR { $$ = chr($1); } + | REGTOKEN { switch ($1) + { case 's': + $$ = space_k; break; + case 'S': + $$ = cspace_k; break; + case 'd': + $$ = digit_k; break; + case 'D': + $$ = cdigit_k; break; + case 'w': + $$ = word_char_k; break; + case 'W': + $$ = cword_char_k; break; }} | '(' regexpr ')' { $$ = $2; } | '(' error { $$ = nil; yybadtoken(yychar, lit("regex subexpression")); } ; -regclass : regclassterm { $$ = cons($1, nil); } - | regclassterm regclass { $$ = cons($1, $2); } +regclass : regclassterm { $$ = $1; } + | regclassterm regclass { $$ = nappend2($1, $2); } ; -regclassterm : regrange { $$ = $1; } - | regchar { $$ = chr($1); } +regclassterm : regrange { $$ = cons($1, nil); } + | regchar { $$ = cons(chr($1), nil); } + | REGTOKEN { switch ($1) + { case 's': + $$ = regex_space_chars; + break; + case 'd': + $$ = cons(cons(chr('0'), chr('9')), nil); + break; + case 'w': + $$ = list(cons(chr('A'), chr('Z')), + cons(chr('a'), chr('z')), + chr('_'), nao); + break; + default: + yyerrorf(lit("complemented token " + "\\~a not allowed " + "in regex character class"), + chr($1), nao); } } ; regrange : regchar '-' regchar { $$ = cons(chr($1), chr($3)); } @@ -1130,6 +1160,7 @@ void yybadtoken(int tok, val context) case FINALLY: problem = lit("\"finally\""); break; case NUMBER: problem = lit("number"); break; case REGCHAR: problem = lit("regular expression character"); break; + case REGTOKEN: problem = lit("regular expression token"); break; case LITCHAR: problem = lit("string literal character"); break; case METAPAR: problem = lit("@("); break; case METABKT: problem = lit("@["); break; -- cgit v1.2.3