summaryrefslogtreecommitdiffstats
path: root/parser.y
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2012-04-19 21:45:46 -0700
committerKaz Kylheku <kaz@kylheku.com>2012-04-19 21:45:46 -0700
commit66a38d816de5d5932fd2d99c74538c43422ad44a (patch)
tree30342ab840f281739c11f181c273fce74cfa82af /parser.y
parent20a737a17009582fd3022fb2f67e4b472445bc4f (diff)
downloadtxr-66a38d816de5d5932fd2d99c74538c43422ad44a.tar.gz
txr-66a38d816de5d5932fd2d99c74538c43422ad44a.tar.bz2
txr-66a38d816de5d5932fd2d99c74538c43422ad44a.zip
First cut at implementing \s, \d, \w, \S, \D and \W regex tokens.
* lib.c (init): Call regex_init. * parser.l: return new REGTOKEN kind. * parser.y (REGTOKEN): New token type. (REGTERM): Translate REGTERM to keyword. (regclass): Restructured to handle inherited nodes as lists. (regclassterm): Produce $$ as list. Add handling for REGTOKEN occurring inside character class by expanding it. This might not be the best approach. (yybadtoken): Handle REGTOKEN in switch. * regex.c (struct any_char_set, struct small_char_set, struct displaced_char_set, struct large_char_set, struct xlarge_char_set): New bitfield member, stat. (char_set_create): New parameter for indicating static char set. (char_set_destroy): Do not free a static char set. (char_set_compile): Pass zero to new parameter of char_set_create. (spaces): New static array. (space_cs, digit_cs, word_cs, cspace_cs, cdigit_cs, cword_cs): New static pointers to char_set_t. (init_special_char_sets, nfa_compile_given_set): New static function. (nfa_compile_regex, dv_compile_regex): Handle new character set token keywords. (space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k, regex_space_chars): New variables. (regex_init): New function. * regex.h (space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k, regex_space_chars, regex_init): Declared.
Diffstat (limited to 'parser.y')
-rw-r--r--parser.y43
1 files changed, 37 insertions, 6 deletions
diff --git a/parser.y b/parser.y
index 7a058d60..c57bc901 100644
--- a/parser.y
+++ b/parser.y
@@ -76,7 +76,7 @@ static val parsed_spec;
%token <val> NUMBER METANUM
-%token <chr> REGCHAR LITCHAR
+%token <chr> REGCHAR REGTOKEN LITCHAR
%token <chr> METAPAR METABKT SPLICE
%type <val> spec clauses clauses_opt clause
@@ -105,7 +105,7 @@ static val parsed_spec;
%left '|' '/'
%left '&'
%right '~' '*' '?' '+' '%'
-%right '.' REGCHAR LITCHAR
+%right '.' REGCHAR REGTOKEN LITCHAR
%right DOTDOT
%%
@@ -796,18 +796,48 @@ regterm : regterm '*' { $$ = list(zeroplus_s, $1, nao); }
| ']' { $$ = chr(']'); }
| '-' { $$ = chr('-'); }
| REGCHAR { $$ = chr($1); }
+ | REGTOKEN { switch ($1)
+ { case 's':
+ $$ = space_k; break;
+ case 'S':
+ $$ = cspace_k; break;
+ case 'd':
+ $$ = digit_k; break;
+ case 'D':
+ $$ = cdigit_k; break;
+ case 'w':
+ $$ = word_char_k; break;
+ case 'W':
+ $$ = cword_char_k; break; }}
| '(' regexpr ')' { $$ = $2; }
| '(' error { $$ = nil;
yybadtoken(yychar,
lit("regex subexpression")); }
;
-regclass : regclassterm { $$ = cons($1, nil); }
- | regclassterm regclass { $$ = cons($1, $2); }
+regclass : regclassterm { $$ = $1; }
+ | regclassterm regclass { $$ = nappend2($1, $2); }
;
-regclassterm : regrange { $$ = $1; }
- | regchar { $$ = chr($1); }
+regclassterm : regrange { $$ = cons($1, nil); }
+ | regchar { $$ = cons(chr($1), nil); }
+ | REGTOKEN { switch ($1)
+ { case 's':
+ $$ = regex_space_chars;
+ break;
+ case 'd':
+ $$ = cons(cons(chr('0'), chr('9')), nil);
+ break;
+ case 'w':
+ $$ = list(cons(chr('A'), chr('Z')),
+ cons(chr('a'), chr('z')),
+ chr('_'), nao);
+ break;
+ default:
+ yyerrorf(lit("complemented token "
+ "\\~a not allowed "
+ "in regex character class"),
+ chr($1), nao); } }
;
regrange : regchar '-' regchar { $$ = cons(chr($1), chr($3)); }
@@ -1130,6 +1160,7 @@ void yybadtoken(int tok, val context)
case FINALLY: problem = lit("\"finally\""); break;
case NUMBER: problem = lit("number"); break;
case REGCHAR: problem = lit("regular expression character"); break;
+ case REGTOKEN: problem = lit("regular expression token"); break;
case LITCHAR: problem = lit("string literal character"); break;
case METAPAR: problem = lit("@("); break;
case METABKT: problem = lit("@["); break;