From 66a38d816de5d5932fd2d99c74538c43422ad44a Mon Sep 17 00:00:00 2001
From: Kaz Kylheku <kaz@kylheku.com>
Date: Thu, 19 Apr 2012 21:45:46 -0700
Subject: First cut at implementing \s, \d, \w, \S, \D and \W regex tokens.

* lib.c (init): Call regex_init.

* parser.l: return new REGTOKEN kind.

* parser.y (REGTOKEN): New token type.
(REGTERM): Translate REGTERM to keyword.
(regclass): Restructured to handle inherited nodes as lists.
(regclassterm): Produce $$ as list. Add handling for REGTOKEN
occurring inside character class by expanding it. This might not
be the best approach.
(yybadtoken): Handle REGTOKEN in switch.

* regex.c (struct any_char_set, struct small_char_set,
struct displaced_char_set, struct large_char_set,
struct xlarge_char_set): New bitfield member, stat.
(char_set_create): New parameter for indicating static char set.
(char_set_destroy): Do not free a static char set.
(char_set_compile): Pass zero to new parameter of char_set_create.
(spaces): New static array.
(space_cs, digit_cs, word_cs, cspace_cs, cdigit_cs, cword_cs): New
static pointers to char_set_t.
(init_special_char_sets, nfa_compile_given_set): New static function.
(nfa_compile_regex, dv_compile_regex): Handle new character set token
keywords.
(space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k,
regex_space_chars): New variables.
(regex_init): New function.

* regex.h (space_k, digit_k, word_char_k, cspace_k, cdigit_k,
cword_char_k, regex_space_chars, regex_init): Declared.
---
 parser.y | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

(limited to 'parser.y')
diff --git a/parser.y b/parser.y
index 7a058d60..c57bc901 100644
--- a/parser.y
+++ b/parser.y
@@ -76,7 +76,7 @@ static val parsed_spec;
 
 %token <val> NUMBER METANUM
 
-%token <chr> REGCHAR LITCHAR
+%token <chr> REGCHAR REGTOKEN LITCHAR
 %token <chr> METAPAR METABKT SPLICE
 
 %type <val> spec clauses clauses_opt clause
@@ -105,7 +105,7 @@ static val parsed_spec;
 %left '|' '/'
 %left '&' 
 %right '~' '*' '?' '+' '%'
-%right '.' REGCHAR LITCHAR
+%right '.' REGCHAR REGTOKEN LITCHAR
 %right DOTDOT
 
 %%
@@ -796,18 +796,48 @@ regterm : regterm '*'           { $$ = list(zeroplus_s, $1, nao); }
         | ']'                   { $$ = chr(']'); }
         | '-'                   { $$ = chr('-'); }
         | REGCHAR               { $$ = chr($1); }
+        | REGTOKEN              { switch ($1)
+                                  { case 's':
+                                      $$ = space_k; break;
+                                    case 'S':
+                                      $$ = cspace_k; break;
+                                    case 'd':
+                                      $$ = digit_k; break;
+                                    case 'D':
+                                      $$ = cdigit_k; break;
+                                    case 'w':
+                                      $$ = word_char_k; break;
+                                    case 'W':
+                                      $$ = cword_char_k; break; }}
         | '(' regexpr ')'       { $$ = $2; }
         | '(' error             { $$ = nil;
                                   yybadtoken(yychar,
                                              lit("regex subexpression")); }
         ;
 
-regclass : regclassterm                 { $$ = cons($1, nil); }
-         | regclassterm regclass        { $$ = cons($1, $2); }
+regclass : regclassterm                 { $$ = $1; }
+         | regclassterm regclass        { $$ = nappend2($1, $2); }
          ;
 
-regclassterm : regrange         { $$ = $1; }
-             | regchar          { $$ = chr($1); }
+regclassterm : regrange         { $$ = cons($1, nil); }
+             | regchar          { $$ = cons(chr($1), nil); }
+             | REGTOKEN         { switch ($1)
+                                  { case 's':
+                                      $$ = regex_space_chars;
+                                      break;
+                                    case 'd':
+                                      $$ = cons(cons(chr('0'), chr('9')), nil);
+                                      break;
+                                    case 'w':
+                                      $$ = list(cons(chr('A'), chr('Z')),
+                                                cons(chr('a'), chr('z')),
+                                                chr('_'), nao);
+                                      break;
+                                    default:
+                                      yyerrorf(lit("complemented token "
+                                                   "\\~a not allowed "
+                                                   "in regex character class"),
+                                               chr($1), nao); } }
              ;
 
 regrange : regchar '-' regchar  { $$ = cons(chr($1), chr($3)); }
@@ -1130,6 +1160,7 @@ void yybadtoken(int tok, val context)
   case FINALLY: problem = lit("\"finally\""); break;
   case NUMBER:  problem = lit("number"); break;
   case REGCHAR: problem = lit("regular expression character"); break;
+  case REGTOKEN: problem = lit("regular expression token"); break;
   case LITCHAR: problem = lit("string literal character"); break;
   case METAPAR: problem = lit("@("); break;
   case METABKT: problem = lit("@["); break;
-- 
cgit v1.2.3