First cut at implementing \s, \d, \w, \S, \D and \W regex tokens.

* lib.c (init): Call regex_init. * parser.l: return new REGTOKEN kind. * parser.y (REGTOKEN): New token type. (REGTERM): Translate REGTERM to keyword. (regclass): Restructured to handle inherited nodes as lists. (regclassterm): Produce $$ as list. Add handling for REGTOKEN occurring inside character class by expanding it. This might not be the best approach. (yybadtoken): Handle REGTOKEN in switch. * regex.c (struct any_char_set, struct small_char_set, struct displaced_char_set, struct large_char_set, struct xlarge_char_set): New bitfield member, stat. (char_set_create): New parameter for indicating static char set. (char_set_destroy): Do not free a static char set. (char_set_compile): Pass zero to new parameter of char_set_create. (spaces): New static array. (space_cs, digit_cs, word_cs, cspace_cs, cdigit_cs, cword_cs): New static pointers to char_set_t. (init_special_char_sets, nfa_compile_given_set): New static function. (nfa_compile_regex, dv_compile_regex): Handle new character set token keywords. (space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k, regex_space_chars): New variables. (regex_init): New function. * regex.h (space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k, regex_space_chars, regex_init): Declared.
author: Kaz Kylheku <kaz@kylheku.com> 2012-04-19 21:45:46 -0700
committer: Kaz Kylheku <kaz@kylheku.com> 2012-04-19 21:45:46 -0700
commit: e63c7be49e144d2ed3967c28243109342e17dcaa (patch)
tree: 30342ab840f281739c11f181c273fce74cfa82af
parent: 12601aaa1ca2af7d685249b1c762458ad194889f (diff)
download: txr-e63c7be49e144d2ed3967c28243109342e17dcaa.tar.gz
txr-e63c7be49e144d2ed3967c28243109342e17dcaa.tar.bz2
txr-e63c7be49e144d2ed3967c28243109342e17dcaa.zip
6 files changed, 188 insertions, 9 deletions
diff --git a/ChangeLog b/ChangeLog
index b44a9c11..92f9cfcc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,38 @@
+2012-04-19  Kaz Kylheku  <kaz@kylheku.com>
+
+	First cut at implementing \s, \d, \w, \S, \D and \W regex tokens.
+
+	* lib.c (init): Call regex_init.
+
+	* parser.l: return new REGTOKEN kind.
+
+	* parser.y (REGTOKEN): New token type.
+	(REGTERM): Translate REGTERM to keyword.
+	(regclass): Restructured to handle inherited nodes as lists.
+	(regclassterm): Produce $$ as list. Add handling for REGTOKEN
+	occurring inside character class by expanding it. This might not
+	be the best approach.
+	(yybadtoken): Handle REGTOKEN in switch.
+
+	* regex.c (struct any_char_set, struct small_char_set,
+	struct displaced_char_set, struct large_char_set,
+	struct xlarge_char_set): New bitfield member, stat.
+	(char_set_create): New parameter for indicating static char set.
+	(char_set_destroy): Do not free a static char set.
+	(char_set_compile): Pass zero to new parameter of char_set_create.
+	(spaces): New static array.
+	(space_cs, digit_cs, word_cs, cspace_cs, cdigit_cs, cword_cs): New
+	static pointers to char_set_t.
+	(init_special_char_sets, nfa_compile_given_set): New static function.
+	(nfa_compile_regex, dv_compile_regex): Handle new character set token
+	keywords.
+	(space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k,
+	regex_space_chars): New variables.
+	(regex_init): New function.
+
+	* regex.h (space_k, digit_k, word_char_k, cspace_k, cdigit_k,
+	cword_char_k, regex_space_chars, regex_init): Declared.
+
 2012-04-15  Kaz Kylheku  <kaz@kylheku.com>
 
 	* eval.c (eval_init): New intrinsic functions remq*, remql*,
diff --git a/lib.c b/lib.c
index 979a4889..acaccf72 100644
--- a/lib.c
+++ b/lib.c
@@ -4622,6 +4622,7 @@ void init(const wchar_t *pn, mem_t *(*oom)(mem_t *, size_t),
   eval_init();
   filter_init();
   hash_init();
+  regex_init();
 
   gc_state(gc_save);
 }
diff --git a/parser.l b/parser.l
index 52aab27c..344684fe 100644
--- a/parser.l
+++ b/parser.l
@@ -574,6 +574,11 @@ UONLY   {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
   return REGCHAR;
 }
 
+<REGEX>[\\][sSdDwW] {
+  yylval.chr = yytext[1];
+  return REGTOKEN;
+}
+
 <REGEX>{WS}[\\]\n{WS} {
   lineno++;
 }
diff --git a/parser.y b/parser.y
index 7a058d60..c57bc901 100644
--- a/parser.y
+++ b/parser.y
@@ -76,7 +76,7 @@ static val parsed_spec;
 
 %token <val> NUMBER METANUM
 
-%token <chr> REGCHAR LITCHAR
+%token <chr> REGCHAR REGTOKEN LITCHAR
 %token <chr> METAPAR METABKT SPLICE
 
 %type <val> spec clauses clauses_opt clause
@@ -105,7 +105,7 @@ static val parsed_spec;
 %left '|' '/'
 %left '&' 
 %right '~' '*' '?' '+' '%'
-%right '.' REGCHAR LITCHAR
+%right '.' REGCHAR REGTOKEN LITCHAR
 %right DOTDOT
 
 %%
@@ -796,18 +796,48 @@ regterm : regterm '*'           { $$ = list(zeroplus_s, $1, nao); }
         | ']'                   { $$ = chr(']'); }
         | '-'                   { $$ = chr('-'); }
         | REGCHAR               { $$ = chr($1); }
+        | REGTOKEN              { switch ($1)
+                                  { case 's':
+                                      $$ = space_k; break;
+                                    case 'S':
+                                      $$ = cspace_k; break;
+                                    case 'd':
+                                      $$ = digit_k; break;
+                                    case 'D':
+                                      $$ = cdigit_k; break;
+                                    case 'w':
+                                      $$ = word_char_k; break;
+                                    case 'W':
+                                      $$ = cword_char_k; break; }}
         | '(' regexpr ')'       { $$ = $2; }
         | '(' error             { $$ = nil;
                                   yybadtoken(yychar,
                                              lit("regex subexpression")); }
         ;
 
-regclass : regclassterm                 { $$ = cons($1, nil); }
-         | regclassterm regclass        { $$ = cons($1, $2); }
+regclass : regclassterm                 { $$ = $1; }
+         | regclassterm regclass        { $$ = nappend2($1, $2); }
          ;
 
-regclassterm : regrange         { $$ = $1; }
-             | regchar          { $$ = chr($1); }
+regclassterm : regrange         { $$ = cons($1, nil); }
+             | regchar          { $$ = cons(chr($1), nil); }
+             | REGTOKEN         { switch ($1)
+                                  { case 's':
+                                      $$ = regex_space_chars;
+                                      break;
+                                    case 'd':
+                                      $$ = cons(cons(chr('0'), chr('9')), nil);
+                                      break;
+                                    case 'w':
+                                      $$ = list(cons(chr('A'), chr('Z')),
+                                                cons(chr('a'), chr('z')),
+                                                chr('_'), nao);
+                                      break;
+                                    default:
+                                      yyerrorf(lit("complemented token "
+                                                   "\\~a not allowed "
+                                                   "in regex character class"),
+                                               chr($1), nao); } }
              ;
 
 regrange : regchar '-' regchar  { $$ = cons(chr($1), chr($3)); }
@@ -1130,6 +1160,7 @@ void yybadtoken(int tok, val context)
   case FINALLY: problem = lit("\"finally\""); break;
   case NUMBER:  problem = lit("number"); break;
   case REGCHAR: problem = lit("regular expression character"); break;
+  case REGTOKEN: problem = lit("regular expression token"); break;
   case LITCHAR: problem = lit("string literal character"); break;
   case METAPAR: problem = lit("@("); break;
   case METABKT: problem = lit("@["); break;
diff --git a/regex.c b/regex.c
index 20774651..9670b901 100644
--- a/regex.c
+++ b/regex.c
@@ -38,6 +38,7 @@
 #include "unwind.h"
 #include "regex.h"
 #include "txr.h"
+#include "gc.h"
 
 #if WCHAR_MAX > 65535
 #define FULL_UNICODE
@@ -99,17 +100,20 @@ typedef cset_L2_t *cset_L3_t[17];
 struct any_char_set {
   unsigned type : 3;
   unsigned comp : 1;
+  unsigned stat : 1;
 };
 
 struct small_char_set {
   unsigned type : 3;
   unsigned comp : 1;
+  unsigned stat : 1;
   cset_L0_t bitcell;
 };
 
 struct displaced_char_set {
   unsigned type : 3;
   unsigned comp : 1;
+  unsigned stat : 1;
   cset_L0_t bitcell;
   wchar_t base;
 };
@@ -118,6 +122,7 @@ struct displaced_char_set {
 struct large_char_set {
   unsigned type : 3;
   unsigned comp : 1;
+  unsigned stat : 1;
   cset_L2_t dir;
 };
 
@@ -125,6 +130,7 @@ struct large_char_set {
 struct xlarge_char_set {
   unsigned type : 3;
   unsigned comp : 1;
+  unsigned stat : 1;
   cset_L3_t dir;
 };
 #endif
@@ -472,12 +478,13 @@ static void L3_free(cset_L3_t *L3)
 
 #endif
 
-static char_set_t *char_set_create(chset_type_t type, wchar_t base)
+static char_set_t *char_set_create(chset_type_t type, wchar_t base, unsigned st)
 {
   static char_set_t blank;
   char_set_t *cs = (char_set_t *) chk_malloc(sizeof *cs);
   *cs = blank;
   cs->any.type = type;
+  cs->any.stat = st;
 
   if (type == CHSET_DISPLACED)
     cs->d.base = base;
@@ -487,6 +494,9 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base)
 
 static void char_set_destroy(char_set_t *set)
 {
+  if (set->any.stat)
+    return;
+
   switch (set->any.type) {
   case CHSET_DISPLACED:
   case CHSET_SMALL:
@@ -644,7 +654,7 @@ static char_set_t *char_set_compile(val args, val comp)
 
 
   {
-    char_set_t *set = char_set_create(cst, min);
+    char_set_t *set = char_set_create(cst, min, 0);
 
     for (iter = args; iter; iter = rest(iter)) {
       val item = first(iter);
@@ -669,6 +679,48 @@ static char_set_t *char_set_compile(val args, val comp)
   }
 }
 
+wchar_t spaces[] = {
+  0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x0020, 0x00a0, 0x1680, 0x180e,
+  0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008,
+  0x2009, 0x200a, 0x2028, 0x2029, 0x205f, 0x3000, 0
+};
+
+static char_set_t *space_cs, *digit_cs, *word_cs;
+static char_set_t *cspace_cs, *cdigit_cs, *cword_cs;
+
+static void init_special_char_sets(void)
+{
+  int i;
+
+  space_cs = char_set_create(CHSET_LARGE, 0, 1);
+  cspace_cs = char_set_create(CHSET_LARGE, 0, 1);
+  digit_cs = char_set_create(CHSET_SMALL, 0, 1);
+  cdigit_cs = char_set_create(CHSET_SMALL, 0, 1);
+  word_cs = char_set_create(CHSET_SMALL, 0, 1);
+  cword_cs = char_set_create(CHSET_SMALL, 0, 1);
+
+  char_set_compl(cspace_cs);
+  char_set_compl(cdigit_cs);
+  char_set_compl(cword_cs);
+
+  for (i = 0; spaces[i] != 0; i++) {
+    wchar_t sp = spaces[i];
+    char_set_add(space_cs, sp);
+    char_set_add(cspace_cs, sp);
+    push(chr(sp), &regex_space_chars);
+  }
+
+  char_set_add_range(digit_cs, '0', '9');
+  char_set_add_range(cdigit_cs, '0', '9');
+
+  char_set_add_range(word_cs, 'A', 'Z');
+  char_set_add_range(cword_cs, 'A', 'Z');
+  char_set_add_range(word_cs, 'a', 'a');
+  char_set_add_range(cword_cs, 'a', 'a');
+  char_set_add(word_cs, '_');
+  char_set_add(cword_cs, '_');
+}
+
 static void char_set_cobj_destroy(val chset)
 {
   char_set_t *set = (char_set_t *) chset->co.handle;
@@ -812,6 +864,13 @@ static nfa_t nfa_compile_set(val args, val comp)
   return nfa_make(s, acc);
 }
 
+static nfa_t nfa_compile_given_set(char_set_t *set)
+{
+  nfa_state_t *acc = nfa_state_accept();
+  nfa_state_t *s = nfa_state_set(acc, set);
+  return nfa_make(s, acc);
+}
+
 static nfa_t nfa_compile_regex(val regex);
 
 /*
@@ -852,6 +911,18 @@ static nfa_t nfa_compile_regex(val exp)
     nfa_state_t *acc = nfa_state_accept();
     nfa_state_t *s = nfa_state_wild(acc);
     return nfa_make(s, acc);
+  } else if (exp == space_k) {
+    return nfa_compile_given_set(space_cs);
+  } else if (exp == digit_k) {
+    return nfa_compile_given_set(digit_cs);
+  } else if (exp == word_char_k) {
+    return nfa_compile_given_set(word_cs);
+  } else if (exp == cspace_k) {
+    return nfa_compile_given_set(cspace_cs);
+  } else if (exp == cdigit_k) {
+    return nfa_compile_given_set(cdigit_cs);
+  } else if (exp == cword_char_k) {
+    return nfa_compile_given_set(cword_cs);
   } else if (consp(exp)) {
     val sym = first(exp), args = rest(exp);
 
@@ -1178,7 +1249,19 @@ static val reg_nullable(val);
  */
 static val dv_compile_regex(val exp)
 {
-  if (symbolp(exp) || chrp(exp)) {
+  if (exp == space_k) {
+    return cobj((mem_t *) space_cs, chset_s, &char_set_obj_ops);
+  } else if (exp == digit_k) {
+    return cobj((mem_t *) digit_cs, chset_s, &char_set_obj_ops);
+  } else if (exp == word_char_k) {
+    return cobj((mem_t *) word_cs, chset_s, &char_set_obj_ops);
+  } else if (exp == cspace_k) {
+    return cobj((mem_t *) cspace_cs, chset_s, &char_set_obj_ops);
+  } else if (exp == cdigit_k) {
+    return cobj((mem_t *) cdigit_cs, chset_s, &char_set_obj_ops);
+  } else if (exp == cword_char_k) {
+    return cobj((mem_t *) cword_cs, chset_s, &char_set_obj_ops);
+  } else if (symbolp(exp) || chrp(exp)) {
     return exp;
   } else if (stringp(exp)) {
     return cons(compound_s, list_str(exp));
@@ -1766,3 +1849,21 @@ val regsub(val regex, val repl, val str)
 
   return cat_str(out, nil);
 }
+
+val space_k, digit_k, word_char_k;
+val cspace_k, cdigit_k, cword_char_k;
+val regex_space_chars;
+
+void regex_init(void)
+{
+  space_k = intern(lit("space"), keyword_package);
+  digit_k = intern(lit("digit"), keyword_package);
+  word_char_k = intern(lit("word-char"), keyword_package);
+  cspace_k = intern(lit("cspace"), keyword_package);
+  cdigit_k = intern(lit("cdigit"), keyword_package);
+  cword_char_k = intern(lit("cword-char"), keyword_package);
+
+  prot1(&regex_space_chars);
+
+  init_special_char_sets();
+}
diff --git a/regex.h b/regex.h
index 7c589798..4d137a0f 100644
--- a/regex.h
+++ b/regex.h
@@ -24,8 +24,14 @@
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  */
 
+extern val space_k, digit_k, word_char_k;
+extern val cspace_k, cdigit_k, cword_char_k;
+extern val regex_space_chars;
+
 val regex_compile(val regex_sexp);
 val regexp(val);
 val search_regex(val haystack, val needle_regex, val start_num, val from_end);
 val match_regex(val str, val regex, val pos);
 val regsub(val regex, val repl, val str);
+
+void regex_init(void);
author	Kaz Kylheku <kaz@kylheku.com>	2012-04-19 21:45:46 -0700
committer	Kaz Kylheku <kaz@kylheku.com>	2012-04-19 21:45:46 -0700
commit	e63c7be49e144d2ed3967c28243109342e17dcaa (patch)
tree	30342ab840f281739c11f181c273fce74cfa82af
parent	12601aaa1ca2af7d685249b1c762458ad194889f (diff)
download	txr-e63c7be49e144d2ed3967c28243109342e17dcaa.tar.gz txr-e63c7be49e144d2ed3967c28243109342e17dcaa.tar.bz2 txr-e63c7be49e144d2ed3967c28243109342e17dcaa.zip