summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2012-04-19 21:45:46 -0700
committerKaz Kylheku <kaz@kylheku.com>2012-04-19 21:45:46 -0700
commite63c7be49e144d2ed3967c28243109342e17dcaa (patch)
tree30342ab840f281739c11f181c273fce74cfa82af
parent12601aaa1ca2af7d685249b1c762458ad194889f (diff)
downloadtxr-e63c7be49e144d2ed3967c28243109342e17dcaa.tar.gz
txr-e63c7be49e144d2ed3967c28243109342e17dcaa.tar.bz2
txr-e63c7be49e144d2ed3967c28243109342e17dcaa.zip
First cut at implementing \s, \d, \w, \S, \D and \W regex tokens.
* lib.c (init): Call regex_init. * parser.l: return new REGTOKEN kind. * parser.y (REGTOKEN): New token type. (REGTERM): Translate REGTERM to keyword. (regclass): Restructured to handle inherited nodes as lists. (regclassterm): Produce $$ as list. Add handling for REGTOKEN occurring inside character class by expanding it. This might not be the best approach. (yybadtoken): Handle REGTOKEN in switch. * regex.c (struct any_char_set, struct small_char_set, struct displaced_char_set, struct large_char_set, struct xlarge_char_set): New bitfield member, stat. (char_set_create): New parameter for indicating static char set. (char_set_destroy): Do not free a static char set. (char_set_compile): Pass zero to new parameter of char_set_create. (spaces): New static array. (space_cs, digit_cs, word_cs, cspace_cs, cdigit_cs, cword_cs): New static pointers to char_set_t. (init_special_char_sets, nfa_compile_given_set): New static function. (nfa_compile_regex, dv_compile_regex): Handle new character set token keywords. (space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k, regex_space_chars): New variables. (regex_init): New function. * regex.h (space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k, regex_space_chars, regex_init): Declared.
-rw-r--r--ChangeLog35
-rw-r--r--lib.c1
-rw-r--r--parser.l5
-rw-r--r--parser.y43
-rw-r--r--regex.c107
-rw-r--r--regex.h6
6 files changed, 188 insertions, 9 deletions
diff --git a/ChangeLog b/ChangeLog
index b44a9c11..92f9cfcc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,38 @@
+2012-04-19 Kaz Kylheku <kaz@kylheku.com>
+
+ First cut at implementing \s, \d, \w, \S, \D and \W regex tokens.
+
+ * lib.c (init): Call regex_init.
+
+ * parser.l: return new REGTOKEN kind.
+
+ * parser.y (REGTOKEN): New token type.
+ (REGTERM): Translate REGTERM to keyword.
+ (regclass): Restructured to handle inherited nodes as lists.
+ (regclassterm): Produce $$ as list. Add handling for REGTOKEN
+ occurring inside character class by expanding it. This might not
+ be the best approach.
+ (yybadtoken): Handle REGTOKEN in switch.
+
+ * regex.c (struct any_char_set, struct small_char_set,
+ struct displaced_char_set, struct large_char_set,
+ struct xlarge_char_set): New bitfield member, stat.
+ (char_set_create): New parameter for indicating static char set.
+ (char_set_destroy): Do not free a static char set.
+ (char_set_compile): Pass zero to new parameter of char_set_create.
+ (spaces): New static array.
+ (space_cs, digit_cs, word_cs, cspace_cs, cdigit_cs, cword_cs): New
+ static pointers to char_set_t.
+ (init_special_char_sets, nfa_compile_given_set): New static function.
+ (nfa_compile_regex, dv_compile_regex): Handle new character set token
+ keywords.
+ (space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k,
+ regex_space_chars): New variables.
+ (regex_init): New function.
+
+ * regex.h (space_k, digit_k, word_char_k, cspace_k, cdigit_k,
+ cword_char_k, regex_space_chars, regex_init): Declared.
+
2012-04-15 Kaz Kylheku <kaz@kylheku.com>
* eval.c (eval_init): New intrinsic functions remq*, remql*,
diff --git a/lib.c b/lib.c
index 979a4889..acaccf72 100644
--- a/lib.c
+++ b/lib.c
@@ -4622,6 +4622,7 @@ void init(const wchar_t *pn, mem_t *(*oom)(mem_t *, size_t),
eval_init();
filter_init();
hash_init();
+ regex_init();
gc_state(gc_save);
}
diff --git a/parser.l b/parser.l
index 52aab27c..344684fe 100644
--- a/parser.l
+++ b/parser.l
@@ -574,6 +574,11 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
return REGCHAR;
}
+<REGEX>[\\][sSdDwW] {
+ yylval.chr = yytext[1];
+ return REGTOKEN;
+}
+
<REGEX>{WS}[\\]\n{WS} {
lineno++;
}
diff --git a/parser.y b/parser.y
index 7a058d60..c57bc901 100644
--- a/parser.y
+++ b/parser.y
@@ -76,7 +76,7 @@ static val parsed_spec;
%token <val> NUMBER METANUM
-%token <chr> REGCHAR LITCHAR
+%token <chr> REGCHAR REGTOKEN LITCHAR
%token <chr> METAPAR METABKT SPLICE
%type <val> spec clauses clauses_opt clause
@@ -105,7 +105,7 @@ static val parsed_spec;
%left '|' '/'
%left '&'
%right '~' '*' '?' '+' '%'
-%right '.' REGCHAR LITCHAR
+%right '.' REGCHAR REGTOKEN LITCHAR
%right DOTDOT
%%
@@ -796,18 +796,48 @@ regterm : regterm '*' { $$ = list(zeroplus_s, $1, nao); }
| ']' { $$ = chr(']'); }
| '-' { $$ = chr('-'); }
| REGCHAR { $$ = chr($1); }
+ | REGTOKEN { switch ($1)
+ { case 's':
+ $$ = space_k; break;
+ case 'S':
+ $$ = cspace_k; break;
+ case 'd':
+ $$ = digit_k; break;
+ case 'D':
+ $$ = cdigit_k; break;
+ case 'w':
+ $$ = word_char_k; break;
+ case 'W':
+ $$ = cword_char_k; break; }}
| '(' regexpr ')' { $$ = $2; }
| '(' error { $$ = nil;
yybadtoken(yychar,
lit("regex subexpression")); }
;
-regclass : regclassterm { $$ = cons($1, nil); }
- | regclassterm regclass { $$ = cons($1, $2); }
+regclass : regclassterm { $$ = $1; }
+ | regclassterm regclass { $$ = nappend2($1, $2); }
;
-regclassterm : regrange { $$ = $1; }
- | regchar { $$ = chr($1); }
+regclassterm : regrange { $$ = cons($1, nil); }
+ | regchar { $$ = cons(chr($1), nil); }
+ | REGTOKEN { switch ($1)
+ { case 's':
+ $$ = regex_space_chars;
+ break;
+ case 'd':
+ $$ = cons(cons(chr('0'), chr('9')), nil);
+ break;
+ case 'w':
+ $$ = list(cons(chr('A'), chr('Z')),
+ cons(chr('a'), chr('z')),
+ chr('_'), nao);
+ break;
+ default:
+ yyerrorf(lit("complemented token "
+ "\\~a not allowed "
+ "in regex character class"),
+ chr($1), nao); } }
;
regrange : regchar '-' regchar { $$ = cons(chr($1), chr($3)); }
@@ -1130,6 +1160,7 @@ void yybadtoken(int tok, val context)
case FINALLY: problem = lit("\"finally\""); break;
case NUMBER: problem = lit("number"); break;
case REGCHAR: problem = lit("regular expression character"); break;
+ case REGTOKEN: problem = lit("regular expression token"); break;
case LITCHAR: problem = lit("string literal character"); break;
case METAPAR: problem = lit("@("); break;
case METABKT: problem = lit("@["); break;
diff --git a/regex.c b/regex.c
index 20774651..9670b901 100644
--- a/regex.c
+++ b/regex.c
@@ -38,6 +38,7 @@
#include "unwind.h"
#include "regex.h"
#include "txr.h"
+#include "gc.h"
#if WCHAR_MAX > 65535
#define FULL_UNICODE
@@ -99,17 +100,20 @@ typedef cset_L2_t *cset_L3_t[17];
struct any_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned stat : 1;
};
struct small_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned stat : 1;
cset_L0_t bitcell;
};
struct displaced_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned stat : 1;
cset_L0_t bitcell;
wchar_t base;
};
@@ -118,6 +122,7 @@ struct displaced_char_set {
struct large_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned stat : 1;
cset_L2_t dir;
};
@@ -125,6 +130,7 @@ struct large_char_set {
struct xlarge_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned stat : 1;
cset_L3_t dir;
};
#endif
@@ -472,12 +478,13 @@ static void L3_free(cset_L3_t *L3)
#endif
-static char_set_t *char_set_create(chset_type_t type, wchar_t base)
+static char_set_t *char_set_create(chset_type_t type, wchar_t base, unsigned st)
{
static char_set_t blank;
char_set_t *cs = (char_set_t *) chk_malloc(sizeof *cs);
*cs = blank;
cs->any.type = type;
+ cs->any.stat = st;
if (type == CHSET_DISPLACED)
cs->d.base = base;
@@ -487,6 +494,9 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base)
static void char_set_destroy(char_set_t *set)
{
+ if (set->any.stat)
+ return;
+
switch (set->any.type) {
case CHSET_DISPLACED:
case CHSET_SMALL:
@@ -644,7 +654,7 @@ static char_set_t *char_set_compile(val args, val comp)
{
- char_set_t *set = char_set_create(cst, min);
+ char_set_t *set = char_set_create(cst, min, 0);
for (iter = args; iter; iter = rest(iter)) {
val item = first(iter);
@@ -669,6 +679,48 @@ static char_set_t *char_set_compile(val args, val comp)
}
}
+wchar_t spaces[] = {
+ 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x0020, 0x00a0, 0x1680, 0x180e,
+ 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008,
+ 0x2009, 0x200a, 0x2028, 0x2029, 0x205f, 0x3000, 0
+};
+
+static char_set_t *space_cs, *digit_cs, *word_cs;
+static char_set_t *cspace_cs, *cdigit_cs, *cword_cs;
+
+static void init_special_char_sets(void)
+{
+ int i;
+
+ space_cs = char_set_create(CHSET_LARGE, 0, 1);
+ cspace_cs = char_set_create(CHSET_LARGE, 0, 1);
+ digit_cs = char_set_create(CHSET_SMALL, 0, 1);
+ cdigit_cs = char_set_create(CHSET_SMALL, 0, 1);
+ word_cs = char_set_create(CHSET_SMALL, 0, 1);
+ cword_cs = char_set_create(CHSET_SMALL, 0, 1);
+
+ char_set_compl(cspace_cs);
+ char_set_compl(cdigit_cs);
+ char_set_compl(cword_cs);
+
+ for (i = 0; spaces[i] != 0; i++) {
+ wchar_t sp = spaces[i];
+ char_set_add(space_cs, sp);
+ char_set_add(cspace_cs, sp);
+ push(chr(sp), &regex_space_chars);
+ }
+
+ char_set_add_range(digit_cs, '0', '9');
+ char_set_add_range(cdigit_cs, '0', '9');
+
+ char_set_add_range(word_cs, 'A', 'Z');
+ char_set_add_range(cword_cs, 'A', 'Z');
+ char_set_add_range(word_cs, 'a', 'a');
+ char_set_add_range(cword_cs, 'a', 'a');
+ char_set_add(word_cs, '_');
+ char_set_add(cword_cs, '_');
+}
+
static void char_set_cobj_destroy(val chset)
{
char_set_t *set = (char_set_t *) chset->co.handle;
@@ -812,6 +864,13 @@ static nfa_t nfa_compile_set(val args, val comp)
return nfa_make(s, acc);
}
+static nfa_t nfa_compile_given_set(char_set_t *set)
+{
+ nfa_state_t *acc = nfa_state_accept();
+ nfa_state_t *s = nfa_state_set(acc, set);
+ return nfa_make(s, acc);
+}
+
static nfa_t nfa_compile_regex(val regex);
/*
@@ -852,6 +911,18 @@ static nfa_t nfa_compile_regex(val exp)
nfa_state_t *acc = nfa_state_accept();
nfa_state_t *s = nfa_state_wild(acc);
return nfa_make(s, acc);
+ } else if (exp == space_k) {
+ return nfa_compile_given_set(space_cs);
+ } else if (exp == digit_k) {
+ return nfa_compile_given_set(digit_cs);
+ } else if (exp == word_char_k) {
+ return nfa_compile_given_set(word_cs);
+ } else if (exp == cspace_k) {
+ return nfa_compile_given_set(cspace_cs);
+ } else if (exp == cdigit_k) {
+ return nfa_compile_given_set(cdigit_cs);
+ } else if (exp == cword_char_k) {
+ return nfa_compile_given_set(cword_cs);
} else if (consp(exp)) {
val sym = first(exp), args = rest(exp);
@@ -1178,7 +1249,19 @@ static val reg_nullable(val);
*/
static val dv_compile_regex(val exp)
{
- if (symbolp(exp) || chrp(exp)) {
+ if (exp == space_k) {
+ return cobj((mem_t *) space_cs, chset_s, &char_set_obj_ops);
+ } else if (exp == digit_k) {
+ return cobj((mem_t *) digit_cs, chset_s, &char_set_obj_ops);
+ } else if (exp == word_char_k) {
+ return cobj((mem_t *) word_cs, chset_s, &char_set_obj_ops);
+ } else if (exp == cspace_k) {
+ return cobj((mem_t *) cspace_cs, chset_s, &char_set_obj_ops);
+ } else if (exp == cdigit_k) {
+ return cobj((mem_t *) cdigit_cs, chset_s, &char_set_obj_ops);
+ } else if (exp == cword_char_k) {
+ return cobj((mem_t *) cword_cs, chset_s, &char_set_obj_ops);
+ } else if (symbolp(exp) || chrp(exp)) {
return exp;
} else if (stringp(exp)) {
return cons(compound_s, list_str(exp));
@@ -1766,3 +1849,21 @@ val regsub(val regex, val repl, val str)
return cat_str(out, nil);
}
+
+val space_k, digit_k, word_char_k;
+val cspace_k, cdigit_k, cword_char_k;
+val regex_space_chars;
+
+void regex_init(void)
+{
+ space_k = intern(lit("space"), keyword_package);
+ digit_k = intern(lit("digit"), keyword_package);
+ word_char_k = intern(lit("word-char"), keyword_package);
+ cspace_k = intern(lit("cspace"), keyword_package);
+ cdigit_k = intern(lit("cdigit"), keyword_package);
+ cword_char_k = intern(lit("cword-char"), keyword_package);
+
+ prot1(&regex_space_chars);
+
+ init_special_char_sets();
+}
diff --git a/regex.h b/regex.h
index 7c589798..4d137a0f 100644
--- a/regex.h
+++ b/regex.h
@@ -24,8 +24,14 @@
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*/
+extern val space_k, digit_k, word_char_k;
+extern val cspace_k, cdigit_k, cword_char_k;
+extern val regex_space_chars;
+
val regex_compile(val regex_sexp);
val regexp(val);
val search_regex(val haystack, val needle_regex, val start_num, val from_end);
val match_regex(val str, val regex, val pos);
val regsub(val regex, val repl, val str);
+
+void regex_init(void);