summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2010-01-19 15:16:28 -0800
committerKaz Kylheku <kaz@kylheku.com>2010-01-19 15:16:28 -0800
commit7c6391bb10adc88d156ec88148184bc3eb8681ce (patch)
tree45752f1a9de6da1ab2584c5cb10d1044e0e82ce4
parent1b29c92e9c0e92f73aab633d59d3417a1f6c405b (diff)
downloadtxr-7c6391bb10adc88d156ec88148184bc3eb8681ce.tar.gz
txr-7c6391bb10adc88d156ec88148184bc3eb8681ce.tar.bz2
txr-7c6391bb10adc88d156ec88148184bc3eb8681ce.zip
More regex grammar work.
-rw-r--r--ChangeLog11
-rw-r--r--parser.h1
-rw-r--r--parser.y3
-rw-r--r--txr.17
4 files changed, 17 insertions, 5 deletions
diff --git a/ChangeLog b/ChangeLog
index 93a450de..a8247e8b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,16 @@
2010-01-19 Kaz Kylheku <kkylheku@gmail.com>
+ * parser.y (regex): Getting rid of empty '/' '/' production
+ again.
+ (regexpr): Re-introducing empty production; this time using
+ %prec LOW trick to give this interpretation the lowest
+ possible precedence. Thus expressions like /&/ work again.
+ (regbranch): New production to allow R1~R2 to be valid.
+
+ * txr.1: Documented.
+
+2010-01-19 Kaz Kylheku <kkylheku@gmail.com>
+
* parser.l (grammar): The ^ character is no longer considered
a special regex token, just a regular character.
diff --git a/parser.h b/parser.h
index e4f712b9..3a7fb720 100644
--- a/parser.h
+++ b/parser.h
@@ -36,3 +36,4 @@ void yyerror(const char *s);
void yyerrorf(val s, ...);
void yybadtoken(int tok, val context);
void end_of_regex(void);
+int yylex(void);
diff --git a/parser.y b/parser.y
index cb9d320f..b2745c2f 100644
--- a/parser.y
+++ b/parser.y
@@ -451,7 +451,6 @@ expr : IDENT { $$ = intern(string_own($1), nil); }
;
regex : '/' regexpr '/' { $$ = $2; end_of_regex(); }
- | '/' '/' { $$ = nil; end_of_regex(); }
| '/' error { $$ = nil;
yybadtoken(yychar, lit("regex"));
end_of_regex(); }
@@ -463,10 +462,12 @@ regexpr : regbranch { $$ = if3(cdr($1),
| regexpr '|' regexpr { $$ = list(or_s, $1, $3, nao); }
| regexpr '&' regexpr { $$ = list(and_s, $1, $3, nao); }
| '~' regexpr { $$ = list(compl_s, $2, nao); }
+ | /* empty */ %prec LOW { $$ = nil; }
;
regbranch : regterm %prec LOW { $$ = cons($1, nil); }
| regterm regbranch { $$ = cons($1, $2); }
+ | regterm '~' regexpr { $$ = list($1, list(compl_s, $3, nao), nao); }
;
regterm : regterm '*' { $$ = list(zeroplus_s, $1, nao); }
diff --git a/txr.1 b/txr.1
index 64403966..729107be 100644
--- a/txr.1
+++ b/txr.1
@@ -695,7 +695,7 @@ string, then R1%R2 is equivalent to R1*.
.IP ~R
match the complement of the following expression R; i.e. match
those texts that R does not match. This operator is called complement,
-or logical not.
+or logical not. The form R1~R2 is permitted and means R1(~R2)
.IP R1R2
Two consecutive regular expressions denote catenation:
the left expression must match, and then the right.
@@ -735,9 +735,8 @@ means ab((c*)%(d*ef)). The left argument of % is c*, but the right is the
entire expression d*ef.
The unary complement operator has the next lower precedence, so
-that ~A* means the ~(A*): "match the all text that is not matched by zero
-or more repetitions of A", not "match zero or more times the text
-not matched by A".
+that ~AB means ~(AB) not (~A)B. AB~CD means (AB)~(CD) where
+the (CD) is complemented, and catenated to (AB).
Catenation is on the next lower precedence rung, so that AB? means A(B?), or
"match A, and then optionally B", not "match A and B, as one optional