summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2012-02-26 03:01:10 -0800
committerKaz Kylheku <kaz@kylheku.com>2012-02-26 03:01:10 -0800
commit7649d799041454321809da89a5716afc19c34f3d (patch)
treea3d1e0c24558c9c5e7842bc14327a1fe88afc5a6
parentcdd51a57490deb19a0bd3d1b77d2e2aac4d6316b (diff)
downloadtxr-7649d799041454321809da89a5716afc19c34f3d.tar.gz
txr-7649d799041454321809da89a5716afc19c34f3d.tar.bz2
txr-7649d799041454321809da89a5716afc19c34f3d.zip
Bug #35625
* parser.l (BSCHR, BSYM, BTOK): New lexical definitions. (BRACED): New state. (grammar): Refactored so that braced variables are now handled in the BRACED state, allowing for lexical differences between braced variables and Lisp. This allows us to have the /regex/ syntax in braces, but /regex/ is just a symbol in the Lisp. The new #/ token is recognized and returned as HASH_SLASH. All rules reformatted to a more easily maintainble convention. * parser.y (HASH_SLASH): New token. (modifiers, lisp_regex): New nonterminals. (var): Grammar changed to use modifiers nonterminal instead of exprs. (var_op): Rule moved closer to var. (expr): Produces lisp_regex rather than regex. (yybadtoken): Handle HASH_SLASH in the switch statement. Bugfix: HASH_BACKSLASH was not handled. * txr.1: Documented #/regex/ syntax.
-rw-r--r--ChangeLog24
-rw-r--r--parser.l954
-rw-r--r--parser.y36
-rw-r--r--txr.16
4 files changed, 547 insertions, 473 deletions
diff --git a/ChangeLog b/ChangeLog
index d131ea71..dff5c576 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,27 @@
+2012-02-26 Kaz Kylheku <kaz@kylheku.com>
+
+ Bug #35625
+
+ * parser.l (BSCHR, BSYM, BTOK): New lexical definitions.
+ (BRACED): New state.
+ (grammar): Refactored so that braced variables are now handled
+ in the BRACED state, allowing for lexical differences between
+ braced variables and Lisp. This allows us to have
+ the /regex/ syntax in braces, but /regex/ is just a symbol
+ in the Lisp. The new #/ token is recognized and returned
+ as HASH_SLASH. All rules reformatted to a more easily
+ maintainble convention.
+
+ * parser.y (HASH_SLASH): New token.
+ (modifiers, lisp_regex): New nonterminals.
+ (var): Grammar changed to use modifiers nonterminal instead of exprs.
+ (var_op): Rule moved closer to var.
+ (expr): Produces lisp_regex rather than regex.
+ (yybadtoken): Handle HASH_SLASH in the switch statement.
+ Bugfix: HASH_BACKSLASH was not handled.
+
+ * txr.1: Documented #/regex/ syntax.
+
2012-02-25 Kaz Kylheku <kaz@kylheku.com>
* arith.c: Updated copyright year.
diff --git a/parser.l b/parser.l
index a53f8514..732e2ce0 100644
--- a/parser.l
+++ b/parser.l
@@ -150,10 +150,13 @@ static wchar_t num_esc(char *num)
SYM [a-zA-Z0-9_]+
NUM [+\-]?[0-9]+
-NSCHR [a-zA-Z0-9!$%&*+\-<=>?\\^_~]
+BSCHR [a-zA-Z0-9!$%&*+\-<=>?\\^_~]
+BSYM {BSCHR}({BSCHR}|#)*
+NSCHR [a-zA-Z0-9!$%&*+\-<=>?\\^_~/]
NSYM {NSCHR}({NSCHR}|#)*
TOK :?{SYM}
ATNUM @{NUM}
+BTOK [:@]?{BSYM}
NTOK [:@]?{NSYM}
ID_END [^a-zA-Z0-9_]
WS [\t ]*
@@ -171,497 +174,520 @@ UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
UANYN {ASCN}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
-%x SPECIAL NESTED REGEX STRLIT CHRLIT QSILIT
+%x SPECIAL BRACED NESTED REGEX STRLIT CHRLIT QSILIT
%%
-<SPECIAL,NESTED>{NUM} {
- val str = string_own(utf8_dup_from(yytext));
-
- if (yy_top_state() == INITIAL
- || yy_top_state() == QSILIT)
- yy_pop_state();
-
- yylval.num = int_str(str, num(10));
- return NUMBER;
- }
-
-<NESTED,QSILIT>{ATNUM} {
- val str = string_own(utf8_dup_from(yytext + 1));
-
- if (yy_top_state() == INITIAL
- || yy_top_state() == QSILIT)
- yy_pop_state();
- yylval.num = int_str(str, num(10));
- return METANUM;
- }
-
-<SPECIAL>{TOK} |
-<NESTED>{NTOK} {
- if (yy_top_state() == INITIAL
- || yy_top_state() == QSILIT)
- yy_pop_state();
-
- switch (yytext[0]) {
- case ':':
- yylval.lexeme = utf8_dup_from(yytext + 1);
- return KEYWORD;
- case '@':
- yylval.lexeme = utf8_dup_from(yytext + 1);
- return METAVAR;
- default:
- yylval.lexeme = utf8_dup_from(yytext);
- return IDENT;
- }
- }
-<NESTED>: {
- if (yy_top_state() == INITIAL
- || yy_top_state() == QSILIT)
- yy_pop_state();
- yylval.lexeme = utf8_dup_from("");
- return KEYWORD;
- }
-
-<SPECIAL>\({WS}all{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return ALL;
- }
-
-<SPECIAL>\({WS}some/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return SOME;
- }
-
-<SPECIAL>\({WS}none{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return NONE;
- }
-
-<SPECIAL>\({WS}maybe{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return MAYBE;
- }
-
-<SPECIAL>\({WS}cases{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return CASES;
- }
-
-<SPECIAL>\({WS}choose/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return CHOOSE;
- }
-
-<SPECIAL>\({WS}gather/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return GATHER;
- }
-
-<SPECIAL>\({WS}and{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return AND;
- }
-
-<SPECIAL>\({WS}or{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return OR;
- }
-
-<SPECIAL>\({WS}end{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return END;
- }
+<SPECIAL,NESTED,BRACED>{NUM} {
+ val str = string_own(utf8_dup_from(yytext));
+
+ if (yy_top_state() == INITIAL
+ || yy_top_state() == QSILIT)
+ yy_pop_state();
+
+ yylval.num = int_str(str, num(10));
+ return NUMBER;
+}
+
+<NESTED,QSILIT>{ATNUM} {
+ val str = string_own(utf8_dup_from(yytext + 1));
+
+ if (yy_top_state() == INITIAL
+ || yy_top_state() == QSILIT)
+ yy_pop_state();
+ yylval.num = int_str(str, num(10));
+ return METANUM;
+}
+
+<SPECIAL>{TOK} |
+<BRACED>{BTOK} |
+<NESTED>{NTOK} {
+ if (yy_top_state() == INITIAL
+ || yy_top_state() == QSILIT)
+ yy_pop_state();
+
+ switch (yytext[0]) {
+ case ':':
+ yylval.lexeme = utf8_dup_from(yytext + 1);
+ return KEYWORD;
+ case '@':
+ yylval.lexeme = utf8_dup_from(yytext + 1);
+ return METAVAR;
+ default:
+ yylval.lexeme = utf8_dup_from(yytext);
+ return IDENT;
+ }
+}
+
+<BRACED,NESTED>: {
+ if (yy_top_state() == INITIAL
+ || yy_top_state() == QSILIT)
+ yy_pop_state();
+ yylval.lexeme = utf8_dup_from("");
+ return KEYWORD;
+}
+
+<SPECIAL>\({WS}all{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return ALL;
+}
+
+<SPECIAL>\({WS}some/{ID_END} {
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return SOME;
+}
+
+<SPECIAL>\({WS}none{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return NONE;
+}
+
+<SPECIAL>\({WS}maybe{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return MAYBE;
+}
+
+<SPECIAL>\({WS}cases{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return CASES;
+}
+
+<SPECIAL>\({WS}choose/{ID_END} {
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return CHOOSE;
+}
+
+<SPECIAL>\({WS}gather/{ID_END} {
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return GATHER;
+}
+
+<SPECIAL>\({WS}and{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return AND;
+}
+
+<SPECIAL>\({WS}or{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return OR;
+}
+
+<SPECIAL>\({WS}end{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return END;
+}
<SPECIAL>\({WS}collect/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return COLLECT;
- }
-
-<SPECIAL>\({WS}coll/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return COLL;
- }
-
-<SPECIAL>\({WS}until{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return UNTIL;
- }
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return COLLECT;
+}
+
+<SPECIAL>\({WS}coll/{ID_END} {
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return COLL;
+}
+
+<SPECIAL>\({WS}until{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return UNTIL;
+}
<SPECIAL>\({WS}output/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return OUTPUT;
- }
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return OUTPUT;
+}
<SPECIAL>\({WS}repeat/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return REPEAT;
- }
-
-
-<SPECIAL>\({WS}rep/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return REP;
- }
-
-<SPECIAL>\({WS}single{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return SINGLE;
- }
-
-<SPECIAL>\({WS}first{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return FIRST;
- }
-
-<SPECIAL>\({WS}last{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return LAST;
- }
-
-<SPECIAL>\({WS}empty{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return EMPTY;
- }
-
-<SPECIAL>\({WS}mod/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return MOD;
- }
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return REPEAT;
+}
+
+
+<SPECIAL>\({WS}rep/{ID_END} {
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return REP;
+}
+
+<SPECIAL>\({WS}single{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return SINGLE;
+}
+
+<SPECIAL>\({WS}first{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return FIRST;
+}
+
+<SPECIAL>\({WS}last{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return LAST;
+}
+
+<SPECIAL>\({WS}empty{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return EMPTY;
+}
+
+<SPECIAL>\({WS}mod/{ID_END} {
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return MOD;
+}
<SPECIAL>\({WS}modlast/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return MODLAST;
- }
-
-<SPECIAL>\({WS}define/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return DEFINE;
- }
-
-<SPECIAL>\({WS}try{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return TRY;
- }
-
-<SPECIAL>\({WS}catch/{ID_END} {
- yy_push_state(NESTED);
- yylval.lineno = lineno;
- return CATCH;
- }
-
-<SPECIAL>\({WS}finally{WS}\) {
- yy_pop_state();
- yylval.lineno = lineno;
- return FINALLY;
- }
-
-<NESTED>@[\(\[] |
-<SPECIAL,NESTED>[{(\[] {
- yy_push_state(NESTED);
- if (yytext[0] == '@') {
- yylval.chr = yytext[1];
- return yytext[1] == '(' ? METAPAR : METABKT;
- }
- yylval.lineno = lineno;
- return yytext[0];
- }
-
-<SPECIAL,NESTED>,[*] {
- yylval.chr = '*';
- return SPLICE;
- }
-
-<SPECIAL,NESTED>[,'] {
- yylval.chr = yytext[0];
- return yytext[0];
- }
-
-<SPECIAL,NESTED>[})\]] {
- yy_pop_state();
- if (yy_top_state() == INITIAL
- || yy_top_state() == QSILIT)
- yy_pop_state();
- return yytext[0];
- }
-
-<SPECIAL,NESTED>{WS} { /* Eat whitespace in directive */ }
-
-<SPECIAL,NESTED>\" {
- yy_push_state(STRLIT);
- return '"';
- }
-
-<SPECIAL,NESTED>#\\ {
- yy_push_state(CHRLIT);
- return HASH_BACKSLASH;
- }
-
-<SPECIAL,NESTED>` {
- yy_push_state(QSILIT);
- return '`';
- }
-
-<NESTED># {
- return '#';
- }
-
-<NESTED>\.\. {
- yylval.lineno = lineno;
- return DOTDOT;
- }
-
-<SPECIAL>@ {
- yy_pop_state();
- yylval.lexeme = chk_strdup(L"@");
- return TEXT;
- }
-
-<SPECIAL,NESTED>\n {
- lineno++;
- }
-
-<SPECIAL,NESTED>[/] {
- yy_push_state(REGEX);
- return '/';
- }
-
-<SPECIAL,NESTED>\. {
- yylval.chr = '.';
- return '.';
- }
-
-<SPECIAL,NESTED>[\\]\n{WS} {
- yy_pop_state();
- lineno++;
- }
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return MODLAST;
+}
+
+<SPECIAL>\({WS}define/{ID_END} {
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return DEFINE;
+}
+
+<SPECIAL>\({WS}try{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return TRY;
+}
+
+<SPECIAL>\({WS}catch/{ID_END} {
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return CATCH;
+}
+
+<SPECIAL>\({WS}finally{WS}\) {
+ yy_pop_state();
+ yylval.lineno = lineno;
+ return FINALLY;
+}
+
+<SPECIAL>[{] {
+ yy_push_state(BRACED);
+ yylval.lineno = lineno;
+ return yytext[0];
+}
+
+<SPECIAL>[(\[] |
+<NESTED,BRACED>@?[(\[] {
+ yy_push_state(NESTED);
+ if (yytext[0] == '@') {
+ yylval.chr = yytext[1];
+ return yytext[1] == '(' ? METAPAR : METABKT;
+ }
+ yylval.lineno = lineno;
+ return yytext[0];
+}
+
+<NESTED>,[*] {
+ yylval.chr = '*';
+ return SPLICE;
+}
+
+<NESTED>[,'] {
+ yylval.chr = yytext[0];
+ return yytext[0];
+}
+
+<BRACED>[}] {
+ yy_pop_state();
+ if (yy_top_state() == INITIAL
+ || yy_top_state() == QSILIT)
+ yy_pop_state();
+ return yytext[0];
+}
+
+<SPECIAL,NESTED>[)\]] {
+ yy_pop_state();
+ if (yy_top_state() == INITIAL
+ || yy_top_state() == QSILIT)
+ yy_pop_state();
+ return yytext[0];
+}
+
+<SPECIAL,NESTED,BRACED>{WS} {
+ /* Eat whitespace in directive */
+}
+
+<SPECIAL,NESTED,BRACED>\" {
+ yy_push_state(STRLIT);
+ return '"';
+}
+
+<SPECIAL,NESTED,BRACED>#\\ {
+ yy_push_state(CHRLIT);
+ return HASH_BACKSLASH;
+}
+
+<SPECIAL,NESTED,BRACED>#[/] {
+ yy_push_state(REGEX);
+ return HASH_SLASH;
+}
+
+<SPECIAL,NESTED,BRACED>` {
+ yy_push_state(QSILIT);
+ return '`';
+}
+
+<NESTED,BRACED># {
+ return '#';
+}
+
+<NESTED>\.\. {
+ yylval.lineno = lineno;
+ return DOTDOT;
+}
+
+<SPECIAL>@ {
+ yy_pop_state();
+ yylval.lexeme = chk_strdup(L"@");
+ return TEXT;
+}
+
+<SPECIAL,NESTED,BRACED>\n {
+ lineno++;
+}
+
+<SPECIAL,BRACED>[/] {
+ yy_push_state(REGEX);
+ return '/';
+}
+
+<SPECIAL,NESTED>\. {
+ yylval.chr = '.';
+ return '.';
+}
+
+<SPECIAL,NESTED,BRACED>[\\]\n{WS} {
+ yy_pop_state();
+ lineno++;
+}
<SPECIAL>[\\][abtnvfre ] {
- wchar_t lexeme[2];
- lexeme[0] = char_esc(yytext[1]);
- lexeme[1] = 0;
- yylval.lexeme = chk_strdup(lexeme);
- yy_pop_state();
- return TEXT;
- }
-
-<SPECIAL>[\\](x{HEX}+|{OCT}+) {
- wchar_t lexeme[2];
- lexeme[0] = num_esc(yytext + 1);
- lexeme[1] = 0;
- yylval.lexeme = chk_strdup(lexeme);
- yy_pop_state();
- return TEXT;
- }
-
-<SPECIAL,NESTED>[;].* {
- /* comment */
- }
-<SPECIAL,NESTED>{UANYN} {
- yyerrprepf(lit("bad character in directive: '~a'"),
- string_utf8(yytext), nao);
- return ERRTOK;
- }
-
-<SPECIAL,NESTED>. {
- yyerrprepf(lit("non-UTF-8 byte in directive: "
- "'\\x~02x'"),
- num((unsigned char) yytext[0]), nao);
- return ERRTOK;
- }
-
-<REGEX>[/] {
- yylval.chr = '/';
- return '/';
- }
-
-
-<REGEX>[\\][abtnvfre\\ ] {
- yylval.chr = char_esc(yytext[1]);
- return REGCHAR;
- }
-
-<REGEX>[\\](x{HEX}+|{OCT}+);? {
- yylval.chr = num_esc(yytext + 1);
- return REGCHAR;
- }
-
-<REGEX>{WS}[\\]\n{WS} {
- lineno++;
- }
-
-<REGEX>\n {
- lineno++;
- yyerrprepf(lit("newline in regex"), nao);
- return ERRTOK;
- }
-
-<REGEX>[.*?+~&%] {
- yylval.chr = yytext[0];
- return yytext[0];
- }
+ wchar_t lexeme[2];
+ lexeme[0] = char_esc(yytext[1]);
+ lexeme[1] = 0;
+ yylval.lexeme = chk_strdup(lexeme);
+ yy_pop_state();
+ return TEXT;
+}
+
+<SPECIAL>[\\](x{HEX}+|{OCT}+) {
+ wchar_t lexeme[2];
+ lexeme[0] = num_esc(yytext + 1);
+ lexeme[1] = 0;
+ yylval.lexeme = chk_strdup(lexeme);
+ yy_pop_state();
+ return TEXT;
+}
+
+<SPECIAL,NESTED,BRACED>[;].* {
+ /* comment */
+}
+<SPECIAL,NESTED,BRACED>{UANYN} {
+ yyerrprepf(lit("bad character in directive: '~a'"),
+ string_utf8(yytext), nao);
+ return ERRTOK;
+}
+
+<SPECIAL,NESTED,BRACED>. {
+ yyerrprepf(lit("non-UTF-8 byte in directive: "
+ "'\\x~02x'"),
+ num((unsigned char) yytext[0]), nao);
+ return ERRTOK;
+}
+
+<REGEX>[/] {
+ yylval.chr = '/';
+ return '/';
+}
+
+
+<REGEX>[\\][abtnvfre\\ ] {
+ yylval.chr = char_esc(yytext[1]);
+ return REGCHAR;
+}
+
+<REGEX>[\\](x{HEX}+|{OCT}+);? {
+ yylval.chr = num_esc(yytext + 1);
+ return REGCHAR;
+}
+
+<REGEX>{WS}[\\]\n{WS} {
+ lineno++;
+}
+
+<REGEX>\n {
+ lineno++;
+ yyerrprepf(lit("newline in regex"), nao);
+ return ERRTOK;
+}
+
+<REGEX>[.*?+~&%] {
+ yylval.chr = yytext[0];
+ return yytext[0];
+}
<REGEX>[\[\]\-] {
- yylval.chr = yytext[0];
- return yytext[0];
- }
+ yylval.chr = yytext[0];
+ return yytext[0];
+}
-<REGEX>[()|] {
- yylval.chr = yytext[0];
- return yytext[0];
- }
+<REGEX>[()|] {
+ yylval.chr = yytext[0];
+ return yytext[0];
+}
-<REGEX>[\\]. {
- yylval.chr = yytext[1];
- return REGCHAR;
- }
+<REGEX>[\\]. {
+ yylval.chr = yytext[1];
+ return REGCHAR;
+}
<REGEX>{UANYN} {
- wchar_t buf[8];
- utf8_from(buf, yytext);
- yylval.chr = buf[0];
- return REGCHAR;
- }
-
-<REGEX>. {
- yyerrprepf(lit("non-UTF-8 byte in regex: '\\x~02x'"),
- num((unsigned char) yytext[0]), nao);
- return ERRTOK;
- }
-
-<INITIAL>[ ]+ {
- yylval.lexeme = utf8_dup_from(yytext);
- return SPACE;
- }
-
-<INITIAL>({UONLY}|[^@\n ])+ {
- yylval.lexeme = utf8_dup_from(yytext);
- return TEXT;
- }
-
-<INITIAL>\n {
- lineno++;
- return '\n';
- }
-
-<INITIAL>@{WS}\* {
- yy_push_state(SPECIAL);
- return '*';
- }
-
-<INITIAL>@ {
- yy_push_state(SPECIAL);
- }
-
-<INITIAL>^@[#;].*\n {
- /* eat whole line comment */
- lineno++;
- }
-
-<INITIAL>@[#;].* {
- /* comment to end of line */
- }
-
-<STRLIT>\" {
- yy_pop_state();
- return yytext[0];
- }
-
-<QSILIT>` {
- yy_pop_state();
- return yytext[0];
- }
-
-<STRLIT,QSILIT>[\\][abtnvfre"`'\\] {
- yylval.chr = char_esc(yytext[1]);
- return LITCHAR;
- }
-
-<STRLIT,QSILIT>{WS}[\\]\n{WS} {
- lineno++;
- }
+ wchar_t buf[8];
+ utf8_from(buf, yytext);
+ yylval.chr = buf[0];
+ return REGCHAR;
+}
+
+<REGEX>. {
+ yyerrprepf(lit("non-UTF-8 byte in regex: '\\x~02x'"),
+ num((unsigned char) yytext[0]), nao);
+ return ERRTOK;
+}
+
+<INITIAL>[ ]+ {
+ yylval.lexeme = utf8_dup_from(yytext);
+ return SPACE;
+}
+
+<INITIAL>({UONLY}|[^@\n ])+ {
+ yylval.lexeme = utf8_dup_from(yytext);
+ return TEXT;
+}
+
+<INITIAL>\n {
+ lineno++;
+ return '\n';
+}
+
+<INITIAL>@{WS}\* {
+ yy_push_state(SPECIAL);
+ return '*';
+}
+
+<INITIAL>@ {
+ yy_push_state(SPECIAL);
+}
+
+<INITIAL>^@[#;].*\n {
+ /* eat whole line comment */
+ lineno++;
+}
+
+<INITIAL>@[#;].* {
+ /* comment to end of line */
+}
+
+<STRLIT>\" {
+ yy_pop_state();
+ return yytext[0];
+}
+
+<QSILIT>` {
+ yy_pop_state();
+ return yytext[0];
+}
+
+<STRLIT,QSILIT>[\\][abtnvfre"`'\\] {
+ yylval.chr = char_esc(yytext[1]);
+ return LITCHAR;
+}
+
+<STRLIT,QSILIT>{WS}[\\]\n{WS} {
+ lineno++;
+}
<STRLIT,QSILIT>[\\](x{HEX}+|{OCT}+);? {
- yylval.chr = num_esc(yytext+1);
- return LITCHAR;
- }
+ yylval.chr = num_esc(yytext+1);
+ return LITCHAR;
+}
<CHRLIT>(x{HEX}+|o{OCT}+) {
- yylval.chr = num_esc(yytext);
- return LITCHAR;
- }
-
-<CHRLIT>{SYM} {
- yylval.lexeme = utf8_dup_from(yytext);
- return IDENT;
- }
-
-<CHRLIT>[^ \t\n] {
- yylval.lexeme = utf8_dup_from(yytext);
- return IDENT; /* hack */
- }
-
-<STRLIT>\n {
- yyerrprepf(lit("newline in string literal"), nao);
- lineno++;
- yylval.chr = yytext[0];
- return ERRTOK;
- }
-
-<CHRLIT>\n {
- yyerrprepf(lit("newline in character literal"), nao);
- lineno++;
- yylval.chr = yytext[0];
- return ERRTOK;
- }
-
-<QSILIT>\n {
- yyerrprepf(lit("newline in string quasiliteral"), nao);
- lineno++;
- yylval.chr = yytext[0];
- return ERRTOK;
- }
-
-<QSILIT>@ {
- yy_push_state(SPECIAL);
- }
+ yylval.chr = num_esc(yytext);
+ return LITCHAR;
+}
+
+<CHRLIT>{SYM} {
+ yylval.lexeme = utf8_dup_from(yytext);
+ return IDENT;
+}
+
+<CHRLIT>[^ \t\n] {
+ yylval.lexeme = utf8_dup_from(yytext);
+ return IDENT; /* hack */
+}
+
+<STRLIT>\n {
+ yyerrprepf(lit("newline in string literal"), nao);
+ lineno++;
+ yylval.chr = yytext[0];
+ return ERRTOK;
+}
+
+<CHRLIT>\n {
+ yyerrprepf(lit("newline in character literal"), nao);
+ lineno++;
+ yylval.chr = yytext[0];
+ return ERRTOK;
+}
+
+<QSILIT>\n {
+ yyerrprepf(lit("newline in string quasiliteral"), nao);
+ lineno++;
+ yylval.chr = yytext[0];
+ return ERRTOK;
+}
+
+<QSILIT>@ {
+ yy_push_state(SPECIAL);
+}
<STRLIT,CHRLIT,QSILIT>{UANYN} {
- wchar_t buf[8];
- utf8_from(buf, yytext);
- yylval.chr = buf[0];
- return LITCHAR;
- }
+ wchar_t buf[8];
+ utf8_from(buf, yytext);
+ yylval.chr = buf[0];
+ return LITCHAR;
+}
<STRLIT,CHRLIT,QSILIT>. {
- yyerrprepf(lit("non-UTF-8 byte in literal: '\\x~02x'"),
- num((unsigned char) yytext[0]), nao);
- return ERRTOK;
- }
+ yyerrprepf(lit("non-UTF-8 byte in literal: '\\x~02x'"),
+ num((unsigned char) yytext[0]), nao);
+ return ERRTOK;
+}
%%
diff --git a/parser.y b/parser.y
index 98f408a5..e5f06b54 100644
--- a/parser.y
+++ b/parser.y
@@ -72,7 +72,7 @@ static val parsed_spec;
%token <lineno> UNTIL COLL OUTPUT REPEAT REP SINGLE FIRST LAST EMPTY
%token <lineno> MOD MODLAST DEFINE TRY CATCH FINALLY
%token <lineno> ERRTOK /* deliberately not used in grammar */
-%token <lineno> HASH_BACKSLASH DOTDOT
+%token <lineno> HASH_BACKSLASH HASH_SLASH DOTDOT
%token <val> NUMBER METANUM
@@ -85,11 +85,11 @@ static val parsed_spec;
%type <val> clause_parts additional_parts gather_parts additional_gather_parts
%type <val> output_clause define_clause try_clause catch_clauses_opt
%type <val> line elems_opt elems clause_parts_h additional_parts_h
-%type <val> text texts elem var var_op meta_expr vector
+%type <val> text texts elem var var_op modifiers meta_expr vector
%type <val> list exprs exprs_opt expr out_clauses out_clauses_opt out_clause
%type <val> repeat_clause repeat_parts_opt o_line
%type <val> o_elems_opt o_elems_opt2 o_elems o_elem o_var rep_elem rep_parts_opt
-%type <val> regex regexpr regbranch
+%type <val> regex lisp_regex regexpr regbranch
%type <val> regterm regclass regclassterm regrange
%type <val> strlit chrlit quasilit quasi_items quasi_item litchars
%type <chr> regchar
@@ -607,9 +607,10 @@ var : IDENT { $$ = list(var_s, intern(string_own($1), nil),
nao); }
| '{' IDENT '}' elem { $$ = list(var_s, intern(string_own($2), nil),
$4, nao); }
- | '{' IDENT exprs '}' { $$ = list(var_s, intern(string_own($2), nil),
+ | '{' IDENT modifiers '}' { $$ = list(var_s, intern(string_own($2), nil),
nil, $3, nao); }
- | '{' IDENT exprs '}' elem { $$ = list(var_s, intern(string_own($2), nil),
+ | '{' IDENT modifiers '}' elem
+ { $$ = list(var_s, intern(string_own($2), nil),
$5, $3, nao); }
| var_op IDENT { $$ = list(var_s, intern(string_own($2), nil),
nil, $1, nao); }
@@ -632,6 +633,16 @@ var : IDENT { $$ = list(var_s, intern(string_own($1), nil),
yybadtoken(yychar, lit("variable spec")); }
;
+var_op : '*' { $$ = list(t, nao); }
+ ;
+
+modifiers : NUMBER { $$ = cons($1, nil); }
+ | regex { $$ = cons(cons(regex_compile(rest($1)),
+ rest($1)), nil);
+ rlcp($$, $1); }
+ | list { $$ = cons($1, nil); }
+ ;
+
o_var : IDENT { $$ = list(var_s, intern(string_own($1), nil),
nao); }
| IDENT o_elem { $$ = list(var_s, intern(string_own($1), nil),
@@ -644,9 +655,6 @@ o_var : IDENT { $$ = list(var_s, intern(string_own($1), nil),
yybadtoken(yychar, lit("variable spec")); }
;
-var_op : '*' { $$ = list(t, nao); }
- ;
-
vector : '#' list { $$ = rlcp(vector_list($2), $2); }
;
@@ -705,7 +713,7 @@ expr : IDENT { $$ = rl(intern(string_own($1), nil),
| list { $$ = $1; }
| vector { $$ = $1; }
| meta_expr { $$ = $1; }
- | regex { $$ = cons(regex_compile(rest($1)),
+ | lisp_regex { $$ = cons(regex_compile(rest($1)),
rest($1));
rlcp($$, $1); }
| chrlit { $$ = rl($1, num(lineno)); }
@@ -721,6 +729,14 @@ regex : '/' regexpr '/' { $$ = cons(regex_s, $2); end_of_regex();
end_of_regex(); }
;
+lisp_regex : HASH_SLASH regexpr '/'
+ { $$ = cons(regex_s, $2); end_of_regex();
+ rl($$, num(lineno)); }
+ | HASH_SLASH error { $$ = nil;
+ yybadtoken(yychar, lit("regex"));
+ end_of_regex(); }
+ ;
+
regexpr : regbranch { $$ = if3(cdr($1),
cons(compound_s, $1),
car($1)); }
@@ -1071,6 +1087,8 @@ void yybadtoken(int tok, val context)
case METAPAR: problem = lit("@("); break;
case METABKT: problem = lit("@["); break;
case DOTDOT: problem = lit(".."); break;
+ case HASH_BACKSLASH: problem = lit("#\\"); break;
+ case HASH_SLASH: problem = lit("#/"); break;
}
if (problem != 0)
diff --git a/txr.1 b/txr.1
index 275a6e8a..34f34bcd 100644
--- a/txr.1
+++ b/txr.1
@@ -4499,6 +4499,12 @@ according to a modified namespace lookup rule.
More details are given in the documentation for the dwim operator.
+.SS Regular Expressions
+
+In TXR Lisp, the / character can occur in symbol names, and the / token
+is a symbol. Therefore the /regex/ syntax is absent, replaced with the
+#/regex/ syntax.
+
.SS Lisp Operators
When the first element of a compound expression is an operator symbol,