aboutsummaryrefslogtreecommitdiffstats
path: root/dfa.c
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2013-10-10 09:07:54 +0300
committerArnold D. Robbins <arnold@skeeve.com>2013-10-10 09:07:54 +0300
commit9bac49a90fec1886de5ae898d84a0022a2a4f2f6 (patch)
tree5280b24f5607659aa5017daee95b40f70ea253c5 /dfa.c
parent8e4ebdf25bb000f7c84e6cba7c01975c01536f44 (diff)
downloadegawk-9bac49a90fec1886de5ae898d84a0022a2a4f2f6.tar.gz
egawk-9bac49a90fec1886de5ae898d84a0022a2a4f2f6.tar.bz2
egawk-9bac49a90fec1886de5ae898d84a0022a2a4f2f6.zip
Fix multibyte \s and \S in dfa.c from GNU grep.
Diffstat (limited to 'dfa.c')
-rw-r--r--dfa.c47
1 files changed, 39 insertions, 8 deletions
diff --git a/dfa.c b/dfa.c
index 84ccbc0c..b12d2d8b 100644
--- a/dfa.c
+++ b/dfa.c
@@ -1488,14 +1488,45 @@ lex (void)
case 'S':
if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- zeroset (ccl);
- for (c2 = 0; c2 < NOTCHAR; ++c2)
- if (isspace (c2))
- setbit (c2, ccl);
- if (c == 'S')
- notset (ccl);
- laststart = 0;
- return lasttok = CSET + charclass_index (ccl);
+ if (MB_CUR_MAX == 1)
+ {
+ zeroset (ccl);
+ for (c2 = 0; c2 < NOTCHAR; ++c2)
+ if (isspace (c2))
+ setbit (c2, ccl);
+ if (c == 'S')
+ notset (ccl);
+ laststart = 0;
+ return lasttok = CSET + charclass_index (ccl);
+ }
+
+#define PUSH_LEX_STATE(s) \
+ do \
+ { \
+ char const *lexptr_saved = lexptr; \
+ size_t lexleft_saved = lexleft; \
+ lexptr = (s); \
+ lexleft = strlen (lexptr)
+
+#define POP_LEX_STATE() \
+ lexptr = lexptr_saved; \
+ lexleft = lexleft_saved; \
+ } \
+ while (0)
+
+ /* FIXME: see if optimizing this, as is done with ANYCHAR and
+ add_utf8_anychar, makes sense. */
+
+ /* \s and \S are documented to be equivalent to [[:space:]] and
+ [^[:space:]] respectively, so tell the lexer to process those
+ strings, each minus its "already processed" '['. */
+ PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]");
+
+ lasttok = parse_bracket_exp ();
+
+ POP_LEX_STATE ();
+
+ return lasttok;
case 'w':
case 'W':