diff options
Diffstat (limited to 'dfa.c')
-rw-r--r-- | dfa.c | 142 |
1 files changed, 73 insertions, 69 deletions
@@ -219,7 +219,8 @@ enum EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches the empty string. */ - BACKREF, /* BACKREF is generated by \<digit>; it + BACKREF, /* BACKREF is generated by \<digit> + or by any other construct that is not completely handled. If the scanner detects a transition on backref, it returns a kind of "semi-success" indicating that @@ -811,6 +812,45 @@ using_utf8 (void) return utf8; } +/* Return true if the current locale is known to be a unibyte locale + without multicharacter collating sequences and where range + comparisons simply use the native encoding. These locales can be + processed more efficiently. */ + +static bool +using_simple_locale (void) +{ + /* True if the native character set is known to be compatible with + the C locale. The following test isn't perfect, but it's good + enough in practice, as only ASCII and EBCDIC are in common use + and this test correctly accepts ASCII and rejects EBCDIC. */ + enum { native_c_charset = + ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12 + && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35 + && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41 + && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46 + && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59 + && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65 + && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94 + && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124 + && '}' == 125 && '~' == 126) + }; + + if (! native_c_charset || MB_CUR_MAX > 1) + return false; + else + { + static int unibyte_c = -1; + if (unibyte_c < 0) + { + char *locale = setlocale (LC_ALL, 0); + unibyte_c = (locale && (STREQ (locale, "C") + || STREQ (locale, "POSIX"))); + } + return unibyte_c; + } +} + /* Lexical analyzer. All the dross that deals with the obnoxious GNU Regex syntax bits is located here. The poor, suffering reader is referred to the GNU Regex documentation for the @@ -928,7 +968,7 @@ static const struct dfa_ctype prednames[] = { {"upper", isupper, false}, {"lower", islower, false}, {"digit", isdigit, true}, - {"xdigit", isxdigit, true}, + {"xdigit", isxdigit, false}, {"space", isspace, false}, {"punct", ispunct, false}, {"alnum", isalnum, false}, @@ -959,6 +999,10 @@ parse_bracket_exp (void) int c, c1, c2; charclass ccl; + /* True if this is a bracket expression that dfaexec is known to + process correctly. */ + bool known_bracket_exp = true; + /* Used to warn about [:space:]. Bit 0 = first character is a colon. Bit 1 = last character is a colon. @@ -1000,6 +1044,7 @@ parse_bracket_exp (void) { FETCH_WC (c, wc, _("unbalanced [")); invert = 1; + known_bracket_exp = using_simple_locale (); } else invert = 0; @@ -1014,16 +1059,14 @@ parse_bracket_exp (void) we just treat it as a bunch of ordinary characters. We can do this because we assume regex has checked for syntax errors before dfa is ever called. */ - if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) + if (c == '[') { #define MAX_BRACKET_STRING_LEN 32 char str[MAX_BRACKET_STRING_LEN + 1]; FETCH_WC (c1, wc1, _("unbalanced [")); - /* If pattern contains '[[:', '[[.', or '[[='. */ - if (c1 == ':' - /* TODO: handle '[[.' and '[[=' also for MB_CUR_MAX == 1. */ - || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '='))) + if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES)) + || c1 == '.' || c1 == '=') { size_t len = 0; for (;;) @@ -1042,7 +1085,10 @@ parse_bracket_exp (void) /* Fetch bracket. */ FETCH_WC (c, wc, _("unbalanced [")); if (c1 == ':') - /* build character class. */ + /* Build character class. POSIX allows character + classes to match multicharacter collating elements, + but the regex code does not support that, so do not + worry about that possibility. */ { char const *class = (case_fold && (STREQ (str, "upper") @@ -1066,28 +1112,9 @@ parse_bracket_exp (void) if (pred->func (c2)) setbit_case_fold_c (c2, ccl); } + else + known_bracket_exp = false; - else if (MBS_SUPPORT && (c1 == '=' || c1 == '.')) - { - char *elem = xmemdup (str, len + 1); - - if (c1 == '=') - /* build equivalence class. */ - { - REALLOC_IF_NECESSARY (work_mbc->equivs, - equivs_al, work_mbc->nequivs + 1); - work_mbc->equivs[work_mbc->nequivs++] = elem; - } - - if (c1 == '.') - /* build collating element. */ - { - REALLOC_IF_NECESSARY (work_mbc->coll_elems, - coll_elems_al, - work_mbc->ncoll_elems + 1); - work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; - } - } colon_warning_state |= 8; /* Fetch new lookahead character. */ @@ -1109,6 +1136,16 @@ parse_bracket_exp (void) /* build range characters. */ { FETCH_WC (c2, wc2, _("unbalanced [")); + + /* A bracket expression like [a-[.aa.]] matches an unknown set. + Treat it like [-a[.aa.]] while parsing it, and + remember that the set is unknown. */ + if (c2 == '[' && *lexptr == '.') + { + known_bracket_exp = false; + c2 = ']'; + } + if (c2 == ']') { /* In the case [x-], the - is an ordinary hyphen, @@ -1146,47 +1183,11 @@ parse_bracket_exp (void) work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2); } } + else if (using_simple_locale ()) + for (; c <= c2; c++) + setbit_case_fold_c (c, ccl); else - { -#ifdef GAWK - c1 = c; - if (case_fold) - { - c1 = tolower (c1); - c2 = tolower (c2); - } - for (c = c1; c <= c2; c++) - setbit_case_fold_c (c, ccl); -#else - /* Defer to the system regex library about the meaning - of range expressions. */ - struct re_pattern_buffer re = { 0 }; - char const *compile_msg; -#if 199901 <= __STDC_VERSION__ - char pattern[] = { '[', '\\', c, '-', '\\', c2, ']' }; -#else - char pattern[] = { '[', '\\', 0, '-', '\\', 0, ']' }; - pattern[2] = c; - pattern[5] = c2; -#endif - re_set_syntax (syntax_bits | RE_BACKSLASH_ESCAPE_IN_LISTS); - compile_msg = re_compile_pattern (pattern, sizeof pattern, &re); - if (compile_msg) - dfaerror (compile_msg); - for (c = 0; c < NOTCHAR; c++) - { - char subject = c; - switch (re_match (&re, &subject, 1, 0, NULL)) - { - case 1: setbit (c, ccl); break; - case -1: break; - default: xalloc_die (); - } - } - regfree (&re); - re_set_syntax (syntax_bits); -#endif - } + known_bracket_exp = false; colon_warning_state |= 8; FETCH_WC (c1, wc1, _("unbalanced [")); @@ -1224,6 +1225,9 @@ parse_bracket_exp (void) if (colon_warning_state == 7) dfawarn (_("character class syntax is [[:space:]], not [:space:]")); + if (! known_bracket_exp) + return BACKREF; + if (MB_CUR_MAX > 1) { static charclass zeroclass; |