diff options
-rw-r--r-- | ChangeLog | 19 | ||||
-rw-r--r-- | dfa.c | 3 | ||||
-rw-r--r-- | re.c | 165 | ||||
-rw-r--r-- | regcomp.c | 37 | ||||
-rw-r--r-- | regex.h | 5 | ||||
-rw-r--r-- | test/ChangeLog | 5 | ||||
-rw-r--r-- | test/Makefile.am | 6 | ||||
-rw-r--r-- | test/Makefile.in | 11 | ||||
-rw-r--r-- | test/Maketests | 5 | ||||
-rw-r--r-- | test/regrange.awk | 30 | ||||
-rw-r--r-- | test/regrange.ok | 6 |
11 files changed, 114 insertions, 178 deletions
@@ -1,3 +1,22 @@ +Tue May 31 22:23:41 2011 Arnold D. Robbins <arnold@skeeve.com> + + In order to attain the goal of having ranges act like they are + always in the C locale, bit the bullet and did the work in + the regex and dfa engines. The pre-processing routine was not + handling too many cases that a full regexp parser would catch. + + * regex.h [RE_RANGES_IGNORE_LOCALES]: New syntax bit. + (RE_SYNTAX_GNU_AWK): Use it. + * dfa.c (parse_bracket_exp): If the RE_RANGES_IGNORE_LOCALES + is set, ignore locales when building a range. + * re.c (expand_range): Remove function and declaration. + (add_char): Remove function and declaration. + (make_regexp): Remove use of expand_range. + (resetup): Add RE_RANGES_IGNORE_LOCALES if --traditional. + * regcomp.c (build_range_exp): Add syntax variable as last argument. + Add code to check for RE_RANGES_IGNORE_LOCALES and do the right thing. + Adjust all calls. + Sun May 29 22:48:41 2011 Arnold D. Robbins <arnold@skeeve.com> * re.c (expand_range): Handle cases where expanded range @@ -993,7 +993,8 @@ parse_bracket_exp (void) c1 = tolower (c1); c2 = tolower (c2); } - if (!hard_LC_COLLATE) + if (!hard_LC_COLLATE + || (syntax_bits & RE_RANGES_IGNORE_LOCALES)) for (c = c1; c <= c2; c++) setbit_case_fold (c, ccl); else @@ -27,7 +27,6 @@ static reg_syntax_t syn; static void check_bracket_exp(char *s, size_t len); -static char *expand_range(char *s, size_t *len); /* make_regexp --- generate compiled regular expressions */ @@ -46,8 +45,6 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal) static short no_dfa = FALSE; int has_anchor = FALSE; int may_have_range = 0; - char *newbuf; - size_t newlen; reg_syntax_t dfa_syn; /* @@ -176,24 +173,6 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal) *dest = '\0'; len = dest - buf; - if ( ! do_posix - && may_have_range >= 3 - && memchr(buf, '-', len) != NULL) { - newlen = len; - newbuf = expand_range(buf, & newlen); - - /* song and dance since buf & buflen are static */ - if (newlen > buflen) { - free(buf); - buf = newbuf; - buflen = newlen; - } else { - memcpy(buf, newbuf, newlen); - free(newbuf); - } - len = newlen; - } - emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); memset((char *) rp, 0, sizeof(*rp)); rp->dfareg = NULL; @@ -403,9 +382,10 @@ resetup() { if (do_posix) syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */ - else if (do_traditional) + else if (do_traditional) { syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */ - else + syn |= RE_RANGES_IGNORE_LOCALES; + } else syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */ /* @@ -622,142 +602,3 @@ again: done: s[length] = save; } - -/* add_char --- add a character to the buffer, grow it if needed */ - -static void -add_char(char **bufp, size_t *lenp, char ch, char **ptr) -{ - size_t newlen; - size_t offset; - - if (*ptr - *bufp < *lenp) { - **ptr = ch; - (*ptr)++; - return; - } - - /* have to grow the buffer and adjust the pointers */ - offset = (*ptr - *bufp); - newlen = offset * 2; - erealloc(*bufp, char *, newlen + 2, "add_char"); - *ptr = *bufp + offset; - **ptr = ch; - *lenp = newlen + 2; - (*ptr)++; -} - -/* expand_range --- turn [b-e] into [bcde] */ - -static char * -expand_range(char *s, size_t *lenp) -{ - char *sp, *sp2, *newbuf; - size_t len; - int count = 0; - size_t newbuf_len = *lenp * 2; - - emalloc(newbuf, char *, newbuf_len, "expand_range"); - - sp = s; - sp2 = newbuf; - len = *lenp; -#define copy() (add_char(& newbuf, & newbuf_len, *sp++, & sp2), len--) -#define copych(ch) (add_char(& newbuf, & newbuf_len, ch, & sp2)) -again: - while (len > 0) { - if (*sp == '\\') { - copy(); - copy(); - } - else if (*sp == '[') { - count++; - break; - } - else - copy(); - } - if (len == 0) - goto done; - - copy(); /* copy in the [ */ - if (*sp == '^') /* allow for negation of range */ - copy(); - - /* - * Minus as first character after [ or ^ is literal, - * just copy it and skip over. - */ - if (*sp == '-') - copy(); - - while (count > 0 && len > 0) { - if (*sp == '\\') { - copy(); - copy(); - continue; - } - if (*sp == '[') { - count++; - copy(); - continue; - } - if (*sp == ']') { - count--; - copy(); - if (count == 0) - goto again; - else - continue; - } - - if (count == 1) { - /* inside [...] but not inside [[:...:]] */ - if (*sp == '-') { - int start, end; - int i; - - if (sp[1] == ']') { /* also literal */ - copy(); - continue; - } - - /* It's a range, expand it. */ - start = sp[-1]; - if (sp[1] == '\\') { - sp++; - len--; - } - end = sp[1]; - if (end < start) - fatal(_("Invalid range end: /%.*s/"), - *lenp, s); - for (i = start + 1; i < end; i++) { - /* - * Will the special cases never end? - */ - if (i == '\\' || i == ']') { - copych('\\'); - } - copych(i); - } - sp++; - len--; - continue; - } - else - copy(); - } else { - copy(); - } - } - - if (len > 0) - goto again; - -done: - *lenp = sp2 - newbuf; - return newbuf; -} -#undef copy -#undef copych @@ -2643,13 +2643,15 @@ static reg_errcode_t internal_function # ifdef RE_ENABLE_I18N build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, - bracket_elem_t *start_elem, bracket_elem_t *end_elem) + bracket_elem_t *start_elem, bracket_elem_t *end_elem, reg_syntax_t syntax) # else /* not RE_ENABLE_I18N */ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, - bracket_elem_t *end_elem) + bracket_elem_t *end_elem, reg_syntax_t syntax) # endif /* not RE_ENABLE_I18N */ { unsigned int start_ch, end_ch; + int ignore_locales = (syntax & RE_RANGES_IGNORE_LOCALES) != 0; + /* Equivalence Classes and Character Classes can't be a range start/end. */ if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, @@ -2697,7 +2699,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, return REG_ECOLLATE; cmp_buf[0] = start_wc; cmp_buf[4] = end_wc; - if (wcscoll (cmp_buf, cmp_buf + 4) > 0) + if (ignore_locales && start_wc > end_wc) + return REG_ERANGE; + else if (wcscoll (cmp_buf, cmp_buf + 4) > 0) return REG_ERANGE; /* Got valid collation sequence values, add them as a new entry. @@ -2736,12 +2740,23 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, } /* Build the table for single byte characters. */ - for (wc = 0; wc < SBC_MAX; ++wc) + if (ignore_locales) + { + for (wc = 0; wc < SBC_MAX; ++wc) + { + if (start_wc <= wc && wc <= end_wc) + bitset_set (sbcset, wc); + } + } + else { - cmp_buf[2] = wc; - if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 - && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) - bitset_set (sbcset, wc); + for (wc = 0; wc < SBC_MAX; ++wc) + { + cmp_buf[2] = wc; + if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 + && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) + bitset_set (sbcset, wc); + } } } # else /* not RE_ENABLE_I18N */ @@ -3201,14 +3216,14 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, #ifdef _LIBC *err = build_range_exp (sbcset, mbcset, &range_alloc, - &start_elem, &end_elem); + &start_elem, &end_elem, syntax); #else # ifdef RE_ENABLE_I18N *err = build_range_exp (sbcset, dfa->mb_cur_max > 1 ? mbcset : NULL, - &range_alloc, &start_elem, &end_elem); + &range_alloc, &start_elem, &end_elem, syntax); # else - *err = build_range_exp (sbcset, &start_elem, &end_elem); + *err = build_range_exp (sbcset, &start_elem, &end_elem, syntax); # endif #endif /* RE_ENABLE_I18N */ if (BE (*err != REG_NOERROR, 0)) @@ -184,6 +184,10 @@ typedef unsigned long int reg_syntax_t; /* If this bit is set, then no_sub will be set to 1 during re_compile_pattern. */ # define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1) + +/* If this bit is set, then ranges act like they are in + the "C" locale. */ +# define RE_RANGES_IGNORE_LOCALES (RE_NO_SUB << 1) #endif /* This global variable defines the particular regexp syntax to use (for @@ -209,6 +213,7 @@ extern reg_syntax_t re_syntax_options; #define RE_SYNTAX_GNU_AWK \ ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \ + | RE_RANGES_IGNORE_LOCALES \ | RE_INVALID_INTERVAL_ORD) \ & ~(RE_DOT_NOT_NULL | RE_CONTEXT_INDEP_OPS \ | RE_CONTEXT_INVALID_OPS )) diff --git a/test/ChangeLog b/test/ChangeLog index f28153ab..764fd90d 100644 --- a/test/ChangeLog +++ b/test/ChangeLog @@ -1,3 +1,8 @@ +Tue May 31 22:50:28 2011 Arnold D. Robbins <arnold@skeeve.com> + + * regrange.awk, regrange.ok: New files. + * Makefile.am (regrange): New test. + Thu May 26 22:08:27 2011 Arnold D. Robbins <arnold@skeeve.com> * fpat2.awk, fpat2.ok: New files. Thanks to Pat Rankin for the cases. diff --git a/test/Makefile.am b/test/Makefile.am index 065bff1d..e3dbce04 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -579,6 +579,8 @@ EXTRA_DIST = \ regeq.awk \ regeq.in \ regeq.ok \ + regrange.awk \ + regrange.ok \ regtest.sh \ regx8bit.awk \ regx8bit.ok \ @@ -771,8 +773,8 @@ BASIC_TESTS = \ ofmta ofmtbig ofmtfidl ofmts onlynl opasnidx opasnslf paramdup \ paramres paramtyp parse1 parsefld parseme pcntplus posix2008sub \ prdupval prec printf0 printf1 prmarscl prmreuse prt1eval prtoeval \ - rand range1 rebt8b1 redfilnm regeq reindops reparse resplit rs \ - rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \ + rand range1 rebt8b1 redfilnm regeq regrange reindops reparse resplit \ + rs rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \ rstest5 rswhite scalar sclforin sclifin sortempty splitargv \ splitarr splitdef splitvar splitwht strcat1 strnum1 strtod subamp \ subi18n subsepnm subslash substr swaplns synerr1 synerr2 tradanch \ diff --git a/test/Makefile.in b/test/Makefile.in index 1fa42e36..3de264e3 100644 --- a/test/Makefile.in +++ b/test/Makefile.in @@ -764,6 +764,8 @@ EXTRA_DIST = \ regeq.awk \ regeq.in \ regeq.ok \ + regrange.awk \ + regrange.ok \ regtest.sh \ regx8bit.awk \ regx8bit.ok \ @@ -956,8 +958,8 @@ BASIC_TESTS = \ ofmta ofmtbig ofmtfidl ofmts onlynl opasnidx opasnslf paramdup \ paramres paramtyp parse1 parsefld parseme pcntplus posix2008sub \ prdupval prec printf0 printf1 prmarscl prmreuse prt1eval prtoeval \ - rand range1 rebt8b1 redfilnm regeq reindops reparse resplit rs \ - rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \ + rand range1 rebt8b1 redfilnm regeq regrange reindops reparse resplit \ + rs rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \ rstest5 rswhite scalar sclforin sclifin sortempty splitargv \ splitarr splitdef splitvar splitwht strcat1 strnum1 strtod subamp \ subi18n subsepnm subslash substr swaplns synerr1 synerr2 tradanch \ @@ -2367,6 +2369,11 @@ regeq: @AWKPATH=$(srcdir) $(AWK) -f $@.awk < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@ +regrange: + @echo regrange + @AWKPATH=$(srcdir) $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@ + reindops: @echo reindops @AWKPATH=$(srcdir) $(AWK) -f $@.awk < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ diff --git a/test/Maketests b/test/Maketests index 7577b893..243d07b7 100644 --- a/test/Maketests +++ b/test/Maketests @@ -645,6 +645,11 @@ regeq: @AWKPATH=$(srcdir) $(AWK) -f $@.awk < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@ +regrange: + @echo regrange + @AWKPATH=$(srcdir) $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@ + reindops: @echo reindops @AWKPATH=$(srcdir) $(AWK) -f $@.awk < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ diff --git a/test/regrange.awk b/test/regrange.awk new file mode 100644 index 00000000..71879316 --- /dev/null +++ b/test/regrange.awk @@ -0,0 +1,30 @@ +# Tests due to John Haque, May 2011 +# +# The following should be fatal; can't catch them inside awk, though +# $> echo 'a' | ./gawk '/[z-a]/ { print }' +# $> echo 'A' | ./gawk '/[+-[:digit:]]/' + +BEGIN { + char[1] = "." + pat[1] = "[--\\/]" + + char[2] = "a" + pat[2] = "[]-c]" + + char[3] = "c" + pat[3] = "[[a-d]" + + char[4] = "\\" + pat[4] = "[\\[-\\]]" + + char[5] = "[.c.]" + pat[5] = "[a-[.e.]]" + + char[6] = "[.d.]" + pat[6] = "[[.c.]-[.z.]]" + + for (i = 1; i in char; i++) { + printf("\"%s\" ~ /%s/ --> %d\n", char[i], pat[i], + char[i] ~ pat[i]) + } +} diff --git a/test/regrange.ok b/test/regrange.ok new file mode 100644 index 00000000..1fa00c70 --- /dev/null +++ b/test/regrange.ok @@ -0,0 +1,6 @@ +"." ~ /[--\/]/ --> 1 +"a" ~ /[]-c]/ --> 1 +"c" ~ /[[a-d]/ --> 1 +"\" ~ /[\[-\]]/ --> 1 +"[.c.]" ~ /[a-[.e.]]/ --> 1 +"[.d.]" ~ /[[.c.]-[.z.]]/ --> 0 |