aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2011-05-31 22:52:23 +0300
committerArnold D. Robbins <arnold@skeeve.com>2011-05-31 22:52:23 +0300
commitc2efba1b80559c475a74622d16aa7361fa566251 (patch)
treed4bb02d4240e5e59670e45bdaff0876131d6545d
parent2c126c4972966714e2c3af8826c4161c30570041 (diff)
downloadegawk-c2efba1b80559c475a74622d16aa7361fa566251.tar.gz
egawk-c2efba1b80559c475a74622d16aa7361fa566251.tar.bz2
egawk-c2efba1b80559c475a74622d16aa7361fa566251.zip
Rationalize range expansion in regexps.
-rw-r--r--ChangeLog19
-rw-r--r--dfa.c3
-rw-r--r--re.c165
-rw-r--r--regcomp.c37
-rw-r--r--regex.h5
-rw-r--r--test/ChangeLog5
-rw-r--r--test/Makefile.am6
-rw-r--r--test/Makefile.in11
-rw-r--r--test/Maketests5
-rw-r--r--test/regrange.awk30
-rw-r--r--test/regrange.ok6
11 files changed, 114 insertions, 178 deletions
diff --git a/ChangeLog b/ChangeLog
index d3ae5ebb..28603eda 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+Tue May 31 22:23:41 2011 Arnold D. Robbins <arnold@skeeve.com>
+
+ In order to attain the goal of having ranges act like they are
+ always in the C locale, bit the bullet and did the work in
+ the regex and dfa engines. The pre-processing routine was not
+ handling too many cases that a full regexp parser would catch.
+
+ * regex.h [RE_RANGES_IGNORE_LOCALES]: New syntax bit.
+ (RE_SYNTAX_GNU_AWK): Use it.
+ * dfa.c (parse_bracket_exp): If the RE_RANGES_IGNORE_LOCALES
+ is set, ignore locales when building a range.
+ * re.c (expand_range): Remove function and declaration.
+ (add_char): Remove function and declaration.
+ (make_regexp): Remove use of expand_range.
+ (resetup): Add RE_RANGES_IGNORE_LOCALES if --traditional.
+ * regcomp.c (build_range_exp): Add syntax variable as last argument.
+ Add code to check for RE_RANGES_IGNORE_LOCALES and do the right thing.
+ Adjust all calls.
+
Sun May 29 22:48:41 2011 Arnold D. Robbins <arnold@skeeve.com>
* re.c (expand_range): Handle cases where expanded range
diff --git a/dfa.c b/dfa.c
index 7dbc4e9b..02f3291d 100644
--- a/dfa.c
+++ b/dfa.c
@@ -993,7 +993,8 @@ parse_bracket_exp (void)
c1 = tolower (c1);
c2 = tolower (c2);
}
- if (!hard_LC_COLLATE)
+ if (!hard_LC_COLLATE
+ || (syntax_bits & RE_RANGES_IGNORE_LOCALES))
for (c = c1; c <= c2; c++)
setbit_case_fold (c, ccl);
else
diff --git a/re.c b/re.c
index b317b096..1f220b3e 100644
--- a/re.c
+++ b/re.c
@@ -27,7 +27,6 @@
static reg_syntax_t syn;
static void check_bracket_exp(char *s, size_t len);
-static char *expand_range(char *s, size_t *len);
/* make_regexp --- generate compiled regular expressions */
@@ -46,8 +45,6 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
static short no_dfa = FALSE;
int has_anchor = FALSE;
int may_have_range = 0;
- char *newbuf;
- size_t newlen;
reg_syntax_t dfa_syn;
/*
@@ -176,24 +173,6 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
*dest = '\0';
len = dest - buf;
- if ( ! do_posix
- && may_have_range >= 3
- && memchr(buf, '-', len) != NULL) {
- newlen = len;
- newbuf = expand_range(buf, & newlen);
-
- /* song and dance since buf & buflen are static */
- if (newlen > buflen) {
- free(buf);
- buf = newbuf;
- buflen = newlen;
- } else {
- memcpy(buf, newbuf, newlen);
- free(newbuf);
- }
- len = newlen;
- }
-
emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
memset((char *) rp, 0, sizeof(*rp));
rp->dfareg = NULL;
@@ -403,9 +382,10 @@ resetup()
{
if (do_posix)
syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
- else if (do_traditional)
+ else if (do_traditional) {
syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
- else
+ syn |= RE_RANGES_IGNORE_LOCALES;
+ } else
syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
/*
@@ -622,142 +602,3 @@ again:
done:
s[length] = save;
}
-
-/* add_char --- add a character to the buffer, grow it if needed */
-
-static void
-add_char(char **bufp, size_t *lenp, char ch, char **ptr)
-{
- size_t newlen;
- size_t offset;
-
- if (*ptr - *bufp < *lenp) {
- **ptr = ch;
- (*ptr)++;
- return;
- }
-
- /* have to grow the buffer and adjust the pointers */
- offset = (*ptr - *bufp);
- newlen = offset * 2;
- erealloc(*bufp, char *, newlen + 2, "add_char");
- *ptr = *bufp + offset;
- **ptr = ch;
- *lenp = newlen + 2;
- (*ptr)++;
-}
-
-/* expand_range --- turn [b-e] into [bcde] */
-
-static char *
-expand_range(char *s, size_t *lenp)
-{
- char *sp, *sp2, *newbuf;
- size_t len;
- int count = 0;
- size_t newbuf_len = *lenp * 2;
-
- emalloc(newbuf, char *, newbuf_len, "expand_range");
-
- sp = s;
- sp2 = newbuf;
- len = *lenp;
-#define copy() (add_char(& newbuf, & newbuf_len, *sp++, & sp2), len--)
-#define copych(ch) (add_char(& newbuf, & newbuf_len, ch, & sp2))
-again:
- while (len > 0) {
- if (*sp == '\\') {
- copy();
- copy();
- }
- else if (*sp == '[') {
- count++;
- break;
- }
- else
- copy();
- }
- if (len == 0)
- goto done;
-
- copy(); /* copy in the [ */
- if (*sp == '^') /* allow for negation of range */
- copy();
-
- /*
- * Minus as first character after [ or ^ is literal,
- * just copy it and skip over.
- */
- if (*sp == '-')
- copy();
-
- while (count > 0 && len > 0) {
- if (*sp == '\\') {
- copy();
- copy();
- continue;
- }
- if (*sp == '[') {
- count++;
- copy();
- continue;
- }
- if (*sp == ']') {
- count--;
- copy();
- if (count == 0)
- goto again;
- else
- continue;
- }
-
- if (count == 1) {
- /* inside [...] but not inside [[:...:]] */
- if (*sp == '-') {
- int start, end;
- int i;
-
- if (sp[1] == ']') { /* also literal */
- copy();
- continue;
- }
-
- /* It's a range, expand it. */
- start = sp[-1];
- if (sp[1] == '\\') {
- sp++;
- len--;
- }
- end = sp[1];
- if (end < start)
- fatal(_("Invalid range end: /%.*s/"),
- *lenp, s);
- for (i = start + 1; i < end; i++) {
- /*
- * Will the special cases never end?
- */
- if (i == '\\' || i == ']') {
- copych('\\');
- }
- copych(i);
- }
- sp++;
- len--;
- continue;
- }
- else
- copy();
- } else {
- copy();
- }
- }
-
- if (len > 0)
- goto again;
-
-done:
- *lenp = sp2 - newbuf;
- return newbuf;
-}
-#undef copy
-#undef copych
diff --git a/regcomp.c b/regcomp.c
index ab37a6f7..e0b158d0 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2643,13 +2643,15 @@ static reg_errcode_t
internal_function
# ifdef RE_ENABLE_I18N
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
- bracket_elem_t *start_elem, bracket_elem_t *end_elem)
+ bracket_elem_t *start_elem, bracket_elem_t *end_elem, reg_syntax_t syntax)
# else /* not RE_ENABLE_I18N */
build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
- bracket_elem_t *end_elem)
+ bracket_elem_t *end_elem, reg_syntax_t syntax)
# endif /* not RE_ENABLE_I18N */
{
unsigned int start_ch, end_ch;
+ int ignore_locales = (syntax & RE_RANGES_IGNORE_LOCALES) != 0;
+
/* Equivalence Classes and Character Classes can't be a range start/end. */
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
@@ -2697,7 +2699,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
return REG_ECOLLATE;
cmp_buf[0] = start_wc;
cmp_buf[4] = end_wc;
- if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
+ if (ignore_locales && start_wc > end_wc)
+ return REG_ERANGE;
+ else if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
return REG_ERANGE;
/* Got valid collation sequence values, add them as a new entry.
@@ -2736,12 +2740,23 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
}
/* Build the table for single byte characters. */
- for (wc = 0; wc < SBC_MAX; ++wc)
+ if (ignore_locales)
+ {
+ for (wc = 0; wc < SBC_MAX; ++wc)
+ {
+ if (start_wc <= wc && wc <= end_wc)
+ bitset_set (sbcset, wc);
+ }
+ }
+ else
{
- cmp_buf[2] = wc;
- if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
- && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
- bitset_set (sbcset, wc);
+ for (wc = 0; wc < SBC_MAX; ++wc)
+ {
+ cmp_buf[2] = wc;
+ if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
+ && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+ bitset_set (sbcset, wc);
+ }
}
}
# else /* not RE_ENABLE_I18N */
@@ -3201,14 +3216,14 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
#ifdef _LIBC
*err = build_range_exp (sbcset, mbcset, &range_alloc,
- &start_elem, &end_elem);
+ &start_elem, &end_elem, syntax);
#else
# ifdef RE_ENABLE_I18N
*err = build_range_exp (sbcset,
dfa->mb_cur_max > 1 ? mbcset : NULL,
- &range_alloc, &start_elem, &end_elem);
+ &range_alloc, &start_elem, &end_elem, syntax);
# else
- *err = build_range_exp (sbcset, &start_elem, &end_elem);
+ *err = build_range_exp (sbcset, &start_elem, &end_elem, syntax);
# endif
#endif /* RE_ENABLE_I18N */
if (BE (*err != REG_NOERROR, 0))
diff --git a/regex.h b/regex.h
index 6bc503b2..a2d120f9 100644
--- a/regex.h
+++ b/regex.h
@@ -184,6 +184,10 @@ typedef unsigned long int reg_syntax_t;
/* If this bit is set, then no_sub will be set to 1 during
re_compile_pattern. */
# define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
+
+/* If this bit is set, then ranges act like they are in
+ the "C" locale. */
+# define RE_RANGES_IGNORE_LOCALES (RE_NO_SUB << 1)
#endif
/* This global variable defines the particular regexp syntax to use (for
@@ -209,6 +213,7 @@ extern reg_syntax_t re_syntax_options;
#define RE_SYNTAX_GNU_AWK \
((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \
+ | RE_RANGES_IGNORE_LOCALES \
| RE_INVALID_INTERVAL_ORD) \
& ~(RE_DOT_NOT_NULL | RE_CONTEXT_INDEP_OPS \
| RE_CONTEXT_INVALID_OPS ))
diff --git a/test/ChangeLog b/test/ChangeLog
index f28153ab..764fd90d 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,8 @@
+Tue May 31 22:50:28 2011 Arnold D. Robbins <arnold@skeeve.com>
+
+ * regrange.awk, regrange.ok: New files.
+ * Makefile.am (regrange): New test.
+
Thu May 26 22:08:27 2011 Arnold D. Robbins <arnold@skeeve.com>
* fpat2.awk, fpat2.ok: New files. Thanks to Pat Rankin for the cases.
diff --git a/test/Makefile.am b/test/Makefile.am
index 065bff1d..e3dbce04 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -579,6 +579,8 @@ EXTRA_DIST = \
regeq.awk \
regeq.in \
regeq.ok \
+ regrange.awk \
+ regrange.ok \
regtest.sh \
regx8bit.awk \
regx8bit.ok \
@@ -771,8 +773,8 @@ BASIC_TESTS = \
ofmta ofmtbig ofmtfidl ofmts onlynl opasnidx opasnslf paramdup \
paramres paramtyp parse1 parsefld parseme pcntplus posix2008sub \
prdupval prec printf0 printf1 prmarscl prmreuse prt1eval prtoeval \
- rand range1 rebt8b1 redfilnm regeq reindops reparse resplit rs \
- rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \
+ rand range1 rebt8b1 redfilnm regeq regrange reindops reparse resplit \
+ rs rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \
rstest5 rswhite scalar sclforin sclifin sortempty splitargv \
splitarr splitdef splitvar splitwht strcat1 strnum1 strtod subamp \
subi18n subsepnm subslash substr swaplns synerr1 synerr2 tradanch \
diff --git a/test/Makefile.in b/test/Makefile.in
index 1fa42e36..3de264e3 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -764,6 +764,8 @@ EXTRA_DIST = \
regeq.awk \
regeq.in \
regeq.ok \
+ regrange.awk \
+ regrange.ok \
regtest.sh \
regx8bit.awk \
regx8bit.ok \
@@ -956,8 +958,8 @@ BASIC_TESTS = \
ofmta ofmtbig ofmtfidl ofmts onlynl opasnidx opasnslf paramdup \
paramres paramtyp parse1 parsefld parseme pcntplus posix2008sub \
prdupval prec printf0 printf1 prmarscl prmreuse prt1eval prtoeval \
- rand range1 rebt8b1 redfilnm regeq reindops reparse resplit rs \
- rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \
+ rand range1 rebt8b1 redfilnm regeq regrange reindops reparse resplit \
+ rs rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \
rstest5 rswhite scalar sclforin sclifin sortempty splitargv \
splitarr splitdef splitvar splitwht strcat1 strnum1 strtod subamp \
subi18n subsepnm subslash substr swaplns synerr1 synerr2 tradanch \
@@ -2367,6 +2369,11 @@ regeq:
@AWKPATH=$(srcdir) $(AWK) -f $@.awk < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@
+regrange:
+ @echo regrange
+ @AWKPATH=$(srcdir) $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@
+
reindops:
@echo reindops
@AWKPATH=$(srcdir) $(AWK) -f $@.awk < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/Maketests b/test/Maketests
index 7577b893..243d07b7 100644
--- a/test/Maketests
+++ b/test/Maketests
@@ -645,6 +645,11 @@ regeq:
@AWKPATH=$(srcdir) $(AWK) -f $@.awk < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@
+regrange:
+ @echo regrange
+ @AWKPATH=$(srcdir) $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@
+
reindops:
@echo reindops
@AWKPATH=$(srcdir) $(AWK) -f $@.awk < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/regrange.awk b/test/regrange.awk
new file mode 100644
index 00000000..71879316
--- /dev/null
+++ b/test/regrange.awk
@@ -0,0 +1,30 @@
+# Tests due to John Haque, May 2011
+#
+# The following should be fatal; can't catch them inside awk, though
+# $> echo 'a' | ./gawk '/[z-a]/ { print }'
+# $> echo 'A' | ./gawk '/[+-[:digit:]]/'
+
+BEGIN {
+ char[1] = "."
+ pat[1] = "[--\\/]"
+
+ char[2] = "a"
+ pat[2] = "[]-c]"
+
+ char[3] = "c"
+ pat[3] = "[[a-d]"
+
+ char[4] = "\\"
+ pat[4] = "[\\[-\\]]"
+
+ char[5] = "[.c.]"
+ pat[5] = "[a-[.e.]]"
+
+ char[6] = "[.d.]"
+ pat[6] = "[[.c.]-[.z.]]"
+
+ for (i = 1; i in char; i++) {
+ printf("\"%s\" ~ /%s/ --> %d\n", char[i], pat[i],
+ char[i] ~ pat[i])
+ }
+}
diff --git a/test/regrange.ok b/test/regrange.ok
new file mode 100644
index 00000000..1fa00c70
--- /dev/null
+++ b/test/regrange.ok
@@ -0,0 +1,6 @@
+"." ~ /[--\/]/ --> 1
+"a" ~ /[]-c]/ --> 1
+"c" ~ /[[a-d]/ --> 1
+"\" ~ /[\[-\]]/ --> 1
+"[.c.]" ~ /[a-[.e.]]/ --> 1
+"[.d.]" ~ /[[.c.]-[.z.]]/ --> 0