11 files changed, 114 insertions, 178 deletions
diff --git a/ChangeLog b/ChangeLog
index d3ae5ebb..28603eda 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+Tue May 31 22:23:41 2011  Arnold D. Robbins  <arnold@skeeve.com>
+
+	In order to attain the goal of having ranges act like they are
+	always in the C locale, bit the bullet and did the work in
+	the regex and dfa engines. The pre-processing routine was not
+	handling too many cases that a full regexp parser would catch.
+
+	* regex.h [RE_RANGES_IGNORE_LOCALES]: New syntax bit.
+	(RE_SYNTAX_GNU_AWK): Use it.
+	* dfa.c (parse_bracket_exp): If the RE_RANGES_IGNORE_LOCALES
+	is set, ignore locales when building a range.
+	* re.c (expand_range): Remove function and declaration.
+	(add_char): Remove function and declaration.
+	(make_regexp): Remove use of expand_range.
+	(resetup): Add RE_RANGES_IGNORE_LOCALES if --traditional.
+	* regcomp.c (build_range_exp): Add syntax variable as last argument.
+	Add code to check for RE_RANGES_IGNORE_LOCALES and do the right thing.
+	Adjust all calls.
+
 Sun May 29 22:48:41 2011  Arnold D. Robbins  <arnold@skeeve.com>
 
 	* re.c (expand_range): Handle cases where expanded range
diff --git a/dfa.c b/dfa.c
index 7dbc4e9b..02f3291d 100644
--- a/dfa.c
+++ b/dfa.c
@@ -993,7 +993,8 @@ parse_bracket_exp (void)
                   c1 = tolower (c1);
                   c2 = tolower (c2);
                 }
-              if (!hard_LC_COLLATE)
+              if (!hard_LC_COLLATE
+                  || (syntax_bits & RE_RANGES_IGNORE_LOCALES))
                 for (c = c1; c <= c2; c++)
                   setbit_case_fold (c, ccl);
               else
diff --git a/re.c b/re.c
index b317b096..1f220b3e 100644
--- a/re.c
+++ b/re.c
@@ -27,7 +27,6 @@
 
 static reg_syntax_t syn;
 static void check_bracket_exp(char *s, size_t len);
-static char *expand_range(char *s, size_t *len);
 
 /* make_regexp --- generate compiled regular expressions */
 
@@ -46,8 +45,6 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
 	static short no_dfa = FALSE;
 	int has_anchor = FALSE;
 	int may_have_range = 0;
-	char *newbuf;
-	size_t newlen;
 	reg_syntax_t dfa_syn;
 
 	/*
@@ -176,24 +173,6 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
 	*dest = '\0';
 	len = dest - buf;
 
-	if (   ! do_posix
-	    && may_have_range >= 3
-	    && memchr(buf, '-', len) != NULL) {
-		newlen = len;
-		newbuf = expand_range(buf, & newlen);
-
-		/* song and dance since buf & buflen are static */
-		if (newlen > buflen) {
-			free(buf);
-			buf = newbuf;
-			buflen = newlen;
-		} else {
-			memcpy(buf, newbuf, newlen);
-			free(newbuf);
-		}
-		len = newlen;
-	}
-
 	emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
 	memset((char *) rp, 0, sizeof(*rp));
 	rp->dfareg = NULL;
@@ -403,9 +382,10 @@ resetup()
 {
 	if (do_posix)
 		syn = RE_SYNTAX_POSIX_AWK;	/* strict POSIX re's */
-	else if (do_traditional)
+	else if (do_traditional) {
 		syn = RE_SYNTAX_AWK;		/* traditional Unix awk re's */
-	else
+		syn |= RE_RANGES_IGNORE_LOCALES;
+	} else
 		syn = RE_SYNTAX_GNU_AWK;	/* POSIX re's + GNU ops */
 
 	/*
@@ -622,142 +602,3 @@ again:
 done:
 	s[length] = save;
 }
-
-/* add_char --- add a character to the buffer, grow it if needed */
-
-static void
-add_char(char **bufp, size_t *lenp, char ch, char **ptr)
-{
-	size_t newlen;
-	size_t offset;
-
-	if (*ptr - *bufp < *lenp) {
-		**ptr = ch;
-		(*ptr)++;
-		return;
-	}
-
-	/* have to grow the buffer and adjust the pointers */
-	offset = (*ptr - *bufp);
-	newlen = offset * 2;
-	erealloc(*bufp, char *, newlen + 2, "add_char");
-	*ptr = *bufp + offset;
-	**ptr = ch;
-	*lenp = newlen + 2;
-	(*ptr)++;
-}
-
-/* expand_range --- turn [b-e] into [bcde] */
-
-static char *
-expand_range(char *s, size_t *lenp)
-{
-	char *sp, *sp2, *newbuf;
-	size_t len;
-	int count = 0;
-	size_t newbuf_len = *lenp * 2;
-
-	emalloc(newbuf, char *, newbuf_len, "expand_range");
-
-	sp = s;
-	sp2 = newbuf;
-	len = *lenp;
-#define copy() (add_char(& newbuf, & newbuf_len, *sp++, & sp2), len--)
-#define copych(ch) (add_char(& newbuf, & newbuf_len, ch, & sp2))
-again:
-	while (len > 0) {
-		if (*sp == '\\') {
-			copy();
-			copy();
-		}
-		else if (*sp == '[') {
-			count++;
-			break;
-		}
-		else
-			copy();
-	}
-	if (len == 0)
-		goto done;
-
-	copy();		/* copy in the [ */
-	if (*sp == '^')	/* allow for negation of range */
-		copy();
-
-	/*
-	 * Minus as first character after [ or ^ is literal,
-	 * just copy it and skip over.
-	 */
-	if (*sp == '-')
-		copy();
-
-	while (count > 0 && len > 0) {
-		if (*sp == '\\') {
-			copy();
-			copy();
-			continue;
-		}
-		if (*sp == '[') {
-			count++;
-			copy();
-			continue;
-		}
-		if (*sp == ']') {
-			count--;
-			copy();
-			if (count == 0)
-				goto again;
-			else
-				continue;
-		}
-
-		if (count == 1) {
-			/* inside [...] but not inside [[:...:]] */
-			if (*sp == '-') {
-				int start, end;
-				int i;
-
-				if (sp[1] == ']') {	/* also literal */
-					copy();
-					continue;
-				}
-
-				/* It's a range, expand it. */
-				start = sp[-1];
-				if (sp[1] == '\\') {
-					sp++;
-					len--;
-				}
-				end = sp[1];
-				if (end < start)
-					fatal(_("Invalid range end: /%.*s/"),
-								*lenp, s);
-				for (i = start + 1; i < end; i++) {
-					/*
-					 * Will the special cases never end?
-					 */
-					if (i == '\\' || i == ']') {
-						copych('\\');
-					}
-					copych(i);
-				}
-				sp++;
-				len--;
-				continue;
-			}
-			else
-				copy();
-		} else {
-			copy();
-		}
-	}
-
-	if (len > 0)
-		goto again;
-
-done:
-	*lenp = sp2 - newbuf;
-	return newbuf;
-}
-#undef copy
-#undef copych
diff --git a/regcomp.c b/regcomp.c
index ab37a6f7..e0b158d0 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2643,13 +2643,15 @@ static reg_errcode_t
 internal_function
 # ifdef RE_ENABLE_I18N
 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
-		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
+		 bracket_elem_t *start_elem, bracket_elem_t *end_elem, reg_syntax_t syntax)
 # else /* not RE_ENABLE_I18N */
 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
-		 bracket_elem_t *end_elem)
+		 bracket_elem_t *end_elem, reg_syntax_t syntax)
 # endif /* not RE_ENABLE_I18N */
 {
   unsigned int start_ch, end_ch;
+  int ignore_locales = (syntax & RE_RANGES_IGNORE_LOCALES) != 0;
+
   /* Equivalence Classes and Character Classes can't be a range start/end.  */
   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
@@ -2697,7 +2699,9 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
       return REG_ECOLLATE;
     cmp_buf[0] = start_wc;
     cmp_buf[4] = end_wc;
-    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
+    if (ignore_locales && start_wc > end_wc)
+      return REG_ERANGE;
+    else if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
       return REG_ERANGE;
 
     /* Got valid collation sequence values, add them as a new entry.
@@ -2736,12 +2740,23 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
       }
 
     /* Build the table for single byte characters.  */
-    for (wc = 0; wc < SBC_MAX; ++wc)
+    if (ignore_locales)
+      {
+        for (wc = 0; wc < SBC_MAX; ++wc)
+          {
+    	     if (start_wc <= wc && wc <= end_wc)
+    	       bitset_set (sbcset, wc);
+          }
+      }
+    else
       {
-	cmp_buf[2] = wc;
-	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
-	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
-	  bitset_set (sbcset, wc);
+        for (wc = 0; wc < SBC_MAX; ++wc)
+          {
+    	     cmp_buf[2] = wc;
+    	     if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
+    	         && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+    	       bitset_set (sbcset, wc);
+          }
       }
   }
 # else /* not RE_ENABLE_I18N */
@@ -3201,14 +3216,14 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 
 #ifdef _LIBC
 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
-				  &start_elem, &end_elem);
+				  &start_elem, &end_elem, syntax);
 #else
 # ifdef RE_ENABLE_I18N
 	  *err = build_range_exp (sbcset,
 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
-				  &range_alloc, &start_elem, &end_elem);
+				  &range_alloc, &start_elem, &end_elem, syntax);
 # else
-	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
+	  *err = build_range_exp (sbcset, &start_elem, &end_elem, syntax);
 # endif
 #endif /* RE_ENABLE_I18N */
 	  if (BE (*err != REG_NOERROR, 0))
diff --git a/regex.h b/regex.h
index 6bc503b2..a2d120f9 100644
--- a/regex.h
+++ b/regex.h
@@ -184,6 +184,10 @@ typedef unsigned long int reg_syntax_t;
 /* If this bit is set, then no_sub will be set to 1 during
    re_compile_pattern.  */
 # define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
+
+/* If this bit is set, then ranges act like they are in
+   the "C" locale.  */
+# define RE_RANGES_IGNORE_LOCALES (RE_NO_SUB << 1)
 #endif
 
 /* This global variable defines the particular regexp syntax to use (for
@@ -209,6 +213,7 @@ extern reg_syntax_t re_syntax_options;
 
 #define RE_SYNTAX_GNU_AWK						\
   ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS		\
+    | RE_RANGES_IGNORE_LOCALES                                          \
     | RE_INVALID_INTERVAL_ORD)						\
    & ~(RE_DOT_NOT_NULL | RE_CONTEXT_INDEP_OPS				\
       | RE_CONTEXT_INVALID_OPS ))
diff --git a/test/ChangeLog b/test/ChangeLog
index f28153ab..764fd90d 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,8 @@
+Tue May 31 22:50:28 2011  Arnold D. Robbins  <arnold@skeeve.com>
+
+	* regrange.awk, regrange.ok: New files.
+	* Makefile.am (regrange): New test.
+
 Thu May 26 22:08:27 2011  Arnold D. Robbins  <arnold@skeeve.com>
 
 	* fpat2.awk, fpat2.ok: New files. Thanks to Pat Rankin for the cases.
diff --git a/test/Makefile.am b/test/Makefile.am
index 065bff1d..e3dbce04 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -579,6 +579,8 @@ EXTRA_DIST = \
 	regeq.awk \
 	regeq.in \
 	regeq.ok \
+	regrange.awk \
+	regrange.ok \
 	regtest.sh \
 	regx8bit.awk \
 	regx8bit.ok \
@@ -771,8 +773,8 @@ BASIC_TESTS = \
 	ofmta ofmtbig ofmtfidl ofmts onlynl opasnidx opasnslf paramdup \
 	paramres paramtyp parse1 parsefld parseme pcntplus posix2008sub \
 	prdupval prec printf0 printf1 prmarscl prmreuse prt1eval prtoeval \
-	rand range1 rebt8b1 redfilnm regeq reindops reparse resplit rs \
-	rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \
+	rand range1 rebt8b1 redfilnm regeq regrange reindops reparse resplit \
+	rs rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \
 	rstest5 rswhite scalar sclforin sclifin sortempty splitargv \
 	splitarr splitdef splitvar splitwht strcat1 strnum1 strtod subamp \
 	subi18n subsepnm subslash substr swaplns synerr1 synerr2 tradanch \
diff --git a/test/Makefile.in b/test/Makefile.in
index 1fa42e36..3de264e3 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -764,6 +764,8 @@ EXTRA_DIST = \
 	regeq.awk \
 	regeq.in \
 	regeq.ok \
+	regrange.awk \
+	regrange.ok \
 	regtest.sh \
 	regx8bit.awk \
 	regx8bit.ok \
@@ -956,8 +958,8 @@ BASIC_TESTS = \
 	ofmta ofmtbig ofmtfidl ofmts onlynl opasnidx opasnslf paramdup \
 	paramres paramtyp parse1 parsefld parseme pcntplus posix2008sub \
 	prdupval prec printf0 printf1 prmarscl prmreuse prt1eval prtoeval \
-	rand range1 rebt8b1 redfilnm regeq reindops reparse resplit rs \
-	rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \
+	rand range1 rebt8b1 redfilnm regeq regrange reindops reparse resplit \
+	rs rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 rstest3 rstest4 \
 	rstest5 rswhite scalar sclforin sclifin sortempty splitargv \
 	splitarr splitdef splitvar splitwht strcat1 strnum1 strtod subamp \
 	subi18n subsepnm subslash substr swaplns synerr1 synerr2 tradanch \
@@ -2367,6 +2369,11 @@ regeq:
 	@AWKPATH=$(srcdir) $(AWK) -f $@.awk  < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
 	@-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@
 
+regrange:
+	@echo regrange
+	@AWKPATH=$(srcdir) $(AWK) -f $@.awk  >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+	@-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@
+
 reindops:
 	@echo reindops
 	@AWKPATH=$(srcdir) $(AWK) -f $@.awk  < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/Maketests b/test/Maketests
index 7577b893..243d07b7 100644
--- a/test/Maketests
+++ b/test/Maketests
@@ -645,6 +645,11 @@ regeq:
 	@AWKPATH=$(srcdir) $(AWK) -f $@.awk  < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
 	@-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@
 
+regrange:
+	@echo regrange
+	@AWKPATH=$(srcdir) $(AWK) -f $@.awk  >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+	@-$(CMP) $(srcdir)/$@.ok _$@ && rm -f _$@
+
 reindops:
 	@echo reindops
 	@AWKPATH=$(srcdir) $(AWK) -f $@.awk  < $(srcdir)/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/regrange.awk b/test/regrange.awk
new file mode 100644
index 00000000..71879316
--- /dev/null
+++ b/test/regrange.awk
@@ -0,0 +1,30 @@
+# Tests due to John Haque, May 2011
+#
+# The following should be fatal; can't catch them inside awk, though
+# $> echo 'a' | ./gawk '/[z-a]/ { print }'
+# $> echo 'A' | ./gawk '/[+-[:digit:]]/'
+
+BEGIN {
+	char[1] = "."
+	pat[1] = "[--\\/]"
+
+	char[2] = "a"
+	pat[2] = "[]-c]"
+
+	char[3] = "c"
+	pat[3] = "[[a-d]"
+
+	char[4] = "\\"
+	pat[4] = "[\\[-\\]]"
+
+	char[5] = "[.c.]"
+	pat[5] = "[a-[.e.]]"
+
+	char[6] = "[.d.]"
+	pat[6] = "[[.c.]-[.z.]]"
+
+	for (i = 1; i in char; i++) {
+		printf("\"%s\" ~ /%s/ --> %d\n", char[i], pat[i],
+			char[i] ~ pat[i])
+	}
+}
diff --git a/test/regrange.ok b/test/regrange.ok
new file mode 100644
index 00000000..1fa00c70
--- /dev/null
+++ b/test/regrange.ok
@@ -0,0 +1,6 @@
+"." ~ /[--\/]/ --> 1
+"a" ~ /[]-c]/ --> 1
+"c" ~ /[[a-d]/ --> 1
+"\" ~ /[\[-\]]/ --> 1
+"[.c.]" ~ /[a-[.e.]]/ --> 1
+"[.d.]" ~ /[[.c.]-[.z.]]/ --> 0