1 files changed, 74 insertions, 82 deletions
diff --git a/re.c b/re.c
index 878c884e..73e75cbb 100644
--- a/re.c
+++ b/re.c
@@ -2,22 +2,22 @@
  * re.c - compile regular expressions.
  */
 
-/* 
+/*
  * Copyright (C) 1991-2016 the Free Software Foundation, Inc.
- * 
+ *
  * This file is part of GAWK, the GNU implementation of the
  * AWK Programming Language.
- * 
+ *
  * GAWK is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * GAWK is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
@@ -25,10 +25,14 @@
 
 #include "awk.h"
 
+#include "localeinfo.h"
+
 static reg_syntax_t syn;
 static void check_bracket_exp(char *s, size_t len);
 const char *regexflags2str(int flags);
 
+static struct localeinfo localeinfo;
+
 /* make_regexp --- generate compiled regular expressions */
 
 Regexp *
@@ -45,9 +49,8 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
 	int c, c2;
 	static bool first = true;
 	static bool no_dfa = false;
-	bool has_anchor = false;
-	reg_syntax_t dfa_syn;
 	int i;
+	static struct dfa* dfaregs[2] = { NULL, NULL };
 
 	/*
 	 * The number of bytes in the current multibyte character.
@@ -59,9 +62,9 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
 	memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
 
 	if (first) {
-		first = false;
 		/* for debugging and testing */
 		no_dfa = (getenv("GAWK_NO_DFA") != NULL);
+		/* don't set first to false here, we do it below */
 	}
 
 	/* always check */
@@ -72,13 +75,13 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
 	/*
 	 * Build a copy of the string (in buf) with the
 	 * escaped characters translated, and generate the regex
-	 * from that. 
+	 * from that.
 	 */
 	if (buf == NULL) {
-		emalloc(buf, char *, len + 2, "make_regexp");
+		emalloc(buf, char *, len + 1, "make_regexp");
 		buflen = len;
 	} else if (len > buflen) {
-		erealloc(buf, char *, len + 2, "make_regexp");
+		erealloc(buf, char *, len + 1, "make_regexp");
 		buflen = len;
 	}
 	dest = buf;
@@ -156,9 +159,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
 			} /* switch */
 		} else {
 			c = *src;
-			if (c == '^' || c == '$')
-				has_anchor = true;
-
 			*dest++ = *src++;	/* not '\\' */
 		}
 		if (gawk_mb_cur_max > 1 && is_multibyte)
@@ -170,7 +170,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
 
 	emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
 	memset((char *) rp, 0, sizeof(*rp));
-	rp->dfareg = NULL;
 	rp->pat.allocated = 0;	/* regex will allocate the buffer */
 	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
 
@@ -203,10 +202,14 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
 		syn &= ~RE_ICASE;
 	}
 
-	dfa_syn = syn;
-	/* FIXME: dfa doesn't pay attention RE_ICASE */
-	if (ignorecase)
-		dfa_syn |= RE_ICASE;
+	/* initialize dfas to hold syntax */
+	if (first) {
+		first = false;
+		dfaregs[0] = dfaalloc();
+		dfaregs[1] = dfaalloc();
+		dfasyntax(dfaregs[0], & localeinfo, syn, DFA_ANCHOR);
+		dfasyntax(dfaregs[1], & localeinfo, syn | RE_ICASE, DFA_ANCHOR);
+	}
 
 	re_set_syntax(syn);
 
@@ -223,13 +226,11 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
 	/* gack. this must be done *after* re_compile_pattern */
 	rp->pat.newline_anchor = false; /* don't get \n in middle of string */
 	if (dfa && ! no_dfa) {
-		rp->dfa = true;
 		rp->dfareg = dfaalloc();
-		dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n');
+		dfacopysyntax(rp->dfareg, dfaregs[ignorecase]);
 		dfacomp(buf, len, rp->dfareg, true);
 	} else
-		rp->dfa = false;
-	rp->has_anchor = has_anchor;
+		rp->dfareg = NULL;
 
 	/* Additional flags that help with RS as regexp. */
 	for (i = 0; i < len; i++) {
@@ -245,7 +246,7 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
 			break;
 		}
 	}
- 
+
 	return rp;
 }
 
@@ -278,26 +279,24 @@ research(Regexp *rp, char *str, int start,
 	 * starts in the middle of a string, so don't bother trying it
 	 * in that case.
 	 */
-	if (rp->dfa && ! no_bol && start == 0) {
-		char save;
-		size_t count = 0;
+	if (rp->dfareg != NULL && ! no_bol && start == 0) {
 		struct dfa *superset = dfasuperset(rp->dfareg);
-		/*
-		 * dfa likes to stick a '\n' right after the matched
-		 * text.  So we just save and restore the character.
-		 */
-		save = str[start+len];
 		if (superset)
 			ret = dfaexec(superset, str+start, str+start+len,
 							true, NULL, NULL);
-		if (ret)
+
+		if (ret && (! need_start
+				|| (! superset && dfaisfast(rp->dfareg))))
 			ret = dfaexec(rp->dfareg, str+start, str+start+len,
-						true, &count, &try_backref);
-		str[start+len] = save;
+						true, NULL, &try_backref);
 	}
 
 	if (ret) {
-		if (need_start || rp->dfa == false || try_backref) {
+		if (   rp->dfareg == NULL
+			|| start != 0
+			|| no_bol
+			|| need_start
+			|| try_backref) {
 			/*
 			 * Passing NULL as last arg speeds up search for cases
 			 * where we don't need the start/end info.
@@ -319,14 +318,14 @@ void
 refree(Regexp *rp)
 {
 	if (rp == NULL)
-		return; 
+		return;
 	rp->pat.translate = NULL;
 	regfree(& rp->pat);
 	if (rp->regs.start)
 		free(rp->regs.start);
 	if (rp->regs.end)
 		free(rp->regs.end);
-	if (rp->dfa) {
+	if (rp->dfareg != NULL) {
 		dfafree(rp->dfareg);
 		free(rp->dfareg);
 	}
@@ -349,48 +348,49 @@ re_update(NODE *t)
 {
 	NODE *t1;
 
-	if ((t->re_flags & CASE) == IGNORECASE) {
-		/* regex was compiled with settings matching IGNORECASE */
-		if ((t->re_flags & CONSTANT) != 0) {
-			/* it's a constant, so just return it as is */
-			assert(t->type == Node_regex);
-			return t->re_reg;
-		}
-		t1 = t->re_exp;
-		if (t->re_text != NULL) {
-			/* if contents haven't changed, just return it */
-			if (cmp_nodes(t->re_text, t1) == 0)
-				return t->re_reg;
-			/* things changed, fall through to recompile */
-			unref(t->re_text);
-		}
-		/* get fresh copy of the text of the regexp */
-		t->re_text = dupnode(t1);
+	if (t->type == Node_val && (t->flags & REGEX) != 0)
+		return t->typed_re->re_reg[IGNORECASE];
+
+	if ((t->re_flags & CONSTANT) != 0) {
+		/* it's a constant, so just return it as is */
+		assert(t->type == Node_regex);
+		return t->re_reg[IGNORECASE];
 	}
-	/* was compiled with different IGNORECASE or text changed */
+	t1 = t->re_exp;
+	if (t->re_text != NULL) {
+		/* if contents haven't changed, just return it */
+		if (cmp_nodes(t->re_text, t1, true) == 0)
+			return t->re_reg[IGNORECASE];
+		/* things changed, fall through to recompile */
+		unref(t->re_text);
+	}
+	/* get fresh copy of the text of the regexp */
+	t->re_text = dupnode(t1);
+
+	/* text changed */
 
 	/* free old */
-	if (t->re_reg != NULL)
-		refree(t->re_reg);
+	if (t->re_reg[0] != NULL)
+		refree(t->re_reg[0]);
+	if (t->re_reg[1] != NULL)
+		refree(t->re_reg[1]);
 	if (t->re_cnt > 0)
 		t->re_cnt++;
 	if (t->re_cnt > 10)
 		t->re_cnt = 0;
-	if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
+	if (t->re_text == NULL) {
 		/* reset regexp text if needed */
 		t1 = t->re_exp;
 		unref(t->re_text);
 		t->re_text = dupnode(t1);
 	}
 	/* compile it */
-	t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
-				IGNORECASE, t->re_cnt, true);
-
-	/* clear case flag */
-	t->re_flags &= ~CASE;
-	/* set current value of case flag */
-	t->re_flags |= IGNORECASE;
-	return t->re_reg;
+	t->re_reg[0] = make_regexp(t->re_text->stptr, t->re_text->stlen,
+				false, t->re_cnt, true);
+	t->re_reg[1] = make_regexp(t->re_text->stptr, t->re_text->stlen,
+				true, t->re_cnt, true);
+
+	return t->re_reg[IGNORECASE];
 }
 
 /* resetup --- choose what kind of regexps we match */
@@ -398,6 +398,9 @@ re_update(NODE *t)
 void
 resetup()
 {
+	// init localeinfo for dfa
+	init_localeinfo(& localeinfo);
+
 	/*
 	 * Syntax bits: _that_ is yet another mind trip.  Recreational drugs
 	 * are helpful for recovering from the experience.
@@ -421,25 +424,14 @@ resetup()
 		syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
 
 	(void) re_set_syntax(syn);
-
-	dfa_init();
 }
 
-/* avoid_dfa --- return true if we should not use the DFA matcher */
+/* using_utf8 --- are we using utf8 */
 
-int
-avoid_dfa(NODE *re, char *str, size_t len)
+bool
+using_utf8(void)
 {
-	char *end;
-
-	if (! re->re_reg->has_anchor)
-		return false;
-
-	for (end = str + len; str < end; str++)
-		if (*str == '\n')
-			return true;
-
-	return false;
+	return localeinfo.using_utf8;
 }
 
 /* reisstring --- return true if the RE match is a simple string match */