aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog10
-rw-r--r--re.c78
2 files changed, 57 insertions, 31 deletions
diff --git a/ChangeLog b/ChangeLog
index 666a435f..2a3fcdbb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+Mon Jan 24 22:05:26 2011 Arnold D. Robbins <arnold@skeeve.com>
+
+ * re.c (make_regexp): Add separate variable for dfa syntax.
+ General formatting cleanup.
+ (research): General formatting cleanup.
+ (refree): Remove out of date comment.
+ (re_update): Comment the routine.
+ (check_bracket_exp): Improve check for range to not get [^-/]
+ kinds of things. Thanks to Nelson Beebe for pointing out the bug.
+
Wed Jan 19 20:31:17 2011 Arnold D. Robbins <arnold@skeeve.com>
* awk.h (is_valid_character): Add `& 0XFF' and remove casts to
diff --git a/re.c b/re.c
index 4f52fb21..379ff144 100644
--- a/re.c
+++ b/re.c
@@ -48,9 +48,12 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
int may_have_range = 0;
char *newbuf;
size_t newlen;
+ reg_syntax_t dfa_syn;
- /* The number of bytes in the current multibyte character.
- It is 0, when the current character is a singlebyte character. */
+ /*
+ * The number of bytes in the current multibyte character.
+ * It is 0, when the current character is a singlebyte character.
+ */
size_t is_multibyte = 0;
#ifdef MBS_SUPPORT
mbstate_t mbs;
@@ -61,11 +64,12 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
if (first) {
first = FALSE;
- no_dfa = (getenv("GAWK_NO_DFA") != NULL); /* for debugging and testing */
+ /* for debugging and testing */
+ no_dfa = (getenv("GAWK_NO_DFA") != NULL);
}
- /* if (do_lint) */ /* always check */
- check_bracket_exp((char *) s, len);
+ /* always check */
+ check_bracket_exp((char *) s, len);
/* Handle escaped characters first. */
@@ -89,8 +93,10 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
/* The previous byte is a singlebyte character, or last byte
of a multibyte character. We check the next character. */
is_multibyte = mbrlen(src, end - src, &mbs);
- if ((is_multibyte == 1) || (is_multibyte == (size_t) -1)
- || (is_multibyte == (size_t) -2 || (is_multibyte == 0))) {
+ if ( (is_multibyte == 1)
+ || (is_multibyte == (size_t) -1)
+ || (is_multibyte == (size_t) -2
+ || (is_multibyte == 0))) {
/* We treat it as a singlebyte character. */
is_multibyte = 0;
}
@@ -127,7 +133,9 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
* literally in re's, so escape regexp
* metacharacters.
*/
- if (do_traditional && ! do_posix && (isdigit(c) || c == 'x')
+ if (do_traditional
+ && ! do_posix
+ && (isdigit(c) || c == 'x')
&& strchr("()|*+?.^$\\[]", c2) != NULL)
*dest++ = '\\';
*dest++ = (char) c2;
@@ -207,6 +215,7 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
* discussion by the definition of casetable[] in eval.c.
*/
+ ignorecase = !! ignorecase; /* force to 1 or 0 */
if (ignorecase) {
if (gawk_mb_cur_max > 1) {
syn |= RE_ICASE;
@@ -220,13 +229,17 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
syn &= ~RE_ICASE;
}
- dfasyntax(syn | (ignorecase ? RE_ICASE : 0), ignorecase ? TRUE : FALSE, '\n');
+ dfa_syn = syn;
+ if (ignorecase)
+ dfa_syn |= RE_ICASE;
+ dfasyntax(dfa_syn, ignorecase, '\n');
re_set_syntax(syn);
if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) {
refree(rp);
if (! canfatal) {
- error("%s: /%s/", rerr, buf); /* rerr already gettextized inside regex routines */
+ /* rerr already gettextized inside regex routines */
+ error("%s: /%s/", rerr, buf);
return NULL;
}
fatal("%s: /%s/", rerr, buf);
@@ -270,15 +283,9 @@ research(Regexp *rp, char *str, int start,
* The dfa matcher doesn't have a no_bol flag, so don't bother
* trying it in that case.
*
- * 4/2007: Grrrr. The dfa matcher has bugs in certain multibyte
- * cases that are just too deeply buried to ferret out. Don't
- * let this kill us if we need_start. (This may be too narrowly
- * focused, perhaps we should relegate the DFA matcher to the
- * single byte case all the time. OTOH, the speed difference
- * between the matchers in non-trivial... Sigh.)
- *
- * 7/2008: Simplify: skip dfa matcher if need_start. The above
- * problems are too much to deal with.
+ * 7/2008: Skip the dfa matcher if need_start. The dfa matcher
+ * has bugs in certain multibyte cases and it's too difficult
+ * to try to special case things.
*/
if (rp->dfa && ! no_bol && ! need_start) {
char save;
@@ -315,12 +322,6 @@ research(Regexp *rp, char *str, int start,
void
refree(Regexp *rp)
{
- /*
- * This isn't malloced, don't let regfree free it.
- * (This is strictly necessary only for the old
- * version of regex, but it's a good idea to keep it
- * here in case regex internals change in the future.)
- */
if (rp == NULL)
return;
rp->pat.translate = NULL;
@@ -352,18 +353,26 @@ re_update(NODE *t)
NODE *t1;
if ((t->re_flags & CASE) == IGNORECASE) {
+ /* regex was compiled with settings matching IGNORECASE */
if ((t->re_flags & CONSTANT) != 0) {
+ /* it's a constant, so just return it as is */
assert(t->type == Node_regex);
return t->re_reg;
}
t1 = t->re_exp;
if (t->re_text != NULL) {
+ /* if contents haven't changed, just return it */
if (cmp_nodes(t->re_text, t1) == 0)
return t->re_reg;
+ /* things changed, fall through to recompile */
unref(t->re_text);
}
+ /* get fresh copy of the text of the regexp */
t->re_text = dupnode(t1);
}
+ /* was compiled with different IGNORECASE or text changed */
+
+ /* free old */
if (t->re_reg != NULL)
refree(t->re_reg);
if (t->re_cnt > 0)
@@ -371,14 +380,18 @@ re_update(NODE *t)
if (t->re_cnt > 10)
t->re_cnt = 0;
if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
+ /* reset regexp text if needed */
t1 = t->re_exp;
unref(t->re_text);
t->re_text = dupnode(t1);
}
+ /* compile it */
t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
IGNORECASE, t->re_cnt, TRUE);
+ /* clear case flag */
t->re_flags &= ~CASE;
+ /* set current value of case flag */
t->re_flags |= IGNORECASE;
return t->re_reg;
}
@@ -407,7 +420,7 @@ resetup()
dfasyntax(syn, FALSE, '\n');
}
-/* avoid_dfa --- FIXME: temporary kludge function until we have a new dfa.c */
+/* avoid_dfa --- return true if we should not use the DFA matcher */
int
avoid_dfa(NODE *re, char *str, size_t len)
@@ -503,9 +516,12 @@ reflags2str(int flagval)
return genflags2str(flagval, values);
}
-/* dfawarn() is called by the dfa routines whenever a regex is compiled
- must supply a dfawarn. */
-extern void
+/*
+ * dfawarn() is called by the dfa routines whenever a regex is compiled
+ * must supply a dfawarn.
+ */
+
+void
dfawarn(const char *dfa_warning)
{
/*
@@ -567,8 +583,8 @@ again:
count--;
if (*sp == '-' && do_lint && ! range_warned && count == 1
&& sp[-1] != '[' && sp[1] != ']'
- && ! isdigit(sp[-1]) && ! isdigit(sp[1])) {
- /* found a range, we think */
+ && ! isdigit(sp[-1]) && ! isdigit(sp[1])
+ && ! (sp[-2] == '[' && sp[-1] == '^')) {
range_warned = TRUE;
warning(_("range of the form `[%c-%c]' is locale dependant"),
sp[-1], sp[1]);