diff options
Diffstat (limited to 're.c')
-rw-r--r-- | re.c | 68 |
1 files changed, 28 insertions, 40 deletions
@@ -25,10 +25,14 @@ #include "awk.h" +#include "localeinfo.h" + static reg_syntax_t syn; static void check_bracket_exp(char *s, size_t len); const char *regexflags2str(int flags); +static struct localeinfo localeinfo; + /* make_regexp --- generate compiled regular expressions */ Regexp * @@ -45,7 +49,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) int c, c2; static bool first = true; static bool no_dfa = false; - bool has_anchor = false; reg_syntax_t dfa_syn; int i; @@ -75,10 +78,10 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) * from that. */ if (buf == NULL) { - emalloc(buf, char *, len + 2, "make_regexp"); + emalloc(buf, char *, len + 1, "make_regexp"); buflen = len; } else if (len > buflen) { - erealloc(buf, char *, len + 2, "make_regexp"); + erealloc(buf, char *, len + 1, "make_regexp"); buflen = len; } dest = buf; @@ -156,9 +159,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) } /* switch */ } else { c = *src; - if (c == '^' || c == '$') - has_anchor = true; - *dest++ = *src++; /* not '\\' */ } if (gawk_mb_cur_max > 1 && is_multibyte) @@ -170,7 +170,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); memset((char *) rp, 0, sizeof(*rp)); - rp->dfareg = NULL; rp->pat.allocated = 0; /* regex will allocate the buffer */ emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); @@ -223,13 +222,12 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) /* gack. this must be done *after* re_compile_pattern */ rp->pat.newline_anchor = false; /* don't get \n in middle of string */ if (dfa && ! no_dfa) { - rp->dfa = true; rp->dfareg = dfaalloc(); - dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n'); + dfasyntax(rp->dfareg, & localeinfo, dfa_syn, + (ignorecase ? DFA_CASE_FOLD : 0) | DFA_ANCHOR); dfacomp(buf, len, rp->dfareg, true); } else - rp->dfa = false; - rp->has_anchor = has_anchor; + rp->dfareg = NULL; /* Additional flags that help with RS as regexp. */ for (i = 0; i < len; i++) { @@ -278,26 +276,24 @@ research(Regexp *rp, char *str, int start, * starts in the middle of a string, so don't bother trying it * in that case. */ - if (rp->dfa && ! no_bol && start == 0) { - char save; - size_t count = 0; + if (rp->dfareg != NULL && ! no_bol && start == 0) { struct dfa *superset = dfasuperset(rp->dfareg); - /* - * dfa likes to stick a '\n' right after the matched - * text. So we just save and restore the character. - */ - save = str[start+len]; if (superset) ret = dfaexec(superset, str+start, str+start+len, true, NULL, NULL); - if (ret) + + if (ret && (! need_start + || (! superset && dfaisfast(rp->dfareg)))) ret = dfaexec(rp->dfareg, str+start, str+start+len, - true, &count, &try_backref); - str[start+len] = save; + true, NULL, &try_backref); } if (ret) { - if (need_start || rp->dfa == false || try_backref) { + if ( rp->dfareg == NULL + || start != 0 + || no_bol + || need_start + || try_backref) { /* * Passing NULL as last arg speeds up search for cases * where we don't need the start/end info. @@ -326,7 +322,7 @@ refree(Regexp *rp) free(rp->regs.start); if (rp->regs.end) free(rp->regs.end); - if (rp->dfa) { + if (rp->dfareg != NULL) { dfafree(rp->dfareg); free(rp->dfareg); } @@ -359,7 +355,7 @@ re_update(NODE *t) t1 = t->re_exp; if (t->re_text != NULL) { /* if contents haven't changed, just return it */ - if (cmp_nodes(t->re_text, t1) == 0) + if (cmp_nodes(t->re_text, t1, true) == 0) return t->re_reg; /* things changed, fall through to recompile */ unref(t->re_text); @@ -398,6 +394,9 @@ re_update(NODE *t) void resetup() { + // init localeinfo for dfa + init_localeinfo(& localeinfo); + /* * Syntax bits: _that_ is yet another mind trip. Recreational drugs * are helpful for recovering from the experience. @@ -421,25 +420,14 @@ resetup() syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES; (void) re_set_syntax(syn); - - dfa_init(); } -/* avoid_dfa --- return true if we should not use the DFA matcher */ +/* using_utf8 --- are we using utf8 */ -int -avoid_dfa(NODE *re, char *str, size_t len) +bool +using_utf8(void) { - char *end; - - if (! re->re_reg->has_anchor) - return false; - - for (end = str + len; str < end; str++) - if (*str == '\n') - return true; - - return false; + return localeinfo.using_utf8; } /* reisstring --- return true if the RE match is a simple string match */ |