diff options
Diffstat (limited to 're.c')
-rw-r--r-- | re.c | 63 |
1 files changed, 28 insertions, 35 deletions
@@ -25,10 +25,14 @@ #include "awk.h" +#include "localeinfo.h" + static reg_syntax_t syn; static void check_bracket_exp(char *s, size_t len); const char *regexflags2str(int flags); +static struct localeinfo localeinfo; + /* make_regexp --- generate compiled regular expressions */ Regexp * @@ -75,10 +79,10 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) * from that. */ if (buf == NULL) { - emalloc(buf, char *, len + 2, "make_regexp"); + emalloc(buf, char *, len + 1, "make_regexp"); buflen = len; } else if (len > buflen) { - erealloc(buf, char *, len + 2, "make_regexp"); + erealloc(buf, char *, len + 1, "make_regexp"); buflen = len; } dest = buf; @@ -170,7 +174,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); memset((char *) rp, 0, sizeof(*rp)); - rp->dfareg = NULL; rp->pat.allocated = 0; /* regex will allocate the buffer */ emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); @@ -223,12 +226,11 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) /* gack. this must be done *after* re_compile_pattern */ rp->pat.newline_anchor = false; /* don't get \n in middle of string */ if (dfa && ! no_dfa) { - rp->dfa = true; rp->dfareg = dfaalloc(); - dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n'); + dfasyntax(rp->dfareg, & localeinfo, dfa_syn, ignorecase, '\n'); dfacomp(buf, len, rp->dfareg, true); } else - rp->dfa = false; + rp->dfareg = NULL; rp->has_anchor = has_anchor; /* Additional flags that help with RS as regexp. */ @@ -278,26 +280,25 @@ research(Regexp *rp, char *str, int start, * starts in the middle of a string, so don't bother trying it * in that case. */ - if (rp->dfa && ! no_bol && start == 0) { - char save; - size_t count = 0; + if (rp->dfareg != NULL && ! no_bol && start == 0) { struct dfa *superset = dfasuperset(rp->dfareg); - /* - * dfa likes to stick a '\n' right after the matched - * text. So we just save and restore the character. - */ - save = str[start+len]; if (superset) ret = dfaexec(superset, str+start, str+start+len, true, NULL, NULL); - if (ret) + + if (ret && ((! need_start && ! rp->has_anchor) + || (! superset && dfaisfast(rp->dfareg)))) ret = dfaexec(rp->dfareg, str+start, str+start+len, - true, &count, &try_backref); - str[start+len] = save; + true, NULL, &try_backref); } if (ret) { - if (need_start || rp->dfa == false || try_backref) { + if ( rp->dfareg == NULL + || start != 0 + || no_bol + || need_start + || rp->has_anchor + || try_backref) { /* * Passing NULL as last arg speeds up search for cases * where we don't need the start/end info. @@ -326,7 +327,7 @@ refree(Regexp *rp) free(rp->regs.start); if (rp->regs.end) free(rp->regs.end); - if (rp->dfa) { + if (rp->dfareg != NULL) { dfafree(rp->dfareg); free(rp->dfareg); } @@ -359,7 +360,7 @@ re_update(NODE *t) t1 = t->re_exp; if (t->re_text != NULL) { /* if contents haven't changed, just return it */ - if (cmp_nodes(t->re_text, t1) == 0) + if (cmp_nodes(t->re_text, t1, true) == 0) return t->re_reg; /* things changed, fall through to recompile */ unref(t->re_text); @@ -398,6 +399,9 @@ re_update(NODE *t) void resetup() { + // init localeinfo for dfa + init_localeinfo(& localeinfo); + /* * Syntax bits: _that_ is yet another mind trip. Recreational drugs * are helpful for recovering from the experience. @@ -421,25 +425,14 @@ resetup() syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES; (void) re_set_syntax(syn); - - dfa_init(); } -/* avoid_dfa --- return true if we should not use the DFA matcher */ +/* using_utf8 --- are we using utf8 */ -int -avoid_dfa(NODE *re, char *str, size_t len) +bool +using_utf8(void) { - char *end; - - if (! re->re_reg->has_anchor) - return false; - - for (end = str + len; str < end; str++) - if (*str == '\n') - return true; - - return false; + return localeinfo.using_utf8; } /* reisstring --- return true if the RE match is a simple string match */ |