aboutsummaryrefslogtreecommitdiffstats
path: root/re.c
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2010-12-13 20:21:22 +0200
committerArnold D. Robbins <arnold@skeeve.com>2010-12-13 20:21:22 +0200
commit74b4f4e5a35f07bc17092e8c9ee01ba77cc8175d (patch)
treeccc7771b3c2f4c8730174b89b5f57db62e55bc04 /re.c
parenta7a9cbdc136cb0f5cb67b94fb76dc3b6a9ca6c62 (diff)
downloadegawk-74b4f4e5a35f07bc17092e8c9ee01ba77cc8175d.tar.gz
egawk-74b4f4e5a35f07bc17092e8c9ee01ba77cc8175d.tar.bz2
egawk-74b4f4e5a35f07bc17092e8c9ee01ba77cc8175d.zip
Expand ranges before compiling the regexp.
Diffstat (limited to 're.c')
-rw-r--r--re.c155
1 files changed, 152 insertions, 3 deletions
diff --git a/re.c b/re.c
index 7dcbe488..60fa517b 100644
--- a/re.c
+++ b/re.c
@@ -27,6 +27,7 @@
static reg_syntax_t syn;
static void check_bracket_exp(char *s, size_t len);
+static char *expand_range(char *s, size_t *len);
/* make_regexp --- generate compiled regular expressions */
@@ -44,6 +45,9 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
static short first = TRUE;
static short no_dfa = FALSE;
int has_anchor = FALSE;
+ int may_have_range = 0;
+ char *newbuf;
+ size_t newlen;
/* The number of bytes in the current multibyte character.
It is 0, when the current character is a singlebyte character. */
@@ -152,13 +156,36 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
c = *src;
if (c == '^' || c == '$')
has_anchor = TRUE;
+ if (c == '[' || c == '-' || c == ']')
+ may_have_range++;
+
*dest++ = *src++; /* not '\\' */
}
if (gawk_mb_cur_max > 1 && is_multibyte)
is_multibyte--;
} /* while */
- *dest = '\0' ; /* Only necessary if we print dest ? */
+ *dest = '\0';
+ len = dest - buf;
+
+ if ( ! do_posix
+ && may_have_range >= 3
+ && memchr(buf, '-', len) != NULL) {
+ newlen = len;
+ newbuf = expand_range(buf, & newlen);
+
+ /* song and dance since buf & buflen are static */
+ if (newlen > buflen) {
+ free(buf);
+ buf = newbuf;
+ buflen = newlen;
+ } else {
+ memcpy(buf, newbuf, newlen);
+ free(newbuf);
+ }
+ len = newlen;
+ }
+
emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
memset((char *) rp, 0, sizeof(*rp));
rp->dfareg = NULL;
@@ -196,7 +223,6 @@ make_regexp(const char *s, size_t len, int ignorecase, int dfa, int canfatal)
dfasyntax(syn | (ignorecase ? RE_ICASE : 0), ignorecase ? TRUE : FALSE, '\n');
re_set_syntax(syn);
- len = dest - buf;
if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) {
refree(rp);
if (! canfatal) {
@@ -539,7 +565,7 @@ again:
count++;
else if (*sp == ']')
count--;
- if (*sp == '-' && ! range_warned && count == 1
+ if (*sp == '-' && do_lint && ! range_warned && count == 1
&& sp[-1] != '[' && sp[1] != ']'
&& ! isdigit(sp[-1]) && ! isdigit(sp[1])) {
/* found a range, we think */
@@ -580,3 +606,126 @@ again:
done:
s[length] = save;
}
+
+/* add_char --- add a character to the buffer, grow it if needed */
+
+static void
+add_char(char **bufp, size_t *lenp, char ch, char **ptr)
+{
+ size_t newlen;
+ size_t offset;
+
+ if (*ptr - *bufp < *lenp) {
+ **ptr = ch;
+ (*ptr)++;
+ return;
+ }
+
+ /* have to grow the buffer and adjust the pointers */
+ offset = (*ptr - *bufp);
+ newlen = offset * 2;
+ erealloc(*bufp, char *, newlen + 2, "add_char");
+ *ptr = *bufp + offset;
+ **ptr = ch;
+ (*ptr)++;
+}
+
+/* expand_range --- turn [b-e] into [bcde] */
+
+static char *
+expand_range(char *s, size_t *lenp)
+{
+ int i;
+ int found = FALSE;
+ char *sp, *sp2, *newbuf;
+ size_t len;
+ int count = 0;
+ size_t newbuf_len = *lenp * 2;
+
+ emalloc(newbuf, char *, newbuf_len, "expand_range");
+
+ sp = s;
+ sp2 = newbuf;
+ len = *lenp;
+#define copy() (add_char(& newbuf, & newbuf_len, *sp++, & sp2), len--)
+#define copych(ch) (add_char(& newbuf, & newbuf_len, ch, & sp2))
+again:
+ while (len > 0) {
+ if (*sp == '\\') {
+ copy();
+ copy();
+ }
+ else if (*sp == '[') {
+ count++;
+ break;
+ }
+ else
+ copy();
+ }
+ if (len == 0)
+ goto done;
+
+ copy(); /* copy in the [ */
+ if (*sp == '-') /* it's literal, just copy it and skip over */
+ copy();
+
+ while (count > 0 && len > 0) {
+ if (*sp == '\\') {
+ copy();
+ copy();
+ continue;
+ }
+ if (*sp == '[') {
+ count++;
+ copy();
+ continue;
+ }
+ if (*sp == ']') {
+ count--;
+ copy();
+ if (count == 0)
+ goto again;
+ else
+ continue;
+ }
+
+ if (count == 1) {
+ /* inside [...] but not inside [[:...:]] */
+ if (*sp == '-') {
+ int start, end;
+ char i;
+
+ if (sp[1] == ']') { /* also literal */
+ copy();
+ continue;
+ }
+
+ /* It's a range, expand it. */
+ start = sp[-1];
+ if (sp[1] == '\\') {
+ sp++;
+ len--;
+ }
+ end = sp[1];
+ for (i = start + 1; i <= end; i++)
+ copych(i);
+ sp++;
+ len--;
+ continue;
+ }
+ else
+ copy();
+ } else {
+ copy();
+ }
+ }
+
+ if (len > 0)
+ goto again;
+
+done:
+ *lenp = sp2 - newbuf;
+ return newbuf;
+}
+#undef copy
+#undef copych