aboutsummaryrefslogtreecommitdiffstats
path: root/re.c
diff options
context:
space:
mode:
Diffstat (limited to 're.c')
-rw-r--r--re.c153
1 files changed, 118 insertions, 35 deletions
diff --git a/re.c b/re.c
index cd11d495..497f7214 100644
--- a/re.c
+++ b/re.c
@@ -6,7 +6,7 @@
* Copyright (C) 1991-1995 the Free Software Foundation, Inc.
*
* This file is part of GAWK, the GNU implementation of the
- * AWK Progamming Language.
+ * AWK Programming Language.
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -19,13 +19,15 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with GAWK; see the file COPYING. If not, write to
- * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*/
#include "awk.h"
-/* Generate compiled regular expressions */
+static reg_syntax_t syn;
+
+/* make_regexp --- generate compiled regular expressions */
Regexp *
make_regexp(s, len, ignorecase, dfa)
@@ -40,14 +42,15 @@ int dfa;
char *temp;
char *end = s + len;
register char *dest;
- register int c;
+ register int c, c2;
/* Handle escaped characters first. */
- /* Build a copy of the string (in dest) with the
- escaped characters translated, and generate the regex
- from that.
- */
+ /*
+ * Build a copy of the string (in dest) with the
+ * escaped characters translated, and generate the regex
+ * from that.
+ */
emalloc(dest, char *, len + 2, "make_regexp");
temp = dest;
@@ -71,27 +74,42 @@ int dfa;
case '5':
case '6':
case '7':
- c = parse_escape(&src);
- if (c < 0)
+ c2 = parse_escape(&src);
+ if (c2 < 0)
cant_happen();
- *dest++ = (char)c;
+ /*
+ * Unix awk treats octal (and hex?) chars
+ * literally in re's, so escape regexp
+ * metacharacters.
+ */
+ if (do_traditional && ! do_posix && (isdigit(c) || c == 'x')
+ && strchr("()|*+?.^$\\[]", c2) != NULL)
+ *dest++ = '\\';
+ *dest++ = (char) c2;
break;
+ case 'y': /* normally \b */
+ /* gnu regex op */
+ if (! do_traditional) {
+ *dest++ = '\\';
+ *dest++ = 'b';
+ src++;
+ break;
+ }
+ /* else, fall through */
default:
*dest++ = '\\';
- *dest++ = (char)c;
+ *dest++ = (char) c;
src++;
break;
} /* switch */
- } else {
+ } else
*dest++ = *src++; /* not '\\' */
- }
} /* for */
*dest = '\0' ; /* Only necessary if we print dest ? */
emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
memset((char *) rp, 0, sizeof(*rp));
- emalloc(rp->pat.buffer, unsigned char *, 16, "make_regexp");
- rp->pat.allocated = 16;
+ rp->pat.allocated = 0; /* regex will allocate the buffer */
emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
if (ignorecase)
@@ -103,17 +121,19 @@ int dfa;
fatal("%s: /%s/", rerr, temp);
/* gack. this must be done *after* re_compile_pattern */
- rp->pat.newline_anchor = 0; /* don't get \n in middle of string */
- if (dfa && !ignorecase) {
- dfacomp(temp, len, &(rp->dfareg), 1);
- rp->dfa = 1;
+ rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
+ if (dfa && ! ignorecase) {
+ dfacomp(temp, len, &(rp->dfareg), TRUE);
+ rp->dfa = TRUE;
} else
- rp->dfa = 0;
+ rp->dfa = FALSE;
free(temp);
return rp;
}
+/* research --- do a regexp search. use dfa if possible */
+
int
research(rp, str, start, len, need_start)
Regexp *rp;
@@ -123,34 +143,42 @@ register size_t len;
int need_start;
{
char *ret = str;
+ int try_backref;
+ /*
+ * Always do dfa search if can; if it fails, then even if
+ * need_start is true, we won't bother with the regex search.
+ */
if (rp->dfa) {
char save;
int count = 0;
- int try_backref;
/*
* dfa likes to stick a '\n' right after the matched
* text. So we just save and restore the character.
*/
save = str[start+len];
- ret = dfaexec(&(rp->dfareg), str+start, str+start+len, 1,
+ ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,
&count, &try_backref);
str[start+len] = save;
}
if (ret) {
- if (need_start || rp->dfa == 0) {
+ if (need_start || rp->dfa == FALSE || try_backref) {
int result = re_search(&(rp->pat), str, start+len,
start, len, &(rp->regs));
/* recover any space from C based alloca */
+#ifdef C_ALLOCA
(void) alloca(0);
+#endif
return result;
} else
return 1;
- } else
+ } else
return -1;
}
+/* refree --- free up the dynamic memory used by a compiled regexp */
+
void
refree(rp)
Regexp *rp;
@@ -166,25 +194,29 @@ Regexp *rp;
free(rp);
}
+/* dfaerror --- print an error message for the dfa routines */
+
void
dfaerror(s)
const char *s;
{
- fatal(s);
+ fatal("%s", s);
}
+/* re_update --- recompile a dynamic regexp */
+
Regexp *
re_update(t)
NODE *t;
{
NODE *t1;
-# define CASE 1
+/* # define CASE 1 */
if ((t->re_flags & CASE) == IGNORECASE) {
- if (t->re_flags & CONST)
+ if ((t->re_flags & CONST) != 0)
return t->re_reg;
t1 = force_string(tree_eval(t->re_exp));
- if (t->re_text) {
+ if (t->re_text != NULL) {
if (cmp_nodes(t->re_text, t1) == 0) {
free_temp(t1);
return t->re_reg;
@@ -194,13 +226,13 @@ NODE *t;
t->re_text = dupnode(t1);
free_temp(t1);
}
- if (t->re_reg)
+ if (t->re_reg != NULL)
refree(t->re_reg);
- if (t->re_cnt)
+ if (t->re_cnt > 0)
t->re_cnt++;
if (t->re_cnt > 10)
t->re_cnt = 0;
- if (!t->re_text) {
+ if (t->re_text == NULL) {
t1 = force_string(tree_eval(t->re_exp));
t->re_text = dupnode(t1);
free_temp(t1);
@@ -212,11 +244,62 @@ NODE *t;
return t->re_reg;
}
+/* resetup --- choose what kind of regexps we match */
+
void
resetup()
{
- reg_syntax_t syn = RE_SYNTAX_AWK;
+ if (do_posix)
+ syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
+ else if (do_traditional)
+ syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
+ else
+ syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
+
+ /*
+ * Interval expressions are off by default, since it's likely to
+ * break too many old programs to have them on.
+ */
+ if (do_intervals)
+ syn |= RE_INTERVALS;
(void) re_set_syntax(syn);
- dfasyntax(syn, 0);
+ dfasyntax(syn, FALSE);
+}
+
+/* avoid_dfa --- temporary kludge function until we have a new dfa.c */
+
+int
+avoid_dfa(re, str, len)
+NODE *re;
+char *str;
+size_t len;
+{
+ char *restr;
+ int relen;
+ int anchor, i;
+ char *end;
+
+ if ((re->re_flags & CONST) != 0) {
+ restr = re->re_exp->stptr;
+ relen = re->re_exp->stlen;
+ } else {
+ restr = re->re_text->stptr;
+ relen = re->re_text->stlen;
+ }
+
+ for (anchor = FALSE, i = 0; i < relen; i++) {
+ if (restr[i] == '^' || restr[i] == '$') {
+ anchor = TRUE;
+ break;
+ }
+ }
+ if (! anchor)
+ return FALSE;
+
+ for (end = str + len; str < end; str++)
+ if (*str == '\n')
+ return TRUE;
+
+ return FALSE;
}