Now at 2.02.

author: Arnold D. Robbins <arnold@skeeve.com> 2010-07-02 15:46:31 +0300
committer: Arnold D. Robbins <arnold@skeeve.com> 2010-07-02 15:46:31 +0300
commit: 3711eedc1b995eb1926c9ffb902d5d796cacf8d0 (patch)
tree: 5642fdee11499774e0b7401f195931cd3a143d18 /regex.c
parent: ec6415f1ba061b2fb78808b7dba3246745a15398 (diff)
download: egawk-3711eedc1b995eb1926c9ffb902d5d796cacf8d0.tar.gz
egawk-3711eedc1b995eb1926c9ffb902d5d796cacf8d0.tar.bz2
egawk-3711eedc1b995eb1926c9ffb902d5d796cacf8d0.zip
1 files changed, 393 insertions, 264 deletions
diff --git a/regex.c b/regex.c
index ebfd612e..40118055 100644
--- a/regex.c
+++ b/regex.c
@@ -32,7 +32,8 @@ as you receive it, in any medium, provided that you conspicuously and
 appropriately publish on each copy a valid copyright notice "Copyright
 (C) 1985 Free Software Foundation, Inc."; and include following the
 copyright notice a verbatim copy of the above disclaimer of warranty
-and of this License.
+and of this License.  You may charge a distribution fee for the
+physical act of transferring a copy.
 
   2. You may modify your copy or copies of this source file or
 any portion of it, and copy and distribute such modifications under
@@ -43,31 +44,42 @@ the terms of Paragraph 1 above, provided that you also do the following:
 
     b) cause the whole of any work that you distribute or publish,
     that in whole or in part contains or is a derivative of this
-    program or any part thereof, to be freely distributed
-    and licensed to all third parties on terms identical to those
-    contained in this License Agreement (except that you may choose
-    to grant more extensive warranty protection to third parties,
-    at your option).
-
-  3. You may copy and distribute this program or any portion of it in
-compiled, executable or object code form under the terms of Paragraphs
-1 and 2 above provided that you do the following:
-
-    a) cause each such copy to be accompanied by the
-    corresponding machine-readable source code, which must
-    be distributed under the terms of Paragraphs 1 and 2 above; or,
-
-    b) cause each such copy to be accompanied by a
-    written offer, with no time limit, to give any third party
-    free (except for a nominal shipping charge) a machine readable
-    copy of the corresponding source code, to be distributed
-    under the terms of Paragraphs 1 and 2 above; or,
-
-    c) in the case of a recipient of this program in compiled, executable
-    or object code form (without the corresponding source code) you
-    shall cause copies you distribute to be accompanied by a copy
-    of the written offer of source code which you received along
-    with the copy you received.
+    program or any part thereof, to be licensed at no charge to all
+    third parties on terms identical to those contained in this
+    License Agreement (except that you may choose to grant more extensive
+    warranty protection to some or all third parties, at your option).
+
+    c) You may charge a distribution fee for the physical act of
+    transferring a copy, and you may at your option offer warranty
+    protection in exchange for a fee.
+
+Mere aggregation of another unrelated program with this program (or its
+derivative) on a volume of a storage or distribution medium does not bring
+the other program under the scope of these terms.
+
+  3. You may copy and distribute this program (or a portion or derivative
+of it, under Paragraph 2) in object code or executable form under the terms
+of Paragraphs 1 and 2 above provided that you also do one of the following:
+
+    a) accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of
+    Paragraphs 1 and 2 above; or,
+
+    b) accompany it with a written offer, valid for at least three
+    years, to give any third party free (except for a nominal
+    shipping charge) a complete machine-readable copy of the
+    corresponding source code, to be distributed under the terms of
+    Paragraphs 1 and 2 above; or,
+
+    c) accompany it with the information you received as to where the
+    corresponding source code may be obtained.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form alone.)
+
+For an executable file, complete source code means all the source code for
+all modules it contains; but, as a special exception, it need not include
+source code for modules which are standard libraries that accompany the
+operating system on which the executable file runs.
 
   4. You may not copy, sublicense, distribute or transfer this program
 except as expressly provided under this License Agreement.  Any attempt
@@ -77,6 +89,14 @@ automatically terminated.  However, parties who have received computer
 software programs from you with this License Agreement will not have
 their licenses terminated so long as such parties remain in full compliance.
 
+  5. If you wish to incorporate parts of this program into other free
+programs whose distribution conditions are different, write to the Free
+Software Foundation at 675 Mass Ave, Cambridge, MA 02139.  We have not yet
+worked out a simple rule that can be stated here, but we will often permit
+this.  We will be guided by the two goals of preserving the free status of
+all derivatives of our free software and of promoting the sharing and reuse of
+software.
+
 
 In other words, you are welcome to use, share and improve this program.
 You are forbidden to forbid anyone else to use, share and improve
@@ -88,11 +108,6 @@ what you give them.   Help stamp out software-hoarding!  */
  which reads a pattern, describes how it compiles,
  then reads a string and searches for it.  */
 
-/* JF this var has taken on whole new meanings as time goes by.  Various bits
-in this int tell how certain pieces of syntax should work */
-
-static int obscure_syntax = 0;
-
 #ifdef emacs
 
 /* The `emacs' switch turns on certain special matching commands
@@ -105,6 +120,21 @@ static int obscure_syntax = 0;
 
 #else  /* not emacs */
 
+#ifdef USG
+#define bcopy(s,d,n)	memcpy((d),(s),(n))
+#define bcmp(s1,s2,n)	memcmp((s1),(s2),(n))
+#define bzero(s,n)	memset((s),0,(n))
+#endif
+
+/* Make alloca work the best possible way.  */
+#ifdef __GNUC__
+#define alloca __builtin_alloca
+#else
+#ifdef sparc
+#include <alloca.h>
+#endif
+#endif
+
 /*
  * Define the syntax stuff, so we can do the \<...\> things.
  */
@@ -157,7 +187,7 @@ init_syntax_once ()
 
 #ifndef NFAILURES
 #define NFAILURES 80
-#endif NFAILURES
+#endif /* NFAILURES */
 
 /* width of a byte in bits */
 
@@ -167,20 +197,39 @@ init_syntax_once ()
 #define SIGN_EXTEND_CHAR(x) (x)
 #endif
 
-/* compile_pattern takes a regular-expression descriptor string in the user's format
-  and converts it into a buffer full of byte commands for matching.
+static int obscure_syntax = 0;
+
+/* Specify the precise syntax of regexp for compilation.
+   This provides for compatibility for various utilities
+   which historically have different, incompatible syntaxes.
+
+   The argument SYNTAX is a bit-mask containing the two bits
+   RE_NO_BK_PARENS and RE_NO_BK_VBAR.  */
+
+int
+re_set_syntax (syntax)
+{
+  int ret;
+
+  ret = obscure_syntax;
+  obscure_syntax = syntax;
+  return ret;
+}
+
+/* re_compile_pattern takes a regular-expression string
+   and converts it into a buffer full of byte commands for matching.
 
-  pattern   is the address of the pattern string
-  size      is the length of it.
-  bufp	    is a  struct re_pattern_buffer *  which points to the info
+  PATTERN   is the address of the pattern string
+  SIZE      is the length of it.
+  BUFP	    is a  struct re_pattern_buffer *  which points to the info
 	    on where to store the byte commands.
 	    This structure contains a  char *  which points to the
 	    actual space, which should have been obtained with malloc.
-	    compile_pattern may use  realloc  to grow the buffer space.
+	    re_compile_pattern may use  realloc  to grow the buffer space.
 
   The number of bytes of commands can be found out by looking in
   the  struct re_pattern_buffer  that bufp pointed to,
-  after compile_pattern returns.
+  after re_compile_pattern returns.
 */
 
 #define PATPUSH(ch) (*b++ = (char) (ch))
@@ -216,20 +265,6 @@ init_syntax_once ()
 
 static int store_jump (), insert_jump ();
 
-/* JF this function is used to compile UN*X style regexps.  In particular,
-   ( ) and | don't have to be \ed to have a special meaning */
-
-int
-re_set_syntax(syntax)
-{
-	int ret;
-
-	ret=obscure_syntax;
-	obscure_syntax=syntax;
-	return ret;
-}
-
-
 char *
 re_compile_pattern (pattern, size, bufp)
      char *pattern;
@@ -325,9 +360,28 @@ re_compile_pattern (pattern, size, bufp)
       switch (c)
 	{
 	case '$':
+	  if (obscure_syntax & RE_TIGHT_VBAR)
+	    {
+	      if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p != pend)
+		goto normal_char;
+	      /* Make operand of last vbar end before this `$'.  */
+	      if (fixup_jump)
+		store_jump (fixup_jump, jump, b);
+	      fixup_jump = 0;
+	      PATPUSH (endline);
+	      break;
+	    }
+
 	  /* $ means succeed if at end of line, but only in special contexts.
-	    If randonly in the middle of a pattern, it is a normal character. */
-	  if (p == pend || (*p == '\\' && (p[1] == ')' || p[1] == '|')))
+	    If randomly in the middle of a pattern, it is a normal character. */
+	  if (p == pend || *p == '\n'
+	      || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
+	      || (obscure_syntax & RE_NO_BK_PARENS
+		  ? *p == ')'
+		  : *p == '\\' && p[1] == ')')
+	      || (obscure_syntax & RE_NO_BK_VBAR
+		  ? *p == '|'
+		  : *p == '\\' && p[1] == '|'))
 	    {
 	      PATPUSH (endline);
 	      break;
@@ -336,15 +390,30 @@ re_compile_pattern (pattern, size, bufp)
 
 	case '^':
 	  /* ^ means succeed if at beg of line, but only if no preceding pattern. */
-	  if (laststart) goto normal_char;
-	  PATPUSH (begline);
+
+	  if (laststart && p[-2] != '\n'
+	      && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
+	    goto normal_char;
+	  if (obscure_syntax & RE_TIGHT_VBAR)
+	    {
+	      if (p != pattern + 1
+		  && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
+		goto normal_char;
+	      PATPUSH (begline);
+	      begalt = b;
+	    }
+	  else
+	    PATPUSH (begline);
 	  break;
 
-	case '*':
 	case '+':
 	case '?':
+	  if (obscure_syntax & RE_BK_PLUS_QM)
+	    goto normal_char;
+	handle_plus:
+	case '*':
 	  /* If there is no previous pattern, char not special. */
-	  if (!laststart)
+	  if (!laststart && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
 	    goto normal_char;
 	  /* If there is a sequence of repetition chars,
 	     collapse it down to equivalent to just one.  */
@@ -357,13 +426,36 @@ re_compile_pattern (pattern, size, bufp)
 	      if (p == pend)
 		break;
 	      PATFETCH (c);
-	      if (!(c == '*' || c == '+' || c == '?'))
+	      if (c == '*')
+		;
+	      else if (!(obscure_syntax & RE_BK_PLUS_QM)
+		       && (c == '+' || c == '?'))
+		;
+	      else if ((obscure_syntax & RE_BK_PLUS_QM)
+		       && c == '\\')
+		{
+		  int c1;
+		  PATFETCH (c1);
+		  if (!(c1 == '+' || c1 == '?'))
+		    {
+		      PATUNFETCH;
+		      PATUNFETCH;
+		      break;
+		    }
+		  c = c1;
+		}
+	      else
 		{
 		  PATUNFETCH;
 		  break;
 		}
 	    }
 
+	  /* Star, etc. applied to an empty pattern is equivalent
+	     to an empty pattern.  */
+	  if (!laststart)
+	    break;
+
 	  /* Now we know whether 0 matches is allowed,
 	     and whether 2 or more matches is allowed.  */
 	  if (many_times_ok)
@@ -391,8 +483,8 @@ re_compile_pattern (pattern, size, bufp)
 	  break;
 
 	case '[':
-	  if (b - bufp->buffer
-	      > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
+	  while (b - bufp->buffer
+		 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
 	    /* Note that EXTEND_BUFFER clobbers c */
 	    EXTEND_BUFFER;
 
@@ -411,7 +503,7 @@ re_compile_pattern (pattern, size, bufp)
 	    {
 	      PATFETCH (c);
 	      if (c == ']' && p != p1 + 1) break;
-	      if (*p == '-')
+	      if (*p == '-' && p[1] != ']')
 		{
 		  PATFETCH (c1);
 		  PATFETCH (c1);
@@ -425,86 +517,44 @@ re_compile_pattern (pattern, size, bufp)
 	    }
 	  /* Discard any bitmap bytes that are all 0 at the end of the map.
 	     Decrement the map-length byte too. */
-	  while (b[-1] > 0 && b[b[-1] - 1] == 0)
+	  while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
 	    b[-1]--;
 	  b += b[-1];
 	  break;
 
-	  case '(':
-	    if(!(obscure_syntax&RE_NO_BK_PARENS)) goto normal_char;
-	    if (stackp == stacke) goto nesting_too_deep;
-	    if (regnum < RE_NREGS)
-	      {
-		PATPUSH (start_memory);
-		PATPUSH (regnum);
-	      }
-	    *stackp++ = b - bufp->buffer;
-	    *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
-	    *stackp++ = regnum++;
-	    *stackp++ = begalt - bufp->buffer;
-	    fixup_jump = 0;
-	    laststart = 0;
-	    begalt = b;
-	    break;
+	case '(':
+	  if (! (obscure_syntax & RE_NO_BK_PARENS))
+	    goto normal_char;
+	  else
+	    goto handle_open;
 
-	  case ')':
-	    if(!(obscure_syntax&RE_NO_BK_PARENS)) goto normal_char;
-	    if (stackp == stackb) goto unmatched_close;
-	    begalt = *--stackp + bufp->buffer;
-	    if (fixup_jump)
-	      store_jump (fixup_jump, jump, b);
-	    if (stackp[-1] < RE_NREGS)
-	      {
-		PATPUSH (stop_memory);
-		PATPUSH (stackp[-1]);
-	      }
-	    stackp -= 2;
-	    fixup_jump = 0;
-	    if (*stackp)
-	      fixup_jump = *stackp + bufp->buffer - 1;
-	    laststart = *--stackp + bufp->buffer;
-	    break;
+	case ')':
+	  if (! (obscure_syntax & RE_NO_BK_PARENS))
+	    goto normal_char;
+	  else
+	    goto handle_close;
 
-	  case '|':
-	    if(!(obscure_syntax&RE_NO_BK_VBAR)) goto normal_char;
-	    insert_jump (on_failure_jump, begalt, b + 6, b);
-	    pending_exact = 0;
-	    b += 3;
-	    if (fixup_jump)
-	      store_jump (fixup_jump, jump, b);
-	    fixup_jump = b;
-	    b += 3;
-	    laststart = 0;
-	    begalt = b;
-	    break;
+	case '\n':
+	  if (! (obscure_syntax & RE_NEWLINE_OR))
+	    goto normal_char;
+	  else
+	    goto handle_bar;
+
+	case '|':
+	  if (! (obscure_syntax & RE_NO_BK_VBAR))
+	    goto normal_char;
+	  else
+	    goto handle_bar;
 
         case '\\':
 	  if (p == pend) goto invalid_pattern;
 	  PATFETCH_RAW (c);
 	  switch (c)
 	    {
-#ifdef emacs
-	    case '=':
-	      PATPUSH (at_dot);
-	      break;
-
-	    case 's':	
-	      laststart = b;
-	      PATPUSH (syntaxspec);
-	      PATFETCH (c);
-	      PATPUSH (syntax_spec_code[c]);
-	      break;
-
-	    case 'S':
-	      laststart = b;
-	      PATPUSH (notsyntaxspec);
-	      PATFETCH (c);
-	      PATPUSH (syntax_spec_code[c]);
-	      break;
-#endif emacs
-
 	    case '(':
-	      if(obscure_syntax&RE_NO_BK_PARENS) goto normal_backsl;
+	      if (obscure_syntax & RE_NO_BK_PARENS)
+		goto normal_backsl;
+	    handle_open:
 	      if (stackp == stacke) goto nesting_too_deep;
 	      if (regnum < RE_NREGS)
 	        {
@@ -521,7 +571,9 @@ re_compile_pattern (pattern, size, bufp)
 	      break;
 
 	    case ')':
-	      if(obscure_syntax&RE_NO_BK_PARENS) goto normal_backsl;
+	      if (obscure_syntax & RE_NO_BK_PARENS)
+		goto normal_backsl;
+	    handle_close:
 	      if (stackp == stackb) goto unmatched_close;
 	      begalt = *--stackp + bufp->buffer;
 	      if (fixup_jump)
@@ -539,7 +591,9 @@ re_compile_pattern (pattern, size, bufp)
 	      break;
 
 	    case '|':
-	      if(obscure_syntax&RE_NO_BK_VBAR) goto normal_backsl;
+	      if (obscure_syntax & RE_NO_BK_VBAR)
+		goto normal_backsl;
+	    handle_bar:
 	      insert_jump (on_failure_jump, begalt, b + 6, b);
 	      pending_exact = 0;
 	      b += 3;
@@ -551,6 +605,26 @@ re_compile_pattern (pattern, size, bufp)
 	      begalt = b;
 	      break;
 
+#ifdef emacs
+	    case '=':
+	      PATPUSH (at_dot);
+	      break;
+
+	    case 's':	
+	      laststart = b;
+	      PATPUSH (syntaxspec);
+	      PATFETCH (c);
+	      PATPUSH (syntax_spec_code[c]);
+	      break;
+
+	    case 'S':
+	      laststart = b;
+	      PATPUSH (notsyntaxspec);
+	      PATFETCH (c);
+	      PATPUSH (syntax_spec_code[c]);
+	      break;
+#endif /* emacs */
+
 	    case 'w':
 	      laststart = b;
 	      PATPUSH (wordchar);
@@ -604,9 +678,15 @@ re_compile_pattern (pattern, size, bufp)
 	      PATPUSH (duplicate);
 	      PATPUSH (c1);
 	      break;
+
+	    case '+':
+	    case '?':
+	      if (obscure_syntax & RE_BK_PLUS_QM)
+		goto handle_plus;
+
 	    default:
 	    normal_backsl:
-	      /* You might think it wuld be useful for \ to mean
+	      /* You might think it would be useful for \ to mean
 		 not to translate; but if we don't translate it
 		 it will never match anything.  */
 	      if (translate) c = translate[c];
@@ -618,7 +698,9 @@ re_compile_pattern (pattern, size, bufp)
 	normal_char:
 	  if (!pending_exact || pending_exact + *pending_exact + 1 != b
 	      || *pending_exact == 0177 || *p == '*' || *p == '^'
-	      || *p == '+' || *p == '?')
+	      || ((obscure_syntax & RE_BK_PLUS_QM)
+		  ? *p == '\\' && (p[1] == '+' || p[1] == '?')
+		  : (*p == '+' || *p == '?')))
 	    {
 	      laststart = b;
 	      PATPUSH (exactn);
@@ -833,11 +915,12 @@ re_compile_fastmap (bufp)
 	  break;
 
 	case notsyntaxspec:
+	  k = *p++;
 	  for (j = 0; j < (1 << BYTEWIDTH); j++)
 	    if (SYNTAX (j) != (enum syntaxcode) k)
 	      fastmap[j] = 1;
 	  break;
-#endif emacs
+#endif /* emacs */
 
 	case charset:
 	  for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
@@ -891,16 +974,17 @@ re_search (pbufp, string, size, startpos, range, regs)
   return re_search_2 (pbufp, 0, 0, string, size, startpos, range, regs, size);
 }
 
-/* Like re_match_2 but tries first a match starting at index `startpos',
- then at startpos + 1, and so on.
- `range' is the number of places to try before giving up.
- If `range' is negative, the starting positions tried are
-  startpos, startpos - 1, etc.
- It is up to the caller to make sure that range is not so large
-  as to take the starting position outside of the input strings.
+/* Like re_match_2 but tries first a match starting at index STARTPOS,
+   then at STARTPOS + 1, and so on.
+   RANGE is the number of places to try before giving up.
+   If RANGE is negative, the starting positions tried are
+    STARTPOS, STARTPOS - 1, etc.
+   It is up to the caller to make sure that range is not so large
+   as to take the starting position outside of the input strings.
 
 The value returned is the position at which the match was found,
- or -1 if no match was found. */
+ or -1 if no match was found,
+ or -2 if error (such as failure stack overflow).  */
 
 int
 re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop)
@@ -913,12 +997,24 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop
      int mstop;
 {
   register char *fastmap = pbufp->fastmap;
-  register char *translate = pbufp->translate;
+  register unsigned char *translate = (unsigned char *) pbufp->translate;
   int total = size1 + size2;
+  int val;
 
   /* Update the fastmap now if not correct already */
   if (fastmap && !pbufp->fastmap_accurate)
     re_compile_fastmap (pbufp);
+  
+  /* Don't waste time in a long search for a pattern
+     that says it is anchored.  */
+  if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
+      && range > 0)
+    {
+      if (startpos > 0)
+	return -1;
+      else
+	range = 1;
+    }
 
   while (1)
     {
@@ -933,12 +1029,13 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop
 	  if (range > 0)
 	    {
 	      register int lim = 0;
-	      register char *p;
+	      register unsigned char *p;
 	      int irange = range;
 	      if (startpos < size1 && startpos + range >= size1)
 		lim = range - (size1 - startpos);
 
-	      p = &(startpos >= size1 ? string2 - size1 : string1)[startpos];
+	      p = ((unsigned char *)
+		   &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
 
 	      if (translate)
 		{
@@ -954,9 +1051,12 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop
 	    }
 	  else
 	    {
-	      register char c;
-	      if (startpos >= size1) c = string2[startpos - size1];
-	      else c = string1[startpos];
+	      register unsigned char c;
+	      if (startpos >= size1)
+		c = string2[startpos - size1];
+	      else
+		c = string1[startpos];
+	      c &= 0xff;
 	      if (translate ? !fastmap[translate[c]] : !fastmap[c])
 		goto advance;
 	    }
@@ -966,8 +1066,13 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop
 	  && fastmap && pbufp->can_be_null == 0)
 	return -1;
 
-      if (0 <= re_match_2 (pbufp, string1, size1, string2, size2, startpos, regs, mstop))
-	return startpos;
+      val = re_match_2 (pbufp, string1, size1, string2, size2, startpos, regs, mstop);
+      if (0 <= val)
+	{
+	  if (val == -2)
+	    return -2;
+	  return startpos;
+	}
 
 #ifdef C_ALLOCA
       alloca (0);
@@ -992,42 +1097,47 @@ re_match (pbufp, string, size, pos, regs)
 }
 #endif /* emacs */
 
-/* Match the pattern described by `pbufp'
-  against data which is the virtual concatenation of `string1' and `string2'.
-  `size1' and `size2' are the sizes of the two data strings.
-  Start the match at position `pos'.
-  Do not consider matching past the position `mstop'.
+/* Maximum size of failure stack.  Beyond this, overflow is an error.  */
 
-  If pbufp->fastmap is nonzero, then it had better be up to date.
+int re_max_failures = 2000;
 
-  The reason that the data to match is specified as two components
-  which are to be regarded as concatenated
-  is so that this function can be used directly on the contents of an Emacs buffer.
+static int bcmp_translate();
+/* Match the pattern described by PBUFP
+   against data which is the virtual concatenation of STRING1 and STRING2.
+   SIZE1 and SIZE2 are the sizes of the two data strings.
+   Start the match at position POS.
+   Do not consider matching past the position MSTOP.
 
-  -1 is returned if there is no match.  Otherwise the value is the length
-  of the substring which was matched.
-*/
+   If pbufp->fastmap is nonzero, then it had better be up to date.
+
+   The reason that the data to match are specified as two components
+   which are to be regarded as concatenated
+   is so this function can be used directly on the contents of an Emacs buffer.
+
+   -1 is returned if there is no match.  -2 is returned if there is
+   an error (such as match stack overflow).  Otherwise the value is the length
+   of the substring which was matched.  */
 
 int
 re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
      struct re_pattern_buffer *pbufp;
-     char *string1, *string2;
+     unsigned char *string1, *string2;
      int size1, size2;
      int pos;
      struct re_registers *regs;
      int mstop;
 {
-  register char *p = pbufp->buffer;
-  register char *pend = p + pbufp->used;
+  register unsigned char *p = (unsigned char *) pbufp->buffer;
+  register unsigned char *pend = p + pbufp->used;
   /* End of first string */
-  char *end1;
+  unsigned char *end1;
   /* End of second string */
-  char *end2;
+  unsigned char *end2;
   /* Pointer just past last char to consider matching */
-  char *end_match_1, *end_match_2;
-  register char *d, *dend;
+  unsigned char *end_match_1, *end_match_2;
+  register unsigned char *d, *dend;
   register int mcnt;
-  char *translate = pbufp->translate;
+  unsigned char *translate = (unsigned char *) pbufp->translate;
 
  /* Failure point stack.  Each place that can handle a failure further down the line
     pushes a failure point on this stack.  It consists of two char *'s.
@@ -1037,8 +1147,9 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
     If a failure happens and the innermost failure point is dormant,
     it discards that failure point and tries the next one. */
 
-  char **stackb = (char **) alloca (2 * NFAILURES * sizeof (char *));
-  char **stackp = stackb, **stacke = &stackb[2 * NFAILURES];
+  unsigned char *initial_stack[2 * NFAILURES];
+  unsigned char **stackb = initial_stack;
+  unsigned char **stackp = stackb, **stacke = &stackb[2 * NFAILURES];
 
   /* Information on the "contents" of registers.
      These are pointers into the input strings; they record
@@ -1048,14 +1159,12 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 
      At that point, regstart[regnum] points to the first character in the register,
      regend[regnum] points to the first character beyond the end of the register,
-     and regstart_segend[regnum] is either the same as regend[regnum]
-     or else points to the end of the input string into which regstart[regnum] points.
-     The latter case happens when regstart[regnum] is in string1 and
-     regend[regnum] is in string2.  */
+     regstart_seg1[regnum] is true iff regstart[regnum] points into string1,
+     and regend_seg1[regnum] is true iff regend[regnum] points into string1.  */
 
-  char *regstart[RE_NREGS];
-  char *regstart_segend[RE_NREGS];
-  char *regend[RE_NREGS];
+  unsigned char *regstart[RE_NREGS];
+  unsigned char *regend[RE_NREGS];
+  unsigned char regstart_seg1[RE_NREGS], regend_seg1[RE_NREGS];
 
   /* Set up pointers to ends of strings.
      Don't allow the second string to be empty unless both are empty.  */
@@ -1081,11 +1190,11 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
       end_match_2 = string2 + mstop - size1;
     }
 
-  /* Initialize \( and \) text positions to -1
+  /* Initialize \) text positions to -1
      to mark ones that no \( or \) has been seen for.  */
 
-  for (mcnt = 0; mcnt < sizeof (regstart) / sizeof (*regstart); mcnt++)
-    regstart[mcnt] = (char *) -1;
+  for (mcnt = 0; mcnt < sizeof (regend) / sizeof (*regend); mcnt++)
+    regend[mcnt] = (unsigned char *) -1;
 
   /* `p' scans through the pattern as `d' scans through the data.
      `dend' is the end of the input string that `d' points within.
@@ -1119,31 +1228,31 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 	  /* If caller wants register contents data back, convert it to indices */
 	  if (regs)
 	    {
-	      regend[0] = d;
-	      regstart[0] = string1;
-	      for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
+ 	      regs->start[0] = pos;
+ 	      if (dend == end_match_1)
+ 		regs->end[0] = d - string1;
+ 	      else
+ 		regs->end[0] = d - string2 + size1;
+ 	      for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
 		{
-		  if ((mcnt != 0) && regstart[mcnt] == (char *) -1)
+		  if (regend[mcnt] == (unsigned char *) -1)
 		    {
 		      regs->start[mcnt] = -1;
 		      regs->end[mcnt] = -1;
 		      continue;
 		    }
-		  if (regstart[mcnt] - string1 < 0 ||
-		      regstart[mcnt] - string1 > size1)
-		    regs->start[mcnt] = regstart[mcnt] - string2 + size1;
-		  else
+ 		  if (regstart_seg1[mcnt])
 		    regs->start[mcnt] = regstart[mcnt] - string1;
-		  if (regend[mcnt] - string1 < 0 ||
-		      regend[mcnt] - string1 > size1)
-		    regs->end[mcnt] = regend[mcnt] - string2 + size1;
 		  else
+		    regs->start[mcnt] = regstart[mcnt] - string2 + size1;
+ 		  if (regend_seg1[mcnt])
 		    regs->end[mcnt] = regend[mcnt] - string1;
+		  else
+		    regs->end[mcnt] = regend[mcnt] - string2 + size1;
 		}
-	      regs->start[0] = pos;
 	    }
-	  if (d - string1 >= 0 && d - string1 <= size1)
-	    return d - string1 - pos;
+ 	  if (dend == end_match_1)
+	    return (d - string1 - pos);
 	  else
 	    return d - string2 + size1 - pos;
 	}
@@ -1164,23 +1273,22 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 
 	case start_memory:
 	  regstart[*p] = d;
-	  regstart_segend[*p++] = dend;
+ 	  regstart_seg1[*p++] = (dend == end_match_1);
 	  break;
 
 	case stop_memory:
 	  regend[*p] = d;
-	  if (regstart_segend[*p] == dend)
-	    regstart_segend[*p] = d;
-	  p++;
+ 	  regend_seg1[*p++] = (dend == end_match_1);
 	  break;
 
 	case duplicate:
 	  {
 	    int regno = *p++;   /* Get which register to match against */
-	    register char *d2, *dend2;
+	    register unsigned char *d2, *dend2;
 
 	    d2 = regstart[regno];
-	    dend2 = regstart_segend[regno];
+ 	    dend2 = ((regstart_seg1[regno] == regend_seg1[regno])
+		     ? regend[regno] : end_match_1);
 	    while (1)
 	      {
 		/* Advance to next segment in register contents, if necessary */
@@ -1222,16 +1330,16 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 	    /* Nonzero for charset_not */
 	    int not = 0;
 	    register int c;
-	    if (*(p - 1) == (char) charset_not)
+	    if (*(p - 1) == (unsigned char) charset_not)
 	      not = 1;
 
 	    /* fetch a data character */
 	    PREFETCH;
 
 	    if (translate)
-	      c = translate [*(unsigned char *)d];
+	      c = translate [*d];
 	    else
-	      c = *(unsigned char *)d;
+	      c = *d;
 
 	    if (c < *p * BYTEWIDTH
 		&& p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
@@ -1273,14 +1381,18 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 	case on_failure_jump:
 	  if (stackp == stacke)
 	    {
-	      char **stackx = (char **) alloca (2 * (stacke - stackb) * sizeof (char *));
+	      unsigned char **stackx;
+	      if (stacke - stackb > re_max_failures * 2)
+		return -2;
+	      stackx = (unsigned char **) alloca (2 * (stacke - stackb)
+					 * sizeof (char *));
 	      bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
-	      stackp += stackx - stackb;
+	      stackp = stackx + (stackp - stackb);
 	      stacke = stackx + 2 * (stacke - stackb);
 	      stackb = stackx;
 	    }
 	  mcnt = *p++ & 0377;
-	  mcnt += SIGN_EXTEND_CHAR (*p) << 8;
+	  mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
 	  p++;
 	  *stackp++ = mcnt + p;
 	  *stackp++ = d;
@@ -1291,37 +1403,46 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 
 	case maybe_finalize_jump:
 	  mcnt = *p++ & 0377;
-	  mcnt += SIGN_EXTEND_CHAR (*p) << 8;
+	  mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
 	  p++;
-	  /* Compare what follows with the begining of the repeat.
-	     If we can establish that there is nothing that they would
-	     both match, we can change to finalize_jump */
-	  if (p == pend)
-	    p[-3] = (char) finalize_jump;
-	  else if (*p == (char) exactn || *p == (char) endline)
-	    {
-	      register int c = *p == (char) endline ? '\n' : p[2];
-	      register char *p1 = p + mcnt;
-	      /* p1[0] ... p1[2] are an on_failure_jump.
-		 Examine what follows that */
-	      if (p1[3] == (char) exactn && p1[5] != c)
-		p[-3] = (char) finalize_jump;
-	      else if (p1[3] == (char) charset || p1[3] == (char) charset_not)
-		{
-		  int not = p1[3] == (char) charset_not;
-		  if (c < p1[4] * BYTEWIDTH
-		      && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
-		    not = !not;
-		  /* not is 1 if c would match */
-		  /* That means it is not safe to finalize */
-		  if (!not)
-		    p[-3] = (char) finalize_jump;
-		}
-	    }
+	  {
+	    register unsigned char *p2 = p;
+	    /* Compare what follows with the begining of the repeat.
+	       If we can establish that there is nothing that they would
+	       both match, we can change to finalize_jump */
+	    while (p2 != pend
+		   && (*p2 == (unsigned char) stop_memory
+		       || *p2 == (unsigned char) start_memory))
+	      p2++;
+	    if (p2 == pend)
+	      p[-3] = (unsigned char) finalize_jump;
+	    else if (*p2 == (unsigned char) exactn
+		     || *p2 == (unsigned char) endline)
+	      {
+		register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
+		register unsigned char *p1 = p + mcnt;
+		/* p1[0] ... p1[2] are an on_failure_jump.
+		   Examine what follows that */
+		if (p1[3] == (unsigned char) exactn && p1[5] != c)
+		  p[-3] = (unsigned char) finalize_jump;
+		else if (p1[3] == (unsigned char) charset
+			 || p1[3] == (unsigned char) charset_not)
+		  {
+		    int not = p1[3] == (unsigned char) charset_not;
+		    if (c < p1[4] * BYTEWIDTH
+			&& p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
+		      not = !not;
+		    /* not is 1 if c would match */
+		    /* That means it is not safe to finalize */
+		    if (!not)
+		      p[-3] = (unsigned char) finalize_jump;
+		  }
+	      }
+	  }
 	  p -= 2;
-	  if (p[-1] != (char) finalize_jump)
+	  if (p[-1] != (unsigned char) finalize_jump)
 	    {
-	      p[-1] = (char) jump;
+	      p[-1] = (unsigned char) jump;
 	      goto nofinalize;
 	    }
 
@@ -1335,16 +1456,18 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 	case jump:
 	nofinalize:
 	  mcnt = *p++ & 0377;
-	  mcnt += SIGN_EXTEND_CHAR (*p) << 8;
+	  mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
 	  p += mcnt + 1;	/* The 1 compensates for missing ++ above */
 	  break;
 
 	case dummy_failure_jump:
 	  if (stackp == stacke)
 	    {
-	      char **stackx = (char **) alloca (2 * (stacke - stackb) * sizeof (char *));
+	      unsigned char **stackx
+		= (unsigned char **) alloca (2 * (stacke - stackb)
+					     * sizeof (char *));
 	      bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
-	      stackp += stackx - stackb;
+	      stackp = stackx + (stackp - stackb);
 	      stacke = stackx + 2 * (stacke - stackb);
 	      stackb = stackx;
 	    }
@@ -1357,8 +1480,8 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 	      || d == end2  /* Points to end */
 	      || (d == end1 && size2 == 0)) /* Points to end */
 	    break;
-	  if ((SYNTAX (((unsigned char *)d)[-1]) == Sword)
-	      != (SYNTAX (d == end1 ? *(unsigned char *)string2 : *(unsigned char *)d) == Sword))
+	  if ((SYNTAX (d[-1]) == Sword)
+	      != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
 	    break;
 	  goto fail;
 
@@ -1367,49 +1490,49 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 	      || d == end2  /* Points to end */
 	      || (d == end1 && size2 == 0)) /* Points to end */
 	    goto fail;
-	  if ((SYNTAX (((unsigned char *)d)[-1]) == Sword)
-	      != (SYNTAX (d == end1 ? *(unsigned char *)string2 : *(unsigned char *)d) == Sword))
+	  if ((SYNTAX (d[-1]) == Sword)
+	      != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
 	    goto fail;
 	  break;
 
 	case wordbeg:
 	  if (d == end2  /* Points to end */
 	      || (d == end1 && size2 == 0) /* Points to end */
-	      || SYNTAX (*(unsigned char *) (d == end1 ? string2 : d)) != Sword) /* Next char not a letter */
+	      || SYNTAX (* (d == end1 ? string2 : d)) != Sword) /* Next char not a letter */
 	    goto fail;
 	  if (d == string1  /* Points to first char */
-	      || SYNTAX (((unsigned char *)d)[-1]) != Sword)  /* prev char not letter */
+	      || SYNTAX (d[-1]) != Sword)  /* prev char not letter */
 	    break;
 	  goto fail;
 
 	case wordend:
 	  if (d == string1  /* Points to first char */
-	      || SYNTAX (((unsigned char *)d)[-1]) != Sword)  /* prev char not letter */
+	      || SYNTAX (d[-1]) != Sword)  /* prev char not letter */
 	    goto fail;
 	  if (d == end2  /* Points to end */
 	      || (d == end1 && size2 == 0) /* Points to end */
-	      || SYNTAX (d == end1 ? *(unsigned char *)string2 : *(unsigned char *)d) != Sword) /* Next char not a letter */
+	      || SYNTAX (d == end1 ? *string2 : *d) != Sword) /* Next char not a letter */
 	    break;
 	  goto fail;
 
 #ifdef emacs
 	case before_dot:
 	  if (((d - string2 <= (unsigned) size2)
-	       ? d - (char *) bf_p2 : d - (char *) bf_p1)
+	       ? d - bf_p2 : d - bf_p1)
 	      <= point)
 	    goto fail;
 	  break;
 
 	case at_dot:
 	  if (((d - string2 <= (unsigned) size2)
-	       ? d - (char *) bf_p2 : d - (char *) bf_p1)
+	       ? d - bf_p2 : d - bf_p1)
 	      == point)
 	    goto fail;
 	  break;
 
 	case after_dot:
 	  if (((d - string2 <= (unsigned) size2)
-	       ? d - (char *) bf_p2 : d - (char *) bf_p1)
+	       ? d - bf_p2 : d - bf_p1)
 	      >= point)
 	    goto fail;
 	  break;
@@ -1422,7 +1545,7 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 	  mcnt = *p++;
 	matchsyntax:
 	  PREFETCH;
-	  if (SYNTAX (*(unsigned char *)d++) != (enum syntaxcode) mcnt) goto fail;
+	  if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
 	  break;
 	  
 	case notwordchar:
@@ -1433,19 +1556,19 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 	  mcnt = *p++;
 	matchnotsyntax:
 	  PREFETCH;
-	  if (SYNTAX (*(unsigned char *)d++) == (enum syntaxcode) mcnt) goto fail;
+	  if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
 	  break;
 #else
 	case wordchar:
 	  PREFETCH;
-	  if (SYNTAX (*(unsigned char *)d++) == 0) goto fail;
+	  if (SYNTAX (*d++) == 0) goto fail;
 	  break;
 	  
 	case notwordchar:
 	  PREFETCH;
-	  if (SYNTAX (*(unsigned char *)d++) != 0) goto fail;
+	  if (SYNTAX (*d++) != 0) goto fail;
 	  break;
-#endif not emacs
+#endif /* not emacs */
 
 	case begbuf:
 	  if (d == string1)	/* Note, d cannot equal string2 */
@@ -1466,7 +1589,7 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 	      do
 		{
 		  PREFETCH;
-		  if (translate[*(unsigned char *)d++] != *p++) goto fail;
+		  if (translate[*d++] != *p++) goto fail;
 		}
 	      while (--mcnt);
 	    }
@@ -1505,11 +1628,11 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
 
 static int
 bcmp_translate (s1, s2, len, translate)
-     char *s1, *s2;
+     unsigned char *s1, *s2;
      register int len;
-     char *translate;
+     unsigned char *translate;
 {
-  register char *p1 = s1, *p2 = s2;
+  register unsigned char *p1 = s1, *p2 = s2;
   while (len)
     {
       if (translate [*p1++] != translate [*p2++]) return 1;
@@ -1597,22 +1720,28 @@ static char upcase[0400] =
     0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
   };
 
-main ()
+main (argc, argv)
+     int argc;
+     char **argv;
 {
   char pat[80];
   struct re_pattern_buffer buf;
   int i;
   char c;
   char fastmap[(1 << BYTEWIDTH)];
-  char *gets();
+
+  /* Allow a command argument to specify the style of syntax.  */
+  if (argc > 1)
+    obscure_syntax = atoi (argv[1]);
 
   buf.allocated = 40;
   buf.buffer = (char *) malloc (buf.allocated);
   buf.fastmap = fastmap;
   buf.translate = upcase;
 
-  while (gets(pat))
+  while (1)
     {
+      gets (pat);
 
       if (*pat)
 	{
@@ -1686,4 +1815,4 @@ error (string)
   exit (1);
 }
 
-#endif test
+#endif /* test */
author	Arnold D. Robbins <arnold@skeeve.com>	2010-07-02 15:46:31 +0300
committer	Arnold D. Robbins <arnold@skeeve.com>	2010-07-02 15:46:31 +0300
commit	3711eedc1b995eb1926c9ffb902d5d796cacf8d0 (patch)
tree	5642fdee11499774e0b7401f195931cd3a143d18 /regex.c
parent	ec6415f1ba061b2fb78808b7dba3246745a15398 (diff)
download	egawk-3711eedc1b995eb1926c9ffb902d5d796cacf8d0.tar.gz egawk-3711eedc1b995eb1926c9ffb902d5d796cacf8d0.tar.bz2 egawk-3711eedc1b995eb1926c9ffb902d5d796cacf8d0.zip