aboutsummaryrefslogtreecommitdiffstats
path: root/regex.c
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2010-07-16 11:58:26 +0300
committerArnold D. Robbins <arnold@skeeve.com>2010-07-16 11:58:26 +0300
commit765c7494b3dac62207e6cd57fb839997e237f292 (patch)
treef7da12ffdb85d9f82671cb3122775b2ce73f7ad9 /regex.c
parentcce5115e21db1702e0617afdca36633e7e2c9eae (diff)
downloadegawk-765c7494b3dac62207e6cd57fb839997e237f292.tar.gz
egawk-765c7494b3dac62207e6cd57fb839997e237f292.tar.bz2
egawk-765c7494b3dac62207e6cd57fb839997e237f292.zip
Moving to 2.13.2.
Diffstat (limited to 'regex.c')
-rw-r--r--regex.c2440
1 files changed, 1714 insertions, 726 deletions
diff --git a/regex.c b/regex.c
index a04a4398..e59a169a 100644
--- a/regex.c
+++ b/regex.c
@@ -1,177 +1,108 @@
-/* Extended regular expression matching and search.
- Copyright (C) 1985 Free Software Foundation, Inc.
-
- NO WARRANTY
-
- BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY
-NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT
-WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC,
-RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS"
-WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
-BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY
-AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
-DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
-CORRECTION.
-
- IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M.
-STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY
-WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE
-LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR
-OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR
-DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR
-A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS
-PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY.
-
- GENERAL PUBLIC LICENSE TO COPY
-
- 1. You may copy and distribute verbatim copies of this source file
-as you receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy a valid copyright notice "Copyright
-(C) 1985 Free Software Foundation, Inc."; and include following the
-copyright notice a verbatim copy of the above disclaimer of warranty
-and of this License. You may charge a distribution fee for the
-physical act of transferring a copy.
-
- 2. You may modify your copy or copies of this source file or
-any portion of it, and copy and distribute such modifications under
-the terms of Paragraph 1 above, provided that you also do the following:
-
- a) cause the modified files to carry prominent notices stating
- that you changed the files and the date of any change; and
-
- b) cause the whole of any work that you distribute or publish,
- that in whole or in part contains or is a derivative of this
- program or any part thereof, to be licensed at no charge to all
- third parties on terms identical to those contained in this
- License Agreement (except that you may choose to grant more extensive
- warranty protection to some or all third parties, at your option).
-
- c) You may charge a distribution fee for the physical act of
- transferring a copy, and you may at your option offer warranty
- protection in exchange for a fee.
-
-Mere aggregation of another unrelated program with this program (or its
-derivative) on a volume of a storage or distribution medium does not bring
-the other program under the scope of these terms.
-
- 3. You may copy and distribute this program (or a portion or derivative
-of it, under Paragraph 2) in object code or executable form under the terms
-of Paragraphs 1 and 2 above provided that you also do one of the following:
-
- a) accompany it with the complete corresponding machine-readable
- source code, which must be distributed under the terms of
- Paragraphs 1 and 2 above; or,
-
- b) accompany it with a written offer, valid for at least three
- years, to give any third party free (except for a nominal
- shipping charge) a complete machine-readable copy of the
- corresponding source code, to be distributed under the terms of
- Paragraphs 1 and 2 above; or,
-
- c) accompany it with the information you received as to where the
- corresponding source code may be obtained. (This alternative is
- allowed only for noncommercial distribution and only if you
- received the program in object code or executable form alone.)
-
-For an executable file, complete source code means all the source code for
-all modules it contains; but, as a special exception, it need not include
-source code for modules which are standard libraries that accompany the
-operating system on which the executable file runs.
-
- 4. You may not copy, sublicense, distribute or transfer this program
-except as expressly provided under this License Agreement. Any attempt
-otherwise to copy, sublicense, distribute or transfer this program is void and
-your rights to use the program under this License agreement shall be
-automatically terminated. However, parties who have received computer
-software programs from you with this License Agreement will not have
-their licenses terminated so long as such parties remain in full compliance.
-
- 5. If you wish to incorporate parts of this program into other free
-programs whose distribution conditions are different, write to the Free
-Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet
-worked out a simple rule that can be stated here, but we will often permit
-this. We will be guided by the two goals of preserving the free status of
-all derivatives of our free software and of promoting the sharing and reuse of
-software.
-
-
-In other words, you are welcome to use, share and improve this program.
-You are forbidden to forbid anyone else to use, share and improve
-what you give them. Help stamp out software-hoarding! */
-
-#ifdef MSDOS
-#include <malloc.h>
-static void init_syntax_once(void );
-extern int re_set_syntax(int syntax);
-extern char *re_compile_pattern(char *pattern,int size,struct re_pattern_buffer *bufp);
-static int store_jump(char *from,char opcode,char *to);
-static int insert_jump(char op,char *from,char *to,char *current_end);
-extern void re_compile_fastmap(struct re_pattern_buffer *bufp);
-extern int re_search(struct re_pattern_buffer *pbufp,char *string,int size,int startpos,int range,struct re_registers *regs);
-extern int re_search_2(struct re_pattern_buffer *pbufp,char *string1,int size1,char *string2,int size2,int startpos,int range,struct re_registers *regs,int mstop);
-extern int re_match(struct re_pattern_buffer *pbufp,char *string,int size,int pos,struct re_registers *regs);
-extern int re_match_2(struct re_pattern_buffer *pbufp,unsigned char *string1,int size1,unsigned char *string2,int size2,int pos,struct re_registers *regs,int mstop);
-static int bcmp_translate(unsigned char *s1,unsigned char *s2,int len,unsigned char *translate);
-extern char *re_comp(char *s);
-extern int re_exec(char *s);
-#endif
+/* Extended regular expression matching and search library.
+ Copyright (C) 1985, 1989-90 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 1, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+
+/* To test, compile with -Dtest. This Dtestable feature turns this into
+ a self-contained program which reads a pattern, describes how it
+ compiles, then reads a string and searches for it.
+
+ On the other hand, if you compile with both -Dtest and -Dcanned you
+ can run some tests we've already thought of. */
-/* To test, compile with -Dtest.
- This Dtestable feature turns this into a self-contained program
- which reads a pattern, describes how it compiles,
- then reads a string and searches for it. */
#ifdef emacs
/* The `emacs' switch turns on certain special matching commands
- that make sense only in emacs. */
+ that make sense only in emacs. */
-#include "config.h"
#include "lisp.h"
#include "buffer.h"
#include "syntax.h"
-#else /* not emacs */
+/* We write fatal error messages on standard error. */
+#include <stdio.h>
-#ifdef BCOPY_MISSING
-#define bcopy(s,d,n) memcpy((d),(s),(n))
-#define bcmp(s1,s2,n) memcmp((s1),(s2),(n))
-#define bzero(s,n) memset((s),0,(n))
-#else
-void bcopy();
-int bcmp();
-void bzero();
-#endif
+/* isalpha(3) etc. are used for the character classes. */
+#include <ctype.h>
+
+#else /* not emacs */
+
+#include "awk.h"
+#define NO_ALLOCA /* try it out for now */
+#ifndef NO_ALLOCA
/* Make alloca work the best possible way. */
#ifdef __GNUC__
+#ifndef atarist
+#ifndef alloca
#define alloca __builtin_alloca
+#endif
+#endif /* atarist */
#else
#ifdef sparc
#include <alloca.h>
+#else
+char *alloca ();
#endif
-#endif
-
-/*
- * Define the syntax stuff, so we can do the \<...\> things.
- */
-
-#ifndef Sword /* must be non-zero in some of the tests below... */
+#endif /* __GNUC__ */
+
+#define FREE_AND_RETURN_VOID(stackb) return
+#define FREE_AND_RETURN(stackb,val) return(val)
+#define DOUBLE_STACK(stackx,stackb,len) \
+ (stackx = (unsigned char **) alloca (2 * len \
+ * sizeof (unsigned char *)),\
+ /* Only copy what is in use. */ \
+ (unsigned char **) memcpy (stackx, stackb, len * sizeof (char *)))
+#else /* NO_ALLOCA defined */
+#define FREE_AND_RETURN_VOID(stackb) free(stackb);return
+#define FREE_AND_RETURN(stackb,val) free(stackb);return(val)
+#define DOUBLE_STACK(stackx,stackb,len) \
+ (unsigned char **) realloc (stackb, 2 * len * sizeof (unsigned char *))
+#endif /* NO_ALLOCA */
+
+static void store_jump P((char *, int, char *));
+static void insert_jump P((int, char *, char *, char *));
+static void store_jump_n P((char *, int, char *, unsigned));
+static void insert_jump_n P((int, char *, char *, char *, unsigned));
+static void insert_op_2 P((int, char *, char *, int, int ));
+static int memcmp_translate P((unsigned char *, unsigned char *,
+ int, unsigned char *));
+
+
+/* Define the syntax stuff, so we can do the \<, \>, etc. */
+
+/* This must be nonzero for the wordchar and notwordchar pattern
+ commands in re_match_2. */
+#ifndef Sword
#define Sword 1
#endif
#define SYNTAX(c) re_syntax_table[c]
+
#ifdef SYNTAX_TABLE
char *re_syntax_table;
-#else
+#else /* not SYNTAX_TABLE */
static char re_syntax_table[256];
+static void init_syntax_once P((void));
+
static void
init_syntax_once ()
@@ -182,7 +113,7 @@ init_syntax_once ()
if (done)
return;
- bzero (re_syntax_table, sizeof re_syntax_table);
+ memset (re_syntax_table, 0, sizeof re_syntax_table);
for (c = 'a'; c <= 'z'; c++)
re_syntax_table[c] = Sword;
@@ -192,42 +123,167 @@ init_syntax_once ()
for (c = '0'; c <= '9'; c++)
re_syntax_table[c] = Sword;
+
+ /* Add specific syntax for ISO Latin-1. */
+ for (c = 0300; c <= 0377; c++)
+ re_syntax_table[c] = Sword;
+ re_syntax_table[0327] = 0;
+ re_syntax_table[0367] = 0;
done = 1;
}
#endif /* SYNTAX_TABLE */
-#endif /* not emacs */
+#undef P
+#endif /* emacs */
+
+/* Sequents are missing isgraph. */
+#ifndef isgraph
+#define isgraph(c) (isprint((c)) && !isspace((c)))
+#endif
+
+/* Get the interface, including the syntax bits. */
#include "regex.h"
+
+/* These are the command codes that appear in compiled regular
+ expressions, one per byte. Some command codes are followed by
+ argument bytes. A command code can specify any interpretation
+ whatsoever for its arguments. Zero-bytes may appear in the compiled
+ regular expression.
+
+ The value of `exactn' is needed in search.c (search_buffer) in emacs.
+ So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
+ `exactn' we use here must also be 1. */
+
+enum regexpcode
+ {
+ unused=0,
+ exactn=1, /* Followed by one byte giving n, then by n literal bytes. */
+ begline, /* Fail unless at beginning of line. */
+ endline, /* Fail unless at end of line. */
+ jump, /* Followed by two bytes giving relative address to jump to. */
+ on_failure_jump, /* Followed by two bytes giving relative address of
+ place to resume at in case of failure. */
+ finalize_jump, /* Throw away latest failure point and then jump to
+ address. */
+ maybe_finalize_jump, /* Like jump but finalize if safe to do so.
+ This is used to jump back to the beginning
+ of a repeat. If the command that follows
+ this jump is clearly incompatible with the
+ one at the beginning of the repeat, such that
+ we can be sure that there is no use backtracking
+ out of repetitions already completed,
+ then we finalize. */
+ dummy_failure_jump, /* Jump, and push a dummy failure point. This
+ failure point will be thrown away if an attempt
+ is made to use it for a failure. A + construct
+ makes this before the first repeat. Also
+ use it as an intermediary kind of jump when
+ compiling an or construct. */
+ succeed_n, /* Used like on_failure_jump except has to succeed n times;
+ then gets turned into an on_failure_jump. The relative
+ address following it is useless until then. The
+ address is followed by two bytes containing n. */
+ jump_n, /* Similar to jump, but jump n times only; also the relative
+ address following is in turn followed by yet two more bytes
+ containing n. */
+ set_number_at, /* Set the following relative location to the
+ subsequent number. */
+ anychar, /* Matches any (more or less) one character. */
+ charset, /* Matches any one char belonging to specified set.
+ First following byte is number of bitmap bytes.
+ Then come bytes for a bitmap saying which chars are in.
+ Bits in each byte are ordered low-bit-first.
+ A character is in the set if its bit is 1.
+ A character too large to have a bit in the map
+ is automatically not in the set. */
+ charset_not, /* Same parameters as charset, but match any character
+ that is not one of those specified. */
+ start_memory, /* Start remembering the text that is matched, for
+ storing in a memory register. Followed by one
+ byte containing the register number. Register numbers
+ must be in the range 0 through RE_NREGS. */
+ stop_memory, /* Stop remembering the text that is matched
+ and store it in a memory register. Followed by
+ one byte containing the register number. Register
+ numbers must be in the range 0 through RE_NREGS. */
+ duplicate, /* Match a duplicate of something remembered.
+ Followed by one byte containing the index of the memory
+ register. */
+ before_dot, /* Succeeds if before point. */
+ at_dot, /* Succeeds if at point. */
+ after_dot, /* Succeeds if after point. */
+ begbuf, /* Succeeds if at beginning of buffer. */
+ endbuf, /* Succeeds if at end of buffer. */
+ wordchar, /* Matches any word-constituent character. */
+ notwordchar, /* Matches any char that is not a word-constituent. */
+ wordbeg, /* Succeeds if at word beginning. */
+ wordend, /* Succeeds if at word end. */
+ wordbound, /* Succeeds if at a word boundary. */
+ notwordbound,/* Succeeds if not at a word boundary. */
+ syntaxspec, /* Matches any character whose syntax is specified.
+ followed by a byte which contains a syntax code,
+ e.g., Sword. */
+ notsyntaxspec /* Matches any character whose syntax differs from
+ that specified. */
+ };
+
+
/* Number of failure points to allocate space for initially,
- when matching. If this number is exceeded, more space is allocated,
- so it is not a hard limit. */
+ when matching. If this number is exceeded, more space is allocated,
+ so it is not a hard limit. */
#ifndef NFAILURES
#define NFAILURES 80
-#endif /* NFAILURES */
-
-/* width of a byte in bits */
-
-#define BYTEWIDTH 8
+#endif
+#ifdef CHAR_UNSIGNED
+#define SIGN_EXTEND_CHAR(c) ((c)>(char)127?(c)-256:(c)) /* for IBM RT */
+#endif
#ifndef SIGN_EXTEND_CHAR
#define SIGN_EXTEND_CHAR(x) (x)
#endif
-
-static int obscure_syntax = 0;
+
-/* Specify the precise syntax of regexp for compilation.
- This provides for compatibility for various utilities
- which historically have different, incompatible syntaxes.
-
- The argument SYNTAX is a bit-mask containing the two bits
- RE_NO_BK_PARENS and RE_NO_BK_VBAR. */
+/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
+#define STORE_NUMBER(destination, number) \
+ { (destination)[0] = (number) & 0377; \
+ (destination)[1] = (number) >> 8; }
+
+/* Same as STORE_NUMBER, except increment the destination pointer to
+ the byte after where the number is stored. Watch out that values for
+ DESTINATION such as p + 1 won't work, whereas p will. */
+#define STORE_NUMBER_AND_INCR(destination, number) \
+ { STORE_NUMBER(destination, number); \
+ (destination) += 2; }
+
+
+/* Put into DESTINATION a number stored in two contingous bytes starting
+ at SOURCE. */
+#define EXTRACT_NUMBER(destination, source) \
+ { (destination) = *(source) & 0377; \
+ (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; }
+
+/* Same as EXTRACT_NUMBER, except increment the pointer for source to
+ point to second byte of SOURCE. Note that SOURCE has to be a value
+ such as p, not, e.g., p + 1. */
+#define EXTRACT_NUMBER_AND_INCR(destination, source) \
+ { EXTRACT_NUMBER (destination, source); \
+ (source) += 2; }
+
+
+/* Specify the precise syntax of regexps for compilation. This provides
+ for compatibility for various utilities which historically have
+ different, incompatible syntaxes.
+
+ The argument SYNTAX is a bit-mask comprised of the various bits
+ defined in regex.h. */
int
re_set_syntax (syntax)
+ int syntax;
{
int ret;
@@ -235,91 +291,115 @@ re_set_syntax (syntax)
obscure_syntax = syntax;
return ret;
}
-
-/* re_compile_pattern takes a regular-expression string
- and converts it into a buffer full of byte commands for matching.
- PATTERN is the address of the pattern string
- SIZE is the length of it.
- BUFP is a struct re_pattern_buffer * which points to the info
- on where to store the byte commands.
- This structure contains a char * which points to the
- actual space, which should have been obtained with malloc.
- re_compile_pattern may use realloc to grow the buffer space.
+/* Set by re_set_syntax to the current regexp syntax to recognize. */
+int obscure_syntax = 0;
- The number of bytes of commands can be found out by looking in
- the struct re_pattern_buffer that bufp pointed to,
- after re_compile_pattern returns.
-*/
-#define PATPUSH(ch) (*b++ = (char) (ch))
+
+/* Macros for re_compile_pattern, which is found below these definitions. */
-#define PATFETCH(c) \
- {if (p == pend) goto end_of_pattern; \
- c = * (unsigned char *) p++; \
+#define CHAR_CLASS_MAX_LENGTH 6
+
+/* Fetch the next character in the uncompiled pattern, translating it if
+ necessary. */
+#define PATFETCH(c) \
+ {if (p == pend) goto end_of_pattern; \
+ c = * (unsigned char *) p++; \
if (translate) c = translate[c]; }
-#define PATFETCH_RAW(c) \
- {if (p == pend) goto end_of_pattern; \
+/* Fetch the next character in the uncompiled pattern, with no
+ translation. */
+#define PATFETCH_RAW(c) \
+ {if (p == pend) goto end_of_pattern; \
c = * (unsigned char *) p++; }
#define PATUNFETCH p--
-#ifdef MSDOS
-#define MaxAllocation (1<<14)
-#else
-#define MaxAllocation (1<<16)
-#endif
-#define EXTEND_BUFFER \
- { char *old_buffer = bufp->buffer; \
- if (bufp->allocated == MaxAllocation) goto too_big; \
- bufp->allocated *= 2; \
- if (bufp->allocated > MaxAllocation) bufp->allocated = MaxAllocation; \
- if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
- goto memory_exhausted; \
- c = bufp->buffer - old_buffer; \
- b += c; \
- if (fixup_jump) \
- fixup_jump += c; \
- if (laststart) \
- laststart += c; \
- begalt += c; \
- if (pending_exact) \
- pending_exact += c; \
+
+/* If the buffer isn't allocated when it comes in, use this. */
+#define INIT_BUF_SIZE 28
+
+/* Make sure we have at least N more bytes of space in buffer. */
+#define GET_BUFFER_SPACE(n) \
+ { \
+ while (b - bufp->buffer + (n) >= bufp->allocated) \
+ EXTEND_BUFFER; \
}
-#ifdef NEVER
-#define EXTEND_BUFFER \
- { unsigned b_off = b - bufp->buffer, \
- f_off, l_off, p_off, \
- beg_off = begalt - bufp->buffer; \
- if (fixup_jump) \
- f_off = fixup_jump - bufp->buffer; \
- if (laststart) \
- l_off = laststart - bufp->buffer; \
- if (pending_exact) \
- p_off = pending_exact - bufp->buffer; \
- if (bufp->allocated == MaxAllocation) goto too_big; \
- bufp->allocated *= 2; \
- if (bufp->allocated > MaxAllocation) bufp->allocated = MaxAllocation; \
- if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
- goto memory_exhausted; \
- b = bufp->buffer + b_off; \
- if (fixup_jump) \
- fixup_jump = bufp->buffer + f_off; \
- if (laststart) \
- laststart = bufp->buffer + l_off; \
- begalt = bufp->buffer + beg_off; \
- if (pending_exact) \
- pending_exact = bufp->buffer + p_off; \
+/* Make sure we have one more byte of buffer space and then add CH to it. */
+#define BUFPUSH(ch) \
+ { \
+ GET_BUFFER_SPACE (1); \
+ *b++ = (char) (ch); \
}
-#endif
-static int store_jump (), insert_jump ();
+
+/* Extend the buffer by twice its current size via reallociation and
+ reset the pointers that pointed into the old allocation to point to
+ the correct places in the new allocation. If extending the buffer
+ results in it being larger than 1 << 16, then flag memory exhausted. */
+#define EXTEND_BUFFER \
+ { char *old_buffer = bufp->buffer; \
+ if (bufp->allocated == (1L<<16)) goto too_big; \
+ bufp->allocated *= 2; \
+ if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
+ bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated); \
+ if (bufp->buffer == 0) \
+ goto memory_exhausted; \
+ b = (b - old_buffer) + bufp->buffer; \
+ if (fixup_jump) \
+ fixup_jump = (fixup_jump - old_buffer) + bufp->buffer; \
+ if (laststart) \
+ laststart = (laststart - old_buffer) + bufp->buffer; \
+ begalt = (begalt - old_buffer) + bufp->buffer; \
+ if (pending_exact) \
+ pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
+ }
+
+/* Set the bit for character C in a character set list. */
+#define SET_LIST_BIT(c) (b[(c) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
+
+/* Get the next unsigned number in the uncompiled pattern. */
+#define GET_UNSIGNED_NUMBER(num) \
+ { if (p != pend) \
+ { \
+ PATFETCH (c); \
+ while (isdigit (c)) \
+ { \
+ if (num < 0) \
+ num = 0; \
+ num = num * 10 + c - '0'; \
+ if (p == pend) \
+ break; \
+ PATFETCH (c); \
+ } \
+ } \
+ }
+
+/* Subroutines for re_compile_pattern. */
+static void store_jump (), insert_jump (), store_jump_n (),
+ insert_jump_n (), insert_op_2 ();
+
+
+/* re_compile_pattern takes a regular-expression string
+ and converts it into a buffer full of byte commands for matching.
+
+ PATTERN is the address of the pattern string
+ SIZE is the length of it.
+ BUFP is a struct re_pattern_buffer * which points to the info
+ on where to store the byte commands.
+ This structure contains a char * which points to the
+ actual space, which should have been obtained with malloc.
+ re_compile_pattern may use realloc to grow the buffer space.
+
+ The number of bytes of commands can be found out by looking in
+ the `struct re_pattern_buffer' that bufp pointed to, after
+ re_compile_pattern returns. */
char *
re_compile_pattern (pattern, size, bufp)
char *pattern;
- int size;
+ size_t size;
struct re_pattern_buffer *bufp;
{
register char *b = bufp->buffer;
@@ -329,36 +409,46 @@ re_compile_pattern (pattern, size, bufp)
char *p1;
unsigned char *translate = (unsigned char *) bufp->translate;
- /* address of the count-byte of the most recently inserted "exactn" command.
- This makes it possible to tell whether a new exact-match character
- can be added to that command or requires a new "exactn" command. */
+ /* Address of the count-byte of the most recently inserted `exactn'
+ command. This makes it possible to tell whether a new exact-match
+ character can be added to that command or requires a new `exactn'
+ command. */
char *pending_exact = 0;
- /* address of the place where a forward-jump should go
- to the end of the containing expression.
- Each alternative of an "or", except the last, ends with a forward-jump
- of this sort. */
+ /* Address of the place where a forward-jump should go to the end of
+ the containing expression. Each alternative of an `or', except the
+ last, ends with a forward-jump of this sort. */
char *fixup_jump = 0;
- /* address of start of the most recently finished expression.
- This tells postfix * where to find the start of its operand. */
+ /* Address of start of the most recently finished expression.
+ This tells postfix * where to find the start of its operand. */
char *laststart = 0;
- /* In processing a repeat, 1 means zero matches is allowed */
+ /* In processing a repeat, 1 means zero matches is allowed. */
char zero_times_ok;
- /* In processing a repeat, 1 means many matches is allowed */
+ /* In processing a repeat, 1 means many matches is allowed. */
char many_times_ok;
- /* address of beginning of regexp, or inside of last \( */
+ /* Address of beginning of regexp, or inside of last \(. */
char *begalt = b;
+ /* In processing an interval, at least this many matches must be made. */
+ int lower_bound;
+
+ /* In processing an interval, at most this many matches can be made. */
+ int upper_bound;
+
+ /* Place in pattern (i.e., the {) to which to go back if the interval
+ is invalid. */
+ char *beg_interval = 0;
+
/* Stack of information saved by \( and restored by \).
Four stack elements are pushed by each \(:
First, the value of b.
@@ -372,7 +462,8 @@ re_compile_pattern (pattern, size, bufp)
int *stackt;
/* Counts \('s as they are encountered. Remembered for the matching \),
- where it becomes the "register number" to put in the stop_memory command */
+ where it becomes the register number to put in the stop_memory
+ command. */
int regnum = 1;
@@ -380,94 +471,117 @@ re_compile_pattern (pattern, size, bufp)
#ifndef emacs
#ifndef SYNTAX_TABLE
- /*
- * Initialize the syntax table.
- */
+ /* Initialize the syntax table. */
init_syntax_once();
#endif
#endif
if (bufp->allocated == 0)
{
- bufp->allocated = 28;
+ bufp->allocated = INIT_BUF_SIZE;
if (bufp->buffer)
- /* EXTEND_BUFFER loses when bufp->allocated is 0 */
- bufp->buffer = (char *) realloc (bufp->buffer, 28);
+ /* EXTEND_BUFFER loses when bufp->allocated is 0. */
+ bufp->buffer = (char *) realloc (bufp->buffer, INIT_BUF_SIZE);
else
- /* Caller did not allocate a buffer. Do it for him */
- bufp->buffer = (char *) malloc (28);
+ /* Caller did not allocate a buffer. Do it for them. */
+ bufp->buffer = (char *) malloc (INIT_BUF_SIZE);
if (!bufp->buffer) goto memory_exhausted;
begalt = b = bufp->buffer;
}
while (p != pend)
{
- if (b - bufp->buffer > bufp->allocated - 10)
- /* Note that EXTEND_BUFFER clobbers c */
- EXTEND_BUFFER;
-
PATFETCH (c);
switch (c)
{
case '$':
- if (obscure_syntax & RE_TIGHT_VBAR)
- {
- if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p != pend)
- goto normal_char;
- /* Make operand of last vbar end before this `$'. */
- if (fixup_jump)
- store_jump (fixup_jump, jump, b);
- fixup_jump = 0;
- PATPUSH (endline);
- break;
- }
-
- /* $ means succeed if at end of line, but only in special contexts.
- If randomly in the middle of a pattern, it is a normal character. */
- if (p == pend || *p == '\n'
- || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
- || (obscure_syntax & RE_NO_BK_PARENS
- ? *p == ')'
- : *p == '\\' && p[1] == ')')
- || (obscure_syntax & RE_NO_BK_VBAR
- ? *p == '|'
- : *p == '\\' && p[1] == '|'))
- {
- PATPUSH (endline);
- break;
- }
- goto normal_char;
-
+ {
+ char *p1 = p;
+ /* When testing what follows the $,
+ look past the \-constructs that don't consume anything. */
+ if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
+ while (p1 != pend)
+ {
+ if (*p1 == '\\' && p1 + 1 != pend
+ && (p1[1] == '<' || p1[1] == '>'
+ || p1[1] == '`' || p1[1] == '\''
+#ifdef emacs
+ || p1[1] == '='
+#endif
+ || p1[1] == 'b' || p1[1] == 'B'))
+ p1 += 2;
+ else
+ break;
+ }
+ if (obscure_syntax & RE_TIGHT_VBAR)
+ {
+ if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p1 != pend)
+ goto normal_char;
+ /* Make operand of last vbar end before this `$'. */
+ if (fixup_jump)
+ store_jump (fixup_jump, jump, b);
+ fixup_jump = 0;
+ BUFPUSH (endline);
+ break;
+ }
+ /* $ means succeed if at end of line, but only in special contexts.
+ If validly in the middle of a pattern, it is a normal character. */
+
+ if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && p1 != pend)
+ goto invalid_pattern;
+ if (p1 == pend || *p1 == '\n'
+ || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
+ || (obscure_syntax & RE_NO_BK_PARENS
+ ? *p1 == ')'
+ : *p1 == '\\' && p1[1] == ')')
+ || (obscure_syntax & RE_NO_BK_VBAR
+ ? *p1 == '|'
+ : *p1 == '\\' && p1[1] == '|'))
+ {
+ BUFPUSH (endline);
+ break;
+ }
+ goto normal_char;
+ }
case '^':
- /* ^ means succeed if at beg of line, but only if no preceding pattern. */
-
- if (laststart && p[-2] != '\n'
- && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
+ /* ^ means succeed if at beg of line, but only if no preceding
+ pattern. */
+
+ if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && laststart)
+ goto invalid_pattern;
+ if (laststart && p - 2 >= pattern && p[-2] != '\n'
+ && !(obscure_syntax & RE_CONTEXT_INDEP_OPS))
goto normal_char;
if (obscure_syntax & RE_TIGHT_VBAR)
{
if (p != pattern + 1
&& ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
goto normal_char;
- PATPUSH (begline);
+ BUFPUSH (begline);
begalt = b;
}
else
- PATPUSH (begline);
+ BUFPUSH (begline);
break;
case '+':
case '?':
- if (obscure_syntax & RE_BK_PLUS_QM)
+ if ((obscure_syntax & RE_BK_PLUS_QM)
+ || (obscure_syntax & RE_LIMITED_OPS))
goto normal_char;
handle_plus:
case '*':
/* If there is no previous pattern, char not special. */
- if (!laststart && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
- goto normal_char;
+ if (!laststart)
+ {
+ if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
+ goto invalid_pattern;
+ else if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
+ goto normal_char;
+ }
/* If there is a sequence of repetition chars,
- collapse it down to equivalent to just one. */
+ collapse it down to just one. */
zero_times_ok = 0;
many_times_ok = 0;
while (1)
@@ -504,83 +618,201 @@ re_compile_pattern (pattern, size, bufp)
/* Star, etc. applied to an empty pattern is equivalent
to an empty pattern. */
- if (!laststart)
+ if (!laststart)
break;
- /* Now we know whether 0 matches is allowed,
- and whether 2 or more matches is allowed. */
+ /* Now we know whether or not zero matches is allowed
+ and also whether or not two or more matches is allowed. */
if (many_times_ok)
{
- /* If more than one repetition is allowed,
- put in a backward jump at the end. */
+ /* If more than one repetition is allowed, put in at the
+ end a backward relative jump from b to before the next
+ jump we're going to put in below (which jumps from
+ laststart to after this jump). */
+ GET_BUFFER_SPACE (3);
store_jump (b, maybe_finalize_jump, laststart - 3);
- b += 3;
+ b += 3; /* Because store_jump put stuff here. */
}
+ /* On failure, jump from laststart to b + 3, which will be the
+ end of the buffer after this jump is inserted. */
+ GET_BUFFER_SPACE (3);
insert_jump (on_failure_jump, laststart, b + 3, b);
pending_exact = 0;
b += 3;
if (!zero_times_ok)
{
- /* At least one repetition required: insert before the loop
- a skip over the initial on-failure-jump instruction */
- insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
+ /* At least one repetition is required, so insert a
+ dummy-failure before the initial on-failure-jump
+ instruction of the loop. This effects a skip over that
+ instruction the first time we hit that loop. */
+ GET_BUFFER_SPACE (6);
+ insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
b += 3;
}
break;
case '.':
laststart = b;
- PATPUSH (anychar);
+ BUFPUSH (anychar);
break;
- case '[':
+ case '[':
+ if (p == pend)
+ goto invalid_pattern;
while (b - bufp->buffer
> bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
- /* Note that EXTEND_BUFFER clobbers c */
EXTEND_BUFFER;
laststart = b;
if (*p == '^')
- PATPUSH (charset_not), p++;
+ {
+ BUFPUSH (charset_not);
+ p++;
+ }
else
- PATPUSH (charset);
+ BUFPUSH (charset);
p1 = p;
- PATPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
+ BUFPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
/* Clear the whole map */
- bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
- /* Read in characters and ranges, setting map bits */
+ memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
+
+ if ((obscure_syntax & RE_HAT_NOT_NEWLINE) && b[-2] == charset_not)
+ SET_LIST_BIT ('\n');
+
+
+ /* Read in characters and ranges, setting map bits. */
while (1)
{
- PATFETCH (c);
+ /* Don't translate while fetching, in case it's a range bound.
+ When we set the bit for the character, we translate it. */
+ PATFETCH_RAW (c);
- /* If awk, \ escapes characters inside [...]. */
+ /* If set, \ escapes characters when inside [...]. */
if ((obscure_syntax & RE_AWK_CLASS_HACK) && c == '\\')
{
PATFETCH(c1);
- b[c1 / BYTEWIDTH] |= 1 << (c1 % BYTEWIDTH);
+ SET_LIST_BIT (c1);
continue;
}
-
- if (c == ']' && p != p1 + 1) break;
- if (*p == '-' && p[1] != ']')
+ if (c == ']')
+ {
+ if (p == p1 + 1)
+ {
+ /* If this is an empty bracket expression. */
+ if ((obscure_syntax & RE_NO_EMPTY_BRACKETS)
+ && p == pend)
+ goto invalid_pattern;
+ }
+ else
+ /* Stop if this isn't merely a ] inside a bracket
+ expression, but rather the end of a bracket
+ expression. */
+ break;
+ }
+ /* Get a range. */
+ if (p[0] == '-' && p[1] != ']')
{
- PATFETCH (c1);
- PATFETCH (c1);
- while (c <= c1)
- b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH), c++;
- }
+ PATFETCH (c1);
+ /* Don't translate the range bounds while fetching them. */
+ PATFETCH_RAW (c1);
+
+ if ((obscure_syntax & RE_NO_EMPTY_RANGES) && c > c1)
+ goto invalid_pattern;
+
+ if ((obscure_syntax & RE_NO_HYPHEN_RANGE_END)
+ && c1 == '-' && *p != ']')
+ goto invalid_pattern;
+
+ while (c <= c1)
+ {
+ /* Translate each char that's in the range. */
+ if (translate)
+ SET_LIST_BIT (translate[c]);
+ else
+ SET_LIST_BIT (c);
+ c++;
+ }
+ }
+ else if ((obscure_syntax & RE_CHAR_CLASSES)
+ && c == '[' && p[0] == ':')
+ {
+ /* Longest valid character class word has six characters. */
+ char str[CHAR_CLASS_MAX_LENGTH];
+ PATFETCH (c);
+ c1 = 0;
+ /* If no ] at end. */
+ if (p == pend)
+ goto invalid_pattern;
+ while (1)
+ {
+ /* Don't translate the ``character class'' characters. */
+ PATFETCH_RAW (c);
+ if (c == ':' || c == ']' || p == pend
+ || c1 == CHAR_CLASS_MAX_LENGTH)
+ break;
+ str[c1++] = c;
+ }
+ str[c1] = '\0';
+ if (p == pend
+ || c == ']' /* End of the bracket expression. */
+ || p[0] != ']'
+ || p + 1 == pend
+ || (strcmp (str, "alpha") != 0
+ && strcmp (str, "upper") != 0
+ && strcmp (str, "lower") != 0
+ && strcmp (str, "digit") != 0
+ && strcmp (str, "alnum") != 0
+ && strcmp (str, "xdigit") != 0
+ && strcmp (str, "space") != 0
+ && strcmp (str, "print") != 0
+ && strcmp (str, "punct") != 0
+ && strcmp (str, "graph") != 0
+ && strcmp (str, "cntrl") != 0))
+ {
+ /* Undo the ending character, the letters, and leave
+ the leading : and [ (but set bits for them). */
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ SET_LIST_BIT ('[');
+ SET_LIST_BIT (':');
+ }
+ else
+ {
+ /* The ] at the end of the character class. */
+ PATFETCH (c);
+ if (c != ']')
+ goto invalid_pattern;
+ for (c = 0; c < (1 << BYTEWIDTH); c++)
+ {
+ if ((strcmp (str, "alpha") == 0 && isalpha (c))
+ || (strcmp (str, "upper") == 0 && isupper (c))
+ || (strcmp (str, "lower") == 0 && islower (c))
+ || (strcmp (str, "digit") == 0 && isdigit (c))
+ || (strcmp (str, "alnum") == 0 && isalnum (c))
+ || (strcmp (str, "xdigit") == 0 && isxdigit (c))
+ || (strcmp (str, "space") == 0 && isspace (c))
+ || (strcmp (str, "print") == 0 && isprint (c))
+ || (strcmp (str, "punct") == 0 && ispunct (c))
+ || (strcmp (str, "graph") == 0 && isgraph (c))
+ || (strcmp (str, "cntrl") == 0 && iscntrl (c)))
+ SET_LIST_BIT (c);
+ }
+ }
+ }
+ else if (translate)
+ SET_LIST_BIT (translate[c]);
else
- {
- b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
- }
+ SET_LIST_BIT (c);
}
- /* Discard any bitmap bytes that are all 0 at the end of the map.
- Decrement the map-length byte too. */
- while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
- b[-1]--;
- b += b[-1];
- break;
+
+ /* Discard any character set/class bitmap bytes that are all
+ 0 at the end of the map. Decrement the map-length byte too. */
+ while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
+ b[-1]--;
+ b += b[-1];
+ break;
case '(':
if (! (obscure_syntax & RE_NO_BK_PARENS))
@@ -594,18 +826,28 @@ re_compile_pattern (pattern, size, bufp)
else
goto handle_close;
- case '\n':
+ case '\n':
if (! (obscure_syntax & RE_NEWLINE_OR))
goto normal_char;
else
goto handle_bar;
case '|':
- if (! (obscure_syntax & RE_NO_BK_VBAR))
+ if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
+ && (! laststart || p == pend))
+ goto invalid_pattern;
+ else if (! (obscure_syntax & RE_NO_BK_VBAR))
goto normal_char;
else
goto handle_bar;
+ case '{':
+ if (! ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
+ && (obscure_syntax & RE_INTERVALS)))
+ goto normal_char;
+ else
+ goto handle_interval;
+
case '\\':
if (p == pend) goto invalid_pattern;
PATFETCH_RAW (c);
@@ -616,12 +858,15 @@ re_compile_pattern (pattern, size, bufp)
goto normal_backsl;
handle_open:
if (stackp == stacke) goto nesting_too_deep;
+
+ /* Laststart should point to the start_memory that we are about
+ to push (unless the pattern has RE_NREGS or more ('s). */
+ *stackp++ = b - bufp->buffer;
if (regnum < RE_NREGS)
{
- PATPUSH (start_memory);
- PATPUSH (regnum);
+ BUFPUSH (start_memory);
+ BUFPUSH (regnum);
}
- *stackp++ = b - bufp->buffer;
*stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
*stackp++ = regnum++;
*stackp++ = begalt - bufp->buffer;
@@ -640,83 +885,242 @@ re_compile_pattern (pattern, size, bufp)
store_jump (fixup_jump, jump, b);
if (stackp[-1] < RE_NREGS)
{
- PATPUSH (stop_memory);
- PATPUSH (stackp[-1]);
+ BUFPUSH (stop_memory);
+ BUFPUSH (stackp[-1]);
}
stackp -= 2;
- fixup_jump = 0;
- if (*stackp)
- fixup_jump = *stackp + bufp->buffer - 1;
- laststart = *--stackp + bufp->buffer;
+ fixup_jump = *stackp ? *stackp + bufp->buffer - 1 : 0;
+ laststart = *--stackp + bufp->buffer;
break;
case '|':
- if (obscure_syntax & RE_NO_BK_VBAR)
+ if ((obscure_syntax & RE_LIMITED_OPS)
+ || (obscure_syntax & RE_NO_BK_VBAR))
goto normal_backsl;
handle_bar:
- insert_jump (on_failure_jump, begalt, b + 6, b);
+ if (obscure_syntax & RE_LIMITED_OPS)
+ goto normal_char;
+ /* Insert before the previous alternative a jump which
+ jumps to this alternative if the former fails. */
+ GET_BUFFER_SPACE (6);
+ insert_jump (on_failure_jump, begalt, b + 6, b);
pending_exact = 0;
b += 3;
- if (fixup_jump)
+ /* The alternative before the previous alternative has a
+ jump after it which gets executed if it gets matched.
+ Adjust that jump so it will jump to the previous
+ alternative's analogous jump (put in below, which in
+ turn will jump to the next (if any) alternative's such
+ jump, etc.). The last such jump jumps to the correct
+ final destination. */
+ if (fixup_jump)
store_jump (fixup_jump, jump, b);
- fixup_jump = b;
- b += 3;
- laststart = 0;
+
+ /* Leave space for a jump after previous alternative---to be
+ filled in later. */
+ fixup_jump = b;
+ b += 3;
+
+ laststart = 0;
begalt = b;
break;
+ case '{':
+ if (! (obscure_syntax & RE_INTERVALS)
+ /* Let \{ be a literal. */
+ || ((obscure_syntax & RE_INTERVALS)
+ && (obscure_syntax & RE_NO_BK_CURLY_BRACES))
+ /* If it's the string "\{". */
+ || (p - 2 == pattern && p == pend))
+ goto normal_backsl;
+ handle_interval:
+ beg_interval = p - 1; /* The {. */
+ /* If there is no previous pattern, this isn't an interval. */
+ if (!laststart)
+ {
+ if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
+ goto invalid_pattern;
+ else
+ goto normal_backsl;
+ }
+ /* It also isn't an interval if not preceded by an re
+ matching a single character or subexpression, or if
+ the current type of intervals can't handle back
+ references and the previous thing is a back reference. */
+ if (! (*laststart == anychar
+ || *laststart == charset
+ || *laststart == charset_not
+ || *laststart == start_memory
+ || (*laststart == exactn && laststart[1] == 1)
+ || (! (obscure_syntax & RE_NO_BK_REFS)
+ && *laststart == duplicate)))
+ {
+ if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
+ goto normal_char;
+
+ /* Posix extended syntax is handled in previous
+ statement; this is for Posix basic syntax. */
+ if (obscure_syntax & RE_INTERVALS)
+ goto invalid_pattern;
+
+ goto normal_backsl;
+ }
+ lower_bound = -1; /* So can see if are set. */
+ upper_bound = -1;
+ GET_UNSIGNED_NUMBER (lower_bound);
+ if (c == ',')
+ {
+ GET_UNSIGNED_NUMBER (upper_bound);
+ if (upper_bound < 0)
+ upper_bound = RE_DUP_MAX;
+ }
+ if (upper_bound < 0)
+ upper_bound = lower_bound;
+ if (! (obscure_syntax & RE_NO_BK_CURLY_BRACES))
+ {
+ if (c != '\\')
+ goto invalid_pattern;
+ PATFETCH (c);
+ }
+ if (c != '}' || lower_bound < 0 || upper_bound > RE_DUP_MAX
+ || lower_bound > upper_bound
+ || ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
+ && p != pend && *p == '{'))
+ {
+ if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
+ goto unfetch_interval;
+ else
+ goto invalid_pattern;
+ }
+
+ /* If upper_bound is zero, don't want to succeed at all;
+ jump from laststart to b + 3, which will be the end of
+ the buffer after this jump is inserted. */
+
+ if (upper_bound == 0)
+ {
+ GET_BUFFER_SPACE (3);
+ insert_jump (jump, laststart, b + 3, b);
+ b += 3;
+ }
+
+ /* Otherwise, after lower_bound number of succeeds, jump
+ to after the jump_n which will be inserted at the end
+ of the buffer, and insert that jump_n. */
+ else
+ { /* Set to 5 if only one repetition is allowed and
+ hence no jump_n is inserted at the current end of
+ the buffer; then only space for the succeed_n is
+ needed. Otherwise, need space for both the
+ succeed_n and the jump_n. */
+
+ unsigned slots_needed = upper_bound == 1 ? 5 : 10;
+
+ GET_BUFFER_SPACE (slots_needed);
+ /* Initialize the succeed_n to n, even though it will
+ be set by its attendant set_number_at, because
+ re_compile_fastmap will need to know it. Jump to
+ what the end of buffer will be after inserting
+ this succeed_n and possibly appending a jump_n. */
+ insert_jump_n (succeed_n, laststart, b + slots_needed,
+ b, lower_bound);
+ b += 5; /* Just increment for the succeed_n here. */
+
+ /* More than one repetition is allowed, so put in at
+ the end of the buffer a backward jump from b to the
+ succeed_n we put in above. By the time we've gotten
+ to this jump when matching, we'll have matched once
+ already, so jump back only upper_bound - 1 times. */
+
+ if (upper_bound > 1)
+ {
+ store_jump_n (b, jump_n, laststart, upper_bound - 1);
+ b += 5;
+ /* When hit this when matching, reset the
+ preceding jump_n's n to upper_bound - 1. */
+ BUFPUSH (set_number_at);
+ GET_BUFFER_SPACE (2);
+ STORE_NUMBER_AND_INCR (b, -5);
+ STORE_NUMBER_AND_INCR (b, upper_bound - 1);
+ }
+ /* When hit this when matching, set the succeed_n's n. */
+ GET_BUFFER_SPACE (5);
+ insert_op_2 (set_number_at, laststart, b, 5, lower_bound);
+ b += 5;
+ }
+ pending_exact = 0;
+ beg_interval = 0;
+ break;
+
+
+ unfetch_interval:
+ /* If an invalid interval, match the characters as literals. */
+ if (beg_interval)
+ p = beg_interval;
+ else
+ {
+ fprintf (stderr,
+ "regex: no interval beginning to which to backtrack.\n");
+ exit (1);
+ }
+
+ beg_interval = 0;
+ PATFETCH (c); /* normal_char expects char in `c'. */
+ goto normal_char;
+ break;
+
#ifdef emacs
case '=':
- PATPUSH (at_dot);
+ BUFPUSH (at_dot);
break;
case 's':
laststart = b;
- PATPUSH (syntaxspec);
+ BUFPUSH (syntaxspec);
PATFETCH (c);
- PATPUSH (syntax_spec_code[c]);
+ BUFPUSH (syntax_spec_code[c]);
break;
case 'S':
laststart = b;
- PATPUSH (notsyntaxspec);
+ BUFPUSH (notsyntaxspec);
PATFETCH (c);
- PATPUSH (syntax_spec_code[c]);
+ BUFPUSH (syntax_spec_code[c]);
break;
#endif /* emacs */
case 'w':
laststart = b;
- PATPUSH (wordchar);
+ BUFPUSH (wordchar);
break;
case 'W':
laststart = b;
- PATPUSH (notwordchar);
+ BUFPUSH (notwordchar);
break;
case '<':
- PATPUSH (wordbeg);
+ BUFPUSH (wordbeg);
break;
case '>':
- PATPUSH (wordend);
+ BUFPUSH (wordend);
break;
case 'b':
- PATPUSH (wordbound);
+ BUFPUSH (wordbound);
break;
case 'B':
- PATPUSH (notwordbound);
+ BUFPUSH (notwordbound);
break;
case '`':
- PATPUSH (begbuf);
+ BUFPUSH (begbuf);
break;
case '\'':
- PATPUSH (endbuf);
+ BUFPUSH (endbuf);
break;
case '1':
@@ -728,23 +1132,34 @@ re_compile_pattern (pattern, size, bufp)
case '7':
case '8':
case '9':
- c1 = c - '0';
+ if (obscure_syntax & RE_NO_BK_REFS)
+ goto normal_char;
+ c1 = c - '0';
if (c1 >= regnum)
- goto normal_char;
- for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
+ {
+ if (obscure_syntax & RE_NO_EMPTY_BK_REF)
+ goto invalid_pattern;
+ else
+ goto normal_char;
+ }
+ /* Can't back reference to a subexpression if inside of it. */
+ for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
if (*stackt == c1)
goto normal_char;
laststart = b;
- PATPUSH (duplicate);
- PATPUSH (c1);
+ BUFPUSH (duplicate);
+ BUFPUSH (c1);
break;
case '+':
case '?':
if (obscure_syntax & RE_BK_PLUS_QM)
goto handle_plus;
+ else
+ goto normal_backsl;
+ break;
- default:
+ default:
normal_backsl:
/* You might think it would be useful for \ to mean
not to translate; but if we don't translate it
@@ -755,19 +1170,23 @@ re_compile_pattern (pattern, size, bufp)
break;
default:
- normal_char:
+ normal_char: /* Expects the character in `c'. */
if (!pending_exact || pending_exact + *pending_exact + 1 != b
|| *pending_exact == 0177 || *p == '*' || *p == '^'
|| ((obscure_syntax & RE_BK_PLUS_QM)
? *p == '\\' && (p[1] == '+' || p[1] == '?')
- : (*p == '+' || *p == '?')))
+ : (*p == '+' || *p == '?'))
+ || ((obscure_syntax & RE_INTERVALS)
+ && ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
+ ? *p == '{'
+ : (p[0] == '\\' && p[1] == '{'))))
{
laststart = b;
- PATPUSH (exactn);
+ BUFPUSH (exactn);
pending_exact = b;
- PATPUSH (0);
+ BUFPUSH (0);
}
- PATPUSH (c);
+ BUFPUSH (c);
(*pending_exact)++;
}
}
@@ -802,46 +1221,137 @@ re_compile_pattern (pattern, size, bufp)
return "Memory exhausted";
}
-/* Store where `from' points a jump operation to jump to where `to' points.
- `opcode' is the opcode to store. */
-static int
+/* Store a jump of the form <OPCODE> <relative address>.
+ Store in the location FROM a jump operation to jump to relative
+ address FROM - TO. OPCODE is the opcode to store. */
+
+static void
store_jump (from, opcode, to)
char *from, *to;
+#ifndef MSDOS
char opcode;
+#else
+ int opcode;
+#endif /* MSDOS */
{
from[0] = opcode;
- from[1] = (to - (from + 3)) & 0377;
- from[2] = (to - (from + 3)) >> 8;
+ STORE_NUMBER(from + 1, to - (from + 3));
}
-/* Open up space at char FROM, and insert there a jump to TO.
- CURRENT_END gives te end of the storage no in use,
- so we know how much data to copy up.
- OP is the opcode of the jump to insert.
+
+/* Open up space before char FROM, and insert there a jump to TO.
+ CURRENT_END gives the end of the storage not in use, so we know
+ how much data to copy up. OP is the opcode of the jump to insert.
If you call this function, you must zero out pending_exact. */
-static int
+static void
insert_jump (op, from, to, current_end)
+#ifndef MSDOS
char op;
+#else
+ int op;
+#endif /* MSDOS */
char *from, *to, *current_end;
{
- register char *pto = current_end + 3;
- register char *pfrom = current_end;
- while (pfrom != from)
+ register char *pfrom = current_end; /* Copy from here... */
+ register char *pto = current_end + 3; /* ...to here. */
+
+ while (pfrom != from)
*--pto = *--pfrom;
store_jump (from, op, to);
}
+
+
+/* Store a jump of the form <opcode> <relative address> <n> .
+
+ Store in the location FROM a jump operation to jump to relative
+ address FROM - TO. OPCODE is the opcode to store, N is a number the
+ jump uses, say, to decide how many times to jump.
+
+ If you call this function, you must zero out pending_exact. */
+
+static void
+store_jump_n (from, opcode, to, n)
+ char *from, *to;
+#ifndef MSDOS
+ char opcode;
+#else
+ int opcode;
+#endif /* MSDOS */
+ unsigned n;
+{
+ from[0] = opcode;
+ STORE_NUMBER (from + 1, to - (from + 3));
+ STORE_NUMBER (from + 3, n);
+}
+
+
+/* Similar to insert_jump, but handles a jump which needs an extra
+ number to handle minimum and maximum cases. Open up space at
+ location FROM, and insert there a jump to TO. CURRENT_END gives the
+ end of the storage in use, so we know how much data to copy up. OP is
+ the opcode of the jump to insert.
+
+ If you call this function, you must zero out pending_exact. */
+
+static void
+insert_jump_n (op, from, to, current_end, n)
+#ifndef MSDOS
+ char op;
+#else
+ int op;
+#endif /* MSDOS */
+ char *from, *to, *current_end;
+ unsigned n;
+{
+ register char *pfrom = current_end; /* Copy from here... */
+ register char *pto = current_end + 5; /* ...to here. */
+
+ while (pfrom != from)
+ *--pto = *--pfrom;
+ store_jump_n (from, op, to, n);
+}
+
+
+/* Open up space at location THERE, and insert operation OP followed by
+ NUM_1 and NUM_2. CURRENT_END gives the end of the storage in use, so
+ we know how much data to copy up.
+
+ If you call this function, you must zero out pending_exact. */
+
+static void
+insert_op_2 (op, there, current_end, num_1, num_2)
+#ifndef MSDOS
+ char op;
+#else
+ int op;
+#endif /* MSDOS */
+ char *there, *current_end;
+ int num_1, num_2;
+{
+ register char *pfrom = current_end; /* Copy from here... */
+ register char *pto = current_end + 5; /* ...to here. */
+
+ while (pfrom != there)
+ *--pto = *--pfrom;
+
+ there[0] = op;
+ STORE_NUMBER (there + 1, num_1);
+ STORE_NUMBER (there + 3, num_2);
+}
+
+
-/* Given a pattern, compute a fastmap from it.
- The fastmap records which of the (1 << BYTEWIDTH) possible characters
- can start a string that matches the pattern.
- This fastmap is used by re_search to skip quickly over totally implausible text.
+/* Given a pattern, compute a fastmap from it. The fastmap records
+ which of the (1 << BYTEWIDTH) possible characters can start a string
+ that matches the pattern. This fastmap is used by re_search to skip
+ quickly over totally implausible text.
- The caller must supply the address of a (1 << BYTEWIDTH)-byte data area
- as bufp->fastmap.
- The other components of bufp describe the pattern to be used. */
+ The caller must supply the address of a (1 << BYTEWIDTH)-byte data
+ area as bufp->fastmap.
+ The other components of bufp describe the pattern to be used. */
void
re_compile_fastmap (bufp)
@@ -854,16 +1364,26 @@ re_compile_fastmap (bufp)
register unsigned char *pend = pattern + size;
register int j, k;
unsigned char *translate = (unsigned char *) bufp->translate;
+ unsigned is_a_succeed_n;
+#ifndef NO_ALLOCA
unsigned char *stackb[NFAILURES];
unsigned char **stackp = stackb;
- bzero (fastmap, (1 << BYTEWIDTH));
+#else
+ unsigned char **stackb;
+ unsigned char **stackp;
+ stackb = (unsigned char **) malloc (NFAILURES * sizeof (unsigned char *));
+ stackp = stackb;
+
+#endif /* NO_ALLOCA */
+ memset (fastmap, 0, (1 << BYTEWIDTH));
bufp->fastmap_accurate = 1;
bufp->can_be_null = 0;
while (p)
{
+ is_a_succeed_n = 0;
if (p == pend)
{
bufp->can_be_null = 1;
@@ -892,51 +1412,70 @@ re_compile_fastmap (bufp)
case notwordbound:
case wordbeg:
case wordend:
- continue;
+ continue;
case endline:
if (translate)
fastmap[translate['\n']] = 1;
else
fastmap['\n'] = 1;
+
if (bufp->can_be_null != 1)
bufp->can_be_null = 2;
break;
- case finalize_jump:
+ case jump_n:
+ case finalize_jump:
case maybe_finalize_jump:
case jump:
case dummy_failure_jump:
- bufp->can_be_null = 1;
- j = *p++ & 0377;
- j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
- p += j + 1; /* The 1 compensates for missing ++ above */
+ EXTRACT_NUMBER_AND_INCR (j, p);
+ p += j;
if (j > 0)
continue;
- /* Jump backward reached implies we just went through
+ /* Jump backward reached implies we just went through
the body of a loop and matched nothing.
Opcode jumped to should be an on_failure_jump.
Just treat it like an ordinary jump.
For a * loop, it has pushed its failure point already;
- if so, discard that as redundant. */
- if ((enum regexpcode) *p != on_failure_jump)
+ If so, discard that as redundant. */
+
+ if ((enum regexpcode) *p != on_failure_jump
+ && (enum regexpcode) *p != succeed_n)
continue;
- p++;
- j = *p++ & 0377;
- j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
- p += j + 1; /* The 1 compensates for missing ++ above */
- if (stackp != stackb && *stackp == p)
- stackp--;
- continue;
+ p++;
+ EXTRACT_NUMBER_AND_INCR (j, p);
+ p += j;
+ if (stackp != stackb && *stackp == p)
+ stackp--;
+ continue;
- case on_failure_jump:
- j = *p++ & 0377;
- j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
- p++;
- *++stackp = p + j;
+ case on_failure_jump:
+ handle_on_failure_jump:
+ EXTRACT_NUMBER_AND_INCR (j, p);
+ *++stackp = p + j;
+ if (is_a_succeed_n)
+ EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
continue;
- case start_memory:
+ case succeed_n:
+ is_a_succeed_n = 1;
+ /* Get to the number of times to succeed. */
+ p += 2;
+ /* Increment p past the n for when k != 0. */
+ EXTRACT_NUMBER_AND_INCR (k, p);
+ if (k == 0)
+ {
+ p -= 4;
+ goto handle_on_failure_jump;
+ }
+ continue;
+
+ case set_number_at:
+ p += 4;
+ continue;
+
+ case start_memory:
case stop_memory:
p++;
continue;
@@ -949,7 +1488,9 @@ re_compile_fastmap (bufp)
if (j != '\n')
fastmap[j] = 1;
if (bufp->can_be_null)
- return;
+ {
+ FREE_AND_RETURN_VOID(stackb);
+ }
/* Don't return; check the alternative paths
so we can set can_be_null if appropriate. */
break;
@@ -980,7 +1521,7 @@ re_compile_fastmap (bufp)
if (SYNTAX (j) != (enum syntaxcode) k)
fastmap[j] = 1;
break;
-#endif /* emacs */
+#endif /* not emacs */
case charset:
for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
@@ -1012,17 +1553,22 @@ re_compile_fastmap (bufp)
break;
}
- /* Get here means we have successfully found the possible starting characters
- of one path of the pattern. We need not follow this path any farther.
- Instead, look at the next alternative remembered in the stack. */
- if (stackp != stackb)
+ /* Get here means we have successfully found the possible starting
+ characters of one path of the pattern. We need not follow this
+ path any farther. Instead, look at the next alternative
+ remembered in the stack. */
+ if (stackp != stackb)
p = *stackp--;
else
break;
}
+ FREE_AND_RETURN_VOID(stackb);
}
+
+
-/* Like re_search_2, below, but only one string is specified. */
+/* Like re_search_2, below, but only one string is specified, and
+ doesn't let you say where to stop matching. */
int
re_search (pbufp, string, size, startpos, range, regs)
@@ -1031,23 +1577,29 @@ re_search (pbufp, string, size, startpos, range, regs)
int size, startpos, range;
struct re_registers *regs;
{
- return re_search_2 (pbufp, 0, 0, string, size, startpos, range, regs, size);
+ return re_search_2 (pbufp, (char *) 0, 0, string, size, startpos, range,
+ regs, size);
}
-/* Like re_match_2 but tries first a match starting at index STARTPOS,
- then at STARTPOS + 1, and so on.
- RANGE is the number of places to try before giving up.
- If RANGE is negative, the starting positions tried are
- STARTPOS, STARTPOS - 1, etc.
- It is up to the caller to make sure that range is not so large
- as to take the starting position outside of the input strings.
-The value returned is the position at which the match was found,
- or -1 if no match was found,
- or -2 if error (such as failure stack overflow). */
+/* Using the compiled pattern in PBUFP->buffer, first tries to match the
+ virtual concatenation of STRING1 and STRING2, starting first at index
+ STARTPOS, then at STARTPOS + 1, and so on. RANGE is the number of
+ places to try before giving up. If RANGE is negative, it searches
+ backwards, i.e., the starting positions tried are STARTPOS, STARTPOS
+ - 1, etc. STRING1 and STRING2 are of SIZE1 and SIZE2, respectively.
+ In REGS, return the indices of the virtual concatenation of STRING1
+ and STRING2 that matched the entire PBUFP->buffer and its contained
+ subexpressions. Do not consider matching one past the index MSTOP in
+ the virtual concatenation of STRING1 and STRING2.
+
+ The value returned is the position in the strings at which the match
+ was found, or -1 if no match was found, or -2 if error (such as
+ failure stack overflow). */
int
-re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop)
+re_search_2 (pbufp, string1, size1, string2, size2, startpos, range,
+ regs, mstop)
struct re_pattern_buffer *pbufp;
char *string1, *string2;
int size1, size2;
@@ -1058,15 +1610,27 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop
{
register char *fastmap = pbufp->fastmap;
register unsigned char *translate = (unsigned char *) pbufp->translate;
- int total = size1 + size2;
+ int total_size = size1 + size2;
+ int endpos = startpos + range;
int val;
- /* Update the fastmap now if not correct already */
+ /* Check for out-of-range starting position. */
+ if (startpos < 0 || startpos > total_size)
+ return -1;
+
+ /* Fix up range if it would eventually take startpos outside of the
+ virtual concatenation of string1 and string2. */
+ if (endpos < -1)
+ range = -1 - startpos;
+ else if (endpos > total_size)
+ range = total_size - startpos;
+
+ /* Update the fastmap now if not correct already. */
if (fastmap && !pbufp->fastmap_accurate)
re_compile_fastmap (pbufp);
- /* Don't waste time in a long search for a pattern
- that says it is anchored. */
+ /* If the search isn't to be a backwards one, don't waste time in a
+ long search for a pattern that says it is anchored. */
if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
&& range > 0)
{
@@ -1077,16 +1641,16 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop
}
while (1)
- {
- /* If a fastmap is supplied, skip quickly over characters
- that cannot possibly be the start of a match.
- Note, however, that if the pattern can possibly match
- the null string, we must test it at each starting point
- so that we take the first null string we get. */
-
- if (fastmap && startpos < total && pbufp->can_be_null != 1)
+ {
+ /* If a fastmap is supplied, skip quickly over characters that
+ cannot possibly be the start of a match. Note, however, that
+ if the pattern can possibly match the null string, we must
+ test it at each starting point so that we take the first null
+ string we get. */
+
+ if (fastmap && startpos < total_size && pbufp->can_be_null != 1)
{
- if (range > 0)
+ if (range > 0) /* Searching forwards. */
{
register int lim = 0;
register unsigned char *p;
@@ -1097,55 +1661,64 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop
p = ((unsigned char *)
&(startpos >= size1 ? string2 - size1 : string1)[startpos]);
- if (translate)
- {
- while (range > lim && !fastmap[translate[*p++]])
+ while (range > lim && !fastmap[translate
+ ? translate[*p++]
+ : *p++])
range--;
- }
- else
- {
- while (range > lim && !fastmap[*p++])
- range--;
- }
startpos += irange - range;
}
- else
+ else /* Searching backwards. */
{
register unsigned char c;
- if (startpos >= size1)
+
+ if (string1 == 0 || startpos >= size1)
c = string2[startpos - size1];
- else
+ else
c = string1[startpos];
- c &= 0xff;
+
+ c &= 0xff;
if (translate ? !fastmap[translate[c]] : !fastmap[c])
goto advance;
}
}
- if (range >= 0 && startpos == total
+ if (range >= 0 && startpos == total_size
&& fastmap && pbufp->can_be_null == 0)
return -1;
- val = re_match_2 (pbufp, string1, size1, string2, size2, startpos, regs, mstop);
- if (0 <= val)
- {
- if (val == -2)
- return -2;
- return startpos;
- }
+ val = re_match_2 (pbufp, string1, size1, string2, size2, startpos,
+ regs, mstop);
+ if (val >= 0)
+ return startpos;
+ if (val == -2)
+ return -2;
+#ifndef NO_ALLOCA
#ifdef C_ALLOCA
alloca (0);
#endif /* C_ALLOCA */
+#endif /* NO_ALLOCA */
advance:
- if (!range) break;
- if (range > 0) range--, startpos++; else range++, startpos--;
+ if (!range)
+ break;
+ else if (range > 0)
+ {
+ range--;
+ startpos++;
+ }
+ else
+ {
+ range++;
+ startpos--;
+ }
}
return -1;
}
+
+
-#ifndef emacs /* emacs never uses this */
+#ifndef emacs /* emacs never uses this. */
int
re_match (pbufp, string, size, pos, regs)
struct re_pattern_buffer *pbufp;
@@ -1153,82 +1726,295 @@ re_match (pbufp, string, size, pos, regs)
int size, pos;
struct re_registers *regs;
{
- return re_match_2 (pbufp, 0, 0, string, size, pos, regs, size);
+ return re_match_2 (pbufp, (char *) 0, 0, string, size, pos, regs, size);
}
-#endif /* emacs */
+#endif /* not emacs */
+
-/* Maximum size of failure stack. Beyond this, overflow is an error. */
+/* The following are used for re_match_2, defined below: */
+/* Roughly the maximum number of failure points on the stack. Would be
+ exactly that if always pushed MAX_NUM_FAILURE_ITEMS each time we failed. */
+
int re_max_failures = 2000;
-static int bcmp_translate();
-/* Match the pattern described by PBUFP
- against data which is the virtual concatenation of STRING1 and STRING2.
- SIZE1 and SIZE2 are the sizes of the two data strings.
- Start the match at position POS.
- Do not consider matching past the position MSTOP.
+/* Routine used by re_match_2. */
+static int memcmp_translate ();
+
+
+/* Structure and accessing macros used in re_match_2: */
+
+struct register_info
+{
+ unsigned is_active : 1;
+ unsigned matched_something : 1;
+};
+
+#define IS_ACTIVE(R) ((R).is_active)
+#define MATCHED_SOMETHING(R) ((R).matched_something)
+
+
+/* Macros used by re_match_2: */
+
+
+/* I.e., regstart, regend, and reg_info. */
+
+#define NUM_REG_ITEMS 3
+
+/* We push at most this many things on the stack whenever we
+ fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are
+ arguments to the PUSH_FAILURE_POINT macro. */
+
+#define MAX_NUM_FAILURE_ITEMS (RE_NREGS * NUM_REG_ITEMS + 2)
+
+
+/* We push this many things on the stack whenever we fail. */
+
+#define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + 2)
+
+
+/* This pushes most of the information about the current state we will want
+ if we ever fail back to it. */
+
+#define PUSH_FAILURE_POINT(pattern_place, string_place) \
+ { \
+ short last_used_reg, this_reg; \
+ \
+ /* Find out how many registers are active or have been matched. \
+ (Aside from register zero, which is only set at the end.) */ \
+ for (last_used_reg = RE_NREGS - 1; last_used_reg > 0; last_used_reg--)\
+ if (regstart[last_used_reg] != (unsigned char *) -1) \
+ break; \
+ \
+ if (stacke - stackp < NUM_FAILURE_ITEMS) \
+ { \
+ unsigned char **stackx; \
+ unsigned int len = stacke - stackb; \
+ if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \
+ { \
+ FREE_AND_RETURN(stackb,(-2)); \
+ } \
+ \
+ /* Roughly double the size of the stack. */ \
+ stackx = DOUBLE_STACK(stackx,stackb,len); \
+ /* Rearrange the pointers. */ \
+ stackp = stackx + (stackp - stackb); \
+ stackb = stackx; \
+ stacke = stackb + 2 * len; \
+ } \
+ \
+ /* Now push the info for each of those registers. */ \
+ for (this_reg = 1; this_reg <= last_used_reg; this_reg++) \
+ { \
+ *stackp++ = regstart[this_reg]; \
+ *stackp++ = regend[this_reg]; \
+ *stackp++ = (unsigned char *) &reg_info[this_reg]; \
+ } \
+ \
+ /* Push how many registers we saved. */ \
+ *stackp++ = (unsigned char *) last_used_reg; \
+ \
+ *stackp++ = pattern_place; \
+ *stackp++ = string_place; \
+ }
+
+
+/* This pops what PUSH_FAILURE_POINT pushes. */
+
+#define POP_FAILURE_POINT() \
+ { \
+ int temp; \
+ stackp -= 2; /* Remove failure points. */ \
+ temp = (int) *--stackp; /* How many regs pushed. */ \
+ temp *= NUM_REG_ITEMS; /* How much to take off the stack. */ \
+ stackp -= temp; /* Remove the register info. */ \
+ }
+
+
+#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
+
+/* Is true if there is a first string and if PTR is pointing anywhere
+ inside it or just past the end. */
+
+#define IS_IN_FIRST_STRING(ptr) \
+ (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
+
+/* Call before fetching a character with *d. This switches over to
+ string2 if necessary. */
+
+#define PREFETCH \
+ while (d == dend) \
+ { \
+ /* end of string2 => fail. */ \
+ if (dend == end_match_2) \
+ goto fail; \
+ /* end of string1 => advance to string2. */ \
+ d = string2; \
+ dend = end_match_2; \
+ }
+
+
+/* Call this when have matched something; it sets `matched' flags for the
+ registers corresponding to the subexpressions of which we currently
+ are inside. */
+#define SET_REGS_MATCHED \
+ { unsigned this_reg; \
+ for (this_reg = 0; this_reg < RE_NREGS; this_reg++) \
+ { \
+ if (IS_ACTIVE(reg_info[this_reg])) \
+ MATCHED_SOMETHING(reg_info[this_reg]) = 1; \
+ else \
+ MATCHED_SOMETHING(reg_info[this_reg]) = 0; \
+ } \
+ }
+
+/* Test if at very beginning or at very end of the virtual concatenation
+ of string1 and string2. If there is only one string, we've put it in
+ string2. */
+
+#define AT_STRINGS_BEG (d == (size1 ? string1 : string2) || !size2)
+#define AT_STRINGS_END (d == end2)
+
+#define AT_WORD_BOUNDARY \
+ (AT_STRINGS_BEG || AT_STRINGS_END || IS_A_LETTER (d - 1) != IS_A_LETTER (d))
+
+/* We have two special cases to check for:
+ 1) if we're past the end of string1, we have to look at the first
+ character in string2;
+ 2) if we're before the beginning of string2, we have to look at the
+ last character in string1; we assume there is a string1, so use
+ this in conjunction with AT_STRINGS_BEG. */
+#define IS_A_LETTER(d) \
+ (SYNTAX ((d) == end1 ? *string2 : (d) == string2 - 1 ? *(end1 - 1) : *(d))\
+ == Sword)
+
+
+/* Match the pattern described by PBUFP against the virtual
+ concatenation of STRING1 and STRING2, which are of SIZE1 and SIZE2,
+ respectively. Start the match at index POS in the virtual
+ concatenation of STRING1 and STRING2. In REGS, return the indices of
+ the virtual concatenation of STRING1 and STRING2 that matched the
+ entire PBUFP->buffer and its contained subexpressions. Do not
+ consider matching one past the index MSTOP in the virtual
+ concatenation of STRING1 and STRING2.
If pbufp->fastmap is nonzero, then it had better be up to date.
The reason that the data to match are specified as two components
- which are to be regarded as concatenated
- is so this function can be used directly on the contents of an Emacs buffer.
+ which are to be regarded as concatenated is so this function can be
+ used directly on the contents of an Emacs buffer.
- -1 is returned if there is no match. -2 is returned if there is
- an error (such as match stack overflow). Otherwise the value is the length
- of the substring which was matched. */
+ -1 is returned if there is no match. -2 is returned if there is an
+ error (such as match stack overflow). Otherwise the value is the
+ length of the substring which was matched. */
int
-re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
+re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
struct re_pattern_buffer *pbufp;
- unsigned char *string1, *string2;
+ char *string1_arg, *string2_arg;
int size1, size2;
int pos;
struct re_registers *regs;
int mstop;
{
register unsigned char *p = (unsigned char *) pbufp->buffer;
+
+ /* Pointer to beyond end of buffer. */
register unsigned char *pend = p + pbufp->used;
- /* End of first string */
- unsigned char *end1;
- /* End of second string */
- unsigned char *end2;
- /* Pointer just past last char to consider matching */
+
+ unsigned char *string1 = (unsigned char *) string1_arg;
+ unsigned char *string2 = (unsigned char *) string2_arg;
+ unsigned char *end1; /* Just past end of first string. */
+ unsigned char *end2; /* Just past end of second string. */
+
+ /* Pointers into string1 and string2, just past the last characters in
+ each to consider matching. */
unsigned char *end_match_1, *end_match_2;
+
register unsigned char *d, *dend;
- register int mcnt;
+ register int mcnt; /* Multipurpose. */
unsigned char *translate = (unsigned char *) pbufp->translate;
-
- /* Failure point stack. Each place that can handle a failure further down the line
- pushes a failure point on this stack. It consists of two char *'s.
- The first one pushed is where to resume scanning the pattern;
- the second pushed is where to resume scanning the strings.
- If the latter is zero, the failure point is a "dummy".
- If a failure happens and the innermost failure point is dormant,
- it discards that failure point and tries the next one. */
-
- unsigned char *initial_stack[2 * NFAILURES];
- unsigned char **stackb = initial_stack;
- unsigned char **stackp = stackb, **stacke = &stackb[2 * NFAILURES];
-
- /* Information on the "contents" of registers.
- These are pointers into the input strings; they record
- just what was matched (on this attempt) by some part of the pattern.
- The start_memory command stores the start of a register's contents
- and the stop_memory command stores the end.
-
- At that point, regstart[regnum] points to the first character in the register,
- regend[regnum] points to the first character beyond the end of the register,
- regstart_seg1[regnum] is true iff regstart[regnum] points into string1,
- and regend_seg1[regnum] is true iff regend[regnum] points into string1. */
-
+ unsigned is_a_jump_n = 0;
+
+ /* Failure point stack. Each place that can handle a failure further
+ down the line pushes a failure point on this stack. It consists of
+ restart, regend, and reg_info for all registers corresponding to the
+ subexpressions we're currently inside, plus the number of such
+ registers, and, finally, two char *'s. The first char * is where to
+ resume scanning the pattern; the second one is where to resume
+ scanning the strings. If the latter is zero, the failure point is a
+ ``dummy''; if a failure happens and the failure point is a dummy, it
+ gets discarded and the next next one is tried. */
+
+#ifndef NO_ALLOCA
+ unsigned char *initial_stack[MAX_NUM_FAILURE_ITEMS * NFAILURES];
+#endif
+ unsigned char **stackb;
+ unsigned char **stackp;
+ unsigned char **stacke;
+
+
+ /* Information on the contents of registers. These are pointers into
+ the input strings; they record just what was matched (on this
+ attempt) by a subexpression part of the pattern, that is, the
+ regnum-th regstart pointer points to where in the pattern we began
+ matching and the regnum-th regend points to right after where we
+ stopped matching the regnum-th subexpression. (The zeroth register
+ keeps track of what the whole pattern matches.) */
+
unsigned char *regstart[RE_NREGS];
unsigned char *regend[RE_NREGS];
- unsigned char regstart_seg1[RE_NREGS], regend_seg1[RE_NREGS];
+
+ /* The is_active field of reg_info helps us keep track of which (possibly
+ nested) subexpressions we are currently in. The matched_something
+ field of reg_info[reg_num] helps us tell whether or not we have
+ matched any of the pattern so far this time through the reg_num-th
+ subexpression. These two fields get reset each time through any
+ loop their register is in. */
+
+ struct register_info reg_info[RE_NREGS];
+
+
+ /* The following record the register info as found in the above
+ variables when we find a match better than any we've seen before.
+ This happens as we backtrack through the failure points, which in
+ turn happens only if we have not yet matched the entire string. */
+
+ unsigned best_regs_set = 0;
+ unsigned char *best_regstart[RE_NREGS];
+ unsigned char *best_regend[RE_NREGS];
+
+ /* Initialize the stack. */
+#ifdef NO_ALLOCA
+ stackb = (unsigned char **) malloc (MAX_NUM_FAILURE_ITEMS * NFAILURES * sizeof (char *));
+#else
+ stackb = initial_stack;
+#endif
+ stackp = stackb;
+ stacke = &stackb[MAX_NUM_FAILURE_ITEMS * NFAILURES];
+
+#ifdef DEBUG_REGEX
+ fprintf (stderr, "Entering re_match_2(%s%s)\n", string1_arg, string2_arg);
+#endif
+
+ /* Initialize subexpression text positions to -1 to mark ones that no
+ \( or ( and \) or ) has been seen for. Also set all registers to
+ inactive and mark them as not having matched anything or ever
+ failed. */
+ for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
+ {
+ regstart[mcnt] = regend[mcnt] = (unsigned char *) -1;
+ IS_ACTIVE (reg_info[mcnt]) = 0;
+ MATCHED_SOMETHING (reg_info[mcnt]) = 0;
+ }
+
+ if (regs)
+ for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
+ regs->start[mcnt] = regs->end[mcnt] = -1;
/* Set up pointers to ends of strings.
Don't allow the second string to be empty unless both are empty. */
- if (!size2)
+ if (size2 == 0)
{
string2 = string1;
size2 = size1;
@@ -1238,7 +2024,7 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
end1 = string1 + size1;
end2 = string2 + size2;
- /* Compute where to stop matching, within the two strings */
+ /* Compute where to stop matching, within the two strings. */
if (mstop <= size1)
{
end_match_1 = string1 + mstop;
@@ -1250,50 +2036,87 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
end_match_2 = string2 + mstop - size1;
}
- /* Initialize \) text positions to -1
- to mark ones that no \( or \) has been seen for. */
-
- for (mcnt = 0; mcnt < sizeof (regend) / sizeof (*regend); mcnt++)
- regend[mcnt] = (unsigned char *) -1;
-
- /* `p' scans through the pattern as `d' scans through the data.
- `dend' is the end of the input string that `d' points within.
- `d' is advanced into the following input string whenever necessary,
- but this happens before fetching;
- therefore, at the beginning of the loop,
- `d' can be pointing at the end of a string,
- but it cannot equal string2. */
+ /* `p' scans through the pattern as `d' scans through the data. `dend'
+ is the end of the input string that `d' points within. `d' is
+ advanced into the following input string whenever necessary, but
+ this happens before fetching; therefore, at the beginning of the
+ loop, `d' can be pointing at the end of a string, but it cannot
+ equal string2. */
- if (pos <= size1)
+ if (size1 != 0 && pos <= size1)
d = string1 + pos, dend = end_match_1;
else
d = string2 + pos - size1, dend = end_match_2;
-/* Write PREFETCH; just before fetching a character with *d. */
-#define PREFETCH \
- while (d == dend) \
- { if (dend == end_match_2) goto fail; /* end of string2 => failure */ \
- d = string2; /* end of string1 => advance to string2. */ \
- dend = end_match_2; }
- /* This loop loops over pattern commands.
- It exits by returning from the function if match is complete,
- or it drops through if match fails at this starting point in the input data. */
+ /* This loops over pattern commands. It exits by returning from the
+ function if match is complete, or it drops through if match fails
+ at this starting point in the input data. */
while (1)
{
+#ifdef DEBUG_REGEX
+ fprintf (stderr,
+ "regex loop(%d): matching 0x%02d\n",
+ p - (unsigned char *) pbufp->buffer,
+ *p);
+#endif
+ is_a_jump_n = 0;
+ /* End of pattern means we might have succeeded. */
if (p == pend)
- /* End of pattern means we have succeeded! */
{
- /* If caller wants register contents data back, convert it to indices */
+ /* If not end of string, try backtracking. Otherwise done. */
+ if (d != end_match_2)
+ {
+ if (stackp != stackb)
+ {
+ /* More failure points to try. */
+
+ unsigned in_same_string =
+ IS_IN_FIRST_STRING (best_regend[0])
+ == MATCHING_IN_FIRST_STRING;
+
+ /* If exceeds best match so far, save it. */
+ if (! best_regs_set
+ || (in_same_string && d > best_regend[0])
+ || (! in_same_string && ! MATCHING_IN_FIRST_STRING))
+ {
+ best_regs_set = 1;
+ best_regend[0] = d; /* Never use regstart[0]. */
+
+ for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
+ {
+ best_regstart[mcnt] = regstart[mcnt];
+ best_regend[mcnt] = regend[mcnt];
+ }
+ }
+ goto fail;
+ }
+ /* If no failure points, don't restore garbage. */
+ else if (best_regs_set)
+ {
+ restore_best_regs:
+ /* Restore best match. */
+ d = best_regend[0];
+
+ for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
+ {
+ regstart[mcnt] = best_regstart[mcnt];
+ regend[mcnt] = best_regend[mcnt];
+ }
+ }
+ }
+
+ /* If caller wants register contents data back, convert it
+ to indices. */
if (regs)
{
- regs->start[0] = pos;
- if (dend == end_match_1)
- regs->end[0] = d - string1;
- else
- regs->end[0] = d - string2 + size1;
- for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
+ regs->start[0] = pos;
+ if (MATCHING_IN_FIRST_STRING)
+ regs->end[0] = d - string1;
+ else
+ regs->end[0] = d - string2 + size1;
+ for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
{
if (regend[mcnt] == (unsigned char *) -1)
{
@@ -1301,23 +2124,24 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
regs->end[mcnt] = -1;
continue;
}
- if (regstart_seg1[mcnt])
+ if (IS_IN_FIRST_STRING (regstart[mcnt]))
regs->start[mcnt] = regstart[mcnt] - string1;
else
regs->start[mcnt] = regstart[mcnt] - string2 + size1;
- if (regend_seg1[mcnt])
+
+ if (IS_IN_FIRST_STRING (regend[mcnt]))
regs->end[mcnt] = regend[mcnt] - string1;
else
regs->end[mcnt] = regend[mcnt] - string2 + size1;
}
}
- if (dend == end_match_1)
- return (d - string1 - pos);
- else
- return d - string2 + size1 - pos;
- }
+ FREE_AND_RETURN(stackb,
+ (d - pos - (MATCHING_IN_FIRST_STRING ?
+ string1 :
+ string2 - size1)));
+ }
- /* Otherwise match next pattern command */
+ /* Otherwise match next pattern command. */
#ifdef SWITCH_ENUM_BUG
switch ((int) ((enum regexpcode) *p++))
#else
@@ -1325,33 +2149,80 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
#endif
{
- /* \( is represented by a start_memory, \) by a stop_memory.
- Both of those commands contain a "register number" argument.
- The text matched within the \( and \) is recorded under that number.
- Then, \<digit> turns into a `duplicate' command which
- is followed by the numeric value of <digit> as the register number. */
-
+ /* \( [or `(', as appropriate] is represented by start_memory,
+ \) by stop_memory. Both of those commands are followed by
+ a register number in the next byte. The text matched
+ within the \( and \) is recorded under that number. */
case start_memory:
- regstart[*p] = d;
- regstart_seg1[*p++] = (dend == end_match_1);
- break;
+ regstart[*p] = d;
+ IS_ACTIVE (reg_info[*p]) = 1;
+ MATCHED_SOMETHING (reg_info[*p]) = 0;
+ p++;
+ break;
case stop_memory:
- regend[*p] = d;
- regend_seg1[*p++] = (dend == end_match_1);
- break;
-
- case duplicate:
+ regend[*p] = d;
+ IS_ACTIVE (reg_info[*p]) = 0;
+
+ /* If just failed to match something this time around with a sub-
+ expression that's in a loop, try to force exit from the loop. */
+ if ((! MATCHED_SOMETHING (reg_info[*p])
+ || (enum regexpcode) p[-3] == start_memory)
+ && (p + 1) != pend)
+ {
+ register unsigned char *p2 = p + 1;
+ mcnt = 0;
+ switch (*p2++)
+ {
+ case jump_n:
+ is_a_jump_n = 1;
+ case finalize_jump:
+ case maybe_finalize_jump:
+ case jump:
+ case dummy_failure_jump:
+ EXTRACT_NUMBER_AND_INCR (mcnt, p2);
+ if (is_a_jump_n)
+ p2 += 2;
+ break;
+ }
+ p2 += mcnt;
+
+ /* If the next operation is a jump backwards in the pattern
+ to an on_failure_jump, exit from the loop by forcing a
+ failure after pushing on the stack the on_failure_jump's
+ jump in the pattern, and d. */
+ if (mcnt < 0 && (enum regexpcode) *p2++ == on_failure_jump)
+ {
+ EXTRACT_NUMBER_AND_INCR (mcnt, p2);
+ PUSH_FAILURE_POINT (p2 + mcnt, d);
+ goto fail;
+ }
+ }
+ p++;
+ break;
+
+ /* \<digit> has been turned into a `duplicate' command which is
+ followed by the numeric value of <digit> as the register number. */
+ case duplicate:
{
int regno = *p++; /* Get which register to match against */
register unsigned char *d2, *dend2;
- d2 = regstart[regno];
- dend2 = ((regstart_seg1[regno] == regend_seg1[regno])
+ /* Where in input to try to start matching. */
+ d2 = regstart[regno];
+
+ /* Where to stop matching; if both the place to start and
+ the place to stop matching are in the same string, then
+ set to the place to stop, otherwise, for now have to use
+ the end of the first string. */
+
+ dend2 = ((IS_IN_FIRST_STRING (regstart[regno])
+ == IS_IN_FIRST_STRING (regend[regno]))
? regend[regno] : end_match_1);
while (1)
{
- /* Advance to next segment in register contents, if necessary */
+ /* If necessary, advance to next segment in register
+ contents. */
while (d2 == dend2)
{
if (dend2 == end_match_2) break;
@@ -1361,15 +2232,22 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
/* At end of register contents => success */
if (d2 == dend2) break;
- /* Advance to next segment in data being matched, if necessary */
+ /* If necessary, advance to next segment in data. */
PREFETCH;
- /* mcnt gets # consecutive chars to compare */
+ /* How many characters left in this segment to match. */
mcnt = dend - d;
- if (mcnt > dend2 - d2)
+
+ /* Want how many consecutive characters we can match in
+ one shot, so, if necessary, adjust the count. */
+ if (mcnt > dend2 - d2)
mcnt = dend2 - d2;
- /* Compare that many; failure if mismatch, else skip them. */
- if (translate ? bcmp_translate (d, d2, mcnt, translate) : bcmp (d, d2, mcnt))
+
+ /* Compare that many; failure if mismatch, else move
+ past them. */
+ if (translate
+ ? memcmp_translate (d, d2, mcnt, translate)
+ : memcmp ((char *)d, (char *)d2, mcnt))
goto fail;
d += mcnt, d2 += mcnt;
}
@@ -1377,27 +2255,28 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
break;
case anychar:
- /* fetch a data character */
- PREFETCH;
- /* Match anything but a newline. */
- if ((translate ? translate[*d++] : *d++) == '\n')
+ PREFETCH; /* Fetch a data character. */
+ /* Match anything but a newline, maybe even a null. */
+ if ((translate ? translate[*d] : *d) == '\n'
+ || ((obscure_syntax & RE_DOT_NOT_NULL)
+ && (translate ? translate[*d] : *d) == '\000'))
goto fail;
+ SET_REGS_MATCHED;
+ d++;
break;
case charset:
case charset_not:
{
- /* Nonzero for charset_not */
- int not = 0;
+ int not = 0; /* Nonzero for charset_not. */
register int c;
if (*(p - 1) == (unsigned char) charset_not)
not = 1;
- /* fetch a data character */
- PREFETCH;
+ PREFETCH; /* Fetch a data character. */
if (translate)
- c = translate [*d];
+ c = translate[*d];
else
c = *d;
@@ -1408,72 +2287,61 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
p += 1 + *p;
if (!not) goto fail;
- d++;
+ SET_REGS_MATCHED;
+ d++;
break;
}
case begline:
- if (d == string1 || d[-1] == '\n')
- break;
- goto fail;
-
+ if ((size1 != 0 && d == string1)
+ || (size1 == 0 && size2 != 0 && d == string2)
+ || (d && d[-1] == '\n')
+ || (size1 == 0 && size2 == 0))
+ break;
+ else
+ goto fail;
+
case endline:
if (d == end2
|| (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
break;
goto fail;
- /* "or" constructs ("|") are handled by starting each alternative
- with an on_failure_jump that points to the start of the next alternative.
- Each alternative except the last ends with a jump to the joining point.
- (Actually, each jump except for the last one really jumps
- to the following jump, because tensioning the jumps is a hassle.) */
+ /* `or' constructs are handled by starting each alternative with
+ an on_failure_jump that points to the start of the next
+ alternative. Each alternative except the last ends with a
+ jump to the joining point. (Actually, each jump except for
+ the last one really jumps to the following jump, because
+ tensioning the jumps is a hassle.) */
/* The start of a stupid repeat has an on_failure_jump that points
- past the end of the repeat text.
- This makes a failure point so that, on failure to match a repetition,
- matching restarts past as many repetitions have been found
- with no way to fail and look for another one. */
+ past the end of the repeat text. This makes a failure point so
+ that on failure to match a repetition, matching restarts past
+ as many repetitions have been found with no way to fail and
+ look for another one. */
/* A smart repeat is similar but loops back to the on_failure_jump
- so that each repetition makes another failure point. */
+ so that each repetition makes another failure point. */
case on_failure_jump:
- if (stackp == stacke)
- {
- unsigned char **stackx;
- if (stacke - stackb > re_max_failures * 2)
- return -2;
- stackx = (unsigned char **) alloca (2 * (stacke - stackb)
- * sizeof (char *));
- bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
- stackp = stackx + (stackp - stackb);
- stacke = stackx + 2 * (stacke - stackb);
- stackb = stackx;
- }
- mcnt = *p++ & 0377;
- mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
- p++;
- *stackp++ = mcnt + p;
- *stackp++ = d;
- break;
-
- /* The end of a smart repeat has an maybe_finalize_jump back.
- Change it either to a finalize_jump or an ordinary jump. */
+ on_failure:
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ PUSH_FAILURE_POINT (p + mcnt, d);
+ break;
+ /* The end of a smart repeat has a maybe_finalize_jump back.
+ Change it either to a finalize_jump or an ordinary jump. */
case maybe_finalize_jump:
- mcnt = *p++ & 0377;
- mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
- p++;
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
{
register unsigned char *p2 = p;
- /* Compare what follows with the begining of the repeat.
+ /* Compare what follows with the beginning of the repeat.
If we can establish that there is nothing that they would
- both match, we can change to finalize_jump */
- while (p2 != pend
+ both match, we can change to finalize_jump. */
+ while (p2 + 1 != pend
&& (*p2 == (unsigned char) stop_memory
|| *p2 == (unsigned char) start_memory))
- p2++;
+ p2 += 2; /* Skip over reg number. */
if (p2 == pend)
p[-3] = (unsigned char) finalize_jump;
else if (*p2 == (unsigned char) exactn
@@ -1482,7 +2350,7 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
register unsigned char *p1 = p + mcnt;
/* p1[0] ... p1[2] are an on_failure_jump.
- Examine what follows that */
+ Examine what follows that. */
if (p1[3] == (unsigned char) exactn && p1[5] != c)
p[-3] = (unsigned char) finalize_jump;
else if (p1[3] == (unsigned char) charset
@@ -1492,108 +2360,139 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
if (c < p1[4] * BYTEWIDTH
&& p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
not = !not;
- /* not is 1 if c would match */
- /* That means it is not safe to finalize */
+ /* `not' is 1 if c would match. */
+ /* That means it is not safe to finalize. */
if (!not)
p[-3] = (unsigned char) finalize_jump;
}
}
}
- p -= 2;
+ p -= 2; /* Point at relative address again. */
if (p[-1] != (unsigned char) finalize_jump)
{
- p[-1] = (unsigned char) jump;
+ p[-1] = (unsigned char) jump;
goto nofinalize;
}
-
- /* The end of a stupid repeat has a finalize-jump
- back to the start, where another failure point will be made
- which will point after all the repetitions found so far. */
-
- case finalize_jump:
- stackp -= 2;
-
- case jump:
+ /* Note fall through. */
+
+ /* The end of a stupid repeat has a finalize_jump back to the
+ start, where another failure point will be made which will
+ point to after all the repetitions found so far. */
+
+ /* Take off failure points put on by matching on_failure_jump
+ because didn't fail. Also remove the register information
+ put on by the on_failure_jump. */
+ case finalize_jump:
+ POP_FAILURE_POINT ();
+ /* Note fall through. */
+
+ /* Jump without taking off any failure points. */
+ case jump:
nofinalize:
- mcnt = *p++ & 0377;
- mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
- p += mcnt + 1; /* The 1 compensates for missing ++ above */
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ p += mcnt;
break;
- case dummy_failure_jump:
- if (stackp == stacke)
- {
- unsigned char **stackx
- = (unsigned char **) alloca (2 * (stacke - stackb)
- * sizeof (char *));
- bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
- stackp = stackx + (stackp - stackb);
- stacke = stackx + 2 * (stacke - stackb);
- stackb = stackx;
+ case dummy_failure_jump:
+ /* Normally, the on_failure_jump pushes a failure point, which
+ then gets popped at finalize_jump. We will end up at
+ finalize_jump, also, and with a pattern of, say, `a+', we
+ are skipping over the on_failure_jump, so we have to push
+ something meaningless for finalize_jump to pop. */
+ PUSH_FAILURE_POINT (0, 0);
+ goto nofinalize;
+
+
+ /* Have to succeed matching what follows at least n times. Then
+ just handle like an on_failure_jump. */
+ case succeed_n:
+ EXTRACT_NUMBER (mcnt, p + 2);
+ /* Originally, this is how many times we HAVE to succeed. */
+ if (mcnt)
+ {
+ mcnt--;
+ p += 2;
+ STORE_NUMBER_AND_INCR (p, mcnt);
+ }
+ else if (mcnt == 0)
+ {
+ p[2] = unused;
+ p[3] = unused;
+ goto on_failure;
+ }
+ else
+ {
+ fprintf (stderr, "regex: the succeed_n's n is not set.\n");
+ exit (1);
}
- *stackp++ = 0;
- *stackp++ = 0;
- goto nofinalize;
-
- case wordbound:
- if (d == string1 /* Points to first char */
- || d == end2 /* Points to end */
- || (d == end1 && size2 == 0)) /* Points to end */
- break;
- if ((SYNTAX (d[-1]) == Sword)
- != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
+ break;
+
+ case jump_n:
+ EXTRACT_NUMBER (mcnt, p + 2);
+ /* Originally, this is how many times we CAN jump. */
+ if (mcnt)
+ {
+ mcnt--;
+ STORE_NUMBER(p + 2, mcnt);
+ goto nofinalize; /* Do the jump without taking off
+ any failure points. */
+ }
+ /* If don't have to jump any more, skip over the rest of command. */
+ else
+ p += 4;
+ break;
+
+ case set_number_at:
+ {
+ register unsigned char *p1;
+
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ p1 = p + mcnt;
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ STORE_NUMBER (p1, mcnt);
+ break;
+ }
+
+ /* Ignore these. Used to ignore the n of succeed_n's which
+ currently have n == 0. */
+ case unused:
+ break;
+
+ case wordbound:
+ if (AT_WORD_BOUNDARY)
break;
goto fail;
case notwordbound:
- if (d == string1 /* Points to first char */
- || d == end2 /* Points to end */
- || (d == end1 && size2 == 0)) /* Points to end */
- goto fail;
- if ((SYNTAX (d[-1]) == Sword)
- != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
+ if (AT_WORD_BOUNDARY)
goto fail;
break;
case wordbeg:
- if (d == end2 /* Points to end */
- || (d == end1 && size2 == 0) /* Points to end */
- || SYNTAX (* (d == end1 ? string2 : d)) != Sword) /* Next char not a letter */
- goto fail;
- if (d == string1 /* Points to first char */
- || SYNTAX (d[-1]) != Sword) /* prev char not letter */
+ if (IS_A_LETTER (d) && (!IS_A_LETTER (d - 1) || AT_STRINGS_BEG))
break;
goto fail;
case wordend:
- if (d == string1 /* Points to first char */
- || SYNTAX (d[-1]) != Sword) /* prev char not letter */
- goto fail;
- if (d == end2 /* Points to end */
- || (d == end1 && size2 == 0) /* Points to end */
- || SYNTAX (d == end1 ? *string2 : *d) != Sword) /* Next char not a letter */
+ /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
+ if (!AT_STRINGS_BEG && IS_A_LETTER (d - 1)
+ && (!IS_A_LETTER (d) || AT_STRINGS_END))
break;
goto fail;
#ifdef emacs
case before_dot:
- if (((d - string2 <= (unsigned) size2)
- ? d - bf_p2 : d - bf_p1)
- <= point)
+ if (PTR_CHAR_POS (d) >= point)
goto fail;
break;
case at_dot:
- if (((d - string2 <= (unsigned) size2)
- ? d - bf_p2 : d - bf_p1)
- == point)
+ if (PTR_CHAR_POS (d) != point)
goto fail;
break;
case after_dot:
- if (((d - string2 <= (unsigned) size2)
- ? d - bf_p2 : d - bf_p1)
- >= point)
+ if (PTR_CHAR_POS (d) <= point)
goto fail;
break;
@@ -1606,6 +2505,7 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
matchsyntax:
PREFETCH;
if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
+ SET_REGS_MATCHED;
break;
case notwordchar:
@@ -1617,34 +2517,44 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
matchnotsyntax:
PREFETCH;
if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
- break;
-#else
+ SET_REGS_MATCHED;
+ break;
+
+#else /* not emacs */
+
case wordchar:
PREFETCH;
- if (SYNTAX (*d++) == 0) goto fail;
+ if (!IS_A_LETTER (d))
+ goto fail;
+ SET_REGS_MATCHED;
break;
case notwordchar:
PREFETCH;
- if (SYNTAX (*d++) != 0) goto fail;
+ if (IS_A_LETTER (d))
+ goto fail;
+ SET_REGS_MATCHED;
break;
+
#endif /* not emacs */
case begbuf:
- if (d == string1) /* Note, d cannot equal string2 */
- break; /* unless string1 == string2. */
- goto fail;
+ if (AT_STRINGS_BEG)
+ break;
+ goto fail;
- case endbuf:
- if (d == end2 || (d == end1 && size2 == 0))
+ case endbuf:
+ if (AT_STRINGS_END)
break;
goto fail;
case exactn:
/* Match the next few pattern characters exactly.
- mcnt is how many characters to match. */
+ mcnt is how many characters to match. */
mcnt = *p++;
- if (translate)
+ /* This is written out as an if-else so we don't waste time
+ testing `translate' inside the loop. */
+ if (translate)
{
do
{
@@ -1662,32 +2572,63 @@ re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
}
while (--mcnt);
}
- break;
+ SET_REGS_MATCHED;
+ break;
}
- continue; /* Successfully matched one pattern command; keep matching */
+ continue; /* Successfully executed one pattern command; keep going. */
- /* Jump here if any matching operation fails. */
+ /* Jump here if any matching operation fails. */
fail:
if (stackp != stackb)
/* A restart point is known. Restart there and pop it. */
{
+ short last_used_reg, this_reg;
+
+ /* If this failure point is from a dummy_failure_point, just
+ skip it. */
if (!stackp[-2])
- { /* If innermost failure point is dormant, flush it and keep looking */
- stackp -= 2;
- goto fail;
- }
- d = *--stackp;
+ {
+ POP_FAILURE_POINT ();
+ goto fail;
+ }
+
+ d = *--stackp;
p = *--stackp;
- if (d >= string1 && d <= end1)
+ if (d >= string1 && d <= end1)
dend = end_match_1;
+ /* Restore register info. */
+ last_used_reg = (short) *--stackp;
+
+ /* Make the ones that weren't saved -1 or 0 again. */
+ for (this_reg = RE_NREGS - 1; this_reg > last_used_reg; this_reg--)
+ {
+ regend[this_reg] = (unsigned char *) -1;
+ regstart[this_reg] = (unsigned char *) -1;
+ IS_ACTIVE (reg_info[this_reg]) = 0;
+ MATCHED_SOMETHING (reg_info[this_reg]) = 0;
+ }
+
+ /* And restore the rest from the stack. */
+ for ( ; this_reg > 0; this_reg--)
+ {
+ reg_info[this_reg] = *(struct register_info *) *--stackp;
+ regend[this_reg] = *--stackp;
+ regstart[this_reg] = *--stackp;
+ }
}
- else break; /* Matching at this starting point really fails! */
+ else
+ break; /* Matching at this starting point really fails. */
}
- return -1; /* Failure to match */
+
+ if (best_regs_set)
+ goto restore_best_regs;
+
+ FREE_AND_RETURN(stackb,(-1)); /* Failure to match. */
}
+
static int
-bcmp_translate (s1, s2, len, translate)
+memcmp_translate (s1, s2, len, translate)
unsigned char *s1, *s2;
register int len;
unsigned char *translate;
@@ -1700,8 +2641,10 @@ bcmp_translate (s1, s2, len, translate)
}
return 0;
}
+
+
-/* Entry points compatible with bsd4.2 regex library */
+/* Entry points compatible with 4.2 BSD regex library. */
#ifndef emacs
@@ -1734,18 +2677,24 @@ re_exec (s)
char *s;
{
int len = strlen (s);
- return 0 <= re_search (&re_comp_buf, s, len, 0, len, 0);
+ return 0 <= re_search (&re_comp_buf, s, len, 0, len,
+ (struct re_registers *) 0);
}
+#endif /* not emacs */
+
-#endif /* emacs */
#ifdef test
+#ifdef atarist
+long _stksize = 2L; /* reserve memory for stack */
+#endif
#include <stdio.h>
-/* Indexed by a character, gives the upper case equivalent of the character */
+/* Indexed by a character, gives the upper case equivalent of the
+ character. */
-static char upcase[0400] =
+char upcase[0400] =
{ 000, 001, 002, 003, 004, 005, 006, 007,
010, 011, 012, 013, 014, 015, 016, 017,
020, 021, 022, 023, 024, 025, 026, 027,
@@ -1780,6 +2729,36 @@ static char upcase[0400] =
0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
};
+#ifdef canned
+
+#include "tests.h"
+
+typedef enum { extended_test, basic_test } test_type;
+
+/* Use this to run the tests we've thought of. */
+
+void
+main ()
+{
+ test_type t = extended_test;
+
+ if (t == basic_test)
+ {
+ printf ("Running basic tests:\n\n");
+ test_posix_basic ();
+ }
+ else if (t == extended_test)
+ {
+ printf ("Running extended tests:\n\n");
+ test_posix_extended ();
+ }
+}
+
+#else /* not canned */
+
+/* Use this to run interactive tests. */
+
+void
main (argc, argv)
int argc;
char **argv;
@@ -1828,6 +2807,9 @@ main (argc, argv)
}
}
+#endif
+
+
#ifdef NOTDEF
print_buf (bufp)
struct re_pattern_buffer *bufp;
@@ -1852,12 +2834,12 @@ print_buf (bufp)
printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
}
-#endif
+#endif /* NOTDEF */
printchar (c)
char c;
{
- if (c < 041 || c >= 0177)
+ if (c < 040 || c >= 0177)
{
putchar ('\\');
putchar (((c >> 6) & 3) + '0');
@@ -1868,4 +2850,10 @@ printchar (c)
putchar (c);
}
+error (string)
+ char *string;
+{
+ puts (string);
+ exit (1);
+}
#endif /* test */