aboutsummaryrefslogtreecommitdiffstats
path: root/regex.c
diff options
context:
space:
mode:
Diffstat (limited to 'regex.c')
-rw-r--r--regex.c5135
1 files changed, 3932 insertions, 1203 deletions
diff --git a/regex.c b/regex.c
index e59a169a..2cb3f2bd 100644
--- a/regex.c
+++ b/regex.c
@@ -1,9 +1,10 @@
/* Extended regular expression matching and search library.
- Copyright (C) 1985, 1989-90 Free Software Foundation, Inc.
+ Version 0.1.
+ Copyright (C) 1985, 89, 90, 91 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 1, or (at your option)
+ the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
@@ -19,68 +20,95 @@
/* To test, compile with -Dtest. This Dtestable feature turns this into
a self-contained program which reads a pattern, describes how it
compiles, then reads a string and searches for it.
-
+
On the other hand, if you compile with both -Dtest and -Dcanned you
can run some tests we've already thought of. */
+#ifdef GAWK
+#include "config.h"
+#endif
+
+#ifdef REGEX_MALLOC
+
+#define REGEX_ALLOCATE malloc
+#define REGEX_REALLOCATE(source, size) (realloc (source, size))
+
+#else /* not REGEX_MALLOC */
+
+
+/* Make alloca work the best possible way. */
+#ifdef __GNUC__
+#define alloca __builtin_alloca
+#else
+#ifdef sparc
+#include <alloca.h>
+#else
+#ifdef _AIX
+ #pragma alloca
+#else /* not __GNUC__ or sparc or _AIX */
+char *alloca ();
+#endif /* _AIX */
+#endif /* sparc */
+#endif /* not __GNUC__ */
+
+/* Still not defined (REGEX_MALLOC) */
+
+#define REGEX_ALLOCATE alloca
+
+/* Requires a `void *destination' declared. */
+#define REGEX_REALLOCATE(source, size) \
+ (destination = alloca (size), \
+ bcopy (source, destination, size), \
+ destination)
+
+#endif /* not defined (REGEX_MALLOC) */
+
+
#ifdef emacs
/* The `emacs' switch turns on certain special matching commands
that make sense only in emacs. */
+#include "config.h"
#include "lisp.h"
#include "buffer.h"
#include "syntax.h"
-/* We write fatal error messages on standard error. */
-#include <stdio.h>
+/* Emacs uses `NULL' as a predicate. */
+#undef NULL
-/* isalpha(3) etc. are used for the character classes. */
-#include <ctype.h>
-#else /* not emacs */
+#else /* not emacs */
-#include "awk.h"
-#define NO_ALLOCA /* try it out for now */
-#ifndef NO_ALLOCA
-/* Make alloca work the best possible way. */
-#ifdef __GNUC__
-#ifndef atarist
-#ifndef alloca
-#define alloca __builtin_alloca
-#endif
-#endif /* atarist */
-#else
-#ifdef sparc
-#include <alloca.h>
+#include <sys/types.h> /* POSIX types. */
+
+#if defined(GAWK) || defined (USG) || defined (POSIX) || defined (STDC_HEADERS)
+#ifndef BSTRING
+#include <string.h>
+#define bcopy(s,d,n) memcpy ((d), (s), (n))
+#define bcmp(s1,s2,n) memcmp ((s1), (s2), (n))
+#define bzero(s,n) memset ((s), 0, (n))
+#endif /* not BSTRING */
+#endif /* USG or POSIX or STDC_HEADERS */
+
+#if defined (STDC_HEADERS)
+#include <stdlib.h>
#else
-char *alloca ();
-#endif
-#endif /* __GNUC__ */
-
-#define FREE_AND_RETURN_VOID(stackb) return
-#define FREE_AND_RETURN(stackb,val) return(val)
-#define DOUBLE_STACK(stackx,stackb,len) \
- (stackx = (unsigned char **) alloca (2 * len \
- * sizeof (unsigned char *)),\
- /* Only copy what is in use. */ \
- (unsigned char **) memcpy (stackx, stackb, len * sizeof (char *)))
-#else /* NO_ALLOCA defined */
-#define FREE_AND_RETURN_VOID(stackb) free(stackb);return
-#define FREE_AND_RETURN(stackb,val) free(stackb);return(val)
-#define DOUBLE_STACK(stackx,stackb,len) \
- (unsigned char **) realloc (stackb, 2 * len * sizeof (unsigned char *))
-#endif /* NO_ALLOCA */
-
-static void store_jump P((char *, int, char *));
-static void insert_jump P((int, char *, char *, char *));
-static void store_jump_n P((char *, int, char *, unsigned));
-static void insert_jump_n P((int, char *, char *, char *, unsigned));
-static void insert_op_2 P((int, char *, char *, int, int ));
-static int memcmp_translate P((unsigned char *, unsigned char *,
- int, unsigned char *));
+#ifdef __STDC__
+void *malloc (size_t);
+void *realloc (void *, size_t);
+#else /* not __STDC__ */
+char *malloc ();
+char *realloc ();
+#endif /* not __STDC__ */
+#endif /* not (POSIX or STDC_HEADERS) */
+
+
+
+/* How many characters in the character set. */
+#define CHAR_SET_SIZE 256
/* Define the syntax stuff, so we can do the \<, \>, etc. */
@@ -89,20 +117,18 @@ static int memcmp_translate P((unsigned char *, unsigned char *,
commands in re_match_2. */
#ifndef Sword
#define Sword 1
-#endif
+#endif /* not Sword */
#define SYNTAX(c) re_syntax_table[c]
#ifdef SYNTAX_TABLE
-char *re_syntax_table;
+extern char *re_syntax_table;
#else /* not SYNTAX_TABLE */
-static char re_syntax_table[256];
-static void init_syntax_once P((void));
-
+static char re_syntax_table[CHAR_SET_SIZE];
static void
init_syntax_once ()
@@ -113,7 +139,7 @@ init_syntax_once ()
if (done)
return;
- memset (re_syntax_table, 0, sizeof re_syntax_table);
+ bzero (re_syntax_table, sizeof re_syntax_table);
for (c = 'a'; c <= 'z'; c++)
re_syntax_table[c] = Sword;
@@ -123,161 +149,244 @@ init_syntax_once ()
for (c = '0'; c <= '9'; c++)
re_syntax_table[c] = Sword;
-
- /* Add specific syntax for ISO Latin-1. */
- for (c = 0300; c <= 0377; c++)
- re_syntax_table[c] = Sword;
- re_syntax_table[0327] = 0;
- re_syntax_table[0367] = 0;
+
+ re_syntax_table['_'] = Sword;
done = 1;
}
-#endif /* SYNTAX_TABLE */
-#undef P
-#endif /* emacs */
+#endif /* not SYNTAX_TABLE */
+#endif /* not emacs */
+/* We write fatal error messages on standard error. */
+#include <stdio.h>
+
+/* isalpha(3) etc. are used for the character classes. */
+#include <ctype.h>
/* Sequents are missing isgraph. */
-#ifndef isgraph
-#define isgraph(c) (isprint((c)) && !isspace((c)))
+#ifdef sequent
+#define ISGRAPH_MISSING
+#endif
+
+#ifdef ISGRAPH_MISSING
+#define isgraph(c) (isprint (c) && !isspace (c))
#endif
+
/* Get the interface, including the syntax bits. */
#include "regex.h"
+/* We will need this constant several times. */
+#define BYTEWIDTH 8
+
+
+
/* These are the command codes that appear in compiled regular
expressions, one per byte. Some command codes are followed by
argument bytes. A command code can specify any interpretation
whatsoever for its arguments. Zero-bytes may appear in the compiled
regular expression.
-
+
The value of `exactn' is needed in search.c (search_buffer) in emacs.
So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
`exactn' we use here must also be 1. */
enum regexpcode
{
- unused=0,
- exactn=1, /* Followed by one byte giving n, then by n literal bytes. */
- begline, /* Fail unless at beginning of line. */
- endline, /* Fail unless at end of line. */
- jump, /* Followed by two bytes giving relative address to jump to. */
- on_failure_jump, /* Followed by two bytes giving relative address of
- place to resume at in case of failure. */
- finalize_jump, /* Throw away latest failure point and then jump to
- address. */
- maybe_finalize_jump, /* Like jump but finalize if safe to do so.
- This is used to jump back to the beginning
- of a repeat. If the command that follows
- this jump is clearly incompatible with the
- one at the beginning of the repeat, such that
- we can be sure that there is no use backtracking
- out of repetitions already completed,
- then we finalize. */
- dummy_failure_jump, /* Jump, and push a dummy failure point. This
- failure point will be thrown away if an attempt
- is made to use it for a failure. A + construct
- makes this before the first repeat. Also
- use it as an intermediary kind of jump when
- compiling an or construct. */
- succeed_n, /* Used like on_failure_jump except has to succeed n times;
- then gets turned into an on_failure_jump. The relative
- address following it is useless until then. The
- address is followed by two bytes containing n. */
- jump_n, /* Similar to jump, but jump n times only; also the relative
- address following is in turn followed by yet two more bytes
- containing n. */
- set_number_at, /* Set the following relative location to the
- subsequent number. */
- anychar, /* Matches any (more or less) one character. */
- charset, /* Matches any one char belonging to specified set.
- First following byte is number of bitmap bytes.
- Then come bytes for a bitmap saying which chars are in.
- Bits in each byte are ordered low-bit-first.
- A character is in the set if its bit is 1.
- A character too large to have a bit in the map
- is automatically not in the set. */
- charset_not, /* Same parameters as charset, but match any character
- that is not one of those specified. */
- start_memory, /* Start remembering the text that is matched, for
- storing in a memory register. Followed by one
- byte containing the register number. Register numbers
- must be in the range 0 through RE_NREGS. */
- stop_memory, /* Stop remembering the text that is matched
- and store it in a memory register. Followed by
- one byte containing the register number. Register
- numbers must be in the range 0 through RE_NREGS. */
- duplicate, /* Match a duplicate of something remembered.
- Followed by one byte containing the index of the memory
- register. */
- before_dot, /* Succeeds if before point. */
- at_dot, /* Succeeds if at point. */
- after_dot, /* Succeeds if after point. */
- begbuf, /* Succeeds if at beginning of buffer. */
- endbuf, /* Succeeds if at end of buffer. */
- wordchar, /* Matches any word-constituent character. */
- notwordchar, /* Matches any char that is not a word-constituent. */
- wordbeg, /* Succeeds if at word beginning. */
- wordend, /* Succeeds if at word end. */
- wordbound, /* Succeeds if at a word boundary. */
- notwordbound,/* Succeeds if not at a word boundary. */
- syntaxspec, /* Matches any character whose syntax is specified.
- followed by a byte which contains a syntax code,
- e.g., Sword. */
- notsyntaxspec /* Matches any character whose syntax differs from
- that specified. */
+ no_op=0,
+ exactn=1, /* Followed by one byte giving n, then by n
+ literal bytes. */
+ begline, /* Fail unless at beginning of line. */
+ endline, /* Fail unless at end of line. */
+ endline_in_repeat, /* If in trailing position, turn into an endline,
+ otherwise, turn into a no_op. This should
+ never end up in the final compiled pattern! */
+ endline_before_newline,/* If after an endline, don't that endline turn into
+ an exactn for '$' when RE_CONTEXT_INDEP_ANCHORS
+ is set. Should never end up in the compiled
+ pattern! */
+ repeated_endline_before_newline, /* A combination of above two. */
+ no_pop_jump, /* Followed by two byte relative address to
+ which to jump. */
+ jump_past_next_alt, /* Same as no_pop_jump, but don't jump if the
+ current group (the largest-numbered active
+ one) hasn't matched anything. */
+ on_failure_jump, /* Followed by two byte relative address of
+ place to resume at in case of failure. */
+ pop_failure_jump, /* Throw away latest failure point and then jump to
+ address. */
+ maybe_pop_jump,
+ /* Like jump but change to pop_failure_jump
+ only if know won't have to backtrack to
+ match; otherwise change to no_pop_jump.
+ This is used to jump back to the
+ beginning of a repeat. If what follows
+ this jump clearly won't match what the
+ repeat does, such that we can be sure
+ that there is no use backtracking out of
+ repetitions already matched, then we
+ change it to a pop_failure_jump. */
+ dummy_failure_jump, /* Jump, and push a dummy failure point. This
+ failure point will be thrown away if an
+ attempt is made to use it for a failure. A
+ `+' construct makes this before the first
+ repeat. Also use it as an intermediary kind
+ of jump when compiling an alternative. */
+ succeed_n, /* Used like on_failure_jump except has to
+ succeed n times; The two-byte relative
+ address following it is useless until then.
+ The address is followed by two bytes
+ containing n. */
+ no_pop_jump_n, /* Similar to no_pop_jump, but jump n times
+ only; also the relative address following is
+ in turn followed by yet two more bytes
+ containing n. */
+ set_number_at, /* Set the following relative location (two
+ bytes) to the subsequent (two-byte) number. */
+ anychar, /* Matches any (more or less) character. */
+ charset, /* Matches any one char belonging to specified set.
+ First following byte is number of bitmap
+ bytes. Then come bytes for a bitmap saying
+ which chars are in. Bits in each byte are
+ ordered low-bit-first. A character is in the
+ set if its bit is 1. A character too large
+ to have a bit in the map is automatically not
+ in the set. */
+ charset_not, /* Same parameters as charset, but match any
+ character that is not one of those specified. */
+ start_memory, /* Start remembering the text that is matched, for
+ storing in a memory register. Followed by
+ one byte containing the register number.
+ Register numbers will be in the range 0
+ through one less than the pattern buffer's
+ re_nsub field. */
+ stop_memory, /* Stop remembering the text that is matched
+ and store it in a memory register. Followed
+ by one byte containing the register number.
+ Register numbers will be in the range 0
+ through one less than the pattern buffer's
+ re_nsub field. */
+ duplicate, /* Match a duplicate of something remembered.
+ Followed by one byte containing the register
+ number. */
+ before_dot, /* Succeeds if before point. */
+ at_dot, /* Succeeds if at point. */
+ after_dot, /* Succeeds if after point. */
+ begbuf, /* Succeeds if at beginning of buffer. */
+ endbuf, /* Succeeds if at end of buffer. */
+ wordchar, /* Matches any word-constituent character. */
+ notwordchar, /* Matches any char that is not a word-constituent. */
+ wordbeg, /* Succeeds if at word beginning. */
+ wordend, /* Succeeds if at word end. */
+ wordbound, /* Succeeds if at a word boundary. */
+ notwordbound, /* Succeeds if not at a word boundary. */
+ syntaxspec, /* Matches any character whose syntax is specified.
+ followed by a byte which contains a syntax
+ code, e.g., Sword. */
+ notsyntaxspec /* Matches any character whose syntax differs from
+ that specified. */
};
-
-/* Number of failure points to allocate space for initially,
- when matching. If this number is exceeded, more space is allocated,
- so it is not a hard limit. */
-#ifndef NFAILURES
-#define NFAILURES 80
-#endif
-#ifdef CHAR_UNSIGNED
-#define SIGN_EXTEND_CHAR(c) ((c)>(char)127?(c)-256:(c)) /* for IBM RT */
+#ifdef CHAR_UNSIGNED /* for, e.g., IBM RT */
+#define SIGN_EXTEND_CHAR(c) (((c)^128) - 128) /* As in Harbison and Steele. */
#endif
#ifndef SIGN_EXTEND_CHAR
-#define SIGN_EXTEND_CHAR(x) (x)
+#define SIGN_EXTEND_CHAR /* As nothing. */
#endif
-
+
+
/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
+
#define STORE_NUMBER(destination, number) \
- { (destination)[0] = (number) & 0377; \
- (destination)[1] = (number) >> 8; }
-
+ do {(destination)[0] = (number) & 0377; \
+ (destination)[1] = (number) >> 8; \
+ } while (0)
+
+
/* Same as STORE_NUMBER, except increment the destination pointer to
the byte after where the number is stored. Watch out that values for
DESTINATION such as p + 1 won't work, whereas p will. */
+
#define STORE_NUMBER_AND_INCR(destination, number) \
- { STORE_NUMBER(destination, number); \
- (destination) += 2; }
+ do { STORE_NUMBER(destination, number); \
+ (destination) += 2; \
+ } while (0)
+
+
+
+
/* Put into DESTINATION a number stored in two contingous bytes starting
at SOURCE. */
+
#define EXTRACT_NUMBER(destination, source) \
- { (destination) = *(source) & 0377; \
- (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; }
+ do { (destination) = *(source) & 0377; \
+ (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; \
+ } while (0)
+
+int
+extract_number (source)
+ unsigned char *source;
+{
+ int answer;
+ int i_temp = * (char *) (source + 1);
+ char c_temp = * (char *) (source + 1);
+
+ i_temp = SIGN_EXTEND_CHAR (i_temp);
+ c_temp = SIGN_EXTEND_CHAR (c_temp);
+
+ i_temp <<= 8;
+ c_temp <<= 8;
+
+ answer = *source & 0377;
+ answer += (SIGN_EXTEND_CHAR (*(char *)((source) + 1))) << 8;
+
+ return answer;
+}
+
/* Same as EXTRACT_NUMBER, except increment the pointer for source to
point to second byte of SOURCE. Note that SOURCE has to be a value
such as p, not, e.g., p + 1. */
+
#define EXTRACT_NUMBER_AND_INCR(destination, source) \
- { EXTRACT_NUMBER (destination, source); \
- (source) += 2; }
+ do { EXTRACT_NUMBER (destination, source); \
+ (source) += 2; \
+ } while (0)
+
+
+void
+extract_number_and_incr (destination, source)
+ int *destination;
+ unsigned char **source;
+{
+ *destination = extract_number (*source);
+ *source += 2;
+}
+
+
+
+typedef enum { false = 0, true = 1 } boolean;
+
+/* Number of failure points for which to initially allocate space
+ when matching. If this number is exceeded, we allocate more space---
+ so it is not a hard limit. */
+
+#ifndef INIT_FAILURE_ALLOC
+#define INIT_FAILURE_ALLOC 5
+#endif
/* Specify the precise syntax of regexps for compilation. This provides
for compatibility for various utilities which historically have
different, incompatible syntaxes.
-
+
The argument SYNTAX is a bit-mask comprised of the various bits
defined in regex.h. */
@@ -296,134 +405,380 @@ re_set_syntax (syntax)
int obscure_syntax = 0;
+
+/* Routine used by re_compile_pattern, re_comp and regcomp. */
+
+#ifdef __STDC__
+static char *regex_compile (const char *pattern, const int size,
+ const int syntax, struct re_pattern_buffer *bufp);
+#else
+static char *regex_compile ();
+#endif
+
+
-/* Macros for re_compile_pattern, which is found below these definitions. */
+/* re_compile_pattern takes a regular-expression string and converts it
+ into a buffer full of byte commands for matching.
+
+ PATTERN is the address of the pattern string.
+ SIZE is the length of it.
+
+ BUFP is a struct re_pattern_buffer * whose pertinent fields are
+ mentioned below:
+
+ It has a char * field called BUFFER which points to the
+ space where this routine will put the compiled pattern; the
+ user can either allocate this using malloc (whereupon they
+ should set the long field ALLOCATED to the number of bytes
+ malloced) or set ALLOCATED to 0 and let the routine
+ allocate it. The routine may use realloc to enlarge the
+ buffer space.
+
+ If the user wants to translate all ordinary elements in the
+ compiled pattern, they should set the char * field
+ TRANSLATE to a translate table, otherwise, they should set
+ it to 0.
+
+ The routine sets the int field SYNTAX to the value of the
+ global variable `obscure_syntax'.
+
+ It returns in the long field USED how many bytes long the
+ compiled pattern is.
+
+ It returns 0 in the char field FASTMAP_ACCURATE, on
+ the assumption that the user usually doesn't compile the
+ same pattern twice and that consequently any fastmap in the
+ pattern buffer is inaccurate.
+
+ In the size_t field RE_NSUB, it returns the number of
+ subexpressions it found in PATTERN.
+
+ Returns 0 if the pattern was valid and an error string if it wasn't. */
+
+
+char *
+re_compile_pattern (pattern, size, bufp)
+ const char *pattern;
+ const int size;
+ struct re_pattern_buffer *bufp;
+{
+ bufp->return_default_num_regs = (obscure_syntax & RE_ALLOCATE_REGISTERS) > 0;
+
+ return regex_compile (pattern, size, obscure_syntax, bufp);
+}
+
+
+
+/* Macros for regex_compile. */
#define CHAR_CLASS_MAX_LENGTH 6
-/* Fetch the next character in the uncompiled pattern, translating it if
- necessary. */
+
+/* Fetch the next character in the uncompiled pattern---translating it
+ if necessary. */
+
#define PATFETCH(c) \
- {if (p == pend) goto end_of_pattern; \
- c = * (unsigned char *) p++; \
- if (translate) c = translate[c]; }
+ do {if (p == pend) goto end_of_pattern; \
+ c = * (unsigned char *) p++; \
+ if (translate) \
+ c = translate[c]; \
+ } while (0)
/* Fetch the next character in the uncompiled pattern, with no
translation. */
+
#define PATFETCH_RAW(c) \
- {if (p == pend) goto end_of_pattern; \
- c = * (unsigned char *) p++; }
+ do {if (p == pend) goto end_of_pattern; \
+ c = * (unsigned char *) p++; \
+ } while (0)
#define PATUNFETCH p--
+/* Pattern offset stuff. */
+
+#define INIT_PATTERN_OFFSETS_LIST_SIZE 32
+
+typedef short pattern_offset_type;
+
+typedef struct {
+ pattern_offset_type *offsets;
+ unsigned size;
+ unsigned avail;
+} pattern_offsets_list_type;
+
+#define PATTERN_OFFSETS_LIST_PTR_FULL(pattern_offsets_list_ptr) \
+ (pattern_offsets_list_ptr->avail == pattern_offsets_list_ptr->size)
+
+
+/* Anchor and op list stuff. */
+
+typedef pattern_offsets_list_type anchor_list_type;
+typedef pattern_offsets_list_type op_list_type;
+
+
+
+/* Bits list declaration. An arbitrarily long string of bits. */
+
+typedef struct {
+ unsigned *bits;
+ unsigned size;
+} bits_list_type;
+
+
+/* Bits list macros. See below for routines. */
+
+#define BITS_BLOCK_SIZE (sizeof (unsigned) * BYTEWIDTH)
+#define BITS_BLOCK(position) ((position) / BITS_BLOCK_SIZE)
+#define BITS_MASK(position) (1 << ((position) % BITS_BLOCK_SIZE))
+
+
+/* Initialize BITS_LIST (of type bits_list_type) to have one bits
+ block. Mostly analogous to routine init_bits_list, but, if
+ REGEX_MALLOC is not defined, uses `alloca' instead of `malloc'. This
+ is because using malloc in re_search* or re_match* could cause core
+ leaks when C-g is used in Emacs, plus malloc's slower and causes
+ storage fragmentation. This has to be a macro because the results of
+ `alloca' disappear at the end of the routine it's in. (If for some
+ reason you delete this explanation, please put it in the comment for
+ the failure stack.)
+
+ Return 1 if there's enough memory to do so and 0 if there isn't. */
+
+#define INIT_BITS_LIST(bits_list) \
+ (bits_list.bits = (unsigned *) REGEX_ALLOCATE (sizeof (unsigned)), \
+ bits_list.bits == NULL \
+ ? 0 \
+ : (bits_list.size = BITS_BLOCK_SIZE, \
+ bits_list.bits[0] = 0, \
+ 1))
+
+
+/* Extend BITS_LIST_PTR (of type bits_list_type) by one bits block.
+ Return 1 if there's enough memory to do so and 0 if there isn't.
+ Analogous to routine extend_bits_list, but uses alloca instead of
+ realloc, for reasons stated above in INIT_BITS_LIST's comment.
+
+ Because REGEX_REALLOCATE requires a declaration of `void
+ *destination', so does this. */
+
+
+#define EXTEND_BITS_LIST(bits_list) \
+ (bits_list.bits \
+ = (unsigned *) REGEX_REALLOCATE (bits_list.bits, \
+ bits_list.size / BYTEWIDTH \
+ + BITS_BLOCK_SIZE / BYTEWIDTH), \
+ bits_list.bits == NULL \
+ ? 0 \
+ : (bits_list.size += BITS_BLOCK_SIZE, \
+ bits_list.bits[(bits_list.size/BITS_BLOCK_SIZE) - 1] = 0, \
+ 1))
+
+
+/* Set the bit for a positive POSITION in BITS_LIST_PTR to VALUE, which,
+ in turn, can only be 0 or 1.
+
+ Returns 1 if can set the bit.
+ 0 if ran out of memory allocating (if necessary) room for it.
+ value if the value is invalid (i.e., not 0 or 1).
+
+ Because EXTENT_BITS_LIST requires a declaration of `void
+ *destination', so does this. */
+
+#define SET_BIT_TO_VALUE(bits_list, position, value) \
+ (position > bits_list.size - 1 \
+ && !EXTEND_BITS_LIST (bits_list) \
+ ? 0 \
+ : (value == 1 \
+ ? (bits_list.bits[BITS_BLOCK (position)] \
+ |= BITS_MASK (position), 1) \
+ : (value == 0 \
+ ? (bits_list.bits[BITS_BLOCK (position)] \
+ &= ~(BITS_MASK (position)), 1) \
+ : value) \
+ ))
+
+
+
+/* Compile stack stuff. */
+
+typedef struct {
+ pattern_offset_type laststart_offset;
+ pattern_offset_type fixup_alt_jump;
+ pattern_offset_type regnum;
+ pattern_offset_type begalt_offset;
+} compile_stack_element;
+
+
+typedef struct {
+ compile_stack_element *stack;
+ unsigned size;
+ unsigned avail; /* Offset of next open position. */
+ } compile_stack_type;
+
+
+#define INIT_COMPILE_STACK_SIZE 32
+
+#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
+#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
+
+
/* If the buffer isn't allocated when it comes in, use this. */
-#define INIT_BUF_SIZE 28
+#define INIT_BUF_SIZE 32
/* Make sure we have at least N more bytes of space in buffer. */
#define GET_BUFFER_SPACE(n) \
{ \
- while (b - bufp->buffer + (n) >= bufp->allocated) \
- EXTEND_BUFFER; \
+ while (b - bufp->buffer + (n) > bufp->allocated) \
+ EXTEND_BUFFER \
}
-/* Make sure we have one more byte of buffer space and then add CH to it. */
-#define BUFPUSH(ch) \
- { \
+/* Make sure we have one more byte of buffer space and then add C to it. */
+#define BUF_PUSH(c) \
+ do { \
GET_BUFFER_SPACE (1); \
- *b++ = (char) (ch); \
- }
-
-/* Extend the buffer by twice its current size via reallociation and
- reset the pointers that pointed into the old allocation to point to
- the correct places in the new allocation. If extending the buffer
- results in it being larger than 1 << 16, then flag memory exhausted. */
+ *b++ = (char) (c); \
+ } while (0)
+
+/* Make sure we have two more bytes of buffer space and then add C1 and
+ C2 to it. */
+#define BUF_PUSH_2(c1, c2) \
+ do { \
+ GET_BUFFER_SPACE (2); \
+ *b++ = (char) (c1); \
+ *b++ = (char) (c2); \
+ } while (0)
+
+
+
+#define MAX_BUF_SIZE (1L << 16)
+
+/* Extend the buffer by twice its current size via realloc and
+ reset the pointers that pointed into the old block to point to the
+ correct places in the new one. If extending the buffer results in it
+ being larger than MAX_BUF_SIZE, then flag memory exhausted. */
#define EXTEND_BUFFER \
- { char *old_buffer = bufp->buffer; \
- if (bufp->allocated == (1L<<16)) goto too_big; \
- bufp->allocated *= 2; \
- if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
+ { \
+ char *old_buffer = bufp->buffer; \
+ if (bufp->allocated == MAX_BUF_SIZE) \
+ goto too_big; \
+ bufp->allocated <<= 1; \
+ if (bufp->allocated > MAX_BUF_SIZE) \
+ bufp->allocated = MAX_BUF_SIZE; \
bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated); \
- if (bufp->buffer == 0) \
+ if (bufp->buffer == NULL) \
goto memory_exhausted; \
b = (b - old_buffer) + bufp->buffer; \
- if (fixup_jump) \
- fixup_jump = (fixup_jump - old_buffer) + bufp->buffer; \
+ begalt = (begalt - old_buffer) + bufp->buffer; \
+ beg_interval = (beg_interval - old_buffer) + bufp->buffer; \
+ if (fixup_alt_jump) \
+ fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer; \
if (laststart) \
laststart = (laststart - old_buffer) + bufp->buffer; \
- begalt = (begalt - old_buffer) + bufp->buffer; \
if (pending_exact) \
pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
}
-/* Set the bit for character C in a character set list. */
+/* Set the bit for character C in a list. */
#define SET_LIST_BIT(c) (b[(c) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
/* Get the next unsigned number in the uncompiled pattern. */
#define GET_UNSIGNED_NUMBER(num) \
- { if (p != pend) \
- { \
- PATFETCH (c); \
- while (isdigit (c)) \
- { \
- if (num < 0) \
- num = 0; \
- num = num * 10 + c - '0'; \
- if (p == pend) \
- break; \
- PATFETCH (c); \
- } \
- } \
- }
+ { if (p != pend) \
+ { \
+ PATFETCH (c); \
+ while (isdigit (c)) \
+ { \
+ if (num < 0) \
+ num = 0; \
+ num = num * 10 + c - '0'; \
+ if (p == pend) \
+ break; \
+ PATFETCH (c); \
+ } \
+ } \
+ }
+
+
+#define DO_RANGE \
+ { \
+ /* Get untranslated range start and end characters. */ \
+ char this_char = p[-2]; \
+ char end; \
+ \
+ if (p == pend) \
+ goto invalid_range_end; \
+ PATFETCH_RAW (end); \
+ if ((syntax & RE_NO_EMPTY_RANGES) && this_char > end) \
+ goto invalid_range_end; \
+ while (this_char <= end) \
+ { \
+ SET_LIST_BIT (translate ? translate[this_char] : this_char); \
+ this_char++; \
+ } \
+ }
+
+
+#define IS_CHAR_CLASS(string) \
+ (strcmp (string, "alpha") == 0 || strcmp (string, "upper") == 0 \
+ || strcmp (string, "lower") == 0 || strcmp (string, "digit") == 0 \
+ || strcmp (string, "alnum") == 0 || strcmp (string, "xdigit") == 0 \
+ || strcmp (string, "space") == 0 || strcmp (string, "print") == 0 \
+ || strcmp (string, "punct") == 0 || strcmp (string, "graph") == 0 \
+ || strcmp (string, "cntrl") == 0) \
+
+
+
+/* Subroutines for regex_compile. */
-/* Subroutines for re_compile_pattern. */
static void store_jump (), insert_jump (), store_jump_n (),
- insert_jump_n (), insert_op_2 ();
+ insert_jump_n (), insert_op_2 (), remove_intervening_anchors (),
+ clear_this_and_higher_levels (), increase_level (),
+ decrease_level (), adjust_pattern_offsets_list ();
-/* re_compile_pattern takes a regular-expression string
- and converts it into a buffer full of byte commands for matching.
+static unsigned record_anchor_position (), init_bits_list (),
+ get_level_match_status (),
+ set_this_level (), set_next_lower_level (),
+ make_group_active (), make_group_inactive (),
+ set_match_status_of_active_groups (),
+ get_group_match_status (), add_op (),
+ init_pattern_offsets_list ();
- PATTERN is the address of the pattern string
- SIZE is the length of it.
- BUFP is a struct re_pattern_buffer * which points to the info
- on where to store the byte commands.
- This structure contains a char * which points to the
- actual space, which should have been obtained with malloc.
- re_compile_pattern may use realloc to grow the buffer space.
+static boolean is_in_compile_stack (), lower_levels_match_nothing (),
+ no_levels_match_anything (), verify_and_adjust_endlines ();
- The number of bytes of commands can be found out by looking in
- the `struct re_pattern_buffer' that bufp pointed to, after
- re_compile_pattern returns. */
-char *
-re_compile_pattern (pattern, size, bufp)
- char *pattern;
- size_t size;
+static char *
+regex_compile (pattern, size, syntax, bufp)
+ const char *pattern;
+ const int size;
+ const int syntax;
struct re_pattern_buffer *bufp;
{
register char *b = bufp->buffer;
- register char *p = pattern;
- char *pend = pattern + size;
+ const char *p = pattern;
+ const char *pend = pattern + size;
register unsigned c, c1;
- char *p1;
+ const char *p1;
unsigned char *translate = (unsigned char *) bufp->translate;
+ boolean enough_memory;
/* Address of the count-byte of the most recently inserted `exactn'
command. This makes it possible to tell whether a new exact-match
character can be added to that command or requires a new `exactn'
command. */
-
+
char *pending_exact = 0;
- /* Address of the place where a forward-jump should go to the end of
+ /* Address of the place where a forward jump should go to the end of
the containing expression. Each alternative of an `or', except the
- last, ends with a forward-jump of this sort. */
+ last, ends with a forward jump of this sort. */
- char *fixup_jump = 0;
+ char *fixup_alt_jump = 0;
/* Address of start of the most recently finished expression.
- This tells postfix * where to find the start of its operand. */
+ This tells, e. g., postfix * where to find the start of its operand. */
char *laststart = 0;
@@ -435,10 +790,10 @@ re_compile_pattern (pattern, size, bufp)
char many_times_ok;
- /* Address of beginning of regexp, or inside of last \(. */
+ /* Address of beginning of regexp, or inside of last group. */
char *begalt = b;
-
+
/* In processing an interval, at least this many matches must be made. */
int lower_bound;
@@ -447,19 +802,8 @@ re_compile_pattern (pattern, size, bufp)
/* Place in pattern (i.e., the {) to which to go back if the interval
is invalid. */
- char *beg_interval = 0;
-
- /* Stack of information saved by \( and restored by \).
- Four stack elements are pushed by each \(:
- First, the value of b.
- Second, the value of fixup_jump.
- Third, the value of regnum.
- Fourth, the value of begalt. */
-
- int stackb[40];
- int *stackp = stackb;
- int *stacke = stackb + 40;
- int *stackt;
+ const char *beg_interval = 0;
+ const char *following_left_brace = 0;
/* Counts \('s as they are encountered. Remembered for the matching \),
where it becomes the register number to put in the stop_memory
@@ -467,7 +811,69 @@ re_compile_pattern (pattern, size, bufp)
int regnum = 1;
+ compile_stack_type compile_stack;
+ anchor_list_type anchor_list;
+
+ /* Keeps track of whether or not the pattern at a given grouping level
+ matches the empty string so far. Each bit in the `bits' field of
+ this variable corresponds to a level, starting at level zero (i.e.,
+ the whole pattern) at the rightmost bit of list[0]. Level 1 is the
+ bit to the left of that, etc. Additional bits that won't fit in
+ bits[0] are in bits[2], bits[3], etc. */
+
+ bits_list_type level_match_status;
+ unsigned current_level = 0;
+
+ /* Does a similar thing for groups that the above variable does for
+ levels. */
+ bits_list_type group_match_status;
+
+ /* Keeps track of whether or not a given group is active. Accessed as
+ is group_match_status. */
+ bits_list_type group_active_status;
+
+ /* Keeps track of operations relevant to detecting valid position of '$'. */
+ op_list_type op_list;
+
+ /* Keeps track of whether or not hit a `$' since the the beginning of
+ the pattern or the last (if any) alternative; if so, then `^' is an
+ ordinary character. */
+
+ boolean had_an_endline = false;
+
+
+ compile_stack.stack
+ = (compile_stack_element *) malloc (INIT_COMPILE_STACK_SIZE
+ * sizeof (compile_stack_element));
+
+ if (compile_stack.stack == NULL)
+ goto memory_exhausted;
+
+ compile_stack.size = INIT_COMPILE_STACK_SIZE;
+ compile_stack.avail = 0;
+
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ if (!init_pattern_offsets_list (&anchor_list,
+ INIT_COMPILE_STACK_SIZE << 1))
+ goto memory_exhausted;
+
+ if (!(init_bits_list (&level_match_status)
+ && init_bits_list (&group_match_status)
+ && init_bits_list (&group_active_status)))
+ goto memory_exhausted;
+
+
+ if (!init_pattern_offsets_list (&op_list, INIT_PATTERN_OFFSETS_LIST_SIZE))
+ goto memory_exhausted;
+
+
+ bufp->syntax = syntax;
bufp->fastmap_accurate = 0;
+ bufp->not_bol = bufp->not_eol = 0;
+
+ /* Always count groups, whether or not bufp->no_sub is set. */
+ bufp->re_nsub = 0;
#ifndef emacs
#ifndef SYNTAX_TABLE
@@ -476,15 +882,21 @@ re_compile_pattern (pattern, size, bufp)
#endif
#endif
+
if (bufp->allocated == 0)
{
bufp->allocated = INIT_BUF_SIZE;
if (bufp->buffer)
- /* EXTEND_BUFFER loses when bufp->allocated is 0. */
- bufp->buffer = (char *) realloc (bufp->buffer, INIT_BUF_SIZE);
+ {
+ /* EXTEND_BUFFER loses when bufp->allocated is 0. This loses if
+ buffer's address is bogus. */
+ bufp->buffer = (char *) realloc (bufp->buffer, INIT_BUF_SIZE);
+ }
else
- /* Caller did not allocate a buffer. Do it for them. */
- bufp->buffer = (char *) malloc (INIT_BUF_SIZE);
+ {
+ /* Caller did not allocate a buffer. Do it for them. */
+ bufp->buffer = (char *) malloc (INIT_BUF_SIZE);
+ }
if (!bufp->buffer) goto memory_exhausted;
begalt = b = bufp->buffer;
}
@@ -494,431 +906,669 @@ re_compile_pattern (pattern, size, bufp)
PATFETCH (c);
switch (c)
- {
- case '$':
- {
- char *p1 = p;
- /* When testing what follows the $,
- look past the \-constructs that don't consume anything. */
- if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
- while (p1 != pend)
- {
- if (*p1 == '\\' && p1 + 1 != pend
- && (p1[1] == '<' || p1[1] == '>'
- || p1[1] == '`' || p1[1] == '\''
-#ifdef emacs
- || p1[1] == '='
-#endif
- || p1[1] == 'b' || p1[1] == 'B'))
- p1 += 2;
- else
- break;
- }
- if (obscure_syntax & RE_TIGHT_VBAR)
- {
- if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p1 != pend)
- goto normal_char;
- /* Make operand of last vbar end before this `$'. */
- if (fixup_jump)
- store_jump (fixup_jump, jump, b);
- fixup_jump = 0;
- BUFPUSH (endline);
- break;
- }
- /* $ means succeed if at end of line, but only in special contexts.
- If validly in the middle of a pattern, it is a normal character. */
-
- if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && p1 != pend)
- goto invalid_pattern;
- if (p1 == pend || *p1 == '\n'
- || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
- || (obscure_syntax & RE_NO_BK_PARENS
- ? *p1 == ')'
- : *p1 == '\\' && p1[1] == ')')
- || (obscure_syntax & RE_NO_BK_VBAR
- ? *p1 == '|'
- : *p1 == '\\' && p1[1] == '|'))
+ {
+ case '$':
+ {
+ if ((syntax & RE_ANCHORS_ONLY_AT_ENDS) && p != pend
+ && (syntax & RE_CONTEXT_INVALID_ANCHORS))
+ goto invalid_pattern;
+
+ if (syntax & RE_TIGHT_ALT)
+ {
+ /* Make operand of last alternation jump to this endline. */
+
+ if (fixup_alt_jump)
+ store_jump (fixup_alt_jump, jump_past_next_alt, b);
+
+ fixup_alt_jump = 0;
+ }
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ if (!record_anchor_position (!COMPILE_STACK_EMPTY,
+ b - bufp->buffer, &anchor_list))
+ goto memory_exhausted;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH ((p != pend && *p == '\n')
+ ? (int) endline_before_newline
+ : (int) endline);
+
+ /* If there's a chance this endline would have to turn into
+ `exactn 1 '$',' have to push dummy ops to make room;
+ can't insert later because would mess up any surrounding
+ jumps. */
+
+ if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
+ && !((syntax & RE_ANCHORS_ONLY_AT_ENDS) && p == pend))
{
- BUFPUSH (endline);
- break;
- }
- goto normal_char;
+ laststart = b - 1;
+ BUF_PUSH_2 (no_op, no_op);
+ }
+
+ had_an_endline = true;
+ break;
}
- case '^':
- /* ^ means succeed if at beg of line, but only if no preceding
- pattern. */
+
+ case '^':
+ /* If change anything in this case, have to change analogous
+ code in *endline* (yes, endline---because the routine goes
+ backwards through the pattern) case of the routine
+ verify_and_adjust_endlines. */
- if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && laststart)
- goto invalid_pattern;
- if (laststart && p - 2 >= pattern && p[-2] != '\n'
- && !(obscure_syntax & RE_CONTEXT_INDEP_OPS))
- goto normal_char;
- if (obscure_syntax & RE_TIGHT_VBAR)
- {
- if (p != pattern + 1
- && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
- goto normal_char;
- BUFPUSH (begline);
- begalt = b;
- }
- else
- BUFPUSH (begline);
- break;
+ /* ^ means match the beginning of a string. If
+ RE_CONTEXT_INDEP_ANCHORS is set, then it represents the
+ match-beginning-of-line operator anywhere in the regular
+ expression.
+
+ If that bit isn't set, then it represents the
+ match-beginning-of-line operator in leading positions and
+ matches itself in other positions (unless it's invalid
+ there). */
+
+ /* If the '^' must be at the pattern's beginning or else is
+ in a leading position. */
+
+ if (((syntax & RE_ANCHORS_ONLY_AT_ENDS)
+ || (syntax & RE_TIGHT_ALT))
+ ? p - 1 == pattern
+
+ /* If just after a newline, or... */
+ : ((p - 2 >= pattern && p[-2] == '\n')
+
+ /* ...no levels match anything, then in a leading position. */
+
+ || no_levels_match_anything (level_match_status)))
+ {
+ if (had_an_endline)
+ goto normal_char;
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ if (!record_anchor_position (!COMPILE_STACK_EMPTY,
+ b - bufp->buffer, &anchor_list))
+ goto memory_exhausted;
+
+ }
+
+ else if (syntax & RE_CONTEXT_INVALID_ANCHORS)
+ goto invalid_pattern;
+
+ /* If not just after a newline and not always supposed to be
+ an anchor, consider it a ordinary character. */
+
+ else if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
+ && ((syntax & RE_ANCHORS_ONLY_AT_ENDS)
+ /* To make, e.g., `^(^a)' match `^a'. */
+ ? p - 1 != pattern
+ : (int)laststart))
+ goto normal_char;
- case '+':
- case '?':
- if ((obscure_syntax & RE_BK_PLUS_QM)
- || (obscure_syntax & RE_LIMITED_OPS))
- goto normal_char;
- handle_plus:
- case '*':
- /* If there is no previous pattern, char not special. */
- if (!laststart)
+ if (syntax & RE_TIGHT_ALT)
+ {
+ if (p != pattern + 1 && !(syntax & RE_CONTEXT_INDEP_ANCHORS))
+ goto normal_char;
+
+ BUF_PUSH (begline);
+ begalt = b; /* Make alternative begin after the '^'. */
+ }
+ else
+ BUF_PUSH (begline);
+
+ break;
+
+ case '+':
+ case '?':
+ if ((syntax & RE_BK_PLUS_QM)
+ || (syntax & RE_LIMITED_OPS))
+ goto normal_char;
+ handle_plus:
+ case '*':
+ /* If there is no previous pattern... */
+ if (!laststart)
{
- if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
- goto invalid_pattern;
- else if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
- goto normal_char;
+ if (syntax & RE_CONTEXT_INVALID_OPS)
+ goto missing_preceding_re;
+ else if (!(syntax & RE_CONTEXT_INDEP_OPS))
+ goto normal_char;
}
- /* If there is a sequence of repetition chars,
- collapse it down to just one. */
- zero_times_ok = 0;
- many_times_ok = 0;
- while (1)
- {
- zero_times_ok |= c != '+';
- many_times_ok |= c != '?';
- if (p == pend)
- break;
- PATFETCH (c);
- if (c == '*')
- ;
- else if (!(obscure_syntax & RE_BK_PLUS_QM)
+
+ if ((syntax & RE_REPEATED_ANCHORS_AWAY)
+ && (enum regexpcode) *laststart == start_memory)
+ remove_intervening_anchors (laststart, b, anchor_list, bufp);
+
+ /* If there is a sequence of repetition chars, collapse it
+ down to just one. We can't combine interval operators with
+ these because we'd incorrect behavior for, e.g., `a{2}*',
+ which should only match an even number of `a's. */
+
+ zero_times_ok = 0;
+ many_times_ok = 0;
+
+ while (1)
+ {
+ zero_times_ok |= c != '+';
+ many_times_ok |= c != '?';
+
+ if (p == pend)
+ break;
+
+ PATFETCH (c);
+
+ if (c == '*')
+ {
+ if (syntax & RE_NO_CONSECUTIVE_REPEATS)
+ goto invalid_preceding_re;
+ }
+ else if (!(syntax & RE_BK_PLUS_QM)
&& (c == '+' || c == '?'))
- ;
- else if ((obscure_syntax & RE_BK_PLUS_QM)
+ {
+ if (syntax & RE_NO_CONSECUTIVE_REPEATS)
+ goto invalid_preceding_re;
+ }
+ else if ((syntax & RE_BK_PLUS_QM)
&& c == '\\')
{
- int c1;
- PATFETCH (c1);
- if (!(c1 == '+' || c1 == '?'))
+ if (p == pend)
+ goto trailing_backslash;
+
+ PATFETCH (c1);
+
+ if (!(c1 == '+' || c1 == '?'))
{
PATUNFETCH;
PATUNFETCH;
break;
}
- c = c1;
+
+ if (syntax & RE_NO_CONSECUTIVE_REPEATS)
+ goto invalid_preceding_re;
+
+ c = c1;
}
else
{
PATUNFETCH;
break;
}
- }
+ }
/* Star, etc. applied to an empty pattern is equivalent
to an empty pattern. */
- if (!laststart)
+ if (!laststart)
break;
- /* Now we know whether or not zero matches is allowed
+ /* Now we know whether or not zero matches is allowed
and also whether or not two or more matches is allowed. */
- if (many_times_ok)
- {
- /* If more than one repetition is allowed, put in at the
- end a backward relative jump from b to before the next
- jump we're going to put in below (which jumps from
- laststart to after this jump). */
- GET_BUFFER_SPACE (3);
- store_jump (b, maybe_finalize_jump, laststart - 3);
- b += 3; /* Because store_jump put stuff here. */
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ /* If more than one repetition is allowed, put in at the
+ end a backward relative jump from b to before the next jump
+ we're going to put in below (which jumps from laststart to
+ after this jump). */
+
+ if (many_times_ok)
+ {
+ GET_BUFFER_SPACE (3);
+ store_jump (b, maybe_pop_jump, laststart - 3);
+ b += 3; /* Because store_jump puts stuff here. */
}
+ /* Otherwise, put in a no_op so verify_and_adjust_endlines can
+ detect that, e.g., a preceding `$' is not an anchor. */
+ else
+ BUF_PUSH (no_op);
+
+
/* On failure, jump from laststart to b + 3, which will be the
end of the buffer after this jump is inserted. */
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer, &op_list);
GET_BUFFER_SPACE (3);
- insert_jump (on_failure_jump, laststart, b + 3, b);
- pending_exact = 0;
+ insert_jump (on_failure_jump, laststart, b + 3, b);
+ pending_exact = 0;
b += 3;
+
if (!zero_times_ok)
{
/* At least one repetition is required, so insert a
- dummy-failure before the initial on-failure-jump
+ dummy_failure before the initial on_failure_jump
instruction of the loop. This effects a skip over that
instruction the first time we hit that loop. */
- GET_BUFFER_SPACE (6);
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer,
+ &op_list);
+ GET_BUFFER_SPACE (3);
insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
b += 3;
- }
+ }
break;
case '.':
laststart = b;
- BUFPUSH (anychar);
- break;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH (anychar);
+
+ if (!set_this_level (&level_match_status, current_level)
+ || !set_match_status_of_active_groups (group_active_status,
+ &group_match_status))
+ goto memory_exhausted;
+
+ break;
case '[':
- if (p == pend)
- goto invalid_pattern;
- while (b - bufp->buffer
- > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
- EXTEND_BUFFER;
+ {
+ unsigned just_had_a_char_class = 0;
+
+ if (p == pend)
+ goto unmatched_left_bracket;
- laststart = b;
- if (*p == '^')
- {
- BUFPUSH (charset_not);
- p++;
- }
- else
- BUFPUSH (charset);
- p1 = p;
+ while (b - bufp->buffer
+ > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
+ EXTEND_BUFFER;
- BUFPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
- /* Clear the whole map */
- memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
-
- if ((obscure_syntax & RE_HAT_NOT_NEWLINE) && b[-2] == charset_not)
- SET_LIST_BIT ('\n');
+ laststart = b;
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
- /* Read in characters and ranges, setting map bits. */
- while (1)
- {
- /* Don't translate while fetching, in case it's a range bound.
- When we set the bit for the character, we translate it. */
- PATFETCH_RAW (c);
+ if (*p == '^')
+ {
+ BUF_PUSH (charset_not);
+ p++;
+ }
+ else
+ BUF_PUSH (charset);
- /* If set, \ escapes characters when inside [...]. */
- if ((obscure_syntax & RE_AWK_CLASS_HACK) && c == '\\')
- {
- PATFETCH(c1);
- SET_LIST_BIT (c1);
- continue;
- }
- if (c == ']')
- {
- if (p == p1 + 1)
- {
- /* If this is an empty bracket expression. */
- if ((obscure_syntax & RE_NO_EMPTY_BRACKETS)
- && p == pend)
- goto invalid_pattern;
- }
- else
- /* Stop if this isn't merely a ] inside a bracket
- expression, but rather the end of a bracket
- expression. */
- break;
- }
- /* Get a range. */
- if (p[0] == '-' && p[1] != ']')
- {
- PATFETCH (c1);
- /* Don't translate the range bounds while fetching them. */
- PATFETCH_RAW (c1);
+ /* Remember the first position in the bracket expression. */
+ p1 = p;
+
+ BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
+ /* Clear the whole map */
+ bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
+
+ if ((syntax & RE_HAT_LISTS_NOT_NEWLINE)
+ && (enum regexpcode) b[-2] == charset_not)
+ SET_LIST_BIT ('\n');
+
+
+ /* Read in characters and ranges, setting map bits. */
+ while (1)
+ {
+ if (p == pend)
+ goto unmatched_left_bracket;
+
+ PATFETCH (c);
+
+
+ /* If set, \ escapes characters when inside [...]. */
+ if ((syntax & RE_AWK_CLASS_HACK) && c == '\\')
+ {
+ if (p == pend)
+ goto trailing_backslash;
+
+ PATFETCH(c1);
+ SET_LIST_BIT (c1);
+ continue;
+ }
+ /* Could be the end of the bracket expression. If it's
+ not (i.e., when the bracket expression is `[]' so
+ far), the ']' character bit gets set way below. */
+
+ if (c == ']' && p != p1 + 1)
+ break;
+
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character class. */
+
+ if (just_had_a_char_class && c == '-' && *p != ']')
+ goto invalid_range_end;
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character: if this is a hyphen not at the
+ beginning or the end of a list, then it's the range
+ operator. */
+
+ if (c == '-'
+ && !(p - 2 >= pattern && p[-2] == '[')
+ && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
+ && *p != ']')
+ {
+ DO_RANGE;
+ }
- if ((obscure_syntax & RE_NO_EMPTY_RANGES) && c > c1)
- goto invalid_pattern;
+ else if (p[0] == '-' && p[1] != ']')
+ {
+ /* This handles ranges made up of characters only. */
+ PATFETCH (c1); /* The `-'. */
+ DO_RANGE;
+ }
+
+ /* See if we're at the beginning of a possible character
+ class. */
+
+ else if ((syntax & RE_CHAR_CLASSES)
+ && c == '[' && p[0] == ':')
+ {
+ /* Longest valid character class word has six chars. */
+ char str[CHAR_CLASS_MAX_LENGTH];
- if ((obscure_syntax & RE_NO_HYPHEN_RANGE_END)
- && c1 == '-' && *p != ']')
- goto invalid_pattern;
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[:'. */
+ if (p == pend)
+ goto unmatched_left_bracket;
+
+ while (1)
+ {
+ /* Don't translate the ``character class''
+ characters. */
+ PATFETCH_RAW (c);
+ if (c == ':' || c == ']' || p == pend
+ || c1 == CHAR_CLASS_MAX_LENGTH)
+ break;
+ str[c1++] = c;
+ }
+ str[c1] = '\0';
- while (c <= c1)
- {
- /* Translate each char that's in the range. */
- if (translate)
- SET_LIST_BIT (translate[c]);
- else
- SET_LIST_BIT (c);
- c++;
- }
- }
- else if ((obscure_syntax & RE_CHAR_CLASSES)
- && c == '[' && p[0] == ':')
- {
- /* Longest valid character class word has six characters. */
- char str[CHAR_CLASS_MAX_LENGTH];
- PATFETCH (c);
- c1 = 0;
- /* If no ] at end. */
- if (p == pend)
- goto invalid_pattern;
- while (1)
- {
- /* Don't translate the ``character class'' characters. */
- PATFETCH_RAW (c);
- if (c == ':' || c == ']' || p == pend
- || c1 == CHAR_CLASS_MAX_LENGTH)
- break;
- str[c1++] = c;
- }
- str[c1] = '\0';
- if (p == pend
- || c == ']' /* End of the bracket expression. */
- || p[0] != ']'
- || p + 1 == pend
- || (strcmp (str, "alpha") != 0
- && strcmp (str, "upper") != 0
- && strcmp (str, "lower") != 0
- && strcmp (str, "digit") != 0
- && strcmp (str, "alnum") != 0
- && strcmp (str, "xdigit") != 0
- && strcmp (str, "space") != 0
- && strcmp (str, "print") != 0
- && strcmp (str, "punct") != 0
- && strcmp (str, "graph") != 0
- && strcmp (str, "cntrl") != 0))
- {
- /* Undo the ending character, the letters, and leave
- the leading : and [ (but set bits for them). */
- c1++;
- while (c1--)
- PATUNFETCH;
- SET_LIST_BIT ('[');
- SET_LIST_BIT (':');
- }
- else
- {
- /* The ] at the end of the character class. */
- PATFETCH (c);
- if (c != ']')
- goto invalid_pattern;
- for (c = 0; c < (1 << BYTEWIDTH); c++)
- {
- if ((strcmp (str, "alpha") == 0 && isalpha (c))
- || (strcmp (str, "upper") == 0 && isupper (c))
- || (strcmp (str, "lower") == 0 && islower (c))
- || (strcmp (str, "digit") == 0 && isdigit (c))
- || (strcmp (str, "alnum") == 0 && isalnum (c))
- || (strcmp (str, "xdigit") == 0 && isxdigit (c))
- || (strcmp (str, "space") == 0 && isspace (c))
- || (strcmp (str, "print") == 0 && isprint (c))
- || (strcmp (str, "punct") == 0 && ispunct (c))
- || (strcmp (str, "graph") == 0 && isgraph (c))
- || (strcmp (str, "cntrl") == 0 && iscntrl (c)))
- SET_LIST_BIT (c);
- }
- }
- }
- else if (translate)
- SET_LIST_BIT (translate[c]);
- else
- SET_LIST_BIT (c);
- }
+ /* If isn't a word bracketed by `[:' and:`]':
+ undo the ending character, the letters, and leave
+ the leading `:' and `[' (but set bits for them). */
+
+ if (c == ':' && p[0] == ']')
+ {
+ if (!IS_CHAR_CLASS (str))
+ goto invalid_char_class;
+
+ /* The ] at the end of the character class. */
+ PATFETCH (c);
+
+ if (p == pend)
+ goto unmatched_left_bracket;
+
+ for (c = 0; c < (1 << BYTEWIDTH); c++)
+ {
+ if ((strcmp (str, "alpha") == 0 && isalpha (c))
+ || (strcmp (str, "upper") == 0 && isupper (c))
+ || (strcmp (str, "lower") == 0 && islower (c))
+ || (strcmp (str, "digit") == 0 && isdigit (c))
+ || (strcmp (str, "alnum") == 0 && isalnum (c))
+ || (strcmp (str, "xdigit") == 0 && isxdigit (c))
+ || (strcmp (str, "space") == 0 && isspace (c))
+ || (strcmp (str, "print") == 0 && isprint (c))
+ || (strcmp (str, "punct") == 0 && ispunct (c))
+ || (strcmp (str, "graph") == 0 && isgraph (c))
+ || (strcmp (str, "cntrl") == 0 && iscntrl (c)))
+ SET_LIST_BIT (c);
+ }
+ just_had_a_char_class = 1;
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ SET_LIST_BIT ('[');
+ SET_LIST_BIT (':');
+ just_had_a_char_class = 0;
+ }
+ }
+ else
+ {
+ just_had_a_char_class = 0;
+ SET_LIST_BIT (c);
+ }
+ }
+
+ /* Discard any (non)matching list bytes that are all 0 at the
+ end of the map. Decrement the map-length byte too. */
+
+ while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
+ b[-1]--;
+ b += b[-1];
+ }
+
+ if (!set_this_level (&level_match_status, current_level)
+ || !set_match_status_of_active_groups (group_active_status,
+ &group_match_status))
+ goto memory_exhausted;
- /* Discard any character set/class bitmap bytes that are all
- 0 at the end of the map. Decrement the map-length byte too. */
- while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
- b[-1]--;
- b += b[-1];
break;
- case '(':
- if (! (obscure_syntax & RE_NO_BK_PARENS))
+
+ case '(':
+ if (!(syntax & RE_NO_BK_PARENS))
goto normal_char;
else
goto handle_open;
case ')':
- if (! (obscure_syntax & RE_NO_BK_PARENS))
+ if (! (syntax & RE_NO_BK_PARENS))
goto normal_char;
else
goto handle_close;
case '\n':
- if (! (obscure_syntax & RE_NEWLINE_OR))
+ if (! (syntax & RE_NEWLINE_ALT))
goto normal_char;
else
goto handle_bar;
case '|':
- if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
- && (! laststart || p == pend))
- goto invalid_pattern;
- else if (! (obscure_syntax & RE_NO_BK_VBAR))
+ if (!(syntax & RE_NO_BK_VBAR))
goto normal_char;
else
goto handle_bar;
case '{':
- if (! ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
- && (obscure_syntax & RE_INTERVALS)))
- goto normal_char;
- else
+ if ((syntax & RE_NO_BK_BRACES)
+ && (syntax & RE_INTERVALS))
goto handle_interval;
+ else
+ goto normal_char;
case '\\':
- if (p == pend) goto invalid_pattern;
- PATFETCH_RAW (c);
+ if (p == pend)
+ goto trailing_backslash;
+
+ PATFETCH_RAW (c);
switch (c)
{
- case '(':
- if (obscure_syntax & RE_NO_BK_PARENS)
+ case '(':
+ if (syntax & RE_NO_BK_PARENS)
goto normal_backsl;
- handle_open:
- if (stackp == stacke) goto nesting_too_deep;
+ handle_open:
+ bufp->re_nsub++;
+ increase_level (&current_level);
- /* Laststart should point to the start_memory that we are about
- to push (unless the pattern has RE_NREGS or more ('s). */
- *stackp++ = b - bufp->buffer;
- if (regnum < RE_NREGS)
- {
- BUFPUSH (start_memory);
- BUFPUSH (regnum);
- }
- *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
- *stackp++ = regnum++;
- *stackp++ = begalt - bufp->buffer;
- fixup_jump = 0;
+ if (!make_group_active (&group_active_status, regnum))
+ goto memory_exhausted;
+
+ if (syntax & RE_NO_EMPTY_GROUPS)
+ {
+ p1 = p;
+ if (*p1 == '^') p1++;
+ if (*p1 == '$') p1++;
+ if (!(syntax & RE_NO_BK_PARENS) && *p1 == '\\') p1++;
+
+ /* If found an empty group... */
+ if (*p1 == ')')
+ goto invalid_pattern;
+ }
+
+ /* Value to restore in laststart when hit end of this
+ group; should point to the start_memory that we are
+ about to push. */
+
+ if (COMPILE_STACK_FULL)
+ {
+ compile_stack.stack = (compile_stack_element *)
+ realloc (compile_stack.stack,
+ (compile_stack.size << 1)
+ * sizeof (compile_stack_element));
+
+ if (compile_stack.stack == NULL)
+ goto memory_exhausted;
+
+ compile_stack.size <<= 1;
+ }
+
+ compile_stack.stack[compile_stack.avail].laststart_offset
+ = b - bufp->buffer;
+ compile_stack.stack[compile_stack.avail].fixup_alt_jump
+ = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
+ compile_stack.stack[compile_stack.avail].regnum = regnum;
+ compile_stack.stack[compile_stack.avail].begalt_offset
+ = begalt - bufp->buffer;
+ compile_stack.avail++;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH_2 (start_memory, regnum);
+ regnum++;
+ fixup_alt_jump = 0;
laststart = 0;
begalt = b;
- break;
+ break;
case ')':
- if (obscure_syntax & RE_NO_BK_PARENS)
+ if (syntax & RE_NO_BK_PARENS)
goto normal_backsl;
- handle_close:
- if (stackp == stackb) goto unmatched_close;
- begalt = *--stackp + bufp->buffer;
- if (fixup_jump)
- store_jump (fixup_jump, jump, b);
- if (stackp[-1] < RE_NREGS)
- {
- BUFPUSH (stop_memory);
- BUFPUSH (stackp[-1]);
- }
- stackp -= 2;
- fixup_jump = *stackp ? *stackp + bufp->buffer - 1 : 0;
- laststart = *--stackp + bufp->buffer;
+
+ if (COMPILE_STACK_EMPTY)
+ if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
+ goto normal_backsl;
+ else
+ goto unmatched_close;
+
+ handle_close:
+ if (fixup_alt_jump)
+ store_jump (fixup_alt_jump, jump_past_next_alt, b);
+
+ /* See similar code for backslashed parens above. */
+
+ if (COMPILE_STACK_EMPTY)
+ if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
+ goto normal_char;
+ else
+ goto unmatched_close;
+
+ if (get_level_match_status (level_match_status, current_level))
+ if (!set_next_lower_level (&level_match_status, current_level))
+ goto memory_exhausted;
+
+ /* Only call these if know you have a matched close. */
+ decrease_level (&current_level);
+ make_group_inactive (&group_active_status, regnum);
+
+ compile_stack.avail--;
+ begalt
+ = compile_stack.stack[compile_stack.avail].begalt_offset
+ + bufp->buffer;
+ laststart
+ = (compile_stack.stack[compile_stack.avail].laststart_offset
+ + bufp->buffer);
+
+ fixup_alt_jump = compile_stack.stack[compile_stack.avail].fixup_alt_jump
+ ? compile_stack.stack[compile_stack.avail]
+ .fixup_alt_jump + bufp->buffer - 1
+ : 0;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH_2 (stop_memory,
+ compile_stack.stack[compile_stack.avail].regnum);
break;
- case '|':
- if ((obscure_syntax & RE_LIMITED_OPS)
- || (obscure_syntax & RE_NO_BK_VBAR))
+ case '|': /* `\|'. */
+ if ((syntax & RE_LIMITED_OPS)
+ || (syntax & RE_NO_BK_VBAR))
goto normal_backsl;
handle_bar:
- if (obscure_syntax & RE_LIMITED_OPS)
+ if (syntax & RE_LIMITED_OPS)
goto normal_char;
- /* Insert before the previous alternative a jump which
+
+ /* Disallow empty alternatives if RE_NO_EMPTY_ALTS is set.
+ Caveat: can't detect if the vbar is followed by a
+ trailing '$' yet, unless it's the last thing in a
+ pattern; the routine for verifying endlines has to do
+ the rest. */
+
+ if ((syntax & RE_NO_EMPTY_ALTS)
+ && (!laststart || p == pend
+ || (*p == '$' && p + 1 == pend)
+ || ((syntax & RE_NO_BK_PARENS)
+ ? (p < pend && *p == ')')
+ : (p + 1 < pend && p[0] == '\\' && p[1] == ')'))))
+ goto invalid_pattern;
+
+
+ /* Clear some variables. */
+
+ if (lower_levels_match_nothing (level_match_status,
+ current_level))
+ clear_this_and_higher_levels (&level_match_status,
+ current_level);
+ had_an_endline = false;
+
+
+ /* Insert before the previous alternative a jump which
jumps to this alternative if the former fails. */
- GET_BUFFER_SPACE (6);
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (3, begalt - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (3, begalt - bufp->buffer, &op_list);
+ GET_BUFFER_SPACE (3);
insert_jump (on_failure_jump, begalt, b + 6, b);
pending_exact = 0;
b += 3;
- /* The alternative before the previous alternative has a
- jump after it which gets executed if it gets matched.
- Adjust that jump so it will jump to the previous
- alternative's analogous jump (put in below, which in
- turn will jump to the next (if any) alternative's such
- jump, etc.). The last such jump jumps to the correct
- final destination. */
- if (fixup_jump)
- store_jump (fixup_jump, jump, b);
+
+ /* The alternative before this one has a jump after it
+ which gets executed if it gets matched. Adjust that
+ jump so it will jump to this alternative's analogous
+ jump (put in below, which in turn will jump to the next
+ (if any) alternative's such jump, etc.). The last such
+ jump jumps to the correct final destination. A picture:
+ _____ _____
+ | | | |
+ | v | v
+ a | b | c
+
+ If we are at `b,' then fixup_alt_jump right now points to a
+ three-byte space after `a.' We'll put in the jump, set
+ fixup_alt_jump to right after `b,' and leave behind three
+ bytes which we'll fill in when we get to after `c.' */
+
+ if (fixup_alt_jump)
+ store_jump (fixup_alt_jump, jump_past_next_alt, b);
- /* Leave space for a jump after previous alternative---to be
- filled in later. */
- fixup_jump = b;
+ /* Mark and leave space for a jump after this alternative
+ ---to be filled in later either by next alternative or
+ when know we're at the end of a series of alternatives. */
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ fixup_alt_jump = b;
+ GET_BUFFER_SPACE (3);
b += 3;
laststart = 0;
@@ -926,93 +1576,167 @@ re_compile_pattern (pattern, size, bufp)
break;
case '{':
- if (! (obscure_syntax & RE_INTERVALS)
- /* Let \{ be a literal. */
- || ((obscure_syntax & RE_INTERVALS)
- && (obscure_syntax & RE_NO_BK_CURLY_BRACES))
- /* If it's the string "\{". */
- || (p - 2 == pattern && p == pend))
+ /* If \{ is a literal. */
+ if (!(syntax & RE_INTERVALS)
+ /* If we're at a "\{" and it's not the open-interval
+ operator. */
+ || ((syntax & RE_INTERVALS)
+ && (syntax & RE_NO_BK_BRACES))
+ || (p - 2 == pattern && p == pend))
goto normal_backsl;
+
handle_interval:
- beg_interval = p - 1; /* The {. */
- /* If there is no previous pattern, this isn't an interval. */
- if (!laststart)
- {
- if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
- goto invalid_pattern;
- else
- goto normal_backsl;
- }
- /* It also isn't an interval if not preceded by an re
- matching a single character or subexpression, or if
- the current type of intervals can't handle back
- references and the previous thing is a back reference. */
- if (! (*laststart == anychar
- || *laststart == charset
- || *laststart == charset_not
- || *laststart == start_memory
- || (*laststart == exactn && laststart[1] == 1)
- || (! (obscure_syntax & RE_NO_BK_REFS)
- && *laststart == duplicate)))
- {
- if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
- goto normal_char;
-
- /* Posix extended syntax is handled in previous
- statement; this is for Posix basic syntax. */
- if (obscure_syntax & RE_INTERVALS)
- goto invalid_pattern;
-
- goto normal_backsl;
- }
+ /* If got here, then intervals must be allowed. */
+
+ beg_interval = p - 1; /* The `{'. */
+ following_left_brace = 0;
lower_bound = -1; /* So can see if are set. */
upper_bound = -1;
+
+ if (p == pend)
+ {
+ if (syntax & RE_NO_BK_BRACES)
+ goto unfetch_interval;
+ else
+ goto unmatched_left_curly_brace;
+ }
+
GET_UNSIGNED_NUMBER (lower_bound);
- if (c == ',')
+
+ if (c == ',')
{
- GET_UNSIGNED_NUMBER (upper_bound);
+ GET_UNSIGNED_NUMBER (upper_bound);
if (upper_bound < 0)
upper_bound = RE_DUP_MAX;
}
+
if (upper_bound < 0)
upper_bound = lower_bound;
- if (! (obscure_syntax & RE_NO_BK_CURLY_BRACES))
+
+ if (lower_bound < 0 || upper_bound > RE_DUP_MAX
+ || lower_bound > upper_bound)
+ {
+ if (syntax & RE_NO_BK_BRACES)
+ goto unfetch_interval;
+ else
+ goto invalid_braces_content;
+ }
+
+ if (!(syntax & RE_NO_BK_BRACES))
{
if (c != '\\')
- goto invalid_pattern;
+ goto unmatched_left_curly_brace;
+
PATFETCH (c);
}
- if (c != '}' || lower_bound < 0 || upper_bound > RE_DUP_MAX
- || lower_bound > upper_bound
- || ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
- && p != pend && *p == '{'))
- {
- if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
+
+ if (c != '}')
+ {
+ if (syntax & RE_NO_BK_BRACES)
goto unfetch_interval;
- else
- goto invalid_pattern;
- }
+ else
+ goto invalid_braces_content;
+ }
+
- /* If upper_bound is zero, don't want to succeed at all;
+ /* Parsed a valid interval, but if an interval can't
+ operate on another repetition operator, check that what
+ follows isn't one. */
+
+ if ((syntax & RE_NO_CONSECUTIVE_REPEATS) && p != pend)
+ {
+ if (*p == '*' || *p == '+' || *p == '?')
+ goto invalid_preceding_re;
+
+ if (syntax & RE_NO_BK_BRACES)
+ {
+ if (*p == '{')
+ {
+ /* Close but not exactly as above. */
+
+ int lower_bound = -1;
+ int upper_bound = -1;
+
+ following_left_brace = p++;
+ GET_UNSIGNED_NUMBER (lower_bound);
+
+ if (c == ',')
+ {
+ GET_UNSIGNED_NUMBER (upper_bound);
+ if (upper_bound < 0)
+ upper_bound = RE_DUP_MAX;
+ }
+
+ if (upper_bound < 0)
+ upper_bound = lower_bound;
+
+ /* If not a valid interval, then we don't have
+ an interval operating on another one; what
+ we have instead is a series match-self ops
+ starting with a '{'. */
+
+ if (lower_bound < 0 || upper_bound > RE_DUP_MAX
+ || lower_bound > upper_bound || c != '}')
+ {
+ /* Back up to '{' so can use again
+ put it in C, as the normal_char label
+ code expects that; will go to that
+ label after putting the preceding valid
+ interval in the buffer. */
+
+ p = following_left_brace;
+ PATFETCH (c);
+ }
+ else
+ goto invalid_preceding_re;
+ }
+ }
+ else if (p[0] == '\\' && p[1] == '{')
+ goto invalid_preceding_re;
+ }
+
+
+ /* We just parsed a valid interval. */
+
+ /* If it's invalid to have no preceding re. */
+ if (!laststart)
+ {
+ if (syntax & RE_CONTEXT_INVALID_OPS)
+ goto missing_preceding_re;
+ else if (syntax & RE_CONTEXT_INDEP_OPS)
+ laststart = b;
+ else
+ goto unfetch_interval;
+ }
+ else if ((syntax & RE_REPEATED_ANCHORS_AWAY)
+ && (enum regexpcode) *laststart == start_memory)
+ remove_intervening_anchors (laststart, b, anchor_list, bufp);
+
+ /* If upper_bound is zero, don't want to succeed at all;
jump from laststart to b + 3, which will be the end of
the buffer after this jump is inserted. */
if (upper_bound == 0)
{
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (3, laststart - bufp->buffer,
+ &op_list);
GET_BUFFER_SPACE (3);
- insert_jump (jump, laststart, b + 3, b);
+ insert_jump (no_pop_jump, laststart, b + 3, b);
b += 3;
}
/* Otherwise, after lower_bound number of succeeds, jump
- to after the jump_n which will be inserted at the end
- of the buffer, and insert that jump_n. */
+ to after the no_pop_jump_n which will be inserted at the end
+ of the buffer, and insert that no_pop_jump_n. */
else
{ /* Set to 5 if only one repetition is allowed and
- hence no jump_n is inserted at the current end of
- the buffer; then only space for the succeed_n is
- needed. Otherwise, need space for both the
- succeed_n and the jump_n. */
+ hence no no_pop_jump_n is inserted at the current
+ end of the buffer. Otherwise, need 10 bytes total
+ for the succeed_n and the no_pop_jump_n. */
unsigned slots_needed = upper_bound == 1 ? 5 : 10;
@@ -1021,37 +1745,69 @@ re_compile_pattern (pattern, size, bufp)
be set by its attendant set_number_at, because
re_compile_fastmap will need to know it. Jump to
what the end of buffer will be after inserting
- this succeed_n and possibly appending a jump_n. */
+ this succeed_n and possibly appending a
+ no_pop_jump_n. */
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (5, laststart - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (5, laststart - bufp->buffer,
+ &op_list);
insert_jump_n (succeed_n, laststart, b + slots_needed,
b, lower_bound);
b += 5; /* Just increment for the succeed_n here. */
- /* More than one repetition is allowed, so put in at
+
+ /* More than one repetition is allowed, so put in at
the end of the buffer a backward jump from b to the
succeed_n we put in above. By the time we've gotten
to this jump when matching, we'll have matched once
already, so jump back only upper_bound - 1 times. */
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
if (upper_bound > 1)
{
- store_jump_n (b, jump_n, laststart, upper_bound - 1);
+ store_jump_n (b, no_pop_jump_n, laststart,
+ upper_bound - 1);
b += 5;
/* When hit this when matching, reset the
- preceding jump_n's n to upper_bound - 1. */
- BUFPUSH (set_number_at);
- GET_BUFFER_SPACE (2);
+ preceding no_pop_jump_n's n to upper_bound - 1. */
+
+ BUF_PUSH (set_number_at);
+
+ /* Only need to get space for the numbers. */
+ GET_BUFFER_SPACE (4);
STORE_NUMBER_AND_INCR (b, -5);
STORE_NUMBER_AND_INCR (b, upper_bound - 1);
}
- /* When hit this when matching, set the succeed_n's n. */
+ /* Otherwise, put in a no_op, so verify_and_adjust_endlines
+ can detect, e.g., a preceding `$' is not an anchor. */
+ else
+ BUF_PUSH (no_op);
+
+
+ /* When hit this when matching, set the succeed_n's n. */
+
+ if (syntax & RE_REPEATED_ANCHORS_AWAY)
+ adjust_pattern_offsets_list (5, laststart - bufp->buffer,
+ &anchor_list);
+
+ adjust_pattern_offsets_list (5, laststart - bufp->buffer,
+ &op_list);
GET_BUFFER_SPACE (5);
insert_op_2 (set_number_at, laststart, b, 5, lower_bound);
b += 5;
}
pending_exact = 0;
beg_interval = 0;
- break;
+
+ if (following_left_brace)
+ goto normal_char;
+ break;
unfetch_interval:
/* If an invalid interval, match the characters as literals. */
@@ -1063,64 +1819,88 @@ re_compile_pattern (pattern, size, bufp)
"regex: no interval beginning to which to backtrack.\n");
exit (1);
}
-
beg_interval = 0;
- PATFETCH (c); /* normal_char expects char in `c'. */
- goto normal_char;
- break;
+
+ /* normal_char and normal_backsl expect a character in `c'. */
+ PATFETCH (c);
+
+ if (!(syntax & RE_NO_BK_BRACES))
+ {
+ if (p > pattern && p[-1] == '\\')
+ goto normal_backsl;
+ }
+ goto normal_char;
#ifdef emacs
case '=':
- BUFPUSH (at_dot);
+ BUF_PUSH (at_dot);
break;
case 's':
laststart = b;
- BUFPUSH (syntaxspec);
PATFETCH (c);
- BUFPUSH (syntax_spec_code[c]);
+ BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
break;
case 'S':
laststart = b;
- BUFPUSH (notsyntaxspec);
PATFETCH (c);
- BUFPUSH (syntax_spec_code[c]);
+ BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
break;
#endif /* emacs */
case 'w':
laststart = b;
- BUFPUSH (wordchar);
- break;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH (wordchar);
+
+ if (!set_this_level (&level_match_status, current_level)
+ || !set_match_status_of_active_groups (group_active_status,
+ &group_match_status))
+ goto memory_exhausted;
+
+ break;
case 'W':
laststart = b;
- BUFPUSH (notwordchar);
- break;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH (notwordchar);
+
+ if (!set_this_level (&level_match_status, current_level)
+ || !set_match_status_of_active_groups (group_active_status,
+ &group_match_status))
+ goto memory_exhausted;
+
+ break;
case '<':
- BUFPUSH (wordbeg);
+ BUF_PUSH (wordbeg);
break;
case '>':
- BUFPUSH (wordend);
+ BUF_PUSH (wordend);
break;
case 'b':
- BUFPUSH (wordbound);
+ BUF_PUSH (wordbound);
break;
case 'B':
- BUFPUSH (notwordbound);
+ BUF_PUSH (notwordbound);
break;
case '`':
- BUFPUSH (begbuf);
+ BUF_PUSH (begbuf);
break;
case '\'':
- BUFPUSH (endbuf);
+ BUF_PUSH (endbuf);
break;
case '1':
@@ -1132,28 +1912,39 @@ re_compile_pattern (pattern, size, bufp)
case '7':
case '8':
case '9':
- if (obscure_syntax & RE_NO_BK_REFS)
+ if (syntax & RE_NO_BK_REFS)
goto normal_char;
+
c1 = c - '0';
- if (c1 >= regnum)
+
+ if (c1 >= regnum)
{
- if (obscure_syntax & RE_NO_EMPTY_BK_REF)
- goto invalid_pattern;
+ if (syntax & RE_NO_MISSING_BK_REF)
+ goto invalid_back_reference;
else
goto normal_char;
}
+
/* Can't back reference to a subexpression if inside of it. */
- for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
- if (*stackt == c1)
- goto normal_char;
- laststart = b;
- BUFPUSH (duplicate);
- BUFPUSH (c1);
- break;
+ if (is_in_compile_stack (compile_stack, c1))
+ goto normal_char;
+
+ laststart = b;
+
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
+
+ BUF_PUSH_2 (duplicate, c1);
+
+ if (get_group_match_status (group_match_status, c1))
+ if (!set_this_level (&level_match_status, current_level))
+ goto memory_exhausted;
+
+ break;
case '+':
case '?':
- if (obscure_syntax & RE_BK_PLUS_QM)
+ if (syntax & RE_BK_PLUS_QM)
goto handle_plus;
else
goto normal_backsl;
@@ -1164,61 +1955,141 @@ re_compile_pattern (pattern, size, bufp)
/* You might think it would be useful for \ to mean
not to translate; but if we don't translate it
it will never match anything. */
- if (translate) c = translate[c];
- goto normal_char;
+
+ if (translate)
+ c = translate[c];
+
+ goto normal_char;
}
break;
- default:
- normal_char: /* Expects the character in `c'. */
- if (!pending_exact || pending_exact + *pending_exact + 1 != b
- || *pending_exact == 0177 || *p == '*' || *p == '^'
- || ((obscure_syntax & RE_BK_PLUS_QM)
+ default:
+
+ /* Expects the character in `c'! */
+ normal_char:
+ /* If no exactn currently being built. */
+ if (!pending_exact
+
+ /* If last exactn not at current position. */
+ || pending_exact + *pending_exact + 1 != b
+
+ || *pending_exact == 0177
+
+ /* If followed by a repetition operator. */
+ || *p == '*' || *p == '^'
+ || ((syntax & RE_BK_PLUS_QM)
? *p == '\\' && (p[1] == '+' || p[1] == '?')
: (*p == '+' || *p == '?'))
- || ((obscure_syntax & RE_INTERVALS)
- && ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
+ || ((syntax & RE_INTERVALS)
+ && ((syntax & RE_NO_BK_BRACES)
? *p == '{'
: (p[0] == '\\' && p[1] == '{'))))
{
- laststart = b;
- BUFPUSH (exactn);
- pending_exact = b;
- BUFPUSH (0);
- }
- BUFPUSH (c);
- (*pending_exact)++;
- }
- }
+ /* Start building a new exactn. */
+
+ laststart = b;
- if (fixup_jump)
- store_jump (fixup_jump, jump, b);
+ if (!add_op (&op_list, b - bufp->buffer))
+ goto memory_exhausted;
- if (stackp != stackb) goto unmatched_open;
+ BUF_PUSH_2 (exactn, 0);
+ pending_exact = b - 1;
+
+ if (!set_this_level (&level_match_status, current_level))
+ goto memory_exhausted;
+ }
+ BUF_PUSH (c);
+ (*pending_exact)++;
+ break;
+
+ } /* end switch (c). */
+ } /* end while p!= pend. */
+
+ /* Through the pattern now. */
+
+ if (fixup_alt_jump)
+ store_jump (fixup_alt_jump, jump_past_next_alt, b);
+
+ if (!COMPILE_STACK_EMPTY)
+ goto unmatched_open;
+
+ /* Have to set this before calling the next routine. */
bufp->used = b - bufp->buffer;
+
+ if (!verify_and_adjust_endlines (op_list, group_match_status, bufp,
+ &enough_memory))
+ goto invalid_pattern;
+
+ if (!enough_memory)
+ goto memory_exhausted;
+
+
+ /* Normal return. */
return 0;
+
+ /* Abnormal return. */
+
invalid_pattern:
- return "Invalid regular expression";
+ bufp->used = b - bufp->buffer;
+ return "Invalid regular expression";
unmatched_open:
- return "Unmatched \\(";
+ bufp->used = b - bufp->buffer;
+ return "Unmatched ( or \\(";
unmatched_close:
- return "Unmatched \\)";
+ bufp->used = b - bufp->buffer;
+ return "Unmatched ) or \\)";
end_of_pattern:
- return "Premature end of regular expression";
-
- nesting_too_deep:
- return "Nesting too deep";
+ bufp->used = b - bufp->buffer;
+ return "Premature end of regular expression";
too_big:
- return "Regular expression too big";
+ bufp->used = b - bufp->buffer;
+ return "Regular expression too big";
memory_exhausted:
- return "Memory exhausted";
+ bufp->used = b - bufp->buffer;
+ return "Memory exhausted";
+
+ invalid_char_class:
+ bufp->used = b - bufp->buffer;
+ return "Invalid character class name";
+
+ unmatched_left_bracket:
+ bufp->used = b - bufp->buffer;
+ return "Unmatched [ or [^";
+
+ invalid_range_end:
+ bufp->used = b - bufp->buffer;
+ return "Invalid range end";
+
+ trailing_backslash:
+ bufp->used = b - bufp->buffer;
+ return "Trailing backslash";
+
+ unmatched_left_curly_brace:
+ bufp->used = b - bufp->buffer;
+ return "Unmatched \\{";
+
+ invalid_braces_content:
+ bufp->used = b - bufp->buffer;
+ return "Invalid content of \\{\\}";
+
+ missing_preceding_re:
+ bufp->used = b - bufp->buffer;
+ return "Missing preceding regular expression";
+
+ invalid_preceding_re:
+ bufp->used = b - bufp->buffer;
+ return "Invalid preceding regular expression";
+
+ invalid_back_reference:
+ bufp->used = b - bufp->buffer;
+ return "Invalid back reference";
}
@@ -1229,11 +2100,7 @@ re_compile_pattern (pattern, size, bufp)
static void
store_jump (from, opcode, to)
char *from, *to;
-#ifndef MSDOS
char opcode;
-#else
- int opcode;
-#endif /* MSDOS */
{
from[0] = opcode;
STORE_NUMBER(from + 1, to - (from + 3));
@@ -1248,11 +2115,7 @@ store_jump (from, opcode, to)
static void
insert_jump (op, from, to, current_end)
-#ifndef MSDOS
char op;
-#else
- int op;
-#endif /* MSDOS */
char *from, *to, *current_end;
{
register char *pfrom = current_end; /* Copy from here... */
@@ -1275,11 +2138,7 @@ insert_jump (op, from, to, current_end)
static void
store_jump_n (from, opcode, to, n)
char *from, *to;
-#ifndef MSDOS
char opcode;
-#else
- int opcode;
-#endif /* MSDOS */
unsigned n;
{
from[0] = opcode;
@@ -1298,11 +2157,7 @@ store_jump_n (from, opcode, to, n)
static void
insert_jump_n (op, from, to, current_end, n)
-#ifndef MSDOS
char op;
-#else
- int op;
-#endif /* MSDOS */
char *from, *to, *current_end;
unsigned n;
{
@@ -1323,11 +2178,7 @@ insert_jump_n (op, from, to, current_end, n)
static void
insert_op_2 (op, there, current_end, num_1, num_2)
-#ifndef MSDOS
char op;
-#else
- int op;
-#endif /* MSDOS */
char *there, *current_end;
int num_1, num_2;
{
@@ -1343,52 +2194,849 @@ insert_op_2 (op, there, current_end, num_1, num_2)
}
+/* Compile stack routine for regex_compile. */
+
+/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
+ false if it's not. */
+
+static boolean
+is_in_compile_stack (compile_stack, regnum)
+ compile_stack_type compile_stack;
+ int regnum;
+{
+ int this_element;
+
+ if (COMPILE_STACK_EMPTY)
+ return false;
+
+ for (this_element = compile_stack.avail - 1;
+ this_element >= 0;
+ this_element--)
+ if (compile_stack.stack[this_element].regnum == regnum)
+ return true;
+
+ return false;
+}
+
+
+/* Pattern offsets list stuff. */
+
+/* Initializes a pattern offsets list PATTERN_OFFSETS_LIST_PTR to be
+ INIT_SIZE large.
+
+ Returns 1 if it can allocate the space and 0 if it can't. */
+
+static unsigned
+init_pattern_offsets_list (pattern_offsets_list_ptr, init_size)
+ pattern_offsets_list_type *pattern_offsets_list_ptr;
+ int init_size;
+{
+ if (init_size < 0)
+ {
+ printf ("Can't initialize a pattern offsets list with a negative \
+or zero init_size %d.\n", init_size);
+ exit (1);
+ }
+ else
+ {
+ pattern_offsets_list_ptr->offsets
+ = (pattern_offset_type *) malloc (init_size
+ * sizeof (pattern_offset_type));
+
+ if (pattern_offsets_list_ptr->offsets == NULL)
+ return 0;
+
+ pattern_offsets_list_ptr->size = init_size;
+ pattern_offsets_list_ptr->avail = 0;
+ }
+ return 1;
+}
+
+
+/* Doubles the size of a pattern offsets list PATTERN_OFFSETS_LIST_PTR.
+
+ Returns 1 if it can allocate the space and 0 if it can't. */
+
+static unsigned
+double_pattern_offsets_list (pattern_offsets_list_ptr)
+ pattern_offsets_list_type *pattern_offsets_list_ptr;
+{
+ pattern_offsets_list_ptr->offsets
+ = (pattern_offset_type *) realloc (pattern_offsets_list_ptr->offsets,
+ (pattern_offsets_list_ptr->size << 1) * sizeof (pattern_offset_type));
+
+ if (pattern_offsets_list_ptr->offsets == NULL)
+ return 0;
+
+ pattern_offsets_list_ptr->size <<= 1;
+ return 1;
+}
+
+
+/* Adds OFFSET to PATTERN_OFFSETS_LIST_PTR.
+
+ Returns 1 if it can add the offset and 0 if it needs to allocate
+ space for it and can't. */
+
+static unsigned
+add_pattern_offset (pattern_offsets_list_ptr, offset)
+ pattern_offsets_list_type *pattern_offsets_list_ptr;
+ pattern_offset_type offset;
+{
+ if (PATTERN_OFFSETS_LIST_PTR_FULL (pattern_offsets_list_ptr))
+ if (!double_pattern_offsets_list (pattern_offsets_list_ptr))
+ return 0;
+
+ pattern_offsets_list_ptr->offsets[pattern_offsets_list_ptr->avail] = offset;
+ pattern_offsets_list_ptr->avail++;
+
+ return 1;
+}
+
+
+/* Adjust each offset in PATTERN_OFFSETS_LIST_PTR by INCREMENT. */
+
+static void
+adjust_pattern_offsets_list (increment, start_position,
+ pattern_offsets_list_ptr)
+ unsigned increment;
+ unsigned start_position;
+ pattern_offsets_list_type *pattern_offsets_list_ptr;
+{
+ unsigned this_pattern_offset = 0;
+
+ while (this_pattern_offset < pattern_offsets_list_ptr->avail
+ && pattern_offsets_list_ptr->offsets[this_pattern_offset]
+ < start_position)
+ this_pattern_offset++;
+
+ for (; this_pattern_offset < pattern_offsets_list_ptr->avail;
+ this_pattern_offset++)
+ pattern_offsets_list_ptr->offsets[this_pattern_offset] += increment;
+}
+
+
+/* Anchor routines for regex_compile. */
+
+/* If it's in a group, record in ANCHOR_LIST_PTR an anchor offset that's
+ at OFFSET.
+
+ Returns 1 if can put the offset in ANCHOR_LIST_PTR.
+ Returns 0 if runs out of memory allocating space for it. */
+
+static unsigned
+record_anchor_position (in_a_group, offset, anchor_list_ptr)
+ unsigned in_a_group;
+ pattern_offset_type offset;
+ anchor_list_type *anchor_list_ptr;
+{
+ if (in_a_group)
+ if (!add_pattern_offset (anchor_list_ptr, offset))
+ return 0;
+
+ return 1;
+}
+
+
+/* Set all `begline's between START and END in BUFP to `no_op's.
+ Set all such `endline's to either `endline_in_repeat's and all such
+ `endline_before_newline's to `repeated_endline_before_repeat's. */
+
+static void
+remove_intervening_anchors (start, end, anchor_list, bufp)
+ char *start, *end;
+ anchor_list_type anchor_list;
+ struct re_pattern_buffer *bufp;
+{
+ unsigned this_anchor = 0;
+
+ while (this_anchor < anchor_list.avail
+ && start - bufp->buffer <= anchor_list.offsets[this_anchor]
+ && anchor_list.offsets[this_anchor] <= end - bufp->buffer)
+ {
+ char *this_anchor_ptr
+ = bufp->buffer + anchor_list.offsets[this_anchor++];
+
+ *this_anchor_ptr = *this_anchor_ptr == endline
+ ? (char)endline_in_repeat
+ : *this_anchor_ptr == endline_before_newline
+ ? (char)repeated_endline_before_newline
+ : *this_anchor_ptr == begline
+ ? (char)no_op
+ : *this_anchor_ptr;
+ }
+}
+
+
+/* Op list stuff. */
+
+/* Add OP_OFFSET to OP_LIST_PTR.
+ Return 1 if can add it and 0 if can't allocate the space to do so. */
+
+static unsigned
+add_op (op_list_ptr, op_offset)
+ op_list_type *op_list_ptr;
+ pattern_offset_type op_offset;
+{
+ return add_pattern_offset (op_list_ptr, op_offset);
+}
+
+
+/* Verify that all `$'s in an entire pattern buffer BUFP are valid
+ anchors or ordinary characters. Either leave or change intermediate
+ forms of `$' anchor ops into `endline' or `exactn ...' where
+ appropriate.
+
+ Return true in ENOUGH_MEMORY if don't run out of space allocating
+ internal data structures.
+
+ Return from the routine true if the pattern is valid and false
+ if it isn't. */
+
+static boolean
+verify_and_adjust_endlines (op_list, group_forward_match_status,
+ bufp, enough_memory)
+ op_list_type op_list;
+ /* `duplicate' case needs this: which groups matched something;
+ set when went fowards through the pattern. */
+ bits_list_type group_forward_match_status;
+ struct re_pattern_buffer *bufp;
+ boolean *enough_memory;
+{
+ int this_op_offset; /* Has to be type int because decrementing it. */
+ /* See comments for analogous variables used for '^' in regex_compile. */
+
+ bits_list_type level_match_status;
+ unsigned current_level = 0;
+ bits_list_type group_match_status;
+ bits_list_type group_active_status;
+ char *bend = bufp->buffer + bufp->used;
+ char *previous_p = NULL;
+
+
+ if (!(init_bits_list (&level_match_status)
+ && init_bits_list (&group_match_status)
+ && init_bits_list (&group_active_status)))
+ {
+ *enough_memory = false;
+ return true;
+ }
+ else
+ *enough_memory = true;
+
+ for (this_op_offset = op_list.avail - 1; this_op_offset >= 0;
+ this_op_offset--)
+ {
+ char *p = bufp->buffer + op_list.offsets[this_op_offset];
+
+ if (!enough_memory)
+ break;
+
+ switch ((enum regexpcode) *p)
+ {
+ case endline:
+ case endline_in_repeat:
+ case endline_before_newline:
+ case repeated_endline_before_newline:
+
+ /* If the '$' must be at the pattern's end or else is
+ in a trailing position. */
+
+ if ((bufp->syntax & RE_ANCHORS_ONLY_AT_ENDS)
+ ? p + 1 == bend
+ : ((bufp->syntax & RE_TIGHT_ALT)
+ ? p + 3 == bend /* Would have two following no_ops. */
+ : (*p == endline_before_newline
+ || *p == repeated_endline_before_newline
+ || no_levels_match_anything (level_match_status))))
+ {
+ if ((enum regexpcode) *p == endline_in_repeat
+ || (enum regexpcode) *p == repeated_endline_before_newline)
+ if (bufp->syntax & RE_REPEATED_ANCHORS_AWAY)
+ *p = no_op;
+ else
+ *p = endline;
+
+
+ /* If this is a trailing '$' in an empty alternative. */
+
+ if ((bufp->syntax & RE_NO_EMPTY_ALTS)
+
+ /* If there's an alternation op right before this `$'. */
+ && ((this_op_offset > 0
+ && *(bufp->buffer
+ + op_list.offsets[this_op_offset - 1])
+ == jump_past_next_alt)
+
+ /* Or this `$' is the only thing in the first
+ alternative of more than one of them. */
+
+ || ((this_op_offset == 0 /* It's first. */
+ /* Or it's right after an open-group op. */
+ || (this_op_offset > 0
+ && *(bufp->buffer
+ + op_list.offsets[this_op_offset - 1])
+ == start_memory))
+
+ /* And it's right before an alternation op. */
+ && previous_p != NULL
+ && *previous_p == jump_past_next_alt)))
+ return false;
+ }
+
+ else if (bufp->syntax & RE_CONTEXT_INVALID_ANCHORS)
+ return false;
+
+ else if (!(bufp->syntax & RE_CONTEXT_INDEP_ANCHORS))
+ {
+ p[0] = (char)exactn;
+ p[1] = (char)1;
+ p[2] = '$';
+ }
+
+ break;
+
+
+ /* Yes, start and stop_memory are switched because we're going
+ backwards through the pattern! */
+
+ case stop_memory:
+ increase_level (&current_level);
+
+ if (!make_group_active (&group_active_status, p[1]))
+ enough_memory = false;
+
+ break;
+
+ case start_memory:
+ if (get_level_match_status (level_match_status, current_level))
+ if (!set_next_lower_level (&level_match_status, current_level))
+ enough_memory = false;
+ else
+ {
+ decrease_level (&current_level);
+ make_group_inactive (&group_active_status, p[1]);
+ }
+
+ break;
+
+
+ /* Hit an alternative. */
+
+ case jump_past_next_alt:
+ if (lower_levels_match_nothing (level_match_status, current_level))
+ clear_this_and_higher_levels (&level_match_status,current_level);
+
+ break;
+
+ /* These below mean was followed by a repetition operator. */
+ case no_op:
+ case maybe_pop_jump:
+ case no_pop_jump_n:
+ if (bufp->syntax & RE_REPEATED_ANCHORS_AWAY)
+ break;
+ case charset:
+ case charset_not:
+ case wordchar:
+ case notwordchar:
+ case exactn:
+ case anychar:
+ if (!set_this_level (&level_match_status, current_level)
+ || !set_match_status_of_active_groups (group_active_status,
+ &group_match_status))
+ enough_memory = false;;
+
+ break;
+
+ case duplicate:
+ /* Only set level_match_status if this back reference
+ refers to a nonempty group. */
+
+ if (get_group_match_status (group_forward_match_status, p[1]))
+ if (!set_this_level (&level_match_status, current_level))
+ enough_memory = false;
+
+ break;
+
+ default:
+ printf ("Found an unknown operator %u in compiled pattern.\n", *p);
+ }
+ previous_p = p;
+ }
+ return true;
+}
+
+
+
+/* Bits list routines. (See above for macros.) */
+
+/* Initialize BITS_LIST_PTR to have one bits block.
+ Return 1 if there's enough memory to do so and 0 if there isn't. */
+
+static unsigned
+init_bits_list (bits_list_ptr)
+ bits_list_type *bits_list_ptr;
+{
+ bits_list_ptr->bits = (unsigned *) malloc (sizeof (unsigned));
+
+ if (bits_list_ptr->bits == NULL)
+ return 0;
+
+ bits_list_ptr->size = BITS_BLOCK_SIZE;
+ bits_list_ptr->bits[0] = 0;
+
+ return 1;
+}
+
+
+/* Extend BITS_LIST_PTR by one bits block.
+ Return 1 if there's enough memory to do so and 0 if there isn't. */
+
+static unsigned
+extend_bits_list (bits_list_ptr)
+ bits_list_type *bits_list_ptr;
+{
+ bits_list_ptr->bits
+ = (unsigned *) realloc (bits_list_ptr->bits,
+ bits_list_ptr->size + sizeof (unsigned));
+
+ if (bits_list_ptr->bits == NULL)
+ return 0;
+
+ bits_list_ptr->size += BITS_BLOCK_SIZE;
+ bits_list_ptr->bits[(bits_list_ptr->size/BITS_BLOCK_SIZE) - 1] = 0;
+
+ return 1;
+}
+
+
+/* Get the bit value at a positive POSITION in BITS_LIST. */
+
+static unsigned
+get_bit (bits_list, position)
+ bits_list_type bits_list;
+ unsigned position;
+{
+ if (position < 0)
+ {
+ printf ("Tried to get a bit at position less than zero.\n");
+ exit (1);
+ }
+
+ if (position > bits_list.size - 1)
+ {
+ printf ("Getting bit value: position %d exceeds bits list size %d.\n",
+ position, bits_list.size);
+ exit (1);
+ }
+
+ return bits_list.bits[BITS_BLOCK (position)] & BITS_MASK (position);
+}
+
+
+/* Set the bit for a positive POSITION in BITS_LIST_PTR to VALUE, which,
+ in turn, can only be 0 or 1.
+
+ Returns 1 if can set the bit and 0 if ran out of memory allocating
+ (if necessary) room for it. */
+
+static unsigned
+set_bit_to_value (bits_list_ptr, position, value)
+ bits_list_type *bits_list_ptr;
+ unsigned position;
+ unsigned value;
+{
+ if (position < 0)
+ {
+ printf ("Tried to set a bit at position less than zero.\n");
+ exit (1);
+ }
+
+ if (position > bits_list_ptr->size - 1
+ && !extend_bits_list (bits_list_ptr))
+ return 0;
+
+ if (value == 1)
+ bits_list_ptr->bits[BITS_BLOCK (position)] |= BITS_MASK (position);
+ else if (value == 0)
+ bits_list_ptr->bits[BITS_BLOCK (position)] &= ~(BITS_MASK (position));
+ else
+ {
+ printf ("Invalid value %d to set a bit.\n");
+ exit (1);
+ }
+ return 1;
+}
+
+
+/* Level stuff. */
+
+
+/* Return 1 if LEVEL in LEVEL_MATCH_STATUS matches something and
+ 0 if it doesn't. Assumes LEVEL is positive. */
+
+static unsigned
+get_level_match_status (level_match_status, level)
+ bits_list_type level_match_status;
+ unsigned level;
+{
+ return get_bit (level_match_status, level);
+}
+
+
+/* Mark as matching something the level LEVEL in LEVEL_MATCH_STATUS_PTR.
+ Assumes LEVEL is positive.
+
+ Return 1 if can mark the level and 0 if need to allocate space for it
+ but can't. */
+
+static unsigned
+set_this_level (level_match_status_ptr, level)
+ bits_list_type *level_match_status_ptr;
+ unsigned level;
+{
+ return set_bit_to_value (level_match_status_ptr, level, 1);
+}
+
+
+/* Mark as matching something the level below the LEVEL recorded in
+ LEVEL_MATCH_STATUS_PTR. Assumes LEVEL is greater than zero.
+
+ Return 1 if can mark the level and 0 ran out of memory trying to do so. */
+
+static unsigned
+set_next_lower_level (level_match_status_ptr, level)
+ bits_list_type *level_match_status_ptr;
+ unsigned level;
+{
+ unsigned this_level;
+
+ return set_bit_to_value (level_match_status_ptr, level - 1, 1);
+}
+
+
+/* Mark as matching something the level LEVEL and all levels higher than
+ it currently in LEVEL_MATCH_STATUS_PTR. Assumes LEVEL is positive.
+
+ Return 1 if can mark the levels and 0 ran out of memory trying to do so. */
+
+static void
+clear_this_and_higher_levels (level_match_status_ptr, level)
+ bits_list_type *level_match_status_ptr;
+ unsigned level;
+{
+ unsigned this_level;
+
+ for (this_level = level;
+ this_level < level_match_status_ptr->size;
+ this_level++)
+ set_bit_to_value (level_match_status_ptr, this_level, 0);
+}
+
+
+/* Returns true if none of the levels in LEVEL_MATCH_STATUS less than a
+ positive LEVEL match anything, and false otherwise. */
+
+static boolean
+lower_levels_match_nothing (level_match_status, level)
+ bits_list_type level_match_status;
+ unsigned level;
+{
+ unsigned this_level;
+
+ for (this_level = 0; this_level < level; this_level++)
+ if (get_bit (level_match_status, this_level))
+ return false;
+
+ return true;
+}
+
+/* Returns true if none of the levels in LEVEL_MATCH_STATUS match
+ anything, and false otherwise. */
+
+static boolean
+no_levels_match_anything (level_match_status)
+ bits_list_type level_match_status;
+{
+ unsigned this_bits_block;
+
+ for (this_bits_block = 0;
+ this_bits_block < level_match_status.size/BITS_BLOCK_SIZE;
+ this_bits_block++)
+ if (level_match_status.bits[this_bits_block] != 0)
+ return false;
+
+ return true;
+}
+
+
+/* Increase CURRENT_LEVEL_PTR. */
+
+static void
+increase_level (current_level_ptr)
+ unsigned *current_level_ptr;
+{
+ (*current_level_ptr)++;
+}
+
+
+/* Decrease CURRENT_LEVEL_PTR, but exit on error if try to decrease
+ below zero. */
+
+static void
+decrease_level (current_level_ptr)
+ unsigned *current_level_ptr;
+{
+ if (*current_level_ptr == 0)
+ {
+ printf ("Tried to decrease current level below zero.\n");
+ exit (1);
+ }
+ (*current_level_ptr)--;
+}
+
+
+/* Group stuff. */
+
+
+/* Mark a positive GROUP in GROUP_ACTIVE_STATUS_PTR as active.
+ Return 1 if can mark the group and 0 ran out of memory trying to do so. */
+
+static unsigned
+make_group_active (group_active_status_ptr, group)
+ bits_list_type *group_active_status_ptr;
+ unsigned group;
+{
+ return set_bit_to_value (group_active_status_ptr, group, 1);
+}
+
+
+/* Mark a positive GROUP in GROUP_ACTIVE_STATUS_PTR as inactive.
+ Return 1 if can mark the group and 0 ran out of memory trying to do so. */
+
+static unsigned
+make_group_inactive (group_active_status_ptr, group)
+ bits_list_type *group_active_status_ptr;
+ unsigned group;
+{
+ return set_bit_to_value (group_active_status_ptr, group, 0);
+}
+
+
+/* Mark as active in GROUP_MATCH_STATUS_PTR those active groups recorded
+ in GROUP_ACTIVE_STATUS_PTR.
+
+ Return 1 if can mark the groups and 0 ran out of memory trying to do so. */
+
+static unsigned
+set_match_status_of_active_groups (group_active_status, group_match_status_ptr)
+ bits_list_type group_active_status;
+ bits_list_type *group_match_status_ptr;
+{
+ unsigned this_bit_block;
+
+ if (group_active_status.size > group_match_status_ptr->size
+ && !extend_bits_list (group_match_status_ptr))
+ return 0;
+
+ for (this_bit_block = 0;
+ this_bit_block < group_active_status.size/BITS_BLOCK_SIZE;
+ this_bit_block++)
+ group_match_status_ptr->bits[this_bit_block]
+ |= group_active_status.bits[this_bit_block];
+
+ return 1;
+}
+
+
+/* Return 1 if GROUP in GROUP_MATCH_STATUS matches something and
+ 0 if it doesn't. Assumes GROUP is positive. */
+
+static unsigned
+get_group_match_status (group_match_status, group)
+ bits_list_type group_match_status;
+ unsigned group;
+{
+ return get_bit (group_match_status, group);
+}
+
+
+
+
+/* Failure stack declarations and macros for both re_compile_fastmap and
+ re_match_2. Have to use `alloca' for reasons stated in INIT_BITS_LIST's
+ comment. */
+
+
+/* Roughly the maximum number of failure points on the stack. Would be
+ exactly that if always used MAX_FAILURE_SPACE each time we failed. */
+
+int re_max_failures = 2000;
+
+
+typedef unsigned char *failure_stack_element;
+
+typedef struct {
+ failure_stack_element *stack;
+ unsigned size;
+ unsigned avail; /* Offset of next open position. */
+ } failure_stack_type;
+
+
+#define FAILURE_STACK_EMPTY (failure_stack.avail == 0)
+#define FAILURE_STACK_PTR_EMPTY (failure_stack_ptr->avail == 0)
+#define FAILURE_STACK_FULL (failure_stack.avail == failure_stack.size)
+
+
+/* Initialize a failure stack.
+
+ Return 1 if was able to allocate the space for (FAILURE_STACK) and
+ 0 if not. */
+
+#define INIT_FAILURE_STACK(failure_stack) \
+ ((failure_stack).stack = (failure_stack_element *) \
+ REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (failure_stack_element)),\
+ \
+ (failure_stack).stack == NULL \
+ ? 0 \
+ : ((failure_stack).size = INIT_FAILURE_ALLOC, \
+ (failure_stack).avail = 0, \
+ 1))
+
+
+/* Double the size of FAILURE_STACK, up to MAX_SIZE.
+
+ Return 1 if was able to double it, and 0 if either ran out of memory
+ allocating space for it or it was already MAX_SIZE large.
+
+ REGEX_REALLOCATE requires `void *destination' be declared. */
+
+#define DOUBLE_FAILURE_STACK(failure_stack, max_size) \
+ ((failure_stack).size > max_size \
+ ? 0 \
+ : ((failure_stack).stack = (failure_stack_element *) \
+ REGEX_REALLOCATE ((failure_stack).stack, \
+ ((failure_stack).size << 1) * sizeof (failure_stack_element)),\
+ \
+ (failure_stack).stack == NULL \
+ ? 0 \
+ : ((failure_stack).size <<= 1, \
+ 1)))
+
+
+/* Push PATTERN_OP on (FAILURE_STACK).
+
+ Return 1 if was able to do so and 0 if ran out of memory allocating
+ space to do so.
+
+ DOUBLE_FAILURE_STACK requires `void *destination' be declared. */
+
+#define PUSH_PATTERN_OP(pattern_op, failure_stack) \
+ ((FAILURE_STACK_FULL \
+ && !DOUBLE_FAILURE_STACK (failure_stack, re_max_failures)) \
+ ? 0 \
+ : ((failure_stack).stack[(failure_stack).avail++] = pattern_op, \
+ 1))
+
+
+/* Push most of the information about the state we will want
+ if we ever fail back to it.
+
+ Requires regstart, regend, reg_info, and num_internal_regs be declared.
+ DOUBLE_FAILURE_STACK requires `void *destination' be declared.
+
+ Does a `return FAILURE_CODE' if runs out of memory. */
+
+#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_stack, failure_code) \
+ do { \
+ long highest_used_reg, this_reg; \
+ void *destination; \
+ \
+ /* Find out how many registers are active or have been matched. \
+ (Aside from register zero, which is only set at the end.) */ \
+ \
+ for (highest_used_reg = num_internal_regs - 1; highest_used_reg > 0;\
+ highest_used_reg--) \
+ if (regstart[highest_used_reg] != (unsigned char *) -1) \
+ break; \
+ \
+ while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
+ if (!DOUBLE_FAILURE_STACK (failure_stack, \
+ re_max_failures * MAX_FAILURE_ITEMS)) \
+ return failure_code; \
+ \
+ /* Now push the info for each of those registers. */ \
+ \
+ for (this_reg = 1; this_reg <= highest_used_reg; this_reg++) \
+ { \
+ (failure_stack).stack[(failure_stack).avail++] \
+ = regstart[this_reg]; \
+ \
+ (failure_stack).stack[(failure_stack).avail++] = regend[this_reg];\
+ \
+ (failure_stack).stack[(failure_stack).avail++] \
+ = (unsigned char *) &reg_info[this_reg]; \
+ } \
+ \
+ /* Push how many registers we saved. */ \
+ (failure_stack).stack[(failure_stack).avail++] \
+ = (unsigned char *) highest_used_reg; \
+ \
+ (failure_stack).stack[(failure_stack).avail++] = pattern_place; \
+ (failure_stack).stack[(failure_stack).avail++] = string_place; \
+ } while (0)
+
+
+
+
/* Given a pattern, compute a fastmap from it. The fastmap records
which of the (1 << BYTEWIDTH) possible characters can start a string
that matches the pattern. This fastmap is used by re_search to skip
- quickly over totally implausible text.
+ quickly over totally impossible text.
The caller must supply the address of a (1 << BYTEWIDTH)-byte data
area as bufp->fastmap.
- The other components of bufp describe the pattern to be used. */
+ The other components of bufp describe the pattern to be used.
+
+ Returns 0 if it can compile a fastmap.
+ Returns -2 if there is an internal error. */
-void
+int
re_compile_fastmap (bufp)
struct re_pattern_buffer *bufp;
{
unsigned char *pattern = (unsigned char *) bufp->buffer;
int size = bufp->used;
register char *fastmap = bufp->fastmap;
- register unsigned char *p = pattern;
+ unsigned char *p = pattern;
register unsigned char *pend = pattern + size;
- register int j, k;
+ int j, k;
unsigned char *translate = (unsigned char *) bufp->translate;
- unsigned is_a_succeed_n;
+ failure_stack_type failure_stack;
+ void *destination;
-#ifndef NO_ALLOCA
- unsigned char *stackb[NFAILURES];
- unsigned char **stackp = stackb;
-#else
- unsigned char **stackb;
- unsigned char **stackp;
- stackb = (unsigned char **) malloc (NFAILURES * sizeof (unsigned char *));
- stackp = stackb;
+ INIT_FAILURE_STACK (failure_stack);
-#endif /* NO_ALLOCA */
- memset (fastmap, 0, (1 << BYTEWIDTH));
+ bzero (fastmap, (1 << BYTEWIDTH));
bufp->fastmap_accurate = 1;
bufp->can_be_null = 0;
while (p)
{
- is_a_succeed_n = 0;
+ boolean is_a_succeed_n = false;
+
if (p == pend)
- {
- bufp->can_be_null = 1;
- break;
- }
+ if (FAILURE_STACK_EMPTY)
+ {
+ bufp->can_be_null = 1;
+ break;
+ }
+ else
+ p = failure_stack.stack[--failure_stack.avail];
+
+
#ifdef SWITCH_ENUM_BUG
switch ((int) ((enum regexpcode) *p++))
#else
@@ -1396,10 +3044,7 @@ re_compile_fastmap (bufp)
#endif
{
case exactn:
- if (translate)
- fastmap[translate[p[1]]] = 1;
- else
- fastmap[p[1]] = 1;
+ fastmap[translate ? translate[p[1]] : p[1]] = 1;
break;
case begline:
@@ -1415,55 +3060,63 @@ re_compile_fastmap (bufp)
continue;
case endline:
- if (translate)
- fastmap[translate['\n']] = 1;
- else
- fastmap['\n'] = 1;
+ fastmap[translate ? translate['\n'] : '\n'] = 1;
- if (bufp->can_be_null != 1)
+ if (! bufp->can_be_null)
bufp->can_be_null = 2;
break;
- case jump_n:
- case finalize_jump:
- case maybe_finalize_jump:
- case jump:
+ case no_pop_jump_n:
+ case pop_failure_jump:
+ case maybe_pop_jump:
+ case no_pop_jump:
+ case jump_past_next_alt:
case dummy_failure_jump:
- EXTRACT_NUMBER_AND_INCR (j, p);
+ extract_number_and_incr (&j, &p);
p += j;
if (j > 0)
continue;
+
/* Jump backward reached implies we just went through
- the body of a loop and matched nothing.
- Opcode jumped to should be an on_failure_jump.
- Just treat it like an ordinary jump.
- For a * loop, it has pushed its failure point already;
- If so, discard that as redundant. */
+ the body of a loop and matched nothing. Opcode jumped to
+ should be an on_failure_jump or succeed_n. Just treat it
+ like an ordinary jump. For a * loop, it has pushed its
+ failure point already; If so, discard that as redundant. */
if ((enum regexpcode) *p != on_failure_jump
&& (enum regexpcode) *p != succeed_n)
continue;
+
p++;
- EXTRACT_NUMBER_AND_INCR (j, p);
+ extract_number_and_incr (&j, &p);
p += j;
- if (stackp != stackb && *stackp == p)
- stackp--;
+
+ /* If what's on the stack is where we are now, pop it. */
+
+ if (!FAILURE_STACK_EMPTY
+ && failure_stack.stack[failure_stack.avail - 1] == p)
+ failure_stack.avail--;
+
continue;
case on_failure_jump:
handle_on_failure_jump:
- EXTRACT_NUMBER_AND_INCR (j, p);
- *++stackp = p + j;
- if (is_a_succeed_n)
- EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
- continue;
+ extract_number_and_incr (&j, &p);
+
+ if (!PUSH_PATTERN_OP (p + j, failure_stack))
+ return -2;
+
+ if (is_a_succeed_n)
+ extract_number_and_incr (&k, &p); /* Skip the n. */
+
+ continue;
case succeed_n:
- is_a_succeed_n = 1;
+ is_a_succeed_n = true;
/* Get to the number of times to succeed. */
p += 2;
/* Increment p past the n for when k != 0. */
- EXTRACT_NUMBER_AND_INCR (k, p);
+ extract_number_and_incr (&k, &p);
if (k == 0)
{
p -= 4;
@@ -1488,9 +3141,7 @@ re_compile_fastmap (bufp)
if (j != '\n')
fastmap[j] = 1;
if (bufp->can_be_null)
- {
- FREE_AND_RETURN_VOID(stackb);
- }
+ return 0;
/* Don't return; check the alternative paths
so we can set can_be_null if appropriate. */
break;
@@ -1523,47 +3174,37 @@ re_compile_fastmap (bufp)
break;
#endif /* not emacs */
- case charset:
- for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
+ case charset:
+ for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
- {
- if (translate)
- fastmap[translate[j]] = 1;
- else
- fastmap[j] = 1;
- }
+ fastmap[translate ? translate[j] : j] = 1;
break;
case charset_not:
- /* Chars beyond end of map must be allowed */
+ /* Chars beyond end of map must be allowed. */
for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
- if (translate)
- fastmap[translate[j]] = 1;
- else
- fastmap[j] = 1;
+ fastmap[translate ? translate[j] : j] = 1;
for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
- {
- if (translate)
- fastmap[translate[j]] = 1;
- else
- fastmap[j] = 1;
- }
- break;
- }
+ fastmap[translate ? translate[j] : j] = 1;
- /* Get here means we have successfully found the possible starting
+ break;
+ } /* End switch *p++. */
+
+ /* Getting here means we have successfully found the possible starting
characters of one path of the pattern. We need not follow this
path any farther. Instead, look at the next alternative
remembered in the stack. */
- if (stackp != stackb)
- p = *stackp--;
+
+ if (!FAILURE_STACK_EMPTY)
+ p = failure_stack.stack[--failure_stack.avail];
else
break;
}
- FREE_AND_RETURN_VOID(stackb);
-}
+ return 0;
+} /* re_compile_fastmap */
+
@@ -1571,110 +3212,123 @@ re_compile_fastmap (bufp)
doesn't let you say where to stop matching. */
int
-re_search (pbufp, string, size, startpos, range, regs)
- struct re_pattern_buffer *pbufp;
- char *string;
- int size, startpos, range;
+re_search (bufp, string, size, startpos, range, regs)
+ struct re_pattern_buffer *bufp;
+ const char *string;
+ const int size, startpos, range;
struct re_registers *regs;
{
- return re_search_2 (pbufp, (char *) 0, 0, string, size, startpos, range,
+ return re_search_2 (bufp, (char *) 0, 0, string, size, startpos, range,
regs, size);
}
-/* Using the compiled pattern in PBUFP->buffer, first tries to match the
+/* Using the compiled pattern in BUFP->buffer, first tries to match the
virtual concatenation of STRING1 and STRING2, starting first at index
STARTPOS, then at STARTPOS + 1, and so on. RANGE is the number of
places to try before giving up. If RANGE is negative, it searches
- backwards, i.e., the starting positions tried are STARTPOS, STARTPOS
- - 1, etc. STRING1 and STRING2 are of SIZE1 and SIZE2, respectively.
+ backwards, i.e., the starting positions tried are STARTPOS, STARTPOS - 1,
+ etc. STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
In REGS, return the indices of the virtual concatenation of STRING1
- and STRING2 that matched the entire PBUFP->buffer and its contained
+ and STRING2 that matched the entire BUFP->buffer and its contained
subexpressions. Do not consider matching one past the index MSTOP in
the virtual concatenation of STRING1 and STRING2.
The value returned is the position in the strings at which the match
- was found, or -1 if no match was found, or -2 if error (such as
+ was found, -1 if no match was found, or -2 if error (such as
failure stack overflow). */
int
-re_search_2 (pbufp, string1, size1, string2, size2, startpos, range,
- regs, mstop)
- struct re_pattern_buffer *pbufp;
- char *string1, *string2;
- int size1, size2;
- int startpos;
- register int range;
+re_search_2 (bufp, string1, size1, string2, size2, startpos, range,
+ regs, stop)
+ struct re_pattern_buffer *bufp;
+ const char *string1, *string2;
+ const int size1, size2;
+ const int startpos;
+ const int range;
struct re_registers *regs;
- int mstop;
+ const int stop;
{
- register char *fastmap = pbufp->fastmap;
- register unsigned char *translate = (unsigned char *) pbufp->translate;
+ register char *fastmap = bufp->fastmap;
+ register unsigned char *translate = (unsigned char *) bufp->translate;
int total_size = size1 + size2;
- int endpos = startpos + range;
+ int private_startpos = startpos;
+ int private_endpos = startpos + range;
+ int private_range = range;
int val;
+ const struct re_pattern_buffer *private_bufp;
/* Check for out-of-range starting position. */
- if (startpos < 0 || startpos > total_size)
+ if (private_startpos < 0 || private_startpos > total_size)
return -1;
- /* Fix up range if it would eventually take startpos outside of the
- virtual concatenation of string1 and string2. */
- if (endpos < -1)
- range = -1 - startpos;
- else if (endpos > total_size)
- range = total_size - startpos;
-
- /* Update the fastmap now if not correct already. */
- if (fastmap && !pbufp->fastmap_accurate)
- re_compile_fastmap (pbufp);
+ /* Fix up range if it would eventually take private_startpos outside
+ of the virtual concatenation of string1 and string2. */
+
+ if (private_endpos < -1)
+ private_range = -1 - private_startpos;
+
+ else if (private_endpos > total_size)
+ private_range = total_size - private_startpos;
+
+
+/* Update the fastmap now if not correct already. */
+ if (fastmap && !bufp->fastmap_accurate)
+ if (re_compile_fastmap (bufp) == -2)
+ return -2;
/* If the search isn't to be a backwards one, don't waste time in a
long search for a pattern that says it is anchored. */
- if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
- && range > 0)
+ if (bufp->used > 0 && (enum regexpcode) bufp->buffer[0] == begbuf
+ && private_range > 0)
{
- if (startpos > 0)
+ if (private_startpos > 0)
return -1;
else
- range = 1;
+ private_range = 1;
}
+ private_bufp = bufp;
+
while (1)
{
/* If a fastmap is supplied, skip quickly over characters that
cannot possibly be the start of a match. Note, however, that
- if the pattern can possibly match the null string, we must
- test it at each starting point so that we take the first null
- string we get. */
+ if the pattern can possibly match the null string, we don't
+ want to skip over characters; we want the first null string we
+ can match. */
- if (fastmap && startpos < total_size && pbufp->can_be_null != 1)
+ if (fastmap && private_startpos < total_size && !bufp->can_be_null)
{
- if (range > 0) /* Searching forwards. */
+ if (private_range > 0) /* Searching forwards. */
{
register int lim = 0;
register unsigned char *p;
- int irange = range;
- if (startpos < size1 && startpos + range >= size1)
- lim = range - (size1 - startpos);
+ int irange = private_range;
+
+ if (private_startpos < size1
+ && private_startpos + private_range >= size1)
+ lim = private_range - (size1 - private_startpos);
p = ((unsigned char *)
- &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
+ &(private_startpos >= size1
+ ? string2 - size1
+ : string1)[private_startpos]);
- while (range > lim && !fastmap[translate
+ while (private_range > lim && !fastmap[translate
? translate[*p++]
: *p++])
- range--;
- startpos += irange - range;
+ private_range--;
+ private_startpos += irange - private_range;
}
else /* Searching backwards. */
{
register unsigned char c;
- if (string1 == 0 || startpos >= size1)
- c = string2[startpos - size1];
+ if (size1 == 0 || private_startpos >= size1)
+ c = string2[private_startpos - size1];
else
- c = string1[startpos];
+ c = string1[private_startpos];
c &= 0xff;
if (translate ? !fastmap[translate[c]] : !fastmap[c])
@@ -1682,35 +3336,30 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range,
}
}
- if (range >= 0 && startpos == total_size
- && fastmap && pbufp->can_be_null == 0)
+ if (private_range >= 0 && private_startpos == total_size
+ && fastmap && bufp->can_be_null == 0)
return -1;
- val = re_match_2 (pbufp, string1, size1, string2, size2, startpos,
- regs, mstop);
+ val = re_match_2 (private_bufp, string1, size1, string2, size2,
+ private_startpos, regs, stop);
if (val >= 0)
- return startpos;
+ return private_startpos;
+
if (val == -2)
return -2;
-#ifndef NO_ALLOCA
-#ifdef C_ALLOCA
- alloca (0);
-#endif /* C_ALLOCA */
-
-#endif /* NO_ALLOCA */
advance:
- if (!range)
+ if (!private_range)
break;
- else if (range > 0)
+ else if (private_range > 0)
{
- range--;
- startpos++;
+ private_range--;
+ private_startpos++;
}
else
{
- range++;
- startpos--;
+ private_range++;
+ private_startpos--;
}
}
return -1;
@@ -1720,115 +3369,118 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range,
#ifndef emacs /* emacs never uses this. */
int
-re_match (pbufp, string, size, pos, regs)
- struct re_pattern_buffer *pbufp;
- char *string;
- int size, pos;
+re_match (bufp, string, size, pos, regs)
+ const struct re_pattern_buffer *bufp;
+ const char *string;
+ const int size, pos;
struct re_registers *regs;
{
- return re_match_2 (pbufp, (char *) 0, 0, string, size, pos, regs, size);
+ return re_match_2 (bufp, (char *) 0, 0, string, size, pos, regs, size);
}
#endif /* not emacs */
-/* The following are used for re_match_2, defined below: */
+
+/* Routines for re_match_2, defined below. */
-/* Roughly the maximum number of failure points on the stack. Would be
- exactly that if always pushed MAX_NUM_FAILURE_ITEMS each time we failed. */
-
-int re_max_failures = 2000;
+static boolean group_can_match_nothing ();
+static int bcmp_translate ();
-/* Routine used by re_match_2. */
-static int memcmp_translate ();
+/* Macros used by re_match_2, defined below: */
/* Structure and accessing macros used in re_match_2: */
-struct register_info
+typedef struct register_info
{
+ bits_list_type inner_groups; /* Which groups are inside this one. */
+ int can_match_nothing; /* Set if this group can match nothing;
+ -1 if not ever set. */
unsigned is_active : 1;
unsigned matched_something : 1;
-};
+ unsigned ever_matched_something : 1;
+} reg_info_type;
+
+/* Macros used by re_match_2: */
+/* I.e., regstart, regend, and reg_info. */
+
+#define INNER_GROUPS(R) ((R).inner_groups)
+#define CAN_MATCH_NOTHING(R) ((R).can_match_nothing)
#define IS_ACTIVE(R) ((R).is_active)
#define MATCHED_SOMETHING(R) ((R).matched_something)
+#define EVER_MATCHED_SOMETHING(R) ((R).ever_matched_something)
-/* Macros used by re_match_2: */
+/* Record that group INNER is inside of all currently active groups. */
+#define NOTE_INNER_GROUP(inner) \
+ do { unsigned this_reg; \
+ for (this_reg = 0; this_reg < num_internal_regs; this_reg++) \
+ { \
+ void *destination; /* For SET_BIT_TO_VALUE. */ \
+ int ret = SET_BIT_TO_VALUE (INNER_GROUPS (reg_info[this_reg]), \
+ inner, \
+ IS_ACTIVE(reg_info[this_reg])); \
+ if (ret == 0) \
+ { \
+ printf ("Ran out of memory in re_match_2 (NOTE_INNER_GROUP).\n");\
+ exit (1); \
+ } \
+ if (ret != 1) \
+ { \
+ printf ("Invalid value %d to set a bit.\n", ret); \
+ exit (1); \
+ } \
+ } \
+ } while (0)
-/* I.e., regstart, regend, and reg_info. */
-#define NUM_REG_ITEMS 3
+/* Call this when have matched something; it sets `matched' flags for the
+ registers corresponding to the group of which we currently are inside.
+ Also records whether this group ever matched something. */
-/* We push at most this many things on the stack whenever we
- fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are
- arguments to the PUSH_FAILURE_POINT macro. */
+#define SET_REGS_MATCHED \
+ do { unsigned this_reg; \
+ for (this_reg = 0; this_reg < num_internal_regs; this_reg++) \
+ { \
+ MATCHED_SOMETHING (reg_info[this_reg]) = \
+ EVER_MATCHED_SOMETHING (reg_info[this_reg]) = \
+ (IS_ACTIVE (reg_info[this_reg])) ? 1 : 0; \
+ } \
+ } while (0)
-#define MAX_NUM_FAILURE_ITEMS (RE_NREGS * NUM_REG_ITEMS + 2)
-/* We push this many things on the stack whenever we fail. */
+/* Failure stack macros for re_match_2. */
-#define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + 2)
+/* This is the number of items that are pushed and popped on the stack
+ for each register, i.e., its REGSTART, REGEND and REG_INFO. */
+#define NUM_REG_ITEMS 3
-/* This pushes most of the information about the current state we will want
- if we ever fail back to it. */
+/* Refers to highest_used_reg (which we calculate), PATTERN_PLACE and
+ STRING_PLACE, which are arguments to the PUSH_FAILURE_POINT macro. */
+
+#define NUM_OTHER_ITEMS 3
-#define PUSH_FAILURE_POINT(pattern_place, string_place) \
- { \
- short last_used_reg, this_reg; \
- \
- /* Find out how many registers are active or have been matched. \
- (Aside from register zero, which is only set at the end.) */ \
- for (last_used_reg = RE_NREGS - 1; last_used_reg > 0; last_used_reg--)\
- if (regstart[last_used_reg] != (unsigned char *) -1) \
- break; \
- \
- if (stacke - stackp < NUM_FAILURE_ITEMS) \
- { \
- unsigned char **stackx; \
- unsigned int len = stacke - stackb; \
- if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \
- { \
- FREE_AND_RETURN(stackb,(-2)); \
- } \
- \
- /* Roughly double the size of the stack. */ \
- stackx = DOUBLE_STACK(stackx,stackb,len); \
- /* Rearrange the pointers. */ \
- stackp = stackx + (stackp - stackb); \
- stackb = stackx; \
- stacke = stackb + 2 * len; \
- } \
- \
- /* Now push the info for each of those registers. */ \
- for (this_reg = 1; this_reg <= last_used_reg; this_reg++) \
- { \
- *stackp++ = regstart[this_reg]; \
- *stackp++ = regend[this_reg]; \
- *stackp++ = (unsigned char *) &reg_info[this_reg]; \
- } \
- \
- /* Push how many registers we saved. */ \
- *stackp++ = (unsigned char *) last_used_reg; \
- \
- *stackp++ = pattern_place; \
- *stackp++ = string_place; \
- }
-
+/* We put at most these many items on the stack whenever we push a
+ failure point . */
-/* This pops what PUSH_FAILURE_POINT pushes. */
+#define MAX_FAILURE_ITEMS \
+ (num_internal_regs * NUM_REG_ITEMS + NUM_OTHER_ITEMS)
+
+
+/* We really push this many items when pushing a failure point. We
+ calculate highest_used_reg each time. */
+
+#define NUM_FAILURE_ITEMS \
+ (highest_used_reg * NUM_REG_ITEMS + NUM_OTHER_ITEMS)
+
+/* How many items can still be added to the stack without overflowing it. */
+#define REMAINING_AVAIL_SLOTS \
+ (failure_stack.size - failure_stack.avail)
-#define POP_FAILURE_POINT() \
- { \
- int temp; \
- stackp -= 2; /* Remove failure points. */ \
- temp = (int) *--stackp; /* How many regs pushed. */ \
- temp *= NUM_REG_ITEMS; /* How much to take off the stack. */ \
- stackp -= temp; /* Remove the register info. */ \
- }
#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
@@ -1854,19 +3506,6 @@ struct register_info
}
-/* Call this when have matched something; it sets `matched' flags for the
- registers corresponding to the subexpressions of which we currently
- are inside. */
-#define SET_REGS_MATCHED \
- { unsigned this_reg; \
- for (this_reg = 0; this_reg < RE_NREGS; this_reg++) \
- { \
- if (IS_ACTIVE(reg_info[this_reg])) \
- MATCHED_SOMETHING(reg_info[this_reg]) = 1; \
- else \
- MATCHED_SOMETHING(reg_info[this_reg]) = 0; \
- } \
- }
/* Test if at very beginning or at very end of the virtual concatenation
of string1 and string2. If there is only one string, we've put it in
@@ -1884,58 +3523,151 @@ struct register_info
2) if we're before the beginning of string2, we have to look at the
last character in string1; we assume there is a string1, so use
this in conjunction with AT_STRINGS_BEG. */
+
#define IS_A_LETTER(d) \
(SYNTAX ((d) == end1 ? *string2 : (d) == string2 - 1 ? *(end1 - 1) : *(d))\
== Sword)
+#ifdef REGEX_MALLOC
+#define FREE_VARIABLES \
+ do { \
+ free (failure_stack.stack); \
+ free (regstart); \
+ free (regend); \
+ free (old_regstart); \
+ free (old_regend); \
+ free (reg_info); \
+ free (best_regstart); \
+ free (best_regend); \
+ reg_info = NULL; \
+ failure_stack.stack = NULL; \
+ regstart = regend = old_regstart = old_regend \
+ = best_regstart = best_regend = NULL; \
+ } while (0)
+#endif
+
+
+
+/* The main matching routine, re_match_2. */
+
+static void pop_failure_point();
+
-/* Match the pattern described by PBUFP against the virtual
- concatenation of STRING1 and STRING2, which are of SIZE1 and SIZE2,
- respectively. Start the match at index POS in the virtual
- concatenation of STRING1 and STRING2. In REGS, return the indices of
- the virtual concatenation of STRING1 and STRING2 that matched the
- entire PBUFP->buffer and its contained subexpressions. Do not
- consider matching one past the index MSTOP in the virtual
- concatenation of STRING1 and STRING2.
+/* re_match_2 matches a buffer full of byte commands for matching (gotten
+ from compiling a regular expression) and matches it against the
+ the virtual concatenation of its two string arguments.
+
+ BUFP is a struct re_pattern_buffer * whose pertinent fields are
+ mentioned below:
+
+ It has a char * field BUFFER which points to the byte
+ commands which make up the compiled pattern.
+
+ Its char * field TRANSLATE, if not 0, translates all
+ ordinary elements in the compiled pattern.
- If pbufp->fastmap is nonzero, then it had better be up to date.
+ Its int field SYNTAX is the syntax with which the pattern
+ was compiled and hence should be matched with.
+
+ The long field USED is how many bytes long the compiled
+ pattern is.
+
+ Its size_t field RE_NSUB contains how many subexpressions
+ the pattern has.
+
+ It ignores its NO_SUB bit.
+
+ If its RETURN_DEFAULT_NUM_REGS bit is set, then if REGS is
+ nonzero, re_match_2 reports in REGS->start[i] and
+ REGS->end[i], for i = 1 to BUFP->RE_NSUB + 1, which
+ substring of the virtual concatenation of STRING1 and
+ STRING2 matched the i-th subexpression of the regular
+ expression compiled in BUFFER; it records in REGS->start[0]
+ and REGS->end[0] information about all of that
+ concatenation. If RETURN_DEFAULT_NUM_REGS isn't set,
+ re_match_2 returns in REGS similar information about i
+ things for i = 1 to REGS->num_regs. If REGS is zero,
+ re_match_2 ignores it. See the comment for `struct
+ re_registers' for more details.
+
+ STRING1 and STRING2
+ are the addresses of the strings of which re_match_2 tries
+ to match the virtual concatenation. Because of this
+ concatenation, this function can be used on an Emacs
+ buffer's contents.
+
+ SIZE1 is the size of STRING1.
- The reason that the data to match are specified as two components
- which are to be regarded as concatenated is so this function can be
- used directly on the contents of an Emacs buffer.
+ SIZE2 is the size of STRING2.
+
+ POS is the index in the virtual concatenation of STRING1 and
+ STRING2 at which re_match_2 tries to start the match.
+
+ REGS is a struct re_registers *. If it's not zero, then
+ re_match_2 will fill its fields START and END with
+ information about what substrings of the virtual
+ concatenation of STRING1 and STRING2 were matched by the
+ groups represented in BUFP's BUFFER field. You must have
+ allocated the correct amount of space in the `start' and
+ `end' fields of REGS to accommodate `num_regs' (the other
+ field) registers. See the comment for `struct re_registers'
+ in regex.h for more details.
+
+ STOP is the index in the virtual concatenation of STRING1 and
+ STRING2 beyond which re_match_2 won't consider matching.
- -1 is returned if there is no match. -2 is returned if there is an
- error (such as match stack overflow). Otherwise the value is the
- length of the substring which was matched. */
+ It returns -1 if there is no match, -2 if there is an internal error
+ (such as its stack overflowing). Otherwise, it returns the length of
+ the substring it matched. */
int
-re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
- struct re_pattern_buffer *pbufp;
- char *string1_arg, *string2_arg;
- int size1, size2;
- int pos;
+re_match_2 (bufp, string1_arg, size1_arg, string2_arg, size2_arg, pos,
+ regs, stop)
+ const struct re_pattern_buffer *bufp;
+ const char *string1_arg;
+ const int size1_arg;
+ const char *string2_arg;
+ const int size2_arg;
+ const int pos;
struct re_registers *regs;
- int mstop;
+ const int stop;
{
- register unsigned char *p = (unsigned char *) pbufp->buffer;
+ unsigned char *p = (unsigned char *) bufp->buffer;
+ unsigned char *p1;
/* Pointer to beyond end of buffer. */
- register unsigned char *pend = p + pbufp->used;
+ register unsigned char *pend = p + bufp->used;
unsigned char *string1 = (unsigned char *) string1_arg;
unsigned char *string2 = (unsigned char *) string2_arg;
+ int size1 = size1_arg;
+ int size2 = size2_arg;
unsigned char *end1; /* Just past end of first string. */
unsigned char *end2; /* Just past end of second string. */
+
/* Pointers into string1 and string2, just past the last characters in
each to consider matching. */
unsigned char *end_match_1, *end_match_2;
register unsigned char *d, *dend;
- register int mcnt; /* Multipurpose. */
- unsigned char *translate = (unsigned char *) pbufp->translate;
+ int mcnt, mcnt2; /* Multipurpose. */
+ unsigned char *translate = (unsigned char *) bufp->translate;
unsigned is_a_jump_n = 0;
+ /* This is how many registers the caller wants. */
+ unsigned num_regs_wanted = regs
+ ? bufp->return_default_num_regs
+ ? bufp->re_nsub + 1
+ : regs->num_regs
+ : 0;
+
+ /* Want to fill *all* the registers internally. */
+ unsigned num_internal_regs = bufp->re_nsub + 1;
+
+ void *destination; /* For REGEX_REALLOCATE. */
+
+
/* Failure point stack. Each place that can handle a failure further
down the line pushes a failure point on this stack. It consists of
restart, regend, and reg_info for all registers corresponding to the
@@ -1946,13 +3678,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
``dummy''; if a failure happens and the failure point is a dummy, it
gets discarded and the next next one is tried. */
-#ifndef NO_ALLOCA
- unsigned char *initial_stack[MAX_NUM_FAILURE_ITEMS * NFAILURES];
-#endif
- unsigned char **stackb;
- unsigned char **stackp;
- unsigned char **stacke;
-
+ failure_stack_type failure_stack;
/* Information on the contents of registers. These are pointers into
the input strings; they record just what was matched (on this
@@ -1962,8 +3688,21 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
stopped matching the regnum-th subexpression. (The zeroth register
keeps track of what the whole pattern matches.) */
- unsigned char *regstart[RE_NREGS];
- unsigned char *regend[RE_NREGS];
+ unsigned char **regstart = (unsigned char **)
+ REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
+ unsigned char **regend = (unsigned char **)
+ REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
+
+ /* If a group that's operated upon by a repetition operator fails to
+ match anything, then the register for its start will need to be
+ restored because it will have been set to wherever in the string we
+ are when we last see its open-group operator. The argument is
+ similar for a register's end. */
+
+ unsigned char **old_regstart
+ = (unsigned char **) REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
+ unsigned char **old_regend
+ = (unsigned char **) REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
/* The is_active field of reg_info helps us keep track of which (possibly
nested) subexpressions we are currently in. The matched_something
@@ -1972,7 +3711,8 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
subexpression. These two fields get reset each time through any
loop their register is in. */
- struct register_info reg_info[RE_NREGS];
+ struct register_info *reg_info = (struct register_info *)
+ REGEX_ALLOCATE (num_internal_regs * sizeof (struct register_info));
/* The following record the register info as found in the above
@@ -1981,36 +3721,92 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
turn happens only if we have not yet matched the entire string. */
unsigned best_regs_set = 0;
- unsigned char *best_regstart[RE_NREGS];
- unsigned char *best_regend[RE_NREGS];
- /* Initialize the stack. */
-#ifdef NO_ALLOCA
- stackb = (unsigned char **) malloc (MAX_NUM_FAILURE_ITEMS * NFAILURES * sizeof (char *));
-#else
- stackb = initial_stack;
-#endif
- stackp = stackb;
- stacke = &stackb[MAX_NUM_FAILURE_ITEMS * NFAILURES];
+ unsigned char **best_regstart
+ = (unsigned char **) REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
+
+ unsigned char **best_regend
+ = (unsigned char **) REGEX_ALLOCATE (num_internal_regs * sizeof (unsigned char *));
-#ifdef DEBUG_REGEX
- fprintf (stderr, "Entering re_match_2(%s%s)\n", string1_arg, string2_arg);
+ unsigned current_reg = 0;
+
+ /* End of declarations. */
+
+
+ if (!INIT_FAILURE_STACK (failure_stack))
+ return -2;
+
+ if (!(regstart && regend && old_regstart && old_regend && reg_info
+ && best_regstart && best_regend))
+ {
+#ifdef REGEX_MALLOC
+ FREE_VARIABLES;
#endif
+ return -2;
+ }
+ /* The starting position is bogus. */
+ if (pos < 0 || pos > size1 + size2)
+ {
+#ifdef REGEX_MALLOC
+ FREE_VARIABLES;
+#endif
+ return -1;
+ }
+
+
/* Initialize subexpression text positions to -1 to mark ones that no
\( or ( and \) or ) has been seen for. Also set all registers to
- inactive and mark them as not having matched anything or ever
- failed. */
- for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
+ inactive and mark them as not having any inner groups, able to
+ match the empty string, matched anything so far, or ever failed. */
+
+ for (mcnt = 0; mcnt < num_internal_regs; mcnt++)
{
- regstart[mcnt] = regend[mcnt] = (unsigned char *) -1;
+ regstart[mcnt] = regend[mcnt]
+ = old_regstart[mcnt] = old_regend[mcnt] = (unsigned char *) -1;
+
+ if (!INIT_BITS_LIST (INNER_GROUPS (reg_info[mcnt])))
+ {
+#ifdef REGEX_MALLOC
+ FREE_VARIABLES;
+#endif
+ return -2;
+ }
+
+ CAN_MATCH_NOTHING (reg_info[mcnt]) = -1; /* I.e., unset. */
+ /* The bit fields. */
IS_ACTIVE (reg_info[mcnt]) = 0;
MATCHED_SOMETHING (reg_info[mcnt]) = 0;
+ EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
}
- if (regs)
- for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
- regs->start[mcnt] = regs->end[mcnt] = -1;
+ IS_ACTIVE (reg_info[0]) = 1;
+
+
+ if (regs && num_regs_wanted > 0)
+ {
+ if (bufp->syntax & RE_ALLOCATE_REGISTERS)
+ {
+ regs->num_regs = num_regs_wanted;
+ regs->start = (int *) malloc (regs->num_regs * sizeof (int));
+
+ if (regs->start == NULL)
+ return -2;
+
+ regs->end = (int *) malloc (regs->num_regs * sizeof (int));
+
+ if (regs->end == NULL)
+ return -2;
+ }
+
+ for (mcnt = 0; mcnt < regs->num_regs; mcnt++)
+ {
+ regs->start[mcnt] = -1;
+ regs->end[mcnt] = -1;
+ }
+ }
+
+
/* Set up pointers to ends of strings.
Don't allow the second string to be empty unless both are empty. */
@@ -2024,16 +3820,17 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
end1 = string1 + size1;
end2 = string2 + size2;
+
/* Compute where to stop matching, within the two strings. */
- if (mstop <= size1)
+ if (stop <= size1)
{
- end_match_1 = string1 + mstop;
+ end_match_1 = string1 + stop;
end_match_2 = string2;
}
else
{
end_match_1 = end1;
- end_match_2 = string2 + mstop - size1;
+ end_match_2 = string2 + stop - size1;
}
/* `p' scans through the pattern as `d' scans through the data. `dend'
@@ -2041,12 +3838,18 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
advanced into the following input string whenever necessary, but
this happens before fetching; therefore, at the beginning of the
loop, `d' can be pointing at the end of a string, but it cannot
- equal string2. */
+ equal `string2'. */
if (size1 != 0 && pos <= size1)
- d = string1 + pos, dend = end_match_1;
+ {
+ d = string1 + pos;
+ dend = end_match_1;
+ }
else
- d = string2 + pos - size1, dend = end_match_2;
+ {
+ d = string2 + pos - size1;
+ dend = end_match_2;
+ }
/* This loops over pattern commands. It exits by returning from the
@@ -2055,12 +3858,6 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
while (1)
{
-#ifdef DEBUG_REGEX
- fprintf (stderr,
- "regex loop(%d): matching 0x%02d\n",
- p - (unsigned char *) pbufp->buffer,
- *p);
-#endif
is_a_jump_n = 0;
/* End of pattern means we might have succeeded. */
if (p == pend)
@@ -2068,7 +3865,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
/* If not end of string, try backtracking. Otherwise done. */
if (d != end_match_2)
{
- if (stackp != stackb)
+ if (!FAILURE_STACK_EMPTY)
{
/* More failure points to try. */
@@ -2084,7 +3881,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
best_regs_set = 1;
best_regend[0] = d; /* Never use regstart[0]. */
- for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
+ for (mcnt = 1; mcnt < num_internal_regs; mcnt++)
{
best_regstart[mcnt] = regstart[mcnt];
best_regend[mcnt] = regend[mcnt];
@@ -2099,46 +3896,54 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
/* Restore best match. */
d = best_regend[0];
- for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
+ if (d >= string1 && d <= end1)
+ dend = end_match_1;
+
+ for (mcnt = 0; mcnt < num_internal_regs; mcnt++)
{
regstart[mcnt] = best_regstart[mcnt];
regend[mcnt] = best_regend[mcnt];
}
}
- }
+ } /* if (d != end_match_2) */
/* If caller wants register contents data back, convert it
to indices. */
- if (regs)
+ if (regs && regs->num_regs > 0)
{
regs->start[0] = pos;
- if (MATCHING_IN_FIRST_STRING)
- regs->end[0] = d - string1;
- else
- regs->end[0] = d - string2 + size1;
- for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
+
+ regs->end[0] = MATCHING_IN_FIRST_STRING
+ ? d - string1
+ : d - string2 + size1;
+
+ for (mcnt = 1; mcnt < regs->num_regs; mcnt++)
{
- if (regend[mcnt] == (unsigned char *) -1)
+ if (mcnt >= num_internal_regs
+ || regstart[mcnt] == (unsigned char *) -1
+ || regend[mcnt] == (unsigned char *) -1)
{
regs->start[mcnt] = -1;
regs->end[mcnt] = -1;
continue;
}
- if (IS_IN_FIRST_STRING (regstart[mcnt]))
- regs->start[mcnt] = regstart[mcnt] - string1;
- else
- regs->start[mcnt] = regstart[mcnt] - string2 + size1;
+
+ regs->start[mcnt] = IS_IN_FIRST_STRING (regstart[mcnt])
+ ? regstart[mcnt] - string1
+ : regstart[mcnt] - string2 + size1;
- if (IS_IN_FIRST_STRING (regend[mcnt]))
- regs->end[mcnt] = regend[mcnt] - string1;
- else
- regs->end[mcnt] = regend[mcnt] - string2 + size1;
+ regs->end[mcnt] = IS_IN_FIRST_STRING (regend[mcnt])
+ ? regend[mcnt] - string1
+ : regend[mcnt] - string2 + size1;
}
}
- FREE_AND_RETURN(stackb,
- (d - pos - (MATCHING_IN_FIRST_STRING ?
- string1 :
- string2 - size1)));
+
+#ifdef REGEX_MALLOC
+ FREE_VARIABLES;
+#endif
+ return d - pos - (MATCHING_IN_FIRST_STRING
+ ? string1
+ : string2 - size1);
}
/* Otherwise match next pattern command. */
@@ -2150,51 +3955,135 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
{
/* \( [or `(', as appropriate] is represented by start_memory,
- \) by stop_memory. Both of those commands are followed by
- a register number in the next byte. The text matched
- within the \( and \) is recorded under that number. */
- case start_memory:
+ \) by stop_memory. Both of those commands are followed by a
+ register number in the next byte. The text matched within
+ the \( and \) is recorded (in the internal registers data
+ structure) under that number. */
+
+ case start_memory:
+ /* Find out if this group can match the empty string. */
+ p1 = p; /* To send to group_can_match_nothing. */
+
+ if (CAN_MATCH_NOTHING (reg_info[*p]) == -1)
+ CAN_MATCH_NOTHING (reg_info[*p])
+ = group_can_match_nothing (&p1, pend, reg_info);
+
+ /* Save the position in the string where we were the last time
+ we were at this open-group operator in case the group is
+ operated upon by a repetition operator, e.g., with `(a*)*b'
+ against `ab'; then we want to ignore where we are now in
+ the string in case this attempt to match fails. */
+
+ old_regstart[*p] = CAN_MATCH_NOTHING (reg_info[*p])
+ ? ((regstart[*p] == (unsigned char *) -1)
+ ? d : regstart[*p])
+ : regstart[*p];
regstart[*p] = d;
+
IS_ACTIVE (reg_info[*p]) = 1;
MATCHED_SOMETHING (reg_info[*p]) = 0;
p++;
break;
case stop_memory:
+ /* Save the position we were in the string the last time we
+ were at this close-group operator in case the group is
+ operated upon by a repetition operator, e.g., with
+ `((a*)*(b*)*)*' against `aba'; then we want to ignore where
+ we are now in the string in case this attempt to match
+ fails. */
+
+ old_regend[*p] = CAN_MATCH_NOTHING (reg_info[*p])
+ ? ((regend[*p] == (unsigned char *) -1)
+ ? d : regend[*p])
+ : regend[*p];
regend[*p] = d;
IS_ACTIVE (reg_info[*p]) = 0;
-
- /* If just failed to match something this time around with a sub-
- expression that's in a loop, try to force exit from the loop. */
- if ((! MATCHED_SOMETHING (reg_info[*p])
- || (enum regexpcode) p[-3] == start_memory)
+
+ /* Record that this group is inside of all currently active
+ groups; makes no sense for group 1. */
+ if (*p != 1)
+ NOTE_INNER_GROUP (*p);
+
+
+ /* If just failed to match something this time around with a
+ group that's operated on by a repetition operator, try to
+ force exit from the ``loop,'' and restore the register
+ information for this group that we had before trying this
+ last match. */
+
+ if ((!MATCHED_SOMETHING (reg_info[*p])
+ || (enum regexpcode) p[-3] == start_memory)
&& (p + 1) != pend)
{
- register unsigned char *p2 = p + 1;
+ p1 = p + 1;
mcnt = 0;
- switch (*p2++)
+ switch ((enum regexcode) *p1++)
{
- case jump_n:
+ case no_pop_jump_n:
is_a_jump_n = 1;
- case finalize_jump:
- case maybe_finalize_jump:
- case jump:
+ case pop_failure_jump:
+ case maybe_pop_jump:
+ case no_pop_jump:
case dummy_failure_jump:
- EXTRACT_NUMBER_AND_INCR (mcnt, p2);
+ extract_number_and_incr (&mcnt, &p1);
if (is_a_jump_n)
- p2 += 2;
+ p1 += 2;
break;
}
- p2 += mcnt;
+ p1 += mcnt;
/* If the next operation is a jump backwards in the pattern
- to an on_failure_jump, exit from the loop by forcing a
- failure after pushing on the stack the on_failure_jump's
- jump in the pattern, and d. */
- if (mcnt < 0 && (enum regexpcode) *p2++ == on_failure_jump)
+ to an on_failure_jump right before the start_memory
+ corresponding to this stop_memory, exit from the loop
+ by forcing a failure after pushing on the stack the
+ on_failure_jump's jump in the pattern, and d. */
+
+ if (mcnt < 0 && (enum regexpcode) *p1 == on_failure_jump
+ && (enum regexpcode) p1[3] == start_memory && p1[4] == *p)
{
- EXTRACT_NUMBER_AND_INCR (mcnt, p2);
- PUSH_FAILURE_POINT (p2 + mcnt, d);
+ /* If this group ever matched anything, then
+ restore what its registers were before trying
+ this last failed match, e.g., with `(a*)*b' against
+ `ab' for regstart[1], and, e.g., with `((a*)*(b*)*)*'
+ against `aba' for regend[3].
+
+ Restore the registers for inner groups, too, e.g.,
+ for `((a*)(b*))*' against `aba' (register 2 gets
+ trashed). */
+
+ if (EVER_MATCHED_SOMETHING (reg_info[*p]))
+ {
+ unsigned this_reg;
+ unsigned bits_mask;
+
+ EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
+
+ /* Restore this group's registers. */
+
+ regstart[*p] = old_regstart[*p];
+ regend[*p] = old_regend[*p];
+
+ /* Restore the inner groups' (if any) registers. */
+
+ for (this_reg = 0;
+ this_reg < INNER_GROUPS (reg_info[*p]).size;
+ this_reg++)
+ {
+ if (get_bit (INNER_GROUPS (reg_info[*p]), this_reg))
+ {
+ regstart[this_reg] = old_regstart[this_reg];
+
+ if ((int)old_regend[this_reg]
+ >= (int)regstart[this_reg])
+ regend[this_reg] = old_regend[this_reg];
+ }
+ }
+ }
+ p1++;
+ extract_number_and_incr (&mcnt, &p1);
+ PUSH_FAILURE_POINT (p1 + mcnt, d, failure_stack, -2);
+
goto fail;
}
}
@@ -2205,10 +4094,16 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
followed by the numeric value of <digit> as the register number. */
case duplicate:
{
- int regno = *p++; /* Get which register to match against */
register unsigned char *d2, *dend2;
-
- /* Where in input to try to start matching. */
+ int regno = *p++; /* Get which register to match against. */
+
+ /* Can't back reference a group which we've never matched. */
+ if ((regstart[regno] == (unsigned char *) -1
+ || regend[regno] == (unsigned char *) -1)
+ && ! bufp->can_be_null)
+ goto really_fail;
+
+ /* Where in input to try to start matching. */
d2 = regstart[regno];
/* Where to stop matching; if both the place to start and
@@ -2227,7 +4122,10 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
{
if (dend2 == end_match_2) break;
if (dend2 == regend[regno]) break;
- d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
+
+ /* end of string1 => advance to string2. */
+ d2 = string2;
+ dend2 = regend[regno];
}
/* At end of register contents => success */
if (d2 == dend2) break;
@@ -2246,8 +4144,8 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
/* Compare that many; failure if mismatch, else move
past them. */
if (translate
- ? memcmp_translate (d, d2, mcnt, translate)
- : memcmp ((char *)d, (char *)d2, mcnt))
+ ? bcmp_translate (d, d2, mcnt, translate)
+ : bcmp (d, d2, mcnt))
goto fail;
d += mcnt, d2 += mcnt;
}
@@ -2256,12 +4154,14 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
case anychar:
PREFETCH; /* Fetch a data character. */
- /* Match anything but a newline, maybe even a null. */
- if ((translate ? translate[*d] : *d) == '\n'
- || ((obscure_syntax & RE_DOT_NOT_NULL)
+ /* Match anything but possibly a newline or a null. */
+ if ((!(bufp->syntax & RE_DOT_NEWLINE)
+ && (translate ? translate[*d] : *d) == '\n')
+ || ((bufp->syntax & RE_DOT_NOT_NULL)
&& (translate ? translate[*d] : *d) == '\000'))
goto fail;
- SET_REGS_MATCHED;
+
+ SET_REGS_MATCHED;
d++;
break;
@@ -2275,10 +4175,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
PREFETCH; /* Fetch a data character. */
- if (translate)
- c = translate[*d];
- else
- c = *d;
+ c = translate ? translate[*d] : *d;
if (c < *p * BYTEWIDTH
&& p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
@@ -2293,66 +4190,105 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
}
case begline:
+ if (bufp->not_bol == 1)
+ goto fail;
+
+ if (d && (*d == '\n' || d[-1] == '\n'))
+ {
+ if (*d == '\n')
+ d++;
+
+ if (bufp->syntax & RE_NO_ANCHOR_AT_NEWLINE)
+ goto fail;
+ else
+ break;
+ }
+
if ((size1 != 0 && d == string1)
|| (size1 == 0 && size2 != 0 && d == string2)
- || (d && d[-1] == '\n')
|| (size1 == 0 && size2 == 0))
break;
else
goto fail;
case endline:
+ if (bufp->not_eol == 1)
+ goto fail;
+
if (d == end2
- || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
+ || (d == end1 && size2 == 0))
break;
- goto fail;
- /* `or' constructs are handled by starting each alternative with
- an on_failure_jump that points to the start of the next
- alternative. Each alternative except the last ends with a
- jump to the joining point. (Actually, each jump except for
- the last one really jumps to the following jump, because
- tensioning the jumps is a hassle.) */
+ if (*d == '\n' || (d == end1 && *string2 == '\n'))
+ {
+ PREFETCH;
+
+ if (*d == '\n')
+ d++;
+
+ if (bufp->syntax & RE_NO_ANCHOR_AT_NEWLINE)
+ goto fail;
+ else
+ break;
+ }
+ goto fail;
- /* The start of a stupid repeat has an on_failure_jump that points
- past the end of the repeat text. This makes a failure point so
- that on failure to match a repetition, matching restarts past
- as many repetitions have been found with no way to fail and
- look for another one. */
+ /* Uses of on_failure_jump:
+
+ Each alternative starts with an on_failure_jump that points
+ to the beginning of the next alternative. Each alternative
+ except the last ends with a jump that in effect jumps past
+ the rest of the alternatives. (They really jump to the
+ ending jump of the following alternative, because tensioning
+ these jumps is a hassle.)
- /* A smart repeat is similar but loops back to the on_failure_jump
- so that each repetition makes another failure point. */
+ Repeats start with an on_failure_jump that points past both
+ the repetition text and the following jump or
+ pop_failure_jump back to this on_failure_jump. */
case on_failure_jump:
on_failure:
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
- PUSH_FAILURE_POINT (p + mcnt, d);
+ extract_number_and_incr (&mcnt, &p);
+ PUSH_FAILURE_POINT (p + mcnt, d, failure_stack, -2);
+
break;
- /* The end of a smart repeat has a maybe_finalize_jump back.
- Change it either to a finalize_jump or an ordinary jump. */
- case maybe_finalize_jump:
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
+
+ /* A smart repeat ends with a maybe_pop_jump.
+ We change it either to a pop_failure_jump or a no_pop_jump. */
+
+ case maybe_pop_jump:
+ extract_number_and_incr (&mcnt, &p);
{
register unsigned char *p2 = p;
- /* Compare what follows with the beginning of the repeat.
- If we can establish that there is nothing that they would
- both match, we can change to finalize_jump. */
+
+ /* Compare the beginning of the repeat with what in the
+ pattern follows its end. If we can establish that there
+ is nothing that they would both match, i.e., that we
+ would have to backtrack because of (as would in, e.g.,
+ `a*a') then we can change to pop_failure_jump, because
+ we'll never have to backtrack. */
+
+ /* Skip over parentheses. */
while (p2 + 1 != pend
&& (*p2 == (unsigned char) stop_memory
|| *p2 == (unsigned char) start_memory))
- p2 += 2; /* Skip over reg number. */
- if (p2 == pend)
- p[-3] = (unsigned char) finalize_jump;
- else if (*p2 == (unsigned char) exactn
+ p2 += 2; /* Skip over reg number, too. */
+
+ if (p2 == pend)
+ p[-3] = (unsigned char) pop_failure_jump;
+ else if (*p2 == (unsigned char) exactn
|| *p2 == (unsigned char) endline)
{
register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
register unsigned char *p1 = p + mcnt;
- /* p1[0] ... p1[2] are an on_failure_jump.
- Examine what follows that. */
- if (p1[3] == (unsigned char) exactn && p1[5] != c)
- p[-3] = (unsigned char) finalize_jump;
+
+ /* p1[0] ... p1[2] are the on_failure_jump corresponding
+ to the maybe_finalize_jump of this case. Examine what
+ follows it. */
+
+ if (p1[3] == (unsigned char) exactn && p1[5] != c)
+ p[-3] = (unsigned char) pop_failure_jump;
else if (p1[3] == (unsigned char) charset
|| p1[3] == (unsigned char) charset_not)
{
@@ -2360,53 +4296,83 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
if (c < p1[4] * BYTEWIDTH
&& p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
not = !not;
- /* `not' is 1 if c would match. */
- /* That means it is not safe to finalize. */
+ /* `not' is equal to 1 if c would match, which means
+ that we can't change to pop_failure_jump. */
if (!not)
- p[-3] = (unsigned char) finalize_jump;
+ p[-3] = (unsigned char) pop_failure_jump;
}
}
}
p -= 2; /* Point at relative address again. */
- if (p[-1] != (unsigned char) finalize_jump)
+ if (p[-1] != (unsigned char) pop_failure_jump)
{
- p[-1] = (unsigned char) jump;
- goto nofinalize;
+ p[-1] = (unsigned char) no_pop_jump;
+ goto no_pop;
}
/* Note fall through. */
- /* The end of a stupid repeat has a finalize_jump back to the
- start, where another failure point will be made which will
- point to after all the repetitions found so far. */
+ /* The end of a simple repeat has a pop_failure_jump back to
+ its matching on_failure_jump, where the latter will push a
+ failure point point. The pop_failure_jump takes off failure
+ points put on by this pop_failure_jump's matching
+ on_failure_jump; we got through the pattern to here from the
+ matching on_failure_jump, so didn't fail. Also remove the
+ register information put on by the matching on_failure_jump. */
+
+ case pop_failure_jump:
+ pop:
+ pop_failure_point (&failure_stack);
+ /* Note fall through. */
+
+ /* Jump without taking off any failure points. */
- /* Take off failure points put on by matching on_failure_jump
- because didn't fail. Also remove the register information
- put on by the on_failure_jump. */
- case finalize_jump:
- POP_FAILURE_POINT ();
- /* Note fall through. */
-
- /* Jump without taking off any failure points. */
- case jump:
- nofinalize:
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
- p += mcnt;
+ case no_pop_jump:
+ no_pop:
+ extract_number_and_incr (&mcnt, &p); /* Get the amount to jump. */
+ p += mcnt; /* Do the jump. */
break;
+
+ /* If the last alternative didn't match anything and empty
+ alternatives aren't allowed, then don't skip over the next
+ one. */
+
+ case jump_past_next_alt:
+ {
+ int this_reg; /* Counting down. */
+
+ /* The current register is the innermost (the one with the
+ highest number) active one. */
+
+ for (this_reg = num_internal_regs - 1;
+ this_reg >= 0; this_reg--)
+ if (IS_ACTIVE (reg_info[this_reg]))
+ break;
+
+ if (!(bufp->syntax & RE_NO_EMPTY_ALTS)
+ || MATCHED_SOMETHING (reg_info[this_reg]))
+ goto no_pop;
+
+ p += 2; /* Skip past the jump's number. */
+ break;
+ }
+
case dummy_failure_jump:
/* Normally, the on_failure_jump pushes a failure point, which
- then gets popped at finalize_jump. We will end up at
- finalize_jump, also, and with a pattern of, say, `a+', we
+ then gets popped at pop_failure_jump. We will end up at
+ pop_failure_jump, also, and with a pattern of, say, `a+', we
are skipping over the on_failure_jump, so we have to push
- something meaningless for finalize_jump to pop. */
- PUSH_FAILURE_POINT (0, 0);
- goto nofinalize;
+ something meaningless for pop_failure_jump to pop. */
+
+ PUSH_FAILURE_POINT (0, 0, failure_stack, -2);
+
+ goto no_pop;
/* Have to succeed matching what follows at least n times. Then
just handle like an on_failure_jump. */
case succeed_n:
- EXTRACT_NUMBER (mcnt, p + 2);
+ mcnt = extract_number (p + 2);
/* Originally, this is how many times we HAVE to succeed. */
if (mcnt)
{
@@ -2416,8 +4382,8 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
}
else if (mcnt == 0)
{
- p[2] = unused;
- p[3] = unused;
+ p[2] = (char) no_op;
+ p[3] = (char) no_op;
goto on_failure;
}
else
@@ -2427,15 +4393,14 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
}
break;
- case jump_n:
- EXTRACT_NUMBER (mcnt, p + 2);
+ case no_pop_jump_n:
+ mcnt = extract_number (p + 2);
/* Originally, this is how many times we CAN jump. */
if (mcnt)
{
mcnt--;
STORE_NUMBER(p + 2, mcnt);
- goto nofinalize; /* Do the jump without taking off
- any failure points. */
+ goto no_pop;
}
/* If don't have to jump any more, skip over the rest of command. */
else
@@ -2446,16 +4411,16 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
{
register unsigned char *p1;
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ extract_number_and_incr (&mcnt, &p);
p1 = p + mcnt;
- EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ extract_number_and_incr (&mcnt, &p);
STORE_NUMBER (p1, mcnt);
break;
}
/* Ignore these. Used to ignore the n of succeed_n's which
currently have n == 0. */
- case unused:
+ case no_op:
break;
case wordbound:
@@ -2469,32 +4434,56 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
break;
case wordbeg:
- if (IS_A_LETTER (d) && (!IS_A_LETTER (d - 1) || AT_STRINGS_BEG))
+ /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
+ if (IS_A_LETTER (d) && (AT_STRINGS_BEG || !IS_A_LETTER (d - 1)))
break;
goto fail;
case wordend:
/* Have to check if AT_STRINGS_BEG before looking at d - 1. */
- if (!AT_STRINGS_BEG && IS_A_LETTER (d - 1)
+ if (!AT_STRINGS_BEG && IS_A_LETTER (d - 1)
&& (!IS_A_LETTER (d) || AT_STRINGS_END))
break;
goto fail;
#ifdef emacs
- case before_dot:
- if (PTR_CHAR_POS (d) >= point)
+#ifdef emacs19
+ case before_dot:
+ if (PTR_CHAR_POS (d) >= point)
+ goto fail;
+ break;
+
+ case at_dot:
+ if (PTR_CHAR_POS (d) != point)
+ goto fail;
+ break;
+
+ case after_dot:
+ if (PTR_CHAR_POS (d) <= point)
+ goto fail;
+ break;
+#else /* not emacs19 */
+ case before_dot:
+ if (((d - string2 <= (unsigned) size2)
+ ? d - bf_p2 : d - bf_p1)
+ <= point)
goto fail;
break;
case at_dot:
- if (PTR_CHAR_POS (d) != point)
+ if (((d - string2 <= (unsigned) size2)
+ ? d - bf_p2 : d - bf_p1)
+ == point)
goto fail;
break;
case after_dot:
- if (PTR_CHAR_POS (d) <= point)
+ if (((d - string2 <= (unsigned) size2)
+ ? d - bf_p2 : d - bf_p1)
+ >= point)
goto fail;
break;
+#endif /* not emacs19 */
case wordchar:
mcnt = (int) Sword;
@@ -2579,28 +4568,64 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
/* Jump here if any matching operation fails. */
fail:
- if (stackp != stackb)
+ if (!FAILURE_STACK_EMPTY)
/* A restart point is known. Restart there and pop it. */
{
- short last_used_reg, this_reg;
-
- /* If this failure point is from a dummy_failure_point, just
- skip it. */
- if (!stackp[-2])
+ short highest_used_reg, this_reg;
+ boolean is_a_jump_n = false;
+
+ /* If this failure point is from a dummy_failure_point,
+ just skip it. */
+
+ if (!failure_stack.stack[failure_stack.avail - 2])
{
- POP_FAILURE_POINT ();
+ pop_failure_point (&failure_stack);
goto fail;
}
- d = *--stackp;
- p = *--stackp;
+ /* Among other things, undo the last failure point push. */
+
+ d = failure_stack.stack[--failure_stack.avail];
+ p = failure_stack.stack[--failure_stack.avail];
+
+
+ /* If failed to a backwards jump that's part of a repetition
+ loop, need to pop this failure point and use the next one. */
+
+ switch ((enum regexpcode) *p)
+ {
+ case no_pop_jump_n:
+ is_a_jump_n = true;
+ case maybe_pop_jump:
+ case pop_failure_jump:
+ case no_pop_jump:
+ p1 = p + 1;
+ extract_number_and_incr (&mcnt, &p1);
+ p1 += mcnt;
+
+ if ((is_a_jump_n && *p1 == succeed_n)
+ || (!is_a_jump_n && *p1 == on_failure_jump))
+ {
+ /* Put p and d back on the stack again... */
+ failure_stack.avail += 2;
+
+ /* ...and pop the whole failure point. */
+ pop_failure_point (&failure_stack);
+ goto fail;
+ }
+ break;
+ }
+
if (d >= string1 && d <= end1)
dend = end_match_1;
+
/* Restore register info. */
- last_used_reg = (short) *--stackp;
+ highest_used_reg
+ = (short) failure_stack.stack[--failure_stack.avail];
/* Make the ones that weren't saved -1 or 0 again. */
- for (this_reg = RE_NREGS - 1; this_reg > last_used_reg; this_reg--)
+ for (this_reg = num_internal_regs - 1; this_reg > highest_used_reg;
+ this_reg--)
{
regend[this_reg] = (unsigned char *) -1;
regstart[this_reg] = (unsigned char *) -1;
@@ -2611,24 +4636,342 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
/* And restore the rest from the stack. */
for ( ; this_reg > 0; this_reg--)
{
- reg_info[this_reg] = *(struct register_info *) *--stackp;
- regend[this_reg] = *--stackp;
- regstart[this_reg] = *--stackp;
+ reg_info[this_reg] = *(struct register_info *)
+ failure_stack.stack[--failure_stack.avail];
+
+ regend[this_reg]
+ = failure_stack.stack[--failure_stack.avail];
+
+ regstart[this_reg]
+ = failure_stack.stack[--failure_stack.avail];
}
- }
+ }
else
break; /* Matching at this starting point really fails. */
- }
+ } /* while (1) */
+ really_fail:
if (best_regs_set)
goto restore_best_regs;
- FREE_AND_RETURN(stackb,(-1)); /* Failure to match. */
+#ifdef REGEX_MALLOC
+ FREE_VARIABLES;
+#endif
+ return -1; /* Failure to match. */
+}
+
+
+
+
+/* Subroutine definitions for re_match_2. */
+
+
+
+/* Failure stack stuff. */
+
+/* Pops what PUSH_FAILURE_STACK pushes. */
+
+static void
+pop_failure_point(failure_stack_ptr)
+ failure_stack_type *failure_stack_ptr;
+{
+ int temp;
+
+ if (FAILURE_STACK_PTR_EMPTY)
+ {
+ printf ("Tried to pop empty failure point in re_match_2.\n");
+ exit (1);
+ }
+
+ /* Remove failure points and point to how many regs pushed. */
+ else
+ {
+ if (failure_stack_ptr->avail < 3)
+ {
+ printf ("Aren't enough items to pop on re_match_2 failure stack: \
+there's only %d on it.\n", failure_stack_ptr->avail);
+ exit (1);
+ }
+ failure_stack_ptr->avail -= 3;
+ temp = (int) failure_stack_ptr->stack[failure_stack_ptr->avail];
+ temp *= NUM_REG_ITEMS; /* How much to take off the stack. */
+
+ if (failure_stack_ptr->avail < temp)
+ {
+ printf ("Can't pop %d items off re_match_2 failure stack: \
+there's only %d on it.\n", temp, failure_stack_ptr->avail);
+ exit (1);
+ }
+ failure_stack_ptr->avail -= temp; /* Remove the register info. */
+ }
+}
+
+
+/* Other things. */
+
+static boolean common_op_can_match_nothing ();
+static boolean alternative_can_match_nothing ();
+
+
+/* We are given P pointing to a register number after a start_memory.
+
+ Return true if the pattern up to the corresponding stop_memory can
+ match the empty string, and false otherwise.
+
+ If we find the matching stop_memory, sets P to point to one past its number.
+ Otherwise, sets P to an undefined byte less than or equal to END.
+
+ We don't handle duplicates properly (yet). */
+
+static boolean
+group_can_match_nothing (p, end, reg_info)
+ unsigned char **p, *end;
+ struct register_info *reg_info;
+{
+ int mcnt;
+ unsigned char *p1 = *p + 1; /* Point to after this register number. */
+
+ while (p1 < end)
+ {
+ /* Skip over opcodes that can match nothing, and return true or
+ false, as appropriate, when we get to one that can't, or to the
+ matching stop_memory. */
+
+ switch ((enum regexpcode) *p1)
+ {
+ /* Could be either a loop or a series of alternatives. */
+ case on_failure_jump:
+ p1++;
+ extract_number_and_incr (&mcnt, &p1);
+
+ /* If the next operation is not a jump backwards in the
+ pattern. */
+
+ if (mcnt >= 0)
+ {
+ /* Go through the on_failure_jumps of the alternatives,
+ seeing if any of the alternatives cannot match nothing.
+ The last alternative starts with only a no_pop_jump,
+ whereas the rest start with on_failure_jump and end
+ with a no_pop_jump, e.g., here is the pattern for `a|b|c':
+
+ /on_failure_jump/0/6/exactn/1/a/jump_past_next_alt/0/6
+ /on_failure_jump/0/6/exactn/1/b/jump_past_next_alt/0/3
+ /exactn/1/c
+
+ So, we have to first go through the first (n-1)
+ alternatives and then deal with the last one separately. */
+
+
+ /* Deal with the first (n-1) alternatives, which start
+ with an on_failure_jump (see above) that jumps to right
+ past a jump_past_next_alt. */
+
+ while ((enum regexpcode) p1[mcnt-3] == jump_past_next_alt)
+ {
+ /* MCNT holds how many bytes long the alternative
+ is, including the ending `jump_past_next_alt' and its number. */
+
+ if (!alternative_can_match_nothing (p1, p1 + mcnt - 3,
+ reg_info))
+ return false;
+
+ /* Move to right after this alternative, including the
+ jump_past_next_alt. */
+
+ p1 += mcnt;
+
+ /* Break if it's the beginning of an n-th alternative
+ that doesn't begin with an on_failure_jump. */
+
+ if ((enum regexpcode) *p1 != on_failure_jump)
+ break;
+
+ /* Still have to check that it's not an n-th
+ alternative that starts with an on_failure_jump. */
+ p1++;
+ extract_number_and_incr (&mcnt, &p1);
+ if ((enum regexpcode) p1[mcnt-3] != jump_past_next_alt)
+ {
+ /* Get to the beginning of the n-th alternative. */
+ p1 -= 3;
+ break;
+ }
+ }
+
+ /* Deal with the last alternative: go back and get number
+ of the jump_past_next_alt just before it. MCNT contains how
+ many bytes long the alternative is. */
+
+ mcnt = extract_number (p1 - 2);
+
+ if (!alternative_can_match_nothing (p1, p1 + mcnt, reg_info))
+ return false;
+
+ p1 += mcnt; /* Get past the n-th alternative. */
+
+ } /* if mcnt > 0 */
+
+ break;
+
+ case stop_memory:
+ if (p1[1] == **p)
+ {
+ *p = p1 + 2;
+ return true;
+ }
+ else
+ {
+ printf ("Error: encountered an unmatched (%d) stop_memory in \
+group_can_match_nothing.\n", **p);
+ exit (1);
+ }
+ break;
+
+ default:
+ if (!common_op_can_match_nothing (&p1, end, reg_info))
+ return false;
+ }
+ } /* While p1 < end. */
+
+ return false;
+}
+
+
+/* Similar to group_can_match_nothing, but doesn't deal with alternatives:
+ It expects P to be the first byte of a single alternative and END one
+ byte past the last. The alternative can contain groups. */
+
+
+static boolean
+alternative_can_match_nothing (p, end, reg_info)
+ unsigned char *p, *end;
+ struct register_info *reg_info;
+{
+ int mcnt;
+ unsigned char *p1 = p;
+
+ while (p1 < end)
+ {
+ /* Skip over opcodes that can match nothing, and break when we get
+ to one that can't. */
+
+ switch ((enum regexpcode) *p1)
+ {
+ /* It's a loop. */
+ case on_failure_jump:
+ p1++;
+ extract_number_and_incr (&mcnt, &p1);
+ p1 += mcnt;
+ break;
+
+ default:
+ if (!common_op_can_match_nothing (&p1, end, reg_info))
+ return false;
+ }
+ } /* While not at the end of the alternative. */
+
+ return true;
+}
+
+
+/* Deals with the ops common to group_can_match_nothing and
+ alternative_can_match_nothing.
+
+ Sets P to one after the op and its arguments, if any. */
+
+static boolean
+common_op_can_match_nothing (p, end, reg_info)
+ unsigned char **p, *end;
+ struct register_info *reg_info;
+{
+ int mcnt;
+ unsigned char *p1 = *p;
+ boolean ret;
+ int reg_no;
+
+ switch ((enum regexp1code) *p1++)
+ {
+ case no_op:
+ case begline:
+ case endline:
+ case endline_in_repeat:
+ case endline_before_newline:
+ break;
+
+ case start_memory:
+ reg_no = *p1;
+ ret = group_can_match_nothing (&p1, end, reg_info);
+
+ /* Have to set this here in case we're checking a group which
+ contains a group and a back reference to it. */
+
+ if (CAN_MATCH_NOTHING (reg_info[reg_no]) == -1)
+ CAN_MATCH_NOTHING (reg_info[reg_no]) = ret;
+
+ if (!ret)
+ return false;
+ break;
+
+ /* If this is an optimized succeed_n for zero times, make the jump. */
+ case no_pop_jump:
+ extract_number_and_incr (&mcnt, &p1);
+
+ if (mcnt >= 0)
+ p1 += mcnt;
+ else
+ return false;
+ break;
+
+ case succeed_n:
+ /* Get to the number of times to succeed. */
+ p1 += 2;
+ extract_number_and_incr (&mcnt, &p1);
+
+ if (mcnt == 0)
+ {
+ p1 -= 4;
+ extract_number_and_incr (&mcnt, &p1);
+ p1 += mcnt;
+ }
+ else
+ return false;
+ break;
+
+ case duplicate:
+ if (!CAN_MATCH_NOTHING (reg_info[*p1]))
+ return false;
+ break;
+
+ case set_number_at:
+ p1 += 4;
+ case before_dot:
+ case at_dot:
+ case after_dot:
+ case begbuf:
+ case endbuf:
+ case wordbeg:
+ case wordend:
+ case wordbound:
+ case notwordbound:
+ break;
+
+ default:
+ /* All other opcodes mean we cannot match the empty string. */
+ return false;
+ }
+
+ *p = p1;
+ return true;
}
+
+/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
+ bytes; nonzero otherwise. */
+
static int
-memcmp_translate (s1, s2, len, translate)
+bcmp_translate (s1, s2, len, translate)
unsigned char *s1, *s2;
register int len;
unsigned char *translate;
@@ -2643,17 +4986,21 @@ memcmp_translate (s1, s2, len, translate)
}
+
-/* Entry points compatible with 4.2 BSD regex library. */
+/* Entry points compatible with 4.2 BSD regex library. We don't define
+ them if this is an Emacs or POSIX compilation. */
-#ifndef emacs
+#if !defined(GAWK) && !defined (emacs) && !defined (_POSIX_SOURCE)
static struct re_pattern_buffer re_comp_buf;
char *
re_comp (s)
- char *s;
+ const char *s;
{
+ char *return_value;
+
if (!s)
{
if (!re_comp_buf.buffer)
@@ -2663,32 +5010,403 @@ re_comp (s)
if (!re_comp_buf.buffer)
{
- if (!(re_comp_buf.buffer = (char *) malloc (200)))
- return "Memory exhausted";
+ re_comp_buf.buffer = (char *) malloc (200);
+
+ if (re_comp_buf.buffer == NULL)
+ return "Memory exhausted";
+
re_comp_buf.allocated = 200;
- if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
+
+ re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
+
+ if (re_comp_buf.fastmap == NULL)
return "Memory exhausted";
}
- return re_compile_pattern (s, strlen (s), &re_comp_buf);
+ return regex_compile (s, strlen (s), obscure_syntax, &re_comp_buf);
}
int
re_exec (s)
- char *s;
+ const char *s;
{
- int len = strlen (s);
- return 0 <= re_search (&re_comp_buf, s, len, 0, len,
- (struct re_registers *) 0);
+ const int len = strlen (s);
+ return 0 <= re_search (&re_comp_buf, s, len, 0, len,
+ (struct re_registers *) 0);
}
-#endif /* not emacs */
+
+#endif /* not emacs and not _POSIX_SOURCE */
+
+
+
+/* Entry points compatible with POSIX regex library. Only define these
+ when this is a POSIX compilation (and it's not Emacs). */
+
+#if !defined(emacs) && !defined(GAWK)
+
+/* regcomp takes a regular-expression string and converts it into a
+ buffer full of byte commands for matching.
+
+ PREG is a regex_t * whose pertinent fields are mentioned in below:
+
+ It has a char * field called BUFFER which points to the
+ space where this routine will put the compiled pattern; the
+ user can either allocate this using malloc (whereupon they
+ should set the long field ALLOCATED to the number of bytes
+ malloced) or set ALLOCATED to 0 and let the routine
+ allocate it. The routine may use realloc to enlarge the
+ buffer space.
+
+ If the user wants to translate all ordinary elements in the
+ compiled pattern, they should set the char * field
+ TRANSLATE to a translate table (and not set the REG_ICASE
+ bit of CFLAGS, which would override this translate table
+ with one that ignores case); otherwise, they should set
+ TRANSLATE to 0.
+
+ The routine sets the int field SYNTAX to RE_SYNTAX_POSIX_EXTENDED
+ if the REG_EXTENDED bit in CFLAGS is set; otherwise, it sets it
+ to RE_SYNTAX_POSIX_BASIC.
+
+ It returns in the long field USED how many bytes long the
+ compiled pattern is.
+
+ It returns 0 in the char field FASTMAP_ACCURATE, on
+ the assumption that the user usually doesn't compile the
+ same pattern twice and that consequently any fastmap in the
+ pattern buffer is inaccurate.
+
+ In the size_t field RE_NSUB, it returns the number of
+ subexpressions it found in PATTERN.
+
+ PATTERN is the address of the pattern string.
+
+ CFLAGS is a series of bits ORed together which affect compilation.
+ If the bit REG_EXTENDED is set, regcomp compiles the
+ pattern as an extended regular expression, otherwise it
+ compiles it as a basic one. If the bit REG_NEWLINE is set,
+ then dot and nonmatching lists won't match a newline, but
+ pattern anchors will match at them. If the bit REG_ICASE
+ is set, then it considers upper- and lowercase versions of
+ letters to be equal when matching. If the bit REG_NOSUB is
+ set, then when PREG is passed to regexec, that routine will
+ only report success or failure.
+
+
+ It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
+ POSIX return codes and their meanings.) */
+
+
+int
+regcomp (preg, pattern, cflags)
+ regex_t *preg;
+ const char *pattern;
+ int cflags;
+{
+ char *return_value;
+
+ int syntax = cflags & REG_EXTENDED
+ ? RE_SYNTAX_POSIX_EXTENDED
+ : RE_SYNTAX_POSIX_BASIC;
+
+ if (cflags & REG_NEWLINE)
+ {
+ syntax &= ~RE_DOT_NEWLINE;
+ syntax |= RE_HAT_LISTS_NOT_NEWLINE;
+ syntax &= ~RE_NO_ANCHOR_AT_NEWLINE;
+ }
+
+ if (cflags & REG_ICASE)
+ {
+ unsigned i;
+
+ preg->translate = (char *) malloc (CHAR_SET_SIZE);
+
+ if (preg->translate == NULL)
+ return REG_ESPACE;
+
+ /* Map any uppercase characters into corresponding lowercase ones. */
+ for (i = 0; i < CHAR_SET_SIZE; i++)
+ preg->translate[i] = isupper (i) ? tolower (i) : i;
+ }
+ else
+ preg->translate = 0;
+
+ preg->no_sub = cflags & REG_NOSUB;
+
+ return_value = regex_compile (pattern, strlen (pattern), syntax, preg);
+
+
+ if (return_value == 0)
+ return 0;
+ else if (strcmp (return_value, "Invalid regular expression") == 0)
+ return REG_BADPAT;
+ else if (strcmp (return_value, "Invalid character class name") == 0)
+ return REG_ECTYPE;
+ else if (strcmp (return_value, "Trailing backslash") == 0)
+ return REG_EESCAPE;
+ else if (strcmp (return_value, "Invalid back reference") == 0)
+ return REG_ESUBREG;
+ else if (strcmp (return_value, "Unmatched [ or [^") == 0)
+ return REG_EBRACK;
+ else if (strcmp (return_value, "Unmatched ( or \\(") == 0
+ || strcmp (return_value, "Unmatched ) or \\)") == 0)
+ return REG_EPAREN;
+ else if (strcmp (return_value, "Unmatched \\{") == 0)
+ return REG_EBRACE;
+ else if (strcmp (return_value, "Invalid content of \\{\\}") == 0)
+ return REG_BADBR;
+ else if (strcmp (return_value, "Invalid range end") == 0)
+ return REG_ERANGE;
+ else if (strcmp (return_value, "Memory exhausted") == 0)
+ return REG_ESPACE;
+ else if (strcmp (return_value, "Invalid preceding regular expression") == 0
+ || strcmp (return_value,
+ "Missing preceding regular expression") == 0)
+ return REG_BADRPT;
+
+ /* Codes added by GNU. */
+
+ else if (strcmp (return_value, "Premature end of regular expression") == 0)
+ return REG_EEND;
+ else if (strcmp (return_value, "Regular expression too big") == 0)
+ return REG_ESIZE;
+else
+ return REG_BADPAT;
+}
+
+
+/* regexex matches a buffer full of byte commands for matching (gotten
+ from compiling a regular expression) and matches it against a string.
+
+ PREG is a regex_t * whose pertinent fields are mentioned below:
+
+ It has a char * field called BUFFER which points to
+ the byte commands which make up the compiled pattern.
+
+ Its char * field TRANSLATE, if not 0, translates all
+ ordinary elements in the compiled pattern.
+
+ Its int field SYNTAX is the syntax with which the pattern
+ was compiled and hence should be matched with.
+
+ The long field USED is how many bytes long the compiled
+ pattern is.
+
+ Its size_t field RE_NSUB contains how many subexpressions
+ the pattern has. (This may be useful for choosing a value
+ for NMATCH).
+
+ If its unsigned NO_SUB bit is set, then regexec will not
+ return anything in PMATCH, but only report whether or not
+ BUFFER matched STRING.
+
+ Regardless of how its unsigned RETURN_DEFAULT_NUM_REGS bit
+ is set, regexec only returns in PMATCH information about
+ the whole pattern and NMATCH - 1 of its subexpressions.
+
+ STRING is the address of the string to be matched.
+
+ NMATCH is how many elements of PMATCH regex should fill.
+
+ PMATCH is an array of struct regex_t's. If PREG's NO_SUB field
+ isn't set, then regexec records in PMATCH[i], for i = 1 to
+ PMATCH - 1, which substring of STRING matched the i-th
+ subexpression of the regular expression compiled in BUFFER;
+ it records in PMATCH[0] that information about all of
+ STRING. See the comment for `typedef struct...regmatch_t'
+ in regex.h for more details.
+
+ The caller must allocate PMATCH to have at least NMATCH
+ elements.
+
+ EFLAGS is two bits OR-ed together which affect execution. If the
+ bit REG_NOTBOL is set, then STRING's first character is not
+ the beginning of a line; that means any beginning-of-line
+ byte command in BUFFER won't match that first character.
+ If the bit REG_NOTEOL is set, then a similar things holds
+ for STRING's last character: it isn't the end of a line and
+ any end-of-line byte command in BUFFER won't match it.
+
+
+ It returns 0 if it matches and REG_NOMATCH if it doesn't. */
+
+int
+regexec (preg, string, nmatch, pmatch, eflags)
+ const regex_t *preg;
+ const char *string;
+ size_t nmatch;
+ regmatch_t pmatch[];
+ int eflags;
+{
+ int return_value;
+ unsigned this_op;
+ struct re_registers regs;
+ regex_t private_preg;
+
+ private_preg = *preg;
+ private_preg.not_bol = eflags & REG_NOTBOL;
+ private_preg.not_eol = eflags & REG_NOTEOL;
+
+ private_preg.return_default_num_regs = 0;
+
+ if (!private_preg.no_sub && nmatch > 0)
+ {
+ regs.num_regs = nmatch;
+ regs.start = malloc (nmatch * sizeof (int));
+ regs.end = malloc (nmatch * sizeof (int));
+ }
+ else
+ {
+ regs.num_regs = 0;
+ regs.start = NULL;
+ regs.end = NULL;
+ }
+
+ return_value = re_match (&private_preg, string, strlen (string), 0,
+ !private_preg.no_sub && nmatch > 0 ? &regs : 0);
+
+ if (return_value == strlen (string))
+ {
+ if (!private_preg.no_sub && nmatch > 0)
+ {
+ unsigned this_reg;
+
+ for (this_reg = 0; this_reg < nmatch; this_reg++)
+ {
+ pmatch[this_reg].rm_so = regs.start[this_reg];
+ pmatch[this_reg].rm_eo = regs.end[this_reg];
+ }
+ }
+ }
+ if (regs.start != NULL)
+ free (regs.start);
+
+ if (regs.end != NULL)
+ free (regs.end);
+
+ return return_value == strlen (string) ? 0 : REG_NOMATCH;
+}
+
+
+/* Puts the first BUFFER_SIZE - 1 characters in BUFFER (if BUFFER isn't null)
+ and terminates it with a null.
+
+ Returns one more than the size of MESSAGE. */
+
+static size_t
+put_in_buffer (message, buffer, buffer_size)
+ char *message;
+ char *buffer;
+ size_t buffer_size;
+{
+ unsigned this_char;
+
+ if (buffer != NULL && buffer_size > 0)
+ {
+ strncpy (buffer, message, buffer_size - 1);
+ buffer[buffer_size - 1] = 0;
+ }
+
+ return strlen (message) + 1;
+}
+
+
+/* Returns a message corresponding to an error code, ERRCODE, returned
+ from either regcomp or regexec. */
+
+size_t
+re_gerror (errcode, preg, errbuf, errbuf_size)
+ int errcode;
+ const regex_t *preg;
+ char *errbuf;
+ size_t errbuf_size;
+{
+ switch (errcode)
+ {
+ case REG_NOERROR:
+ return put_in_buffer ("Regex message: no error.", errbuf, errbuf_size);
+
+ case REG_NOMATCH:
+ return put_in_buffer ("Regex error: regexec didn't find a match.",
+ errbuf, errbuf_size);
+ case REG_BADPAT:
+ return put_in_buffer ("Regex error: Invalid regular expression.",
+ errbuf, errbuf_size);
+ case REG_ECOLLATE:
+ return put_in_buffer ("Regex error: (not implemented) Invalid \
+collating character.", errbuf, errbuf_size);
+
+ case REG_ECTYPE:
+ return put_in_buffer ("Regex error: Invalid character class name.",
+ errbuf, errbuf_size);
+ case REG_EESCAPE:
+ return put_in_buffer ("Regex error: Trailing backslash.",
+ errbuf, errbuf_size);
+ case REG_ESUBREG:
+ return put_in_buffer("Regex error: Invalid back reference.",
+ errbuf, errbuf_size);
+ case REG_EBRACK:
+ return put_in_buffer ("Regex error: Unmatched [ or [^.",
+ errbuf, errbuf_size);
+ case REG_EPAREN:
+ return put_in_buffer ("Regex error: Unmatched parenthesis.",
+ errbuf, errbuf_size);
+ case REG_EBRACE:
+ return put_in_buffer ("Regex error: Unmatched \\{.",
+ errbuf, errbuf_size);
+ case REG_BADBR:
+ return put_in_buffer ("Regex error: Invalid content of \\{\\}.",
+ errbuf, errbuf_size);
+ case REG_ERANGE:
+ return put_in_buffer ("Regex error: Invalid range end.",
+ errbuf, errbuf_size);
+ case REG_ESPACE:
+ return put_in_buffer ("Regex error: Ran out of memory.",
+ errbuf, errbuf_size);
+ case REG_BADRPT:
+ return put_in_buffer ("Regex error: Preceding regular expression \
+either missing or not simple.", errbuf, errbuf_size);
+
+ case REG_EEND:
+ return put_in_buffer ("Regex error: Regular expression ended \
+prematurely.", errbuf, errbuf_size);
+
+ case REG_ESIZE:
+ return put_in_buffer ("Regex error: Excessively large regular \
+expression.", errbuf, errbuf_size);
+ }
+}
+
+
+void
+re_gfree (preg)
+ regex_t *preg;
+{
+ if (preg->buffer != NULL)
+ free (preg->buffer);
+ preg->buffer = NULL;
+
+ preg->allocated = 0;
+ preg->used = 0;
+
+ if (preg->fastmap != NULL)
+ free (preg->fastmap);
+ preg->fastmap = NULL;
+
+ preg->fastmap_accurate = 0;
+
+ if (preg->translate != NULL)
+ free (preg->translate);
+ preg->translate = NULL;
+}
+
+#endif /* not emacs */
+
#ifdef test
-#ifdef atarist
-long _stksize = 2L; /* reserve memory for stack */
-#endif
#include <stdio.h>
/* Indexed by a character, gives the upper case equivalent of the
@@ -2733,25 +5451,25 @@ char upcase[0400] =
#include "tests.h"
-typedef enum { extended_test, basic_test } test_type;
+typedef enum {extended_test, basic_test, other_test, interface_test} test_type;
/* Use this to run the tests we've thought of. */
void
main ()
{
- test_type t = extended_test;
-
+ test_type t = interface_test;
+
if (t == basic_test)
- {
- printf ("Running basic tests:\n\n");
- test_posix_basic ();
- }
+ test_posix_basic ();
else if (t == extended_test)
- {
- printf ("Running extended tests:\n\n");
- test_posix_extended ();
- }
+ test_posix_extended ();
+ else if (t == other_test)
+ test_others ();
+ else if (t == interface_test)
+ test_posix_c_interface ();
+
+ exit (0);
}
#else /* not canned */
@@ -2771,7 +5489,7 @@ main (argc, argv)
/* Allow a command argument to specify the style of syntax. */
if (argc > 1)
- obscure_syntax = atoi (argv[1]);
+ re_set_syntax (atoi (argv[1]));
buf.allocated = 40;
buf.buffer = (char *) malloc (buf.allocated);
@@ -2810,7 +5528,7 @@ main (argc, argv)
#endif
-#ifdef NOTDEF
+#if 0
print_buf (bufp)
struct re_pattern_buffer *bufp;
{
@@ -2834,7 +5552,8 @@ print_buf (bufp)
printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
}
-#endif /* NOTDEF */
+#endif /* 0 */
+
printchar (c)
char c;
@@ -2857,3 +5576,13 @@ error (string)
exit (1);
}
#endif /* test */
+
+
+
+/*
+Local variables:
+make-backup-files: t
+version-control: t
+trim-versions-without-asking: nil
+End:
+*/