aboutsummaryrefslogtreecommitdiffstats
path: root/regex.h
diff options
context:
space:
mode:
Diffstat (limited to 'regex.h')
-rw-r--r--regex.h418
1 files changed, 200 insertions, 218 deletions
diff --git a/regex.h b/regex.h
index 7ad5da24..145b6d13 100644
--- a/regex.h
+++ b/regex.h
@@ -1,107 +1,28 @@
/* Definitions for data structures callers pass the regex library.
- Copyright (C) 1985 Free Software Foundation, Inc.
-
- NO WARRANTY
-
- BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY
-NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT
-WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC,
-RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS"
-WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
-BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY
-AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
-DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
-CORRECTION.
-
- IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M.
-STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY
-WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE
-LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR
-OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR
-DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR
-A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS
-PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY.
-
- GENERAL PUBLIC LICENSE TO COPY
-
- 1. You may copy and distribute verbatim copies of this source file
-as you receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy a valid copyright notice "Copyright
-(C) 1985 Free Software Foundation, Inc."; and include following the
-copyright notice a verbatim copy of the above disclaimer of warranty
-and of this License. You may charge a distribution fee for the
-physical act of transferring a copy.
-
- 2. You may modify your copy or copies of this source file or
-any portion of it, and copy and distribute such modifications under
-the terms of Paragraph 1 above, provided that you also do the following:
-
- a) cause the modified files to carry prominent notices stating
- that you changed the files and the date of any change; and
-
- b) cause the whole of any work that you distribute or publish,
- that in whole or in part contains or is a derivative of this
- program or any part thereof, to be licensed at no charge to all
- third parties on terms identical to those contained in this
- License Agreement (except that you may choose to grant more extensive
- warranty protection to some or all third parties, at your option).
-
- c) You may charge a distribution fee for the physical act of
- transferring a copy, and you may at your option offer warranty
- protection in exchange for a fee.
-
-Mere aggregation of another unrelated program with this program (or its
-derivative) on a volume of a storage or distribution medium does not bring
-the other program under the scope of these terms.
-
- 3. You may copy and distribute this program (or a portion or derivative
-of it, under Paragraph 2) in object code or executable form under the terms
-of Paragraphs 1 and 2 above provided that you also do one of the following:
-
- a) accompany it with the complete corresponding machine-readable
- source code, which must be distributed under the terms of
- Paragraphs 1 and 2 above; or,
-
- b) accompany it with a written offer, valid for at least three
- years, to give any third party free (except for a nominal
- shipping charge) a complete machine-readable copy of the
- corresponding source code, to be distributed under the terms of
- Paragraphs 1 and 2 above; or,
-
- c) accompany it with the information you received as to where the
- corresponding source code may be obtained. (This alternative is
- allowed only for noncommercial distribution and only if you
- received the program in object code or executable form alone.)
-
-For an executable file, complete source code means all the source code for
-all modules it contains; but, as a special exception, it need not include
-source code for modules which are standard libraries that accompany the
-operating system on which the executable file runs.
-
- 4. You may not copy, sublicense, distribute or transfer this program
-except as expressly provided under this License Agreement. Any attempt
-otherwise to copy, sublicense, distribute or transfer this program is void and
-your rights to use the program under this License agreement shall be
-automatically terminated. However, parties who have received computer
-software programs from you with this License Agreement will not have
-their licenses terminated so long as such parties remain in full compliance.
-
- 5. If you wish to incorporate parts of this program into other free
-programs whose distribution conditions are different, write to the Free
-Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet
-worked out a simple rule that can be stated here, but we will often permit
-this. We will be guided by the two goals of preserving the free status of
-all derivatives of our free software and of promoting the sharing and reuse of
-software.
-
-
-In other words, you are welcome to use, share and improve this program.
-You are forbidden to forbid anyone else to use, share and improve
-what you give them. Help stamp out software-hoarding! */
+ Copyright (C) 1985, 1989-90 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 1, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+
+#ifdef __GNUC__
+ #pragma once
+#endif
+
+#ifndef __REGEXP_LIBRARY
+#define __REGEXP_LIBRARY
/* Define number of parens for which we record the beginnings and ends.
This affects how much space the `struct re_registers' type takes up. */
@@ -109,70 +30,158 @@ what you give them. Help stamp out software-hoarding! */
#define RE_NREGS 10
#endif
-/* These bits are used in the obscure_syntax variable to choose among
+#define BYTEWIDTH 8
+
+
+/* Maximum number of duplicates an interval can allow. */
+#define RE_DUP_MAX ((1 << 15) - 1)
+
+
+/* This defines the various regexp syntaxes. */
+extern int obscure_syntax;
+
+
+/* The following bits are used in the obscure_syntax variable to choose among
alternative regexp syntaxes. */
-/* 1 means plain parentheses serve as grouping, and backslash
+/* If this bit is set, plain parentheses serve as grouping, and backslash
parentheses are needed for literal searching.
- 0 means backslash-parentheses are grouping, and plain parentheses
+ If not set, backslash-parentheses are grouping, and plain parentheses
are for literal searching. */
-#define RE_NO_BK_PARENS 1
-
-/* 1 means plain | serves as the "or"-operator, and \| is a literal.
- 0 means \| serves as the "or"-operator, and | is a literal. */
-#define RE_NO_BK_VBAR 2
-
-/* 0 means plain + or ? serves as an operator, and \+, \? are literals.
- 1 means \+, \? are operators and plain +, ? are literals. */
-#define RE_BK_PLUS_QM 4
-
-/* 1 means | binds tighter than ^ or $.
- 0 means the contrary. */
-#define RE_TIGHT_VBAR 8
-
-/* 1 means treat \n as an _OR operator
- 0 means treat it as a normal character */
-#define RE_NEWLINE_OR 16
-
-/* 0 means that a special characters (such as *, ^, and $) always have
- their special meaning regardless of the surrounding context.
- 1 means that special characters may act as normal characters in some
- contexts. Specifically, this applies to:
- ^ - only special at the beginning, or after ( or |
- $ - only special at the end, or before ) or |
- *, +, ? - only special when not after the beginning, (, or | */
-#define RE_CONTEXT_INDEP_OPS 32
-
-/* 0 means that \ before anything inside [ and ] is taken as a real \.
- 1 means that such a \ escapes the following character This is a
- special case for AWK. */
-#define RE_AWK_CLASS_HACK 64
-
-/* Now define combinations of bits for the standard possibilities. */
-#define RE_SYNTAX_POSIX_EGREP (RE_NO_BK_PARENS | RE_NO_BK_VBAR \
+#define RE_NO_BK_PARENS 1
+
+/* If this bit is set, plain | serves as the `or'-operator, and \| is a
+ literal.
+ If not set, \| serves as the `or'-operator, and | is a literal. */
+#define RE_NO_BK_VBAR (1 << 1)
+
+/* If this bit is not set, plain + or ? serves as an operator, and \+, \? are
+ literals.
+ If set, \+, \? are operators and plain +, ? are literals. */
+#define RE_BK_PLUS_QM (1 << 2)
+
+/* If this bit is set, | binds tighter than ^ or $.
+ If not set, the contrary. */
+#define RE_TIGHT_VBAR (1 << 3)
+
+/* If this bit is set, then treat newline as an OR operator.
+ If not set, treat it as a normal character. */
+#define RE_NEWLINE_OR (1 << 4)
+
+/* If this bit is set, then special characters may act as normal
+ characters in some contexts. Specifically, this applies to:
+ ^ -- only special at the beginning, or after ( or |;
+ $ -- only special at the end, or before ) or |;
+ *, +, ? -- only special when not after the beginning, (, or |.
+ If this bit is not set, special characters (such as *, ^, and $)
+ always have their special meaning regardless of the surrounding
+ context. */
+#define RE_CONTEXT_INDEP_OPS (1 << 5)
+
+/* If this bit is not set, then \ before anything inside [ and ] is taken as
+ a real \.
+ If set, then such a \ escapes the following character. This is a
+ special case for awk. */
+#define RE_AWK_CLASS_HACK (1 << 6)
+
+/* If this bit is set, then \{ and \} or { and } serve as interval operators.
+ If not set, then \{ and \} and { and } are treated as literals. */
+#define RE_INTERVALS (1 << 7)
+
+/* If this bit is not set, then \{ and \} serve as interval operators and
+ { and } are literals.
+ If set, then { and } serve as interval operators and \{ and \} are
+ literals. */
+#define RE_NO_BK_CURLY_BRACES (1 << 8)
+
+/* If this bit is set, then character classes are supported; they are:
+ [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
+ [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
+ If not set, then character classes are not supported. */
+#define RE_CHAR_CLASSES (1 << 9)
+
+/* If this bit is set, then the dot re doesn't match a null byte.
+ If not set, it does. */
+#define RE_DOT_NOT_NULL (1 << 10)
+
+/* If this bit is set, then [^...] doesn't match a newline.
+ If not set, it does. */
+#define RE_HAT_NOT_NEWLINE (1 << 11)
+
+/* If this bit is set, back references are recognized.
+ If not set, they aren't. */
+#define RE_NO_BK_REFS (1 << 12)
+
+/* If this bit is set, back references must refer to a preceding
+ subexpression. If not set, a back reference to a nonexistent
+ subexpression is treated as literal characters. */
+#define RE_NO_EMPTY_BK_REF (1 << 13)
+
+/* If this bit is set, bracket expressions can't be empty.
+ If it is set, they can be empty. */
+#define RE_NO_EMPTY_BRACKETS (1 << 14)
+
+/* If this bit is set, then *, +, ? and { cannot be first in an re or
+ immediately after a |, or a (. Furthermore, a | cannot be first or
+ last in an re, or immediately follow another | or a (. Also, a ^
+ cannot appear in a nonleading position and a $ cannot appear in a
+ nontrailing position (outside of bracket expressions, that is). */
+#define RE_CONTEXTUAL_INVALID_OPS (1 << 15)
+
+/* If this bit is set, then +, ? and | aren't recognized as operators.
+ If it's not, they are. */
+#define RE_LIMITED_OPS (1 << 16)
+
+/* If this bit is set, then an ending range point has to collate higher
+ or equal to the starting range point.
+ If it's not set, then when the ending range point collates higher
+ than the starting range point, the range is just considered empty. */
+#define RE_NO_EMPTY_RANGES (1 << 17)
+
+/* If this bit is set, then a hyphen (-) can't be an ending range point.
+ If it isn't, then it can. */
+#define RE_NO_HYPHEN_RANGE_END (1 << 18)
+
+
+/* Define combinations of bits for the standard possibilities. */
+#define RE_SYNTAX_POSIX_AWK (RE_NO_BK_PARENS | RE_NO_BK_VBAR \
| RE_CONTEXT_INDEP_OPS)
-#define RE_SYNTAX_AWK (RE_SYNTAX_POSIX_EGREP | RE_AWK_CLASS_HACK)
-#define RE_SYNTAX_EGREP (RE_SYNTAX_POSIX_EGREP | RE_NEWLINE_OR)
+#define RE_SYNTAX_AWK (RE_NO_BK_PARENS | RE_NO_BK_VBAR | RE_AWK_CLASS_HACK)
+#define RE_SYNTAX_EGREP (RE_NO_BK_PARENS | RE_NO_BK_VBAR \
+ | RE_CONTEXT_INDEP_OPS | RE_NEWLINE_OR)
#define RE_SYNTAX_GREP (RE_BK_PLUS_QM | RE_NEWLINE_OR)
#define RE_SYNTAX_EMACS 0
-
-/* This data structure is used to represent a compiled pattern. */
+#define RE_SYNTAX_POSIX_BASIC (RE_INTERVALS | RE_BK_PLUS_QM \
+ | RE_CHAR_CLASSES | RE_DOT_NOT_NULL \
+ | RE_HAT_NOT_NEWLINE | RE_NO_EMPTY_BK_REF \
+ | RE_NO_EMPTY_BRACKETS | RE_LIMITED_OPS \
+ | RE_NO_EMPTY_RANGES | RE_NO_HYPHEN_RANGE_END)
+
+#define RE_SYNTAX_POSIX_EXTENDED (RE_INTERVALS | RE_NO_BK_CURLY_BRACES \
+ | RE_NO_BK_VBAR | RE_NO_BK_PARENS \
+ | RE_HAT_NOT_NEWLINE | RE_CHAR_CLASSES \
+ | RE_NO_EMPTY_BRACKETS | RE_CONTEXTUAL_INVALID_OPS \
+ | RE_NO_BK_REFS | RE_NO_EMPTY_RANGES \
+ | RE_NO_HYPHEN_RANGE_END)
+
+
+/* This data structure is used to represent a compiled pattern. */
struct re_pattern_buffer
{
- char *buffer; /* Space holding the compiled pattern commands. */
- int allocated; /* Size of space that buffer points to */
- int used; /* Length of portion of buffer actually occupied */
- char *fastmap; /* Pointer to fastmap, if any, or zero if none. */
+ char *buffer; /* Space holding the compiled pattern commands. */
+ long allocated; /* Size of space that `buffer' points to. */
+ long used; /* Length of portion of buffer actually occupied */
+ char *fastmap; /* Pointer to fastmap, if any, or zero if none. */
/* re_search uses the fastmap, if there is one,
- to skip quickly over totally implausible characters */
- char *translate; /* Translate table to apply to all characters before comparing.
- Or zero for no translation.
- The translation is applied to a pattern when it is compiled
- and to data when it is matched. */
+ to skip over totally implausible characters. */
+ char *translate; /* Translate table to apply to all characters before
+ comparing, or zero for no translation.
+ The translation is applied to a pattern when it is
+ compiled and to data when it is matched. */
char fastmap_accurate;
/* Set to zero when a new pattern is stored,
- set to one when the fastmap is updated from it. */
+ set to one when the fastmap is updated from it. */
char can_be_null; /* Set to one by compiling fastmap
if this pattern might match the null string.
It does not necessarily match the null string
@@ -182,14 +191,21 @@ struct re_pattern_buffer
listed in the fastmap. */
};
-/* Structure to store "register" contents data in.
+
+/* search.c (search_buffer) needs this one value. It is defined both in
+ regex.c and here. */
+#define RE_EXACTN_VALUE 1
+
+
+/* Structure to store register contents data in.
Pass the address of such a structure as an argument to re_match, etc.,
if you want this information back.
- start[i] and end[i] record the string matched by \( ... \) grouping i,
- for i from 1 to RE_NREGS - 1.
- start[0] and end[0] record the entire string matched. */
+ For i from 1 to RE_NREGS - 1, start[i] records the starting index in
+ the string of where the ith subexpression matched, and end[i] records
+ one after the ending index. start[0] and end[0] are analogous, for
+ the entire pattern. */
struct re_registers
{
@@ -197,78 +213,44 @@ struct re_registers
int end[RE_NREGS];
};
-/* These are the command codes that appear in compiled regular expressions, one per byte.
- Some command codes are followed by argument bytes.
- A command code can specify any interpretation whatever for its arguments.
- Zero-bytes may appear in the compiled regular expression. */
-enum regexpcode
- {
- unused,
- exactn, /* followed by one byte giving n, and then by n literal bytes */
- begline, /* fails unless at beginning of line */
- endline, /* fails unless at end of line */
- jump, /* followed by two bytes giving relative address to jump to */
- on_failure_jump, /* followed by two bytes giving relative address of place
- to resume at in case of failure. */
- finalize_jump, /* Throw away latest failure point and then jump to address. */
- maybe_finalize_jump, /* Like jump but finalize if safe to do so.
- This is used to jump back to the beginning
- of a repeat. If the command that follows
- this jump is clearly incompatible with the
- one at the beginning of the repeat, such that
- we can be sure that there is no use backtracking
- out of repetitions already completed,
- then we finalize. */
- dummy_failure_jump, /* jump, and push a dummy failure point.
- This failure point will be thrown away
- if an attempt is made to use it for a failure.
- A + construct makes this before the first repeat. */
- anychar, /* matches any one character */
- charset, /* matches any one char belonging to specified set.
- First following byte is # bitmap bytes.
- Then come bytes for a bit-map saying which chars are in.
- Bits in each byte are ordered low-bit-first.
- A character is in the set if its bit is 1.
- A character too large to have a bit in the map
- is automatically not in the set */
- charset_not, /* similar but match any character that is NOT one of those specified */
- start_memory, /* starts remembering the text that is matched
- and stores it in a memory register.
- followed by one byte containing the register number.
- Register numbers must be in the range 0 through NREGS. */
- stop_memory, /* stops remembering the text that is matched
- and stores it in a memory register.
- followed by one byte containing the register number.
- Register numbers must be in the range 0 through NREGS. */
- duplicate, /* match a duplicate of something remembered.
- Followed by one byte containing the index of the memory register. */
- before_dot, /* Succeeds if before dot */
- at_dot, /* Succeeds if at dot */
- after_dot, /* Succeeds if after dot */
- begbuf, /* Succeeds if at beginning of buffer */
- endbuf, /* Succeeds if at end of buffer */
- wordchar, /* Matches any word-constituent character */
- notwordchar, /* Matches any char that is not a word-constituent */
- wordbeg, /* Succeeds if at word beginning */
- wordend, /* Succeeds if at word end */
- wordbound, /* Succeeds if at a word boundary */
- notwordbound, /* Succeeds if not at a word boundary */
- syntaxspec, /* Matches any character whose syntax is specified.
- followed by a byte which contains a syntax code, Sword or such like */
- notsyntaxspec /* Matches any character whose syntax differs from the specified. */
- };
+#ifdef __STDC__
+
+extern char *re_compile_pattern (char *, size_t, struct re_pattern_buffer *);
+/* Is this really advertised? */
+extern void re_compile_fastmap (struct re_pattern_buffer *);
+extern int re_search (struct re_pattern_buffer *, char*, int, int, int,
+ struct re_registers *);
+extern int re_search_2 (struct re_pattern_buffer *, char *, int,
+ char *, int, int, int,
+ struct re_registers *, int);
+extern int re_match (struct re_pattern_buffer *, char *, int, int,
+ struct re_registers *);
+extern int re_match_2 (struct re_pattern_buffer *, char *, int,
+ char *, int, int, struct re_registers *, int);
+
+/* 4.2 bsd compatibility. */
+extern char *re_comp (char *);
+extern int re_exec (char *);
+
+#else /* !__STDC__ */
+
extern char *re_compile_pattern ();
/* Is this really advertised? */
extern void re_compile_fastmap ();
extern int re_search (), re_search_2 ();
extern int re_match (), re_match_2 ();
-/* 4.2 bsd compatibility (yuck) */
+/* 4.2 bsd compatibility. */
extern char *re_comp ();
extern int re_exec ();
+#endif /* __STDC__ */
+
+
#ifdef SYNTAX_TABLE
extern char *re_syntax_table;
#endif
+
+#endif /* !__REGEXP_LIBRARY */