aboutsummaryrefslogtreecommitdiffstats
path: root/dfa.c
diff options
context:
space:
mode:
Diffstat (limited to 'dfa.c')
-rw-r--r--dfa.c1033
1 files changed, 448 insertions, 585 deletions
diff --git a/dfa.c b/dfa.c
index 4d1e1ab0..62bb6435 100644
--- a/dfa.c
+++ b/dfa.c
@@ -59,7 +59,6 @@
#define _(str) gettext (str)
#include <wchar.h>
-#include <wctype.h>
#include "xalloc.h"
@@ -69,6 +68,8 @@
#include "dfa.h"
+#include "localeinfo.h"
+
#ifdef GAWK
static int
is_blank (int c)
@@ -77,14 +78,6 @@ is_blank (int c)
}
#endif /* GAWK */
-#ifdef LIBC_IS_BORKED
-extern int gawk_mb_cur_max;
-#undef MB_CUR_MAX
-#define MB_CUR_MAX gawk_mb_cur_max
-#undef mbrtowc
-#define mbrtowc(a, b, c, d) (-1)
-#endif
-
/* HPUX defines these as macros in sys/param.h. */
#ifdef setbit
# undef setbit
@@ -337,9 +330,6 @@ typedef struct
size_t hash; /* Hash of the positions of this state. */
position_set elems; /* Positions this state could match. */
unsigned char context; /* Context from previous state. */
- bool curr_dependent; /* True if the follows of any positions with
- ANYCHAR depends on the next character's
- context. */
unsigned short constraint; /* Constraint for this state to accept. */
token first_end; /* Token value of the first END in elems. */
position_set mbps; /* Positions which can match multibyte
@@ -372,6 +362,10 @@ struct regex_syntax
/* Flag for case-folding letters into sets. */
bool case_fold;
+ /* True if ^ and $ match only the start and end of data, and do not match
+ end-of-line within data. */
+ bool anchor;
+
/* End-of-line byte in data. */
unsigned char eolbyte;
@@ -395,8 +389,8 @@ struct regex_syntax
meaning of the @#%!@#%^!@ syntax bits. */
struct lexer_state
{
- char const *lexptr; /* Pointer to next input character. */
- size_t lexleft; /* Number of characters remaining. */
+ char const *ptr; /* Pointer to next input character. */
+ size_t left; /* Number of characters remaining. */
token lasttok; /* Previous token returned; initially END. */
size_t parens; /* Count of outstanding left parens. */
int minrep, maxrep; /* Repeat counts for {m,n}. */
@@ -435,12 +429,13 @@ struct dfa
charclass *charclasses; /* Array of character sets for CSET tokens. */
size_t cindex; /* Index for adding new charclasses. */
size_t calloc; /* Number of charclasses allocated. */
+ size_t canychar; /* Index of anychar class, or (size_t) -1. */
/* Scanner state */
- struct lexer_state lexstate;
+ struct lexer_state lex;
/* Parser state */
- struct parser_state parsestate;
+ struct parser_state parse;
/* Fields filled by the parser. */
token *tokens; /* Postfix parse array. */
@@ -453,14 +448,9 @@ struct dfa
size_t nregexps; /* Count of parallel regexps being built
with dfaparse. */
bool fast; /* The DFA is fast. */
- bool multibyte; /* MB_CUR_MAX > 1. */
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */
- /* dfaexec implementation. */
- char *(*dfaexec) (struct dfa *, char const *, char *,
- bool, size_t *, bool *);
-
/* The following are valid only if MB_CUR_MAX > 1. */
/* The value of multibyte_prop[i] is defined by following rule.
@@ -546,6 +536,21 @@ struct dfa
state_num **mb_trans; /* Transition tables for states with ANYCHAR. */
state_num mb_trcount; /* Number of transition tables for states with
ANYCHAR that have actually been built. */
+
+ /* Information derived from the locale. This is at the end so that
+ a quick memset need not clear it specially. */
+
+ /* dfaexec implementation. */
+ char *(*dfaexec) (struct dfa *, char const *, char *,
+ bool, size_t *, bool *);
+
+ /* The locale is simple, like the C locale. These locales can be
+ processed more efficiently, e.g., the relationship between lower-
+ and upper-case letters is 1-1. */
+ bool simple_locale;
+
+ /* Other cached information derived from the locale. */
+ struct localeinfo localeinfo;
};
/* Some macros for user access to dfa internals. */
@@ -559,13 +564,8 @@ struct dfa
static void regexp (struct dfa *dfa);
-/* A table indexed by byte values that contains the corresponding wide
- character (if any) for that byte. WEOF means the byte is not a
- valid single-byte character. */
-static wint_t mbrtowc_cache[NOTCHAR];
-
/* Store into *PWC the result of converting the leading bytes of the
- multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
+ multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
and updating the conversion state in *D. On conversion error,
convert just a single byte, to WEOF. Return the number of bytes
converted.
@@ -574,7 +574,7 @@ static wint_t mbrtowc_cache[NOTCHAR];
* PWC points to wint_t, not to wchar_t.
* The last arg is a dfa *D instead of merely a multibyte conversion
- state D->mbs. D also contains an mbrtowc_cache for speed.
+ state D->mbs.
* N must be at least 1.
* S[N - 1] must be a sentinel byte.
* Shift encodings are not supported.
@@ -585,7 +585,7 @@ static size_t
mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
{
unsigned char uc = s[0];
- wint_t wc = mbrtowc_cache[uc];
+ wint_t wc = d->localeinfo.sbctowc[uc];
if (wc == WEOF)
{
@@ -762,7 +762,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
/* In DFA D, find the index of charclass S, or allocate a new one. */
static size_t
-dfa_charclass_index (struct dfa *d, charclass const s)
+charclass_index (struct dfa *d, charclass const s)
{
size_t i;
@@ -777,78 +777,21 @@ dfa_charclass_index (struct dfa *d, charclass const s)
}
static bool
-unibyte_word_constituent (unsigned char c)
+unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
{
- return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
+ return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
}
static int
char_context (struct dfa const *dfa, unsigned char c)
{
- if (c == dfa->syntax.eolbyte)
+ if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
return CTX_NEWLINE;
- if (unibyte_word_constituent (c))
+ if (unibyte_word_constituent (dfa, c))
return CTX_LETTER;
return CTX_NONE;
}
-/* UTF-8 encoding allows some optimizations that we can't otherwise
- assume in a multibyte encoding. */
-static bool using_utf8;
-
-bool
-dfa_using_utf8 (void)
-{
- return using_utf8;
-}
-
-static void
-init_mbrtowc_cache (void)
-{
- int i;
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- char c = i;
- unsigned char uc = i;
- mbstate_t s = { 0 };
- wchar_t wc;
- mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
- }
-}
-
-/* Entry point to set syntax options. */
-void
-dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol)
-{
- int i;
- dfa->syntax.syntax_bits_set = true;
- dfa->syntax.syntax_bits = bits;
- dfa->syntax.case_fold = fold;
- dfa->syntax.eolbyte = eol;
-
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- unsigned char uc = i;
-
- /* Use mbrtowc_cache to calculate sbit. */
- dfa->syntax.sbit[uc] = char_context (dfa, uc);
- switch (dfa->syntax.sbit[uc])
- {
- case CTX_LETTER:
- setbit (uc, dfa->syntax.letters);
- break;
- case CTX_NEWLINE:
- setbit (uc, dfa->syntax.newline);
- break;
- }
-
- /* POSIX requires that the five bytes in "\n\r./" (including the
- terminating NUL) cannot occur inside a multibyte character. */
- dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80
- : strchr ("\n\r./", uc) != NULL);
- }
-}
-
/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
this may happen when folding case in weird Turkish locales where
@@ -877,30 +820,10 @@ setbit_case_fold_c (int b, charclass c)
setbit (i, c);
}
-static void check_utf8 (void)
-{
- wchar_t wc;
- mbstate_t mbs = { 0 };
- using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
-}
-
-static bool unibyte_c;
-
-static void check_unibyte_c (void)
-{
- char const *locale = setlocale (LC_ALL, NULL);
- unibyte_c = (!locale
- || STREQ (locale, "C")
- || STREQ (locale, "POSIX"));
-}
-
-/* The current locale is known to be a unibyte locale
- without multicharacter collating sequences and where range
- comparisons simply use the native encoding. These locales can be
- processed more efficiently. */
+/* Return true if the locale compatible with the C locale. */
static bool
-using_simple_locale (struct dfa const *dfa)
+using_simple_locale (bool multibyte)
{
/* The native character set is known to be compatible with
the C locale. The following test isn't perfect, but it's good
@@ -918,7 +841,15 @@ using_simple_locale (struct dfa const *dfa)
&& '}' == 125 && '~' == 126)
};
- return (!native_c_charset || dfa->multibyte) ? false : unibyte_c;
+ if (native_c_charset && !multibyte)
+ return true;
+ else
+ {
+ /* Treat C and POSIX locales as being compatible. Also, treat
+ errors as compatible, as these are invariably from stubs. */
+ char const *loc = setlocale (LC_ALL, NULL);
+ return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX");
+ }
}
/* Fetch the next lexical input character. Set C (of type int) to the
@@ -930,23 +861,23 @@ using_simple_locale (struct dfa const *dfa)
otherwise. */
# define FETCH_WC(dfa, c, wc, eoferr) \
do { \
- if (! dfa->lexstate.lexleft) \
+ if (! (dfa)->lex.left) \
{ \
if ((eoferr) != 0) \
dfaerror (eoferr); \
else \
- return dfa->lexstate.lasttok = END; \
+ return (dfa)->lex.lasttok = END; \
} \
else \
{ \
wint_t _wc; \
- size_t nbytes = mbs_to_wchar (&_wc, dfa->lexstate.lexptr, \
- dfa->lexstate.lexleft, dfa); \
- dfa->lexstate.cur_mb_len = nbytes; \
+ size_t nbytes = mbs_to_wchar (&_wc, (dfa)->lex.ptr, \
+ (dfa)->lex.left, dfa); \
+ (dfa)->lex.cur_mb_len = nbytes; \
(wc) = _wc; \
- (c) = nbytes == 1 ? to_uchar (*dfa->lexstate.lexptr) : EOF; \
- dfa->lexstate.lexptr += nbytes; \
- dfa->lexstate.lexleft -= nbytes; \
+ (c) = nbytes == 1 ? to_uchar ((dfa)->lex.ptr[0]) : EOF; \
+ (dfa)->lex.ptr += nbytes; \
+ (dfa)->lex.left -= nbytes; \
} \
} while (false)
@@ -954,53 +885,6 @@ using_simple_locale (struct dfa const *dfa)
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
-/* The set of wchar_t values C such that there's a useful locale
- somewhere where C != towupper (C) && C != towlower (towupper (C)).
- For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
- towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
- towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
-static short const lonesome_lower[] =
- {
- 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
- 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
- /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
- counterpart in locales predating Unicode 4.0.0 (April 2003). */
- 0x03F2,
-
- 0x03F5, 0x1E9B, 0x1FBE,
- };
-
-/* Maximum number of characters that can be the case-folded
- counterparts of a single character, not counting the character
- itself. This is 1 for towupper, 1 for towlower, and 1 for each
- entry in LONESOME_LOWER. */
-enum
-{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
-
-/* Find the characters equal to C after case-folding, other than C
- itself, and store them into FOLDED. Return the number of characters
- stored. */
-static unsigned int
-case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
-{
- unsigned int i;
- unsigned int n = 0;
- wint_t uc = towupper (c);
- wint_t lc = towlower (uc);
- if (uc != c)
- folded[n++] = uc;
- if (lc != uc && lc != c && towupper (lc) == uc)
- folded[n++] = lc;
- for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
- {
- wint_t li = lonesome_lower[i];
- if (li != lc && li != uc && li != c && towupper (li) == uc)
- folded[n++] = li;
- }
- return n;
-}
-
typedef int predicate (int);
/* The following list maps the names of the Posix named character classes
@@ -1069,7 +953,7 @@ parse_bracket_exp (struct dfa *dfa)
size_t chars_al;
chars_al = 0;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
&dfa->mbcsets_alloc,
@@ -1092,7 +976,7 @@ parse_bracket_exp (struct dfa *dfa)
{
FETCH_WC (dfa, c, wc, _("unbalanced ["));
invert = true;
- known_bracket_exp = using_simple_locale (dfa);
+ known_bracket_exp = dfa->simple_locale;
}
else
invert = false;
@@ -1120,8 +1004,8 @@ parse_bracket_exp (struct dfa *dfa)
for (;;)
{
FETCH_WC (dfa, c, wc, _("unbalanced ["));
- if ((c == c1 && *dfa->lexstate.lexptr == ']')
- || dfa->lexstate.lexleft == 0)
+ if (dfa->lex.left == 0
+ || (c == c1 && dfa->lex.ptr[0] == ']'))
break;
if (len < MAX_BRACKET_STRING_LEN)
str[len++] = c;
@@ -1141,13 +1025,13 @@ parse_bracket_exp (struct dfa *dfa)
{
char const *class
= (dfa->syntax.case_fold && (STREQ (str, "upper")
- || STREQ (str, "lower")) ?
- "alpha" : str);
+ || STREQ (str, "lower"))
+ ? "alpha" : str);
const struct dfa_ctype *pred = find_pred (class);
if (!pred)
dfaerror (_("invalid character class"));
- if (dfa->multibyte && !pred->single_byte_only)
+ if (dfa->localeinfo.multibyte && !pred->single_byte_only)
known_bracket_exp = false;
else
for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1182,7 +1066,7 @@ parse_bracket_exp (struct dfa *dfa)
/* A bracket expression like [a-[.aa.]] matches an unknown set.
Treat it like [-a[.aa.]] while parsing it, and
remember that the set is unknown. */
- if (c2 == '[' && *dfa->lexstate.lexptr == '.')
+ if (c2 == '[' && dfa->lex.ptr[0] == '.')
{
known_bracket_exp = false;
c2 = ']';
@@ -1192,8 +1076,8 @@ parse_bracket_exp (struct dfa *dfa)
{
/* In the case [x-], the - is an ordinary hyphen,
which is left in c1, the lookahead character. */
- dfa->lexstate.lexptr -= dfa->lexstate.cur_mb_len;
- dfa->lexstate.lexleft += dfa->lexstate.cur_mb_len;
+ dfa->lex.ptr -= dfa->lex.cur_mb_len;
+ dfa->lex.left += dfa->lex.cur_mb_len;
}
else
{
@@ -1207,9 +1091,9 @@ parse_bracket_exp (struct dfa *dfa)
/* Treat [x-y] as a range if x != y. */
if (wc != wc2 || wc == WEOF)
{
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
known_bracket_exp = false;
- else if (using_simple_locale (dfa))
+ else if (dfa->simple_locale)
{
int ci;
for (ci = c; ci <= c2; ci++)
@@ -1236,7 +1120,7 @@ parse_bracket_exp (struct dfa *dfa)
colon_warning_state |= (c == ':') ? 2 : 4;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
if (dfa->syntax.case_fold)
setbit_case_fold_c (c, ccl);
@@ -1273,37 +1157,45 @@ parse_bracket_exp (struct dfa *dfa)
if (! known_bracket_exp)
return BACKREF;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
work_mbc->invert = invert;
- work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl);
+ work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
return MBCSET;
}
if (invert)
{
- assert (!dfa->multibyte);
+ assert (!dfa->localeinfo.multibyte);
notset (ccl);
if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
clrbit ('\n', ccl);
}
- return CSET + dfa_charclass_index (dfa, ccl);
+ return CSET + charclass_index (dfa, ccl);
}
-#define PUSH_LEX_STATE(s) \
- do \
- { \
- char const *lexptr_saved = dfa->lexstate.lexptr; \
- size_t lexleft_saved = dfa->lexstate.lexleft; \
- dfa->lexstate.lexptr = (s); \
- dfa->lexstate.lexleft = strlen (dfa->lexstate.lexptr)
+struct lexptr
+{
+ char const *ptr;
+ size_t left;
+};
-#define POP_LEX_STATE() \
- dfa->lexstate.lexptr = lexptr_saved; \
- dfa->lexstate.lexleft = lexleft_saved; \
- } \
- while (false)
+static void
+push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
+{
+ ls->ptr = dfa->lex.ptr;
+ ls->left = dfa->lex.left;
+ dfa->lex.ptr = s;
+ dfa->lex.left = strlen (s);
+}
+
+static void
+pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
+{
+ dfa->lex.ptr = ls->ptr;
+ dfa->lex.left = ls->left;
+}
static token
lex (struct dfa *dfa)
@@ -1321,14 +1213,14 @@ lex (struct dfa *dfa)
"if (backslash) ...". */
for (i = 0; i < 2; ++i)
{
- FETCH_WC (dfa, c, dfa->lexstate.wctok, NULL);
+ FETCH_WC (dfa, c, dfa->lex.wctok, NULL);
switch (c)
{
case '\\':
if (backslash)
goto normal_char;
- if (dfa->lexstate.lexleft == 0)
+ if (dfa->lex.left == 0)
dfaerror (_("unfinished \\ escape"));
backslash = true;
break;
@@ -1337,28 +1229,29 @@ lex (struct dfa *dfa)
if (backslash)
goto normal_char;
if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
- || dfa->lexstate.lasttok == END || dfa->lexstate.lasttok == LPAREN
- || dfa->lexstate.lasttok == OR)
- return dfa->lexstate.lasttok = BEGLINE;
+ || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
+ || dfa->lex.lasttok == OR)
+ return dfa->lex.lasttok = BEGLINE;
goto normal_char;
case '$':
if (backslash)
goto normal_char;
if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
- || dfa->lexstate.lexleft == 0
- || (dfa->syntax.syntax_bits & RE_NO_BK_PARENS
- ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == ')'
- : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\'
- && dfa->lexstate.lexptr[1] == ')')
- || (dfa->syntax.syntax_bits & RE_NO_BK_VBAR
- ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == '|'
- : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\'
- && dfa->lexstate.lexptr[1] == '|')
+ || dfa->lex.left == 0
+ || ((dfa->lex.left
+ > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
+ && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
+ & (dfa->lex.ptr[0] == '\\')]
+ == ')'))
+ || ((dfa->lex.left
+ > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
+ && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
+ & (dfa->lex.ptr[0] == '\\')]
+ == '|'))
|| ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
- && dfa->lexstate.lexleft > 0
- && *dfa->lexstate.lexptr == '\n'))
- return dfa->lexstate.lasttok = ENDLINE;
+ && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
+ return dfa->lex.lasttok = ENDLINE;
goto normal_char;
case '1':
@@ -1372,8 +1265,8 @@ lex (struct dfa *dfa)
case '9':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
{
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = BACKREF;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = BACKREF;
}
goto normal_char;
@@ -1381,7 +1274,7 @@ lex (struct dfa *dfa)
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
{
/* FIXME: should be beginning of string */
- return dfa->lexstate.lasttok = BEGLINE;
+ return dfa->lex.lasttok = BEGLINE;
}
goto normal_char;
@@ -1389,28 +1282,28 @@ lex (struct dfa *dfa)
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
{
/* FIXME: should be end of string */
- return dfa->lexstate.lasttok = ENDLINE;
+ return dfa->lex.lasttok = ENDLINE;
}
goto normal_char;
case '<':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = BEGWORD;
+ return dfa->lex.lasttok = BEGWORD;
goto normal_char;
case '>':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = ENDWORD;
+ return dfa->lex.lasttok = ENDWORD;
goto normal_char;
case 'b':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = LIMWORD;
+ return dfa->lex.lasttok = LIMWORD;
goto normal_char;
case 'B':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = NOTLIMWORD;
+ return dfa->lex.lasttok = NOTLIMWORD;
goto normal_char;
case '?':
@@ -1419,17 +1312,17 @@ lex (struct dfa *dfa)
if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
- return dfa->lexstate.lasttok = QMARK;
+ return dfa->lex.lasttok = QMARK;
case '*':
if (backslash)
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
- return dfa->lexstate.lasttok = STAR;
+ return dfa->lex.lasttok = STAR;
case '+':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
@@ -1437,9 +1330,9 @@ lex (struct dfa *dfa)
if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
- return dfa->lexstate.lasttok = PLUS;
+ return dfa->lex.lasttok = PLUS;
case '{':
if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
@@ -1447,7 +1340,7 @@ lex (struct dfa *dfa)
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
/* Cases:
@@ -1457,111 +1350,107 @@ lex (struct dfa *dfa)
{,} - 0 to infinity (same as '*')
{M,N} - M through N */
{
- char const *p = dfa->lexstate.lexptr;
- char const *lim = p + dfa->lexstate.lexleft;
- dfa->lexstate.minrep = dfa->lexstate.maxrep = -1;
+ char const *p = dfa->lex.ptr;
+ char const *lim = p + dfa->lex.left;
+ dfa->lex.minrep = dfa->lex.maxrep = -1;
for (; p != lim && ISASCIIDIGIT (*p); p++)
- {
- if (dfa->lexstate.minrep < 0)
- dfa->lexstate.minrep = *p - '0';
- else
- dfa->lexstate.minrep = MIN (RE_DUP_MAX + 1,
- (dfa->lexstate.minrep
- * 10 + *p - '0'));
- }
+ dfa->lex.minrep = (dfa->lex.minrep < 0
+ ? *p - '0'
+ : MIN (RE_DUP_MAX + 1,
+ dfa->lex.minrep * 10 + *p - '0'));
if (p != lim)
{
if (*p != ',')
- dfa->lexstate.maxrep = dfa->lexstate.minrep;
+ dfa->lex.maxrep = dfa->lex.minrep;
else
{
- if (dfa->lexstate.minrep < 0)
- dfa->lexstate.minrep = 0;
+ if (dfa->lex.minrep < 0)
+ dfa->lex.minrep = 0;
while (++p != lim && ISASCIIDIGIT (*p))
- {
- if (dfa->lexstate.maxrep < 0)
- dfa->lexstate.maxrep = *p - '0';
- else
- dfa->lexstate.maxrep = MIN (RE_DUP_MAX + 1,
- (dfa->lexstate.maxrep
- * 10 + *p - '0'));
- }
+ dfa->lex.maxrep
+ = (dfa->lex.maxrep < 0
+ ? *p - '0'
+ : MIN (RE_DUP_MAX + 1,
+ dfa->lex.maxrep * 10 + *p - '0'));
}
}
if (! ((! backslash || (p != lim && *p++ == '\\'))
&& p != lim && *p++ == '}'
- && 0 <= dfa->lexstate.minrep
- && (dfa->lexstate.maxrep < 0
- || dfa->lexstate.minrep <= dfa->lexstate.maxrep)))
+ && 0 <= dfa->lex.minrep
+ && (dfa->lex.maxrep < 0
+ || dfa->lex.minrep <= dfa->lex.maxrep)))
{
if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
goto normal_char;
dfaerror (_("invalid content of \\{\\}"));
}
- if (RE_DUP_MAX < dfa->lexstate.maxrep)
+ if (RE_DUP_MAX < dfa->lex.maxrep)
dfaerror (_("regular expression too big"));
- dfa->lexstate.lexptr = p;
- dfa->lexstate.lexleft = lim - p;
+ dfa->lex.ptr = p;
+ dfa->lex.left = lim - p;
}
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = REPMN;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = REPMN;
case '|':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
goto normal_char;
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
goto normal_char;
- dfa->lexstate.laststart = true;
- return dfa->lexstate.lasttok = OR;
+ dfa->lex.laststart = true;
+ return dfa->lex.lasttok = OR;
case '\n':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
|| backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
goto normal_char;
- dfa->lexstate.laststart = true;
- return dfa->lexstate.lasttok = OR;
+ dfa->lex.laststart = true;
+ return dfa->lex.lasttok = OR;
case '(':
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
goto normal_char;
- ++dfa->lexstate.parens;
- dfa->lexstate.laststart = true;
- return dfa->lexstate.lasttok = LPAREN;
+ dfa->lex.parens++;
+ dfa->lex.laststart = true;
+ return dfa->lex.lasttok = LPAREN;
case ')':
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
goto normal_char;
- if (dfa->lexstate.parens == 0
+ if (dfa->lex.parens == 0
&& dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
goto normal_char;
- --dfa->lexstate.parens;
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = RPAREN;
+ dfa->lex.parens--;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = RPAREN;
case '.':
if (backslash)
goto normal_char;
- if (dfa->multibyte)
+ if (dfa->canychar == (size_t) -1)
{
- /* In multibyte environment period must match with a single
- character not a byte. So we use ANYCHAR. */
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = ANYCHAR;
+ zeroset (ccl);
+ notset (ccl);
+ if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
+ clrbit ('\n', ccl);
+ if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
+ clrbit ('\0', ccl);
+ if (dfa->localeinfo.multibyte)
+ for (c2 = 0; c2 < NOTCHAR; c2++)
+ if (dfa->localeinfo.sbctowc[c2] == WEOF)
+ clrbit (c2, ccl);
+ dfa->canychar = charclass_index (dfa, ccl);
}
- zeroset (ccl);
- notset (ccl);
- if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
- clrbit ('\n', ccl);
- if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
- clrbit ('\0', ccl);
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = (dfa->localeinfo.multibyte
+ ? ANYCHAR
+ : CSET + dfa->canychar);
case 's':
case 'S':
if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
zeroset (ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1569,9 +1458,8 @@ lex (struct dfa *dfa)
setbit (c2, ccl);
if (c == 'S')
notset (ccl);
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
- ccl);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1580,31 +1468,31 @@ lex (struct dfa *dfa)
/* \s and \S are documented to be equivalent to [[:space:]] and
[^[:space:]] respectively, so tell the lexer to process those
strings, each minus its "already processed" '['. */
- PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]");
-
- dfa->lexstate.lasttok = parse_bracket_exp (dfa);
-
- POP_LEX_STATE ();
+ {
+ struct lexptr ls;
+ push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
+ dfa->lex.lasttok = parse_bracket_exp (dfa);
+ pop_lex_state (dfa, &ls);
+ }
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok;
case 'w':
case 'W':
if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
zeroset (ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
- if (unibyte_word_constituent (c2))
+ if (dfa->syntax.sbit[c2] == CTX_LETTER)
setbit (c2, ccl);
if (c == 'W')
notset (ccl);
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
- ccl);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1613,38 +1501,38 @@ lex (struct dfa *dfa)
/* \w and \W are documented to be equivalent to [_[:alnum:]] and
[^_[:alnum:]] respectively, so tell the lexer to process those
strings, each minus its "already processed" '['. */
- PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]");
-
- dfa->lexstate.lasttok = parse_bracket_exp (dfa);
-
- POP_LEX_STATE ();
+ {
+ struct lexptr ls;
+ push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
+ dfa->lex.lasttok = parse_bracket_exp (dfa);
+ pop_lex_state (dfa, &ls);
+ }
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok;
case '[':
if (backslash)
goto normal_char;
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = parse_bracket_exp (dfa);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = parse_bracket_exp (dfa);
default:
normal_char:
- dfa->lexstate.laststart = false;
+ dfa->lex.laststart = false;
/* For multibyte character sets, folding is done in atom. Always
return WCHAR. */
- if (dfa->multibyte)
- return dfa->lexstate.lasttok = WCHAR;
+ if (dfa->localeinfo.multibyte)
+ return dfa->lex.lasttok = WCHAR;
if (dfa->syntax.case_fold && isalpha (c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
- ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
- return dfa->lexstate.lasttok = c;
+ return dfa->lex.lasttok = c;
}
}
@@ -1661,11 +1549,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
{
dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
sizeof *dfa->tokens);
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
sizeof *dfa->multibyte_prop);
}
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
dfa->multibyte_prop[dfa->tindex] = mbprop;
dfa->tokens[dfa->tindex++] = t;
@@ -1678,21 +1566,21 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
case CAT:
case OR:
- --dfa->parsestate.depth;
+ dfa->parse.depth--;
break;
case BACKREF:
dfa->fast = false;
/* fallthrough */
default:
- ++dfa->nleaves;
+ dfa->nleaves++;
/* fallthrough */
case EMPTY:
- ++dfa->parsestate.depth;
+ dfa->parse.depth++;
break;
}
- if (dfa->parsestate.depth > dfa->depth)
- dfa->depth = dfa->parsestate.depth;
+ if (dfa->parse.depth > dfa->depth)
+ dfa->depth = dfa->parse.depth;
}
static void addtok_wc (struct dfa *dfa, wint_t wc);
@@ -1702,7 +1590,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc);
static void
addtok (struct dfa *dfa, token t)
{
- if (dfa->multibyte && t == MBCSET)
+ if (dfa->localeinfo.multibyte && t == MBCSET)
{
bool need_or = false;
struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
@@ -1749,19 +1637,19 @@ addtok_wc (struct dfa *dfa, wint_t wc)
size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
if (stored_bytes != (size_t) -1)
- dfa->lexstate.cur_mb_len = stored_bytes;
+ dfa->lex.cur_mb_len = stored_bytes;
else
{
/* This is merely stop-gap. buf[0] is undefined, yet skipping
the addtok_mb call altogether can corrupt the heap. */
- dfa->lexstate.cur_mb_len = 1;
+ dfa->lex.cur_mb_len = 1;
buf[0] = 0;
}
- addtok_mb (dfa, buf[0], dfa->lexstate.cur_mb_len == 1 ? 3 : 1);
- for (i = 1; i < dfa->lexstate.cur_mb_len; i++)
+ addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1);
+ for (i = 1; i < dfa->lex.cur_mb_len; i++)
{
- addtok_mb (dfa, buf[i], i == dfa->lexstate.cur_mb_len - 1 ? 2 : 0);
+ addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0);
addtok (dfa, CAT);
}
}
@@ -1801,7 +1689,7 @@ add_utf8_anychar (struct dfa *dfa)
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', c);
}
- dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c);
+ dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
}
/* A valid UTF-8 character is
@@ -1862,18 +1750,18 @@ add_utf8_anychar (struct dfa *dfa)
static void
atom (struct dfa *dfa)
{
- if (dfa->parsestate.tok == WCHAR)
+ if (dfa->parse.tok == WCHAR)
{
- if (dfa->lexstate.wctok == WEOF)
+ if (dfa->lex.wctok == WEOF)
addtok (dfa, BACKREF);
else
{
- addtok_wc (dfa, dfa->lexstate.wctok);
+ addtok_wc (dfa, dfa->lex.wctok);
if (dfa->syntax.case_fold)
{
wchar_t folded[CASE_FOLDED_BUFSIZE];
- unsigned int i, n = case_folded_counterparts (dfa->lexstate.wctok,
+ unsigned int i, n = case_folded_counterparts (dfa->lex.wctok,
folded);
for (i = 0; i < n; i++)
{
@@ -1883,9 +1771,9 @@ atom (struct dfa *dfa)
}
}
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
- else if (dfa->parsestate.tok == ANYCHAR && using_utf8)
+ else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
{
/* For UTF-8 expand the period to a series of CSETs that define a valid
UTF-8 character. This avoids using the slow multibyte path. I'm
@@ -1895,26 +1783,25 @@ atom (struct dfa *dfa)
UTF-8: it is the most used, and the structure of the encoding
makes the correctness more obvious. */
add_utf8_anychar (dfa);
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
- else if ((dfa->parsestate.tok >= 0 && dfa->parsestate.tok < NOTCHAR)
- || dfa->parsestate.tok >= CSET || dfa->parsestate.tok == BACKREF
- || dfa->parsestate.tok == BEGLINE || dfa->parsestate.tok == ENDLINE
- || dfa->parsestate.tok == BEGWORD || dfa->parsestate.tok == ANYCHAR
- || dfa->parsestate.tok == MBCSET || dfa->parsestate.tok == ENDWORD
- || dfa->parsestate.tok == LIMWORD
- || dfa->parsestate.tok == NOTLIMWORD)
+ else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
+ || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
+ || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
+ || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
+ || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
+ || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
{
- addtok (dfa, dfa->parsestate.tok);
- dfa->parsestate.tok = lex (dfa);
+ addtok (dfa, dfa->parse.tok);
+ dfa->parse.tok = lex (dfa);
}
- else if (dfa->parsestate.tok == LPAREN)
+ else if (dfa->parse.tok == LPAREN)
{
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
regexp (dfa);
- if (dfa->parsestate.tok != RPAREN)
+ if (dfa->parse.tok != RPAREN)
dfaerror (_("unbalanced ("));
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
else
addtok (dfa, EMPTY);
@@ -1947,7 +1834,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
{
size_t i;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
for (i = 0; i < ntokens; ++i)
addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
else
@@ -1962,40 +1849,39 @@ closure (struct dfa *dfa)
size_t tindex, ntokens;
atom (dfa);
- while (dfa->parsestate.tok == QMARK || dfa->parsestate.tok == STAR
- || dfa->parsestate.tok == PLUS || dfa->parsestate.tok == REPMN)
- if (dfa->parsestate.tok == REPMN
- && (dfa->lexstate.minrep || dfa->lexstate.maxrep))
+ while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
+ || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
+ if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
{
ntokens = nsubtoks (dfa, dfa->tindex);
tindex = dfa->tindex - ntokens;
- if (dfa->lexstate.maxrep < 0)
+ if (dfa->lex.maxrep < 0)
addtok (dfa, PLUS);
- if (dfa->lexstate.minrep == 0)
+ if (dfa->lex.minrep == 0)
addtok (dfa, QMARK);
- for (i = 1; i < dfa->lexstate.minrep; ++i)
+ for (i = 1; i < dfa->lex.minrep; i++)
{
copytoks (dfa, tindex, ntokens);
addtok (dfa, CAT);
}
- for (; i < dfa->lexstate.maxrep; ++i)
+ for (; i < dfa->lex.maxrep; i++)
{
copytoks (dfa, tindex, ntokens);
addtok (dfa, QMARK);
addtok (dfa, CAT);
}
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
- else if (dfa->parsestate.tok == REPMN)
+ else if (dfa->parse.tok == REPMN)
{
dfa->tindex -= nsubtoks (dfa, dfa->tindex);
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
closure (dfa);
}
else
{
- addtok (dfa, dfa->parsestate.tok);
- dfa->parsestate.tok = lex (dfa);
+ addtok (dfa, dfa->parse.tok);
+ dfa->parse.tok = lex (dfa);
}
}
@@ -2003,8 +1889,8 @@ static void
branch (struct dfa* dfa)
{
closure (dfa);
- while (dfa->parsestate.tok != RPAREN && dfa->parsestate.tok != OR
- && dfa->parsestate.tok >= 0)
+ while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
+ && dfa->parse.tok >= 0)
{
closure (dfa);
addtok (dfa, CAT);
@@ -2015,9 +1901,9 @@ static void
regexp (struct dfa *dfa)
{
branch (dfa);
- while (dfa->parsestate.tok == OR)
+ while (dfa->parse.tok == OR)
{
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
branch (dfa);
addtok (dfa, OR);
}
@@ -2029,26 +1915,26 @@ regexp (struct dfa *dfa)
static void
dfaparse (char const *s, size_t len, struct dfa *d)
{
- d->lexstate.lexptr = s;
- d->lexstate.lexleft = len;
- d->lexstate.lasttok = END;
- d->lexstate.laststart = true;
- d->lexstate.parens = 0;
- if (d->multibyte)
+ d->lex.ptr = s;
+ d->lex.left = len;
+ d->lex.lasttok = END;
+ d->lex.laststart = true;
+ d->lex.parens = 0;
+ if (d->localeinfo.multibyte)
{
- d->lexstate.cur_mb_len = 0;
+ d->lex.cur_mb_len = 0;
memset (&d->mbs, 0, sizeof d->mbs);
}
if (!d->syntax.syntax_bits_set)
dfaerror (_("no syntax specified"));
- d->parsestate.tok = lex (d);
- d->parsestate.depth = d->depth;
+ d->parse.tok = lex (d);
+ d->parse.depth = d->depth;
regexp (d);
- if (d->parsestate.tok != END)
+ if (d->parse.tok != END)
dfaerror (_("unbalanced )"));
addtok (d, END - d->nregexps);
@@ -2169,7 +2055,6 @@ state_index (struct dfa *d, position_set const *s, int context)
size_t hash = 0;
int constraint = 0;
state_num i, j;
- bool curr_dependent = false;
token first_end = 0;
for (i = 0; i < s->nelem; ++i)
@@ -2223,17 +2108,6 @@ state_index (struct dfa *d, position_set const *s, int context)
}
else if (d->tokens[s->elems[j].index] == BACKREF)
constraint = NO_CONSTRAINT;
- if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
- {
- int acceptable
- = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
- ? CTX_NEWLINE : 0)
- | (SUCCEEDS_IN_CONTEXT (c, context, CTX_LETTER)
- ? CTX_LETTER : 0)
- | (SUCCEEDS_IN_CONTEXT (c, context, CTX_NONE)
- ? CTX_NONE : 0));
- curr_dependent |= acceptable && (context & ~acceptable);
- }
}
@@ -2244,7 +2118,6 @@ state_index (struct dfa *d, position_set const *s, int context)
alloc_position_set (&d->states[i].elems, s->nelem);
copy (s, &d->states[i].elems);
d->states[i].context = context;
- d->states[i].curr_dependent = curr_dependent;
d->states[i].constraint = constraint;
d->states[i].first_end = first_end;
d->states[i].mbps.nelem = 0;
@@ -2331,11 +2204,10 @@ charclass_context (struct dfa const *dfa, charclass c)
int context = 0;
unsigned int j;
- if (tstbit (dfa->syntax.eolbyte, c))
- context |= CTX_NEWLINE;
-
for (j = 0; j < CHARCLASS_WORDS; ++j)
{
+ if (c[j] & dfa->syntax.newline[j])
+ context |= CTX_NEWLINE;
if (c[j] & dfa->syntax.letters[j])
context |= CTX_LETTER;
if (c[j] & ~(dfa->syntax.letters[j] | dfa->syntax.newline[j]))
@@ -2700,25 +2572,22 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
setbit (d->tokens[pos.index], matches);
else if (d->tokens[pos.index] >= CSET)
copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
- else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
+ else if (d->tokens[pos.index] == ANYCHAR)
{
- /* ANYCHAR must match a single character, so put it to
- D->states[s].mbps which contains the positions which can
- match with a single character not a byte. If all
- positions with ANYCHAR do not depend on the context of
- the next character, put its follows instead to
+ copyset (d->charclasses[d->canychar], matches);
+
+ /* ANYCHAR must match with a single character, so we must put
+ it to D->states[s].mbps which contains the positions which
+ can match with a single character not a byte. If all
+ positions which has ANYCHAR does not depend on context of
+ next character, we put the follows instead of it to
D->states[s].mbps to optimize. */
- if (d->states[s].curr_dependent)
- {
- if (d->states[s].mbps.nelem == 0)
- alloc_position_set (&d->states[s].mbps, 1);
- insert (pos, &d->states[s].mbps);
- }
- else if (SUCCEEDS_IN_CONTEXT (pos.constraint,
- d->states[s].context, CTX_ANY))
+ if (SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context,
+ CTX_NONE))
{
if (d->states[s].mbps.nelem == 0)
- alloc_position_set (&d->states[s].mbps, 1);
+ alloc_position_set (&d->states[s].mbps,
+ d->follows[pos.index].nelem);
for (j = 0; j < d->follows[pos.index].nelem; j++)
insert (d->follows[pos.index].elems[j], &d->states[s].mbps);
}
@@ -2832,22 +2701,27 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
is to fail miserably. */
if (d->searchflag)
{
- /* Find the state(s) corresponding to the positions of state 0. */
- copy (&d->states[0].elems, &follows);
- separate_contexts = state_separate_contexts (&follows);
- state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
- if (separate_contexts & CTX_NEWLINE)
- state_newline = state_index (d, &follows, CTX_NEWLINE);
- else
- state_newline = state;
- if (separate_contexts & CTX_LETTER)
- state_letter = state_index (d, &follows, CTX_LETTER);
- else
- state_letter = state;
+ int c;
+
+ state_newline = 0;
+ state_letter = d->min_trcount - 1;
+ state = d->initstate_notbol;
- for (i = 0; i < NOTCHAR; ++i)
- trans[i] = unibyte_word_constituent (i) ? state_letter : state;
- trans[d->syntax.eolbyte] = state_newline;
+ for (c = 0; c < NOTCHAR; ++c)
+ {
+ switch (d->syntax.sbit[c])
+ {
+ case CTX_NEWLINE:
+ trans[c] = state_newline;
+ break;
+ case CTX_LETTER:
+ trans[c] = state_letter;
+ break;
+ default:
+ trans[c] = state;
+ break;
+ }
+ }
}
else
for (i = 0; i < NOTCHAR; ++i)
@@ -2863,7 +2737,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
insert (d->follows[grps[i].elems[j]].elems[k], &follows);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
/* If a token in follows.elems is not 1st byte of a multibyte
character, or the states of follows must accept the bytes
@@ -2896,7 +2770,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
/* If we are building a searching matcher, throw in the positions
of state 0 as well. */
- if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte))
+ if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
{
merge (&d->states[0].elems, &follows, &tmp);
copy (&tmp, &follows);
@@ -2950,12 +2824,18 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
{
int c = j * CHARCLASS_WORD_BITS + k;
- if (c == d->syntax.eolbyte)
- trans[c] = state_newline;
- else if (unibyte_word_constituent (c))
- trans[c] = state_letter;
- else if (c < NOTCHAR)
- trans[c] = state;
+ switch (d->syntax.sbit[c])
+ {
+ case CTX_NEWLINE:
+ trans[c] = state_newline;
+ break;
+ case CTX_LETTER:
+ trans[c] = state_letter;
+ break;
+ default:
+ trans[c] = state;
+ break;
+ }
}
}
@@ -2993,7 +2873,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
@@ -3005,7 +2885,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
{
d->trans[oldalloc] = NULL;
d->fails[oldalloc] = NULL;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
d->mb_trans[oldalloc] = NULL;
}
}
@@ -3039,7 +2919,7 @@ build_state (state_num s, struct dfa *d)
}
d->trcount = d->min_trcount;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
for (i = d->min_trcount; i < d->tralloc; i++)
{
@@ -3096,16 +2976,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
{
state_num *t;
- if (**pp == d->syntax.eolbyte)
- {
- /* S is always an initial state in transit_state, so the
- transition table for the state must have been built already. */
- assert (d->trans[s] || d->fails[s]);
-
- ++*pp;
- return d->newlines[s];
- }
-
if (d->trans[s])
t = d->trans[s];
else if (d->fails[s])
@@ -3132,15 +3002,12 @@ static state_num
transit_state (struct dfa *d, state_num s, unsigned char const **pp,
unsigned char const *end)
{
- state_num s1;
+ state_num s1, s2;
wint_t wc;
int separate_contexts;
- state_num state, state_newline, mb_index;
- size_t i, j;
+ size_t i;
int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
- int context = wc == d->syntax.eolbyte ? CTX_NEWLINE : CTX_NONE;
- bool context_newline = context == CTX_NEWLINE;
/* This state has some operators which can match a multibyte character. */
d->mb_follows.nelem = 0;
@@ -3152,31 +3019,9 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
s = transit_state_singlebyte (d, s, pp);
*pp += mbclen - i;
- if (d->states[s1].curr_dependent)
+ if (wc == WEOF)
{
- if (s < 0)
- d->mb_follows.nelem = 0;
- else
- copy (&d->states[s].elems, &d->mb_follows);
-
- for (i = 0; i < d->states[s1].mbps.nelem; i++)
- {
- if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint,
- d->states[s1].context, context))
- continue;
- for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem;
- j++)
- insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
- &d->mb_follows);
- }
-
- separate_contexts = state_separate_contexts (&d->mb_follows);
- if (context_newline && separate_contexts & CTX_NEWLINE)
- s = state_index (d, &d->mb_follows, CTX_NEWLINE);
- else
- s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
- realloc_trans_if_necessary (d, s);
-
+ /* It is an invalid character, so ANYCHAR is not accepted. */
return s;
}
@@ -3187,11 +3032,11 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
{
if (MAX_TRCOUNT <= d->mb_trcount)
{
- state_num s2;
- for (s2 = -1; s2 < d->tralloc; s2++)
+ state_num s3;
+ for (s3 = -1; s3 < d->tralloc; s3++)
{
- free (d->mb_trans[s2]);
- d->mb_trans[s2] = NULL;
+ free (d->mb_trans[s3]);
+ d->mb_trans[s3] = NULL;
}
for (i = 0; i < d->sindex; i++)
@@ -3201,22 +3046,16 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
d->states[s1].mb_trindex = d->mb_trcount++;
}
- mb_index = d->states[s1].mb_trindex * 2;
-
if (! d->mb_trans[s])
{
enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
- enum { TRANSALLOC_SIZE = 2 * MAX_TRCOUNT * TRANSPTR_SIZE };
+ enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
- for (i = 0; i < 2 * MAX_TRCOUNT; i++)
+ for (i = 0; i < MAX_TRCOUNT; i++)
d->mb_trans[s][i] = -1;
}
- else
- {
- state = d->mb_trans[s][mb_index + context_newline];
- if (0 <= state)
- return state;
- }
+ else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
+ return d->mb_trans[s][d->states[s1].mb_trindex];
if (s < 0)
copy (&d->states[s1].mbps, &d->mb_follows);
@@ -3224,17 +3063,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
separate_contexts = state_separate_contexts (&d->mb_follows);
- state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
- if (separate_contexts & CTX_NEWLINE)
- state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE);
- else
- state_newline = state;
- realloc_trans_if_necessary (d, state_newline);
+ s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+ realloc_trans_if_necessary (d, s2);
- d->mb_trans[s][mb_index] = state;
- d->mb_trans[s][mb_index + 1] = state_newline;
+ d->mb_trans[s][d->states[s1].mb_trindex] = s2;
- return context_newline ? state_newline : state;
+ return s2;
}
/* The initial state may encounter a byte which is not a single byte character
@@ -3254,16 +3088,14 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
Both P and MBP must be no larger than END. */
static unsigned char const *
skip_remains_mb (struct dfa *d, unsigned char const *p,
- unsigned char const *mbp, char const *end, wint_t *wcp)
+ unsigned char const *mbp, char const *end)
{
- wint_t wc = WEOF;
+ wint_t wc;
if (d->syntax.never_trail[*p])
return p;
while (mbp < p)
mbp += mbs_to_wchar (&wc, (char const *) mbp,
end - (char const *) mbp, d);
- if (wcp != NULL)
- *wcp = wc;
return mbp;
}
@@ -3299,6 +3131,39 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
unsigned char saved_end;
size_t nlcount = 0;
+ if (MAX_TRCOUNT <= d->sindex)
+ {
+ for (s = d->min_trcount; s < d->sindex; s++)
+ {
+ free (d->states[s].elems.elems);
+ free (d->states[s].mbps.elems);
+ }
+ d->sindex = d->min_trcount;
+
+ if (d->trans)
+ {
+ for (s = 0; s < d->tralloc; s++)
+ {
+ free (d->trans[s]);
+ free (d->fails[s]);
+ d->trans[s] = d->fails[s] = NULL;
+ }
+ d->trcount = 0;
+ }
+
+ if (d->localeinfo.multibyte && d->mb_trans)
+ {
+ for (s = -1; s < d->tralloc; s++)
+ {
+ free (d->mb_trans[s]);
+ d->mb_trans[s] = NULL;
+ }
+ for (s = 0; s < d->min_trcount; s++)
+ d->states[s].mb_trindex = -1;
+ d->mb_trcount = 0;
+ }
+ }
+
if (!d->tralloc)
{
realloc_trans_if_necessary (d, 1);
@@ -3320,51 +3185,25 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
for (;;)
{
- if (multibyte)
+ while ((t = trans[s]) != NULL)
{
- while ((t = trans[s]) != NULL)
+ if (s < d->min_trcount)
{
- s1 = s;
-
- if (s < d->min_trcount)
+ if (!multibyte || d->states[s].mbps.nelem == 0)
{
- if (d->min_trcount == 1)
- {
- if (d->states[s].mbps.nelem == 0)
- {
- do
- {
- while (t[*p] == 0)
- p++;
- p = mbp = skip_remains_mb (d, p, mbp, end, NULL);
- }
- while (t[*p] == 0);
- }
- else
- p = mbp = skip_remains_mb (d, p, mbp, end, NULL);
- }
- else
- {
- wint_t wc;
- mbp = skip_remains_mb (d, p, mbp, end, &wc);
-
- /* If d->min_trcount is greater than 1, maybe
- transit to another initial state after skip. */
- if (p < mbp)
- {
- /* It's CTX_LETTER or CTX_NONE. CTX_NEWLINE
- cannot happen, as we assume that a newline
- is always a single byte character. */
- s1 = s = d->initstate_notbol;
- p = mbp;
- }
- }
+ while (t[*p] == s)
+ p++;
}
+ if (multibyte)
+ p = mbp = skip_remains_mb (d, p, mbp, end);
+ }
+
+ if (multibyte)
+ {
+ s1 = s;
- if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl)
- || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE))
- || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL))
- || (char *) p >= end)
+ if (d->states[s].mbps.nelem == 0
+ || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
{
/* If an input character does not match ANYCHAR, do it
like a single-byte character. */
@@ -3373,28 +3212,11 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
else
{
s = transit_state (d, s, &p, (unsigned char *) end);
- if (s >= 0 && p[-1] == eol)
- nlcount++;
mbp = p;
trans = d->trans;
}
}
- }
- else
- {
- if (s == 0)
- {
- t = trans[s];
- if (t)
- {
- while (t[*p] == 0)
- p++;
- s1 = 0;
- s = t[*p++];
- }
- }
-
- while ((t = trans[s]) != NULL)
+ else
{
s1 = t[*p++];
t = trans[s1];
@@ -3405,6 +3227,11 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
s1 = tmp; /* swap */
break;
}
+ if (s < d->min_trcount)
+ {
+ while (t[*p] == s1)
+ p++;
+ }
s = t[*p++];
}
}
@@ -3422,19 +3249,25 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
nlcount++;
mbp = p;
- s = allow_nl ? d->newlines[s1] : 0;
+ s = (allow_nl ? d->newlines[s1]
+ : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
+ : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
+ : d->initstate_notbol);
}
else if (d->fails[s])
{
- if (d->success[s] & d->syntax.sbit[*p])
+ if ((d->success[s] & d->syntax.sbit[*p])
+ || ((char *) p == end
+ && ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s,
+ *d)))
goto done;
+ if (multibyte && s < d->min_trcount)
+ p = mbp = skip_remains_mb (d, p, mbp, end);
+
s1 = s;
if (!multibyte || d->states[s].mbps.nelem == 0
- || (*p == eol && !allow_nl)
- || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE))
- || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL))
- || (char *) p >= end)
+ || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
{
/* If a input character does not match ANYCHAR, do it
like a single-byte character. */
@@ -3443,8 +3276,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
else
{
s = transit_state (d, s, &p, (unsigned char *) end);
- if (s >= 0 && p[-1] == eol)
- nlcount++;
mbp = p;
trans = d->trans;
}
@@ -3490,7 +3321,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end,
return (char *) begin;
}
-/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte),
+/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
but faster and set *BACKREF if the DFA code does not support this
regexp usage. */
@@ -3548,7 +3379,7 @@ dfa_supported (struct dfa const *d)
case ENDWORD:
case LIMWORD:
case NOTLIMWORD:
- if (!d->multibyte)
+ if (!d->localeinfo.multibyte)
continue;
/* fallthrough */
@@ -3566,7 +3397,7 @@ dfaoptimize (struct dfa *d)
size_t i;
bool have_backref = false;
- if (!using_utf8)
+ if (!d->localeinfo.using_utf8)
return;
for (i = 0; i < d->tindex; ++i)
@@ -3596,7 +3427,7 @@ dfaoptimize (struct dfa *d)
}
free_mbdata (d);
- d->multibyte = false;
+ d->localeinfo.multibyte = false;
d->dfaexec = dfaexec_sb;
d->fast = true;
}
@@ -3611,7 +3442,7 @@ dfassbuild (struct dfa *d)
struct dfa *sup = dfaalloc ();
*sup = *d;
- sup->multibyte = false;
+ sup->localeinfo.multibyte = false;
sup->dfaexec = dfaexec_sb;
sup->multibyte_prop = NULL;
sup->mbcsets = NULL;
@@ -3644,7 +3475,7 @@ dfassbuild (struct dfa *d)
case BACKREF:
zeroset (ccl);
notset (ccl);
- sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
+ sup->tokens[j++] = CSET + charclass_index (sup, ccl);
sup->tokens[j++] = STAR;
if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
|| d->tokens[i + 1] == PLUS)
@@ -3655,13 +3486,14 @@ dfassbuild (struct dfa *d)
case ENDWORD:
case LIMWORD:
case NOTLIMWORD:
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
/* These constraints aren't supported in a multibyte locale.
Ignore them in the superset DFA. */
sup->tokens[j++] = EMPTY;
break;
}
+ /* fallthrough */
default:
sup->tokens[j++] = d->tokens[i];
if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
@@ -3672,7 +3504,7 @@ dfassbuild (struct dfa *d)
}
sup->tindex = j;
- if (have_nchar && (have_achar || d->multibyte))
+ if (have_nchar && (have_achar || d->localeinfo.multibyte))
d->superset = sup;
else
{
@@ -3714,7 +3546,7 @@ dfafree (struct dfa *d)
free (d->charclasses);
free (d->tokens);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
free_mbdata (d);
for (i = 0; i < d->sindex; ++i)
@@ -4238,20 +4070,51 @@ dfamustfree (struct dfamust *dm)
struct dfa *
dfaalloc (void)
{
- struct dfa *d = xcalloc (1, sizeof (struct dfa));
- d->multibyte = MB_CUR_MAX > 1;
- d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
- d->fast = !d->multibyte;
- d->lexstate.cur_mb_len = 1;
- return d;
+ return xmalloc (sizeof (struct dfa));
}
+/* Initialize DFA. */
void
-dfa_init (void)
+dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
+ reg_syntax_t bits, int dfaopts)
{
- check_utf8 ();
- check_unibyte_c ();
- init_mbrtowc_cache ();
+ int i;
+ memset (dfa, 0, offsetof (struct dfa, dfaexec));
+ dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
+ dfa->simple_locale = using_simple_locale (linfo->multibyte);
+ dfa->localeinfo = *linfo;
+
+ dfa->fast = !dfa->localeinfo.multibyte;
+
+ dfa->canychar = -1;
+ dfa->lex.cur_mb_len = 1;
+ dfa->syntax.syntax_bits_set = true;
+ dfa->syntax.case_fold = (dfaopts & DFA_CASE_FOLD) != 0;
+ dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
+ dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
+ dfa->syntax.syntax_bits = bits;
+
+ for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+ {
+ unsigned char uc = i;
+
+ dfa->syntax.sbit[uc] = char_context (dfa, uc);
+ switch (dfa->syntax.sbit[uc])
+ {
+ case CTX_LETTER:
+ setbit (uc, dfa->syntax.letters);
+ break;
+ case CTX_NEWLINE:
+ setbit (uc, dfa->syntax.newline);
+ break;
+ }
+
+ /* POSIX requires that the five bytes in "\n\r./" (including the
+ terminating NUL) cannot occur inside a multibyte character. */
+ dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
+ ? (uc & 0xc0) != 0x80
+ : strchr ("\n\r./", uc) != NULL);
+ }
}
/* vim:set shiftwidth=2: */