aboutsummaryrefslogtreecommitdiffstats
path: root/dfa.c
diff options
context:
space:
mode:
Diffstat (limited to 'dfa.c')
-rw-r--r--dfa.c117
1 files changed, 32 insertions, 85 deletions
diff --git a/dfa.c b/dfa.c
index ac43b30b..6d63acf8 100644
--- a/dfa.c
+++ b/dfa.c
@@ -58,15 +58,15 @@
#include "gettext.h"
#define _(str) gettext (str)
-#include "mbsupport.h" /* Define MBS_SUPPORT to 1 or 0, as appropriate. */
-#if MBS_SUPPORT
-/* We can handle multibyte strings. */
-# include <wchar.h>
-# include <wctype.h>
-#endif
+#include <wchar.h>
+#include <wctype.h>
#include "xalloc.h"
+#if defined(__DJGPP__)
+#include "mbsupport.h"
+#endif
+
#include "dfa.h"
#ifdef GAWK
@@ -391,12 +391,10 @@ struct dfa
*/
int *multibyte_prop;
-#if MBS_SUPPORT
/* A table indexed by byte values that contains the corresponding wide
character (if any) for that byte. WEOF means the byte is not a
valid single-byte character. */
wint_t mbrtowc_cache[NOTCHAR];
-#endif
/* Array of the bracket expression in the DFA. */
struct mb_char_classes *mbcsets;
@@ -481,7 +479,6 @@ static void regexp (void);
static void
dfambcache (struct dfa *d)
{
-#if MBS_SUPPORT
int i;
for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
{
@@ -491,10 +488,8 @@ dfambcache (struct dfa *d)
wchar_t wc;
d->mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
}
-#endif
}
-#if MBS_SUPPORT
/* Store into *PWC the result of converting the leading bytes of the
multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
and updating the conversion state in *D. On conversion error,
@@ -533,9 +528,6 @@ mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
*pwc = wc;
return 1;
}
-#else
-#define mbs_to_wchar(pwc, s, n, d) (WEOF)
-#endif
#ifdef DEBUG
@@ -730,7 +722,7 @@ static charclass newline;
#ifdef __GLIBC__
# define is_valid_unibyte_character(c) 1
#else
-# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF))
+# define is_valid_unibyte_character(c) (btowc (c) != WEOF)
#endif
/* C is a "word-constituent" byte. */
@@ -791,17 +783,12 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
static bool
setbit_wc (wint_t wc, charclass c)
{
-#if MBS_SUPPORT
int b = wctob (wc);
if (b == EOF)
return false;
setbit (b, c);
return true;
-#else
- abort ();
- /*NOTREACHED*/ return false;
-#endif
}
/* Set a bit for B and its case variants in the charclass C.
@@ -895,7 +882,6 @@ static wint_t wctok; /* Wide character representation of the current
MB_CUR_MAX > 1. */
-#if MBS_SUPPORT
/* Fetch the next lexical input character. Set C (of type int) to the
next input byte, except set C to EOF if the input is a multibyte
character of length greater than 1. Set WC (of type wint_t) to the
@@ -924,23 +910,6 @@ static wint_t wctok; /* Wide character representation of the current
} \
} while (0)
-#else
-/* Note that characters become unsigned here. */
-# define FETCH_WC(c, unused, eoferr) \
- do { \
- if (! lexleft) \
- { \
- if ((eoferr) != 0) \
- dfaerror (eoferr); \
- else \
- return lasttok = END; \
- } \
- (c) = to_uchar (*lexptr++); \
- --lexleft; \
- } while (0)
-
-#endif /* MBS_SUPPORT */
-
#ifndef MIN
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
@@ -1752,7 +1721,6 @@ addtok (token t)
}
}
-#if MBS_SUPPORT
/* We treat a multibyte character as a single atom, so that DFA
can treat a multibyte character as a single expression.
@@ -1784,17 +1752,10 @@ addtok_wc (wint_t wc)
addtok (CAT);
}
}
-#else
-static void
-addtok_wc (wint_t wc)
-{
-}
-#endif
static void
add_utf8_anychar (void)
{
-#if MBS_SUPPORT
static const charclass utf8_classes[5] = {
/* 80-bf: non-leading bytes. */
{0, 0, 0, 0, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, 0, 0},
@@ -1849,7 +1810,6 @@ add_utf8_anychar (void)
addtok (CAT);
addtok (OR);
}
-#endif
}
/* The grammar understood by the parser is as follows.
@@ -1890,7 +1850,7 @@ add_utf8_anychar (void)
static void
atom (void)
{
- if (MBS_SUPPORT && tok == WCHAR)
+ if (tok == WCHAR)
{
if (wctok == WEOF)
addtok (BACKREF);
@@ -1912,7 +1872,7 @@ atom (void)
tok = lex ();
}
- else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ())
+ else if (tok == ANYCHAR && using_utf8 ())
{
/* For UTF-8 expand the period to a series of CSETs that define a valid
UTF-8 character. This avoids using the slow multibyte path. I'm
@@ -1926,9 +1886,7 @@ atom (void)
}
else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
|| tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
-#if MBS_SUPPORT
|| tok == ANYCHAR || tok == MBCSET
-#endif /* MBS_SUPPORT */
|| tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
{
addtok (tok);
@@ -2261,10 +2219,8 @@ epsclosure (position_set *s, struct dfa const *d, char *visited)
for (i = 0; i < s->nelem; ++i)
if (d->tokens[s->elems[i].index] >= NOTCHAR
&& d->tokens[s->elems[i].index] != BACKREF
-#if MBS_SUPPORT
&& d->tokens[s->elems[i].index] != ANYCHAR
&& d->tokens[s->elems[i].index] != MBCSET
-#endif
&& d->tokens[s->elems[i].index] < CSET)
{
if (!initialized)
@@ -2583,9 +2539,7 @@ dfaanalyze (struct dfa *d, int searchflag)
it with its epsilon closure. */
for (i = 0; i < d->tindex; ++i)
if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
-#if MBS_SUPPORT
|| d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
-#endif
|| d->tokens[i] >= CSET)
{
#ifdef DEBUG
@@ -2695,9 +2649,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
else
{
- if (MBS_SUPPORT
- && (d->tokens[pos.index] == MBCSET
- || d->tokens[pos.index] == ANYCHAR))
+ if (d->tokens[pos.index] == MBCSET
+ || d->tokens[pos.index] == ANYCHAR)
{
/* MB_CUR_MAX > 1 */
if (d->tokens[pos.index] == MBCSET)
@@ -3531,13 +3484,23 @@ dfaexec_main (struct dfa *d, char const *begin, char *end,
}
}
- if ((char *) p > end)
+ if (s < 0)
{
- p = NULL;
- goto done;
+ if ((char *) p > end || p[-1] != eol || d->newlines[s1] < 0)
+ {
+ p = NULL;
+ goto done;
+ }
+
+ /* The previous character was a newline, count it, and skip
+ checking of multibyte character boundary until here. */
+ nlcount++;
+ mbp = p;
+
+ s = allow_nl ? d->newlines[s1] : 0;
}
- if (s >= 0 && d->fails[s])
+ if (d->fails[s])
{
if (d->success[s] & sbit[*p])
{
@@ -3551,32 +3514,13 @@ dfaexec_main (struct dfa *d, char const *begin, char *end,
State_transition();
else
s = d->fails[s][*p++];
- continue;
}
-
- /* If the previous character was a newline, count it, and skip
- checking of multibyte character boundary until here. */
- if (p[-1] == eol)
- {
- nlcount++;
- mbp = p;
- }
-
- if (s >= 0)
+ else
{
if (!d->trans[s])
build_state (s, d);
trans = d->trans;
- continue;
}
-
- if (p[-1] == eol && allow_nl)
- {
- s = d->newlines[s1];
- continue;
- }
-
- s = 0;
}
done:
@@ -3672,7 +3616,7 @@ dfaoptimize (struct dfa *d)
size_t i;
bool have_backref = false;
- if (!MBS_SUPPORT || !using_utf8 ())
+ if (!using_utf8 ())
return;
for (i = 0; i < d->tindex; ++i)
@@ -3732,8 +3676,11 @@ dfassbuild (struct dfa *d)
sup->musts = NULL;
sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
- memcpy (sup->charclasses, d->charclasses,
- d->cindex * sizeof *sup->charclasses);
+ if (d->cindex)
+ {
+ memcpy (sup->charclasses, d->charclasses,
+ d->cindex * sizeof *sup->charclasses);
+ }
sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
sup->talloc = d->tindex * 2;