diff options
Diffstat (limited to 'dfa.c')
-rw-r--r-- | dfa.c | 117 |
1 files changed, 32 insertions, 85 deletions
@@ -58,15 +58,15 @@ #include "gettext.h" #define _(str) gettext (str) -#include "mbsupport.h" /* Define MBS_SUPPORT to 1 or 0, as appropriate. */ -#if MBS_SUPPORT -/* We can handle multibyte strings. */ -# include <wchar.h> -# include <wctype.h> -#endif +#include <wchar.h> +#include <wctype.h> #include "xalloc.h" +#if defined(__DJGPP__) +#include "mbsupport.h" +#endif + #include "dfa.h" #ifdef GAWK @@ -391,12 +391,10 @@ struct dfa */ int *multibyte_prop; -#if MBS_SUPPORT /* A table indexed by byte values that contains the corresponding wide character (if any) for that byte. WEOF means the byte is not a valid single-byte character. */ wint_t mbrtowc_cache[NOTCHAR]; -#endif /* Array of the bracket expression in the DFA. */ struct mb_char_classes *mbcsets; @@ -481,7 +479,6 @@ static void regexp (void); static void dfambcache (struct dfa *d) { -#if MBS_SUPPORT int i; for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { @@ -491,10 +488,8 @@ dfambcache (struct dfa *d) wchar_t wc; d->mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; } -#endif } -#if MBS_SUPPORT /* Store into *PWC the result of converting the leading bytes of the multibyte buffer S of length N bytes, using the mbrtowc_cache in *D and updating the conversion state in *D. On conversion error, @@ -533,9 +528,6 @@ mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d) *pwc = wc; return 1; } -#else -#define mbs_to_wchar(pwc, s, n, d) (WEOF) -#endif #ifdef DEBUG @@ -730,7 +722,7 @@ static charclass newline; #ifdef __GLIBC__ # define is_valid_unibyte_character(c) 1 #else -# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF)) +# define is_valid_unibyte_character(c) (btowc (c) != WEOF) #endif /* C is a "word-constituent" byte. */ @@ -791,17 +783,12 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) static bool setbit_wc (wint_t wc, charclass c) { -#if MBS_SUPPORT int b = wctob (wc); if (b == EOF) return false; setbit (b, c); return true; -#else - abort (); - /*NOTREACHED*/ return false; -#endif } /* Set a bit for B and its case variants in the charclass C. @@ -895,7 +882,6 @@ static wint_t wctok; /* Wide character representation of the current MB_CUR_MAX > 1. */ -#if MBS_SUPPORT /* Fetch the next lexical input character. Set C (of type int) to the next input byte, except set C to EOF if the input is a multibyte character of length greater than 1. Set WC (of type wint_t) to the @@ -924,23 +910,6 @@ static wint_t wctok; /* Wide character representation of the current } \ } while (0) -#else -/* Note that characters become unsigned here. */ -# define FETCH_WC(c, unused, eoferr) \ - do { \ - if (! lexleft) \ - { \ - if ((eoferr) != 0) \ - dfaerror (eoferr); \ - else \ - return lasttok = END; \ - } \ - (c) = to_uchar (*lexptr++); \ - --lexleft; \ - } while (0) - -#endif /* MBS_SUPPORT */ - #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif @@ -1752,7 +1721,6 @@ addtok (token t) } } -#if MBS_SUPPORT /* We treat a multibyte character as a single atom, so that DFA can treat a multibyte character as a single expression. @@ -1784,17 +1752,10 @@ addtok_wc (wint_t wc) addtok (CAT); } } -#else -static void -addtok_wc (wint_t wc) -{ -} -#endif static void add_utf8_anychar (void) { -#if MBS_SUPPORT static const charclass utf8_classes[5] = { /* 80-bf: non-leading bytes. */ {0, 0, 0, 0, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, 0, 0}, @@ -1849,7 +1810,6 @@ add_utf8_anychar (void) addtok (CAT); addtok (OR); } -#endif } /* The grammar understood by the parser is as follows. @@ -1890,7 +1850,7 @@ add_utf8_anychar (void) static void atom (void) { - if (MBS_SUPPORT && tok == WCHAR) + if (tok == WCHAR) { if (wctok == WEOF) addtok (BACKREF); @@ -1912,7 +1872,7 @@ atom (void) tok = lex (); } - else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ()) + else if (tok == ANYCHAR && using_utf8 ()) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1926,9 +1886,7 @@ atom (void) } else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD -#if MBS_SUPPORT || tok == ANYCHAR || tok == MBCSET -#endif /* MBS_SUPPORT */ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) { addtok (tok); @@ -2261,10 +2219,8 @@ epsclosure (position_set *s, struct dfa const *d, char *visited) for (i = 0; i < s->nelem; ++i) if (d->tokens[s->elems[i].index] >= NOTCHAR && d->tokens[s->elems[i].index] != BACKREF -#if MBS_SUPPORT && d->tokens[s->elems[i].index] != ANYCHAR && d->tokens[s->elems[i].index] != MBCSET -#endif && d->tokens[s->elems[i].index] < CSET) { if (!initialized) @@ -2583,9 +2539,7 @@ dfaanalyze (struct dfa *d, int searchflag) it with its epsilon closure. */ for (i = 0; i < d->tindex; ++i) if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF -#if MBS_SUPPORT || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET -#endif || d->tokens[i] >= CSET) { #ifdef DEBUG @@ -2695,9 +2649,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); else { - if (MBS_SUPPORT - && (d->tokens[pos.index] == MBCSET - || d->tokens[pos.index] == ANYCHAR)) + if (d->tokens[pos.index] == MBCSET + || d->tokens[pos.index] == ANYCHAR) { /* MB_CUR_MAX > 1 */ if (d->tokens[pos.index] == MBCSET) @@ -3531,13 +3484,23 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, } } - if ((char *) p > end) + if (s < 0) { - p = NULL; - goto done; + if ((char *) p > end || p[-1] != eol || d->newlines[s1] < 0) + { + p = NULL; + goto done; + } + + /* The previous character was a newline, count it, and skip + checking of multibyte character boundary until here. */ + nlcount++; + mbp = p; + + s = allow_nl ? d->newlines[s1] : 0; } - if (s >= 0 && d->fails[s]) + if (d->fails[s]) { if (d->success[s] & sbit[*p]) { @@ -3551,32 +3514,13 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, State_transition(); else s = d->fails[s][*p++]; - continue; } - - /* If the previous character was a newline, count it, and skip - checking of multibyte character boundary until here. */ - if (p[-1] == eol) - { - nlcount++; - mbp = p; - } - - if (s >= 0) + else { if (!d->trans[s]) build_state (s, d); trans = d->trans; - continue; } - - if (p[-1] == eol && allow_nl) - { - s = d->newlines[s1]; - continue; - } - - s = 0; } done: @@ -3672,7 +3616,7 @@ dfaoptimize (struct dfa *d) size_t i; bool have_backref = false; - if (!MBS_SUPPORT || !using_utf8 ()) + if (!using_utf8 ()) return; for (i = 0; i < d->tindex; ++i) @@ -3732,8 +3676,11 @@ dfassbuild (struct dfa *d) sup->musts = NULL; sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses); - memcpy (sup->charclasses, d->charclasses, - d->cindex * sizeof *sup->charclasses); + if (d->cindex) + { + memcpy (sup->charclasses, d->charclasses, + d->cindex * sizeof *sup->charclasses); + } sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens); sup->talloc = d->tindex * 2; |