diff options
Diffstat (limited to 'dfa.c')
-rw-r--r-- | dfa.c | 1076 |
1 files changed, 1064 insertions, 12 deletions
@@ -1,5 +1,5 @@ /* dfa.c - deterministic extended regexp routines for GNU - Copyright 1988, 1998, 2000 Free Software Foundation, Inc. + Copyright 1988, 1998, 2000, 2002 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -46,6 +46,16 @@ extern void free(); #include <strings.h> #endif +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC +/* We can handle multibyte string. */ +# define MBS_SUPPORT +#endif + +#ifdef MBS_SUPPORT +# include <wchar.h> +# include <wctype.h> +#endif + #ifndef DEBUG /* use the same approach as regex.c */ #undef assert #define assert(e) @@ -91,17 +101,22 @@ extern void free(); host does not conform to Posix. */ #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) +/* Don't use gettext if ENABLE_NLS is not defined */ /* If we (don't) have I18N. */ /* glibc defines _ */ -#ifndef _ -# ifdef HAVE_LIBINTL_H -# include <libintl.h> -# ifndef _ -# define _(Str) gettext (Str) +#ifdef ENABLE_NLS +# ifndef _ +# ifdef HAVE_LIBINTL_H +# include <libintl.h> +# ifndef _ +# define _(Str) gettext (Str) +# endif +# else +# define _(Str) (Str) # endif -# else -# define _(Str) (Str) # endif +#else +# define _(Str) (Str) #endif #include "regex.h" @@ -234,6 +249,10 @@ prtok (token t) case ORTOP: s = "ORTOP"; break; case LPAREN: s = "LPAREN"; break; case RPAREN: s = "RPAREN"; break; +#ifdef MBS_SUPPORT + case ANYCHAR: s = "ANYCHAR"; break; + case MBCSET: s = "MBCSET"; break; +#endif /* MBS_SUPPORT */ default: s = "CSET"; break; } fprintf(stderr, "%s", s); @@ -349,8 +368,117 @@ static int laststart; /* True if we're separated from beginning or (, | static int parens; /* Count of outstanding left parens. */ static int minrep, maxrep; /* Repeat counts for {m,n}. */ +#ifdef MBS_SUPPORT +/* These variables are used only if (MB_CUR_MAX > 1). */ +static mbstate_t mbs; /* Mbstate for mbrlen(). */ +static int cur_mb_len; /* Byte length of the current scanning + multibyte character. */ +static int cur_mb_index; /* Byte index of the current scanning multibyte + character. + + singlebyte character : cur_mb_index = 0 + multibyte character + 1st byte : cur_mb_index = 1 + 2nd byte : cur_mb_index = 2 + ... + nth byte : cur_mb_index = n */ +static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec(). + Each element store the amount of remain + byte of corresponding multibyte character + in the input string. A element's value + is 0 if corresponding character is a + singlebyte chracter. + e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)> + mblen_buf : 0, 3, 2, 1 + */ + +static wchar_t *inputwcs; /* Wide character representation of input + string in dfaexec(). + The length of this array is same as + the length of input string(char array). + inputstring[i] is a single-byte char, + or 1st byte of a multibyte char. + And inputwcs[i] is the codepoint. */ +static unsigned char const *buf_begin;/* refference to begin in dfaexec(). */ +static unsigned char const *buf_end; /* refference to end in dfaexec(). */ +#endif /* MBS_SUPPORT */ + +#ifdef MBS_SUPPORT +/* This function update cur_mb_len, and cur_mb_index. + p points current lexptr, len is the remaining buffer length. */ +static void +update_mb_len_index (unsigned char const *p, int len) +{ + /* If last character is a part of a multibyte character, + we update cur_mb_index. */ + if (cur_mb_index) + cur_mb_index = (cur_mb_index >= cur_mb_len)? 0 + : cur_mb_index + 1; + + /* If last character is a single byte character, or the + last portion of a multibyte character, we check whether + next character is a multibyte character or not. */ + if (! cur_mb_index) + { + cur_mb_len = mbrlen(p, len, &mbs); + if (cur_mb_len > 1) + /* It is a multibyte character. + cur_mb_len was already set by mbrlen(). */ + cur_mb_index = 1; + else if (cur_mb_len < 1) + /* Invalid sequence. We treat it as a singlebyte character. + cur_mb_index is aleady 0. */ + cur_mb_len = 1; + /* Otherwise, cur_mb_len == 1, it is a singlebyte character. + cur_mb_index is aleady 0. */ + } +} +#endif /* MBS_SUPPORT */ + +#ifdef MBS_SUPPORT /* Note that characters become unsigned here. */ -#define FETCH(c, eoferr) \ +# define FETCH(c, eoferr) \ + { \ + if (! lexleft) \ + { \ + if (eoferr != 0) \ + dfaerror (eoferr); \ + else \ + return lasttok = END; \ + } \ + if (MB_CUR_MAX > 1) \ + update_mb_len_index(lexptr, lexleft); \ + (c) = (unsigned char) *lexptr++; \ + --lexleft; \ + } + +/* This function fetch a wide character, and update cur_mb_len, + used only if the current locale is a multibyte environment. */ +static wint_t +fetch_wc (char const *eoferr) +{ + wchar_t wc; + if (! lexleft) + { + if (eoferr != 0) + dfaerror (eoferr); + else + return WEOF; + } + + cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); + if (cur_mb_len <= 0) + { + cur_mb_len = 1; + wc = *lexptr; + } + lexptr += cur_mb_len; + lexleft -= cur_mb_len; + return wc; +} +#else +/* Note that characters become unsigned here. */ +# define FETCH(c, eoferr) \ { \ if (! lexleft) \ { \ @@ -362,6 +490,202 @@ static int minrep, maxrep; /* Repeat counts for {m,n}. */ (c) = (unsigned char) *lexptr++; \ --lexleft; \ } +#endif /* MBS_SUPPORT */ + +#ifdef MBS_SUPPORT +/* Multibyte character handling sub-routin for lex. + This function parse a bracket expression and build a struct + mb_char_classes. */ +static void +parse_bracket_exp_mb () +{ + wint_t wc, wc1, wc2; + + /* Work area to build a mb_char_classes. */ + struct mb_char_classes *work_mbc; + int chars_al, range_sts_al, range_ends_al, ch_classes_al, + equivs_al, coll_elems_al; + + REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes, + dfa->mbcsets_alloc, dfa->nmbcsets + 1); + /* dfa->multibyteb_prop[] hold the index of dfa->mbcsets. + We will update dfa->multibyte_prop[] in addtok(), because we can't + decide the index in dfa->tokens[]. */ + + /* Initialize work are */ + work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]); + + chars_al = 1; + range_sts_al = range_ends_al = 0; + ch_classes_al = equivs_al = coll_elems_al = 0; + MALLOC(work_mbc->chars, wchar_t, chars_al); + + work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0; + work_mbc->nequivs = work_mbc->ncoll_elems = 0; + work_mbc->chars = work_mbc->ch_classes = NULL; + work_mbc->range_sts = work_mbc->range_ends = NULL; + work_mbc->equivs = work_mbc->coll_elems = NULL; + + wc = fetch_wc(_("Unbalanced [")); + if (wc == L'^') + { + wc = fetch_wc(_("Unbalanced [")); + work_mbc->invert = 1; + } + else + work_mbc->invert = 0; + do + { + wc1 = WEOF; /* mark wc1 is not initialized". */ + + /* Note that if we're looking at some other [:...:] construct, + we just treat it as a bunch of ordinary characters. We can do + this because we assume regex has checked for syntax errors before + dfa is ever called. */ + if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES)) + { +#define BRACKET_BUFFER_SIZE 128 + char str[BRACKET_BUFFER_SIZE]; + wc1 = wc; + wc = fetch_wc(_("Unbalanced [")); + + /* If pattern contains `[[:', `[[.', or `[[='. */ + if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'=')) + { + unsigned char c; + unsigned char delim = (unsigned char)wc; + int len = 0; + for (;;) + { + if (! lexleft) + dfaerror (_("Unbalanced [")); + c = (unsigned char) *lexptr++; + --lexleft; + + if ((c == delim && *lexptr == ']') || lexleft == 0) + break; + if (len < BRACKET_BUFFER_SIZE) + str[len++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[len] = '\0'; + + if (lexleft == 0) + { + REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, + work_mbc->nchars + 2); + work_mbc->chars[work_mbc->nchars++] = L'['; + work_mbc->chars[work_mbc->nchars++] = delim; + break; + } + + if (--lexleft, *lexptr++ != ']') + dfaerror (_("Unbalanced [")); + if (delim == ':') + /* build character class. */ + { + wctype_t wt; + /* Query the character class as wctype_t. */ + wt = wctype (str); + + if (ch_classes_al == 0) + MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al); + REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t, + ch_classes_al, + work_mbc->nch_classes + 1); + work_mbc->ch_classes[work_mbc->nch_classes++] = wt; + + } + else if (delim == '=' || delim == '.') + { + char *elem; + MALLOC(elem, char, len + 1); + strncpy(elem, str, len + 1); + + if (delim == '=') + /* build equivalent class. */ + { + if (equivs_al == 0) + MALLOC(work_mbc->equivs, char*, ++equivs_al); + REALLOC_IF_NECESSARY(work_mbc->equivs, char*, + equivs_al, + work_mbc->nequivs + 1); + work_mbc->equivs[work_mbc->nequivs++] = elem; + } + + if (delim == '.') + /* build collating element. */ + { + if (coll_elems_al == 0) + MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al); + REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*, + coll_elems_al, + work_mbc->ncoll_elems + 1); + work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; + } + } + wc = WEOF; + } + else + /* We treat '[' as a normal character here. */ + { + wc2 = wc1; wc1 = wc; wc = wc2; /* swap */ + } + } + else + { + if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + wc = fetch_wc(("Unbalanced [")); + } + + if (wc1 == WEOF) + wc1 = fetch_wc(_("Unbalanced [")); + + if (wc1 == L'-') + /* build range characters. */ + { + wc2 = fetch_wc(_("Unbalanced [")); + if (wc2 == L']') + { + /* In the case [x-], the - is an ordinary hyphen, + which is left in c1, the lookahead character. */ + lexptr -= cur_mb_len; + lexleft += cur_mb_len; + wc2 = wc; + } + else + { + if (wc2 == L'\\' + && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + wc2 = fetch_wc(_("Unbalanced [")); + wc1 = fetch_wc(_("Unbalanced [")); + } + + if (range_sts_al == 0) + { + MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al); + MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al); + } + REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, + range_sts_al, work_mbc->nranges + 1); + work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; + REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, + range_ends_al, work_mbc->nranges + 1); + work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; + } + else if (wc != WEOF) + /* build normal characters. */ + { + REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, + work_mbc->nchars + 1); + work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; + } + } + while ((wc = wc1) != L']'); +} +#endif /* MBS_SUPPORT */ #ifdef __STDC__ #define FUNC(F, P) static int F(int c) { return P(c); } @@ -442,6 +766,14 @@ lex (void) for (i = 0; i < 2; ++i) { FETCH(c, 0); +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && cur_mb_index) + /* If this is a part of a multi-byte character, we must treat + this byte data as a normal character. + e.g. In case of SJIS encoding, some character contains '\', + but they must not be backslash. */ + goto normal_char; +#endif /* MBS_SUPPORT */ switch (c) { case '\\': @@ -662,6 +994,15 @@ lex (void) case '.': if (backslash) goto normal_char; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + /* In multibyte environment period must match with a single + character not a byte. So we use ANYCHAR. */ + laststart = 0; + return lasttok = ANYCHAR; + } +#endif /* MBS_SUPPORT */ zeroset(ccl); notset(ccl); if (!(syntax_bits & RE_DOT_NEWLINE)) @@ -687,6 +1028,17 @@ lex (void) case '[': if (backslash) goto normal_char; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + /* In multibyte environment a bracket expression may contain + multibyte characters, which must be treated as characters + (not bytes). So we parse it by parse_bracket_exp_mb(). */ + laststart = 0; + parse_bracket_exp_mb(); + return lasttok = MBCSET; + } +#endif zeroset(ccl); FETCH(c, _("Unbalanced [")); if (c == '^') @@ -815,6 +1167,26 @@ static int depth; /* Current depth of a hypothetical stack static void addtok (token t) { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop, + dfa->tindex); + /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */ + if (t == MBCSET) + dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3; + else if (t < NOTCHAR) + dfa->multibyte_prop[dfa->tindex] + = (cur_mb_len == 1)? 3 /* single-byte char */ + : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */ + + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */ + else + /* It may be unnecessary, but it is safer to treat other symbols + as single byte characters. */ + dfa->multibyte_prop[dfa->tindex] = 3; + } +#endif + REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); dfa->tokens[dfa->tindex++] = t; @@ -858,7 +1230,10 @@ addtok (token t) atom atom: + <multibyte character> <normal character> + ANYCHAR + MBCSET CSET BACKREF BEGLINE @@ -876,10 +1251,31 @@ atom (void) { if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD +#ifdef MBS_SUPPORT + || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */ +#endif /* MBS_SUPPORT */ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) { addtok(tok); tok = lex(); +#ifdef MBS_SUPPORT + /* We treat a multibyte character as a single atom, so that DFA + can treat a multibyte character as a single expression. + + e.g. We construct following tree from "<mb1><mb2>". + <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> + <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> + */ + if (MB_CUR_MAX > 1) + { + while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR) + { + addtok(tok); + addtok(CAT); + tok = lex(); + } + } +#endif /* MBS_SUPPORT */ } else if (tok == LPAREN) { @@ -998,6 +1394,14 @@ dfaparse (char *s, size_t len, struct dfa *d) lasttok = END; laststart = 1; parens = 0; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + cur_mb_index = 0; + cur_mb_len = 0; + memset(&mbs, 0, sizeof(mbstate_t)); + } +#endif /* MBS_SUPPORT */ if (! syntax_bits_set) dfaerror(_("No regexp syntax bits specified")); @@ -1139,6 +1543,10 @@ state_index (struct dfa *d, position_set *s, int newline, int letter) d->states[i].backref = 0; d->states[i].constraint = 0; d->states[i].first_end = 0; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + d->states[i].mbps.nelem = 0; +#endif for (j = 0; j < s->nelem; ++j) if (d->tokens[s->elems[j].index] < 0) { @@ -1181,6 +1589,10 @@ epsclosure (position_set *s, struct dfa *d) for (i = 0; i < s->nelem; ++i) if (d->tokens[s->elems[i].index] >= NOTCHAR && d->tokens[s->elems[i].index] != BACKREF +#ifdef MBS_SUPPORT + && d->tokens[s->elems[i].index] != ANYCHAR + && d->tokens[s->elems[i].index] != MBCSET +#endif && d->tokens[s->elems[i].index] < CSET) { old = s->elems[i]; @@ -1462,6 +1874,10 @@ dfaanalyze (struct dfa *d, int searchflag) it with its epsilon closure. */ for (i = 0; i < d->tindex; ++i) if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF +#ifdef MBS_SUPPORT + || d->tokens[i] == ANYCHAR + || d->tokens[i] == MBCSET +#endif || d->tokens[i] >= CSET) { #ifdef DEBUG @@ -1563,6 +1979,9 @@ dfastate (int s, struct dfa *d, int trans[]) int wants_letter; /* New state wants to know letter context. */ int state_letter; /* New state on a letter transition. */ static int initialized; /* Flag for static initialization. */ +#ifdef MBS_SUPPORT + int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */ +#endif int i, j, k; /* Initialize the set of letters, if necessary. */ @@ -1584,6 +2003,23 @@ dfastate (int s, struct dfa *d, int trans[]) setbit(d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); +#ifdef MBS_SUPPORT + else if (d->tokens[pos.index] == ANYCHAR + || d->tokens[pos.index] == MBCSET) + /* MB_CUR_MAX > 1 */ + { + /* ANYCHAR and MBCSET must match with a single character, so we + must put it to d->states[s].mbps, which contains the positions + which can match with a single character not a byte. */ + if (d->states[s].mbps.nelem == 0) + { + MALLOC(d->states[s].mbps.elems, position, + d->states[s].elems.nelem); + } + insert(pos, &(d->states[s].mbps)); + continue; + } +#endif /* MBS_SUPPORT */ else continue; @@ -1720,9 +2156,46 @@ dfastate (int s, struct dfa *d, int trans[]) for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) insert(d->follows[grps[i].elems[j].index].elems[k], &follows); +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + /* If a token in follows.elems is not 1st byte of a multibyte + character, or the states of follows must accept the bytes + which are not 1st byte of the multibyte character. + Then, if a state of follows encounter a byte, it must not be + a 1st byte of a multibyte character nor singlebyte character. + We cansel to add state[0].follows to next state, because + state[0] must accept 1st-byte + + For example, we assume <sb a> is a certain singlebyte + character, <mb A> is a certain multibyte character, and the + codepoint of <sb a> equals the 2nd byte of the codepoint of + <mb A>. + When state[0] accepts <sb a>, state[i] transit to state[i+1] + by accepting accepts 1st byte of <mb A>, and state[i+1] + accepts 2nd byte of <mb A>, if state[i+1] encounter the + codepoint of <sb a>, it must not be <sb a> but 2nd byte of + <mb A>, so we can not add state[0]. */ + + next_isnt_1st_byte = 0; + for (j = 0; j < follows.nelem; ++j) + { + if (!(d->multibyte_prop[follows.elems[j].index] & 1)) + { + next_isnt_1st_byte = 1; + break; + } + } + } +#endif + /* If we are building a searching matcher, throw in the positions of state 0 as well. */ +#ifdef MBS_SUPPORT + if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte)) +#else if (d->searchflag) +#endif for (j = 0; j < d->states[0].elems.nelem; ++j) insert(d->states[0].elems.elems[j], &follows); @@ -1871,6 +2344,434 @@ build_state_zero (struct dfa *d) build_state(0, d); } +#ifdef MBS_SUPPORT +/* Multibyte character handling sub-routins for dfaexec. */ + +/* Initial state may encounter the byte which is not a singlebyte character + nor 1st byte of a multibyte character. But it is incorrect for initial + state to accept such a byte. + For example, in sjis encoding the regular expression like "\\" accepts + the codepoint 0x5c, but should not accept the 2nd byte of the codepoint + 0x815c. Then Initial state must skip the bytes which are not a singlebyte + character nor 1st byte of a multibyte character. */ +#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \ + if (s == 0) \ + { \ + while (inputwcs[p - buf_begin] == 0 \ + && mblen_buf[p - buf_begin] > 0 \ + && (unsigned char const *)p < buf_end) \ + ++p; \ + if ((char*)p >= end) \ + { \ + free(mblen_buf); \ + free(inputwcs); \ + return NULL; \ + } \ + } + +static void +realloc_trans_if_necessary(struct dfa *d, int new_state) +{ + /* Make sure that the trans and fail arrays are allocated large enough + to hold a pointer for the new state. */ + if (new_state >= d->tralloc) + { + int oldalloc = d->tralloc; + + while (new_state >= d->tralloc) + d->tralloc *= 2; + REALLOC(d->realtrans, int *, d->tralloc + 1); + d->trans = d->realtrans + 1; + REALLOC(d->fails, int *, d->tralloc); + REALLOC(d->success, int, d->tralloc); + REALLOC(d->newlines, int, d->tralloc); + while (oldalloc < d->tralloc) + { + d->trans[oldalloc] = NULL; + d->fails[oldalloc++] = NULL; + } + } +} + +/* Return values of transit_state_singlebyte(), and + transit_state_consume_1char. */ +typedef enum +{ + TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */ + TRANSIT_STATE_DONE, /* State transition has finished. */ + TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */ +} status_transit_state; + +/* Consume a single byte and transit state from 's' to '*next_state'. + This function is almost same as the state transition routin in dfaexec(). + But state transition is done just once, otherwise matching succeed or + reach the end of the buffer. */ +static status_transit_state +transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p, + int *next_state) +{ + int *t; + int works = s; + + status_transit_state rval = TRANSIT_STATE_IN_PROGRESS; + + while (rval == TRANSIT_STATE_IN_PROGRESS) + { + if ((t = d->trans[works]) != NULL) + { + works = t[*p]; + rval = TRANSIT_STATE_DONE; + if (works < 0) + works = 0; + } + else + { + if (works >= 0 && d->fails[works]) + { + works = d->fails[works][*p]; + rval = TRANSIT_STATE_DONE; + } + + else if (works >= 0) + { + build_state(works, d); + } + else + { + works = 0; + } + } + } + *next_state = works; + return rval; +} + +/* Check whether period can match or not in the current context. If it can, + return the amount of the bytes with which period can match, otherwise + return 0. + `pos' is the position of the period. `index' is the index from the + buf_begin, and it is the current position in the buffer. */ +static int +match_anychar (struct dfa *d, int s, position pos, int index) +{ + int newline = 0; + int letter = 0; + wchar_t wc; + int mbclen; + + wc = inputwcs[index]; + mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; + + /* Check context. */ + if (wc == (wchar_t)eolbyte) + { + if (!(syntax_bits & RE_DOT_NEWLINE)) + return 0; + newline = 1; + } + else if (wc == (wchar_t)'\0') + { + if (syntax_bits & RE_DOT_NOT_NULL) + return 0; + newline = 1; + } + + if (iswalnum(wc) || wc == L'_') + letter = 1; + + if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, + newline, d->states[s].letter, letter)) + return 0; + + return mbclen; +} + +/* Check whether bracket expression can match or not in the current context. + If it can, return the amount of the bytes with which expression can match, + otherwise return 0. + `pos' is the position of the bracket expression. `index' is the index + from the buf_begin, and it is the current position in the buffer. */ +int +match_mb_charset (struct dfa *d, int s, position pos, int index) +{ + int i; + int match; /* Flag which represent that matching succeed. */ + int match_len; /* Length of the character (or collating element) + with which this operator match. */ + int op_len; /* Length of the operator. */ + char buffer[128]; + wchar_t wcbuf[6]; + + /* Pointer to the structure to which we are currently reffering. */ + struct mb_char_classes *work_mbc; + + int newline = 0; + int letter = 0; + wchar_t wc; /* Current reffering character. */ + + wc = inputwcs[index]; + + /* Check context. */ + if (wc == (wchar_t)eolbyte) + { + if (!(syntax_bits & RE_DOT_NEWLINE)) + return 0; + newline = 1; + } + else if (wc == (wchar_t)'\0') + { + if (syntax_bits & RE_DOT_NOT_NULL) + return 0; + newline = 1; + } + if (iswalnum(wc) || wc == L'_') + letter = 1; + if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, + newline, d->states[s].letter, letter)) + return 0; + + /* Assign the current reffering operator to work_mbc. */ + work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]); + match = !work_mbc->invert; + match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; + + /* match with a character class? */ + for (i = 0; i<work_mbc->nch_classes; i++) + { + if (iswctype((wint_t)wc, work_mbc->ch_classes[i])) + goto charset_matched; + } + + strncpy(buffer, buf_begin + index, match_len); + buffer[match_len] = '\0'; + + /* match with an equivalent class? */ + for (i = 0; i<work_mbc->nequivs; i++) + { + op_len = strlen(work_mbc->equivs[i]); + strncpy(buffer, buf_begin + index, op_len); + buffer[op_len] = '\0'; + if (strcoll(work_mbc->equivs[i], buffer) == 0) + { + match_len = op_len; + goto charset_matched; + } + } + + /* match with a collating element? */ + for (i = 0; i<work_mbc->ncoll_elems; i++) + { + op_len = strlen(work_mbc->coll_elems[i]); + strncpy(buffer, buf_begin + index, op_len); + buffer[op_len] = '\0'; + + if (strcoll(work_mbc->coll_elems[i], buffer) == 0) + { + match_len = op_len; + goto charset_matched; + } + } + + wcbuf[0] = wc; + wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0'; + + /* match with a range? */ + for (i = 0; i<work_mbc->nranges; i++) + { + wcbuf[2] = work_mbc->range_sts[i]; + wcbuf[4] = work_mbc->range_ends[i]; + + if (wcscoll(wcbuf, wcbuf+2) >= 0 && + wcscoll(wcbuf+4, wcbuf) >= 0) + goto charset_matched; + } + + /* match with a character? */ + for (i = 0; i<work_mbc->nchars; i++) + { + if (wc == work_mbc->chars[i]) + goto charset_matched; + } + + match = !match; + + charset_matched: + return match ? match_len : 0; +} + +/* Check each of `d->states[s].mbps.elem' can match or not. Then return the + array which corresponds to `d->states[s].mbps.elem' and each element of + the array contains the amount of the bytes with which the element can + match. + `index' is the index from the buf_begin, and it is the current position + in the buffer. + Caller MUST free the array which this function return. */ +static int* +check_matching_with_multibyte_ops (struct dfa *d, int s, int index) +{ + int i; + int* rarray; + + MALLOC(rarray, int, d->states[s].mbps.nelem); + for (i = 0; i < d->states[s].mbps.nelem; ++i) + { + position pos = d->states[s].mbps.elems[i]; + switch(d->tokens[pos.index]) + { + case ANYCHAR: + rarray[i] = match_anychar(d, s, pos, index); + break; + case MBCSET: + rarray[i] = match_mb_charset(d, s, pos, index); + break; + default: + break; /* can not happen. */ + } + } + return rarray; +} + +/* Consume a single character and enumerate all of the positions which can + be next position from the state `s'. + `match_lens' is the input. It can be NULL, but it can also be the output + of check_matching_with_multibyte_ops() for optimization. + `mbclen' and `pps' are the output. `mbclen' is the length of the + character consumed, and `pps' is the set this function enumerate. */ +static status_transit_state +transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp, + int *match_lens, int *mbclen, position_set *pps) +{ + int i, j; + int s1, s2; + int* work_mbls; + status_transit_state rs = TRANSIT_STATE_DONE; + + /* Calculate the length of the (single/multi byte) character + to which p points. */ + *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1 + : mblen_buf[*pp - buf_begin]; + + /* Calculate the state which can be reached from the state `s' by + consuming `*mbclen' single bytes from the buffer. */ + s1 = s; + for (i = 0; i < *mbclen; i++) + { + s2 = s1; + rs = transit_state_singlebyte(d, s2, (*pp)++, &s1); + } + /* Copy the positions contained by `s1' to the set `pps'. */ + copy(&(d->states[s1].elems), pps); + + /* Check (inputed)match_lens, and initialize if it is NULL. */ + if (match_lens == NULL && d->states[s].mbps.nelem != 0) + work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); + else + work_mbls = match_lens; + + /* Add all of the positions which can be reached from `s' by consuming + a single character. */ + for (i = 0; i < d->states[s].mbps.nelem ; i++) + { + if (work_mbls[i] == *mbclen) + for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; + j++) + insert(d->follows[d->states[s].mbps.elems[i].index].elems[j], + pps); + } + + if (match_lens == NULL && work_mbls != NULL) + free(work_mbls); + return rs; +} + +/* Transit state from s, then return new state and update the pointer of the + buffer. This function is for some operator which can match with a multi- + byte character or a collating element(which may be multi characters). */ +static int +transit_state (struct dfa *d, int s, unsigned char const **pp) +{ + int s1; + int mbclen; /* The length of current input multibyte character. */ + int maxlen = 0; + int i, j; + int *match_lens = NULL; + int nelem = d->states[s].mbps.nelem; /* Just a alias. */ + position_set follows; + unsigned char const *p1 = *pp; + status_transit_state rs; + wchar_t wc; + + if (nelem > 0) + /* This state has (a) multibyte operator(s). + We check whether each of them can match or not. */ + { + /* Note: caller must free the return value of this function. */ + match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); + + for (i = 0; i < nelem; i++) + /* Search the operator which match the longest string, + in this state. */ + { + if (match_lens[i] > maxlen) + maxlen = match_lens[i]; + } + } + + if (nelem == 0 || maxlen == 0) + /* This state has no multibyte operator which can match. + We need to check only one singlebyte character. */ + { + status_transit_state rs; + rs = transit_state_singlebyte(d, s, *pp, &s1); + + /* We must update the pointer if state transition succeeded. */ + if (rs == TRANSIT_STATE_DONE) + ++*pp; + + if (match_lens != NULL) + free(match_lens); + return s1; + } + + /* This state has some operators which can match a multibyte character. */ + follows.nelem = 0; + MALLOC(follows.elems, position, d->nleaves); + + /* `maxlen' may be longer than the length of a character, because it may + not be a character but a (multi character) collating element. + We enumerate all of the positions which `s' can reach by consuming + `maxlen' bytes. */ + rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows); + + wc = inputwcs[*pp - mbclen - buf_begin]; + s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); + realloc_trans_if_necessary(d, s1); + + while (*pp - p1 < maxlen) + { + follows.nelem = 0; + rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows); + + for (i = 0; i < nelem ; i++) + { + if (match_lens[i] == *pp - p1) + for (j = 0; + j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) + insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j], + &follows); + } + + wc = inputwcs[*pp - mbclen - buf_begin]; + s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); + realloc_trans_if_necessary(d, s1); + } + free(match_lens); + free(follows.elems); + return s1; +} + +#endif + /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the buffer, and the shortest possible version thereof. Return a pointer to the first @@ -1914,8 +2815,80 @@ dfaexec (struct dfa *d, char *begin, char *end, trans = d->trans; *end = eol; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + int remain_bytes, i; + buf_begin = begin; + buf_end = end; + + /* initialize mblen_buf, and inputwcs. */ + MALLOC(mblen_buf, unsigned char, end - begin + 2); + MALLOC(inputwcs, wchar_t, end - begin + 2); + memset(&mbs, 0, sizeof(mbstate_t)); + remain_bytes = 0; + for (i = 0; i < end - begin + 1; i++) + { + if (remain_bytes == 0) + { + remain_bytes + = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs); + if (remain_bytes <= 1) + { + remain_bytes = 0; + inputwcs[i] = (wchar_t)begin[i]; + mblen_buf[i] = 0; + } + else + { + mblen_buf[i] = remain_bytes; + remain_bytes--; + } + } + else + { + mblen_buf[i] = remain_bytes; + inputwcs[i] = 0; + remain_bytes--; + } + } + mblen_buf[i] = 0; + inputwcs[i] = 0; /* sentinel */ + } +#endif /* MBS_SUPPORT */ + for (;;) { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + while ((t = trans[s])) + { + if ((char *) p > end) + break; + s1 = s; + if (d->states[s].mbps.nelem != 0) + { + /* Can match with a multibyte character( and multi character + collating element). */ + unsigned char const *nextp; + + SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); + + nextp = p; + s = transit_state(d, s, &nextp); + p = (unsigned char *)nextp; + + /* Trans table might be updated. */ + trans = d->trans; + } + else + { + SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); + s = t[*p++]; + } + } + else +#endif /* MBS_SUPPORT */ while ((t = trans[s]) != 0) { /* hand-optimized loop */ s1 = t[*p++]; if ((t = trans[s1]) == 0) { @@ -1931,10 +2904,30 @@ dfaexec (struct dfa *d, char *begin, char *end, { if (backref) *backref = (d->states[s].backref != 0); - return (char *) p; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + free(mblen_buf); + free(inputwcs); + } +#endif /* MBS_SUPPORT */ + return (char *) p; } s1 = s; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + unsigned char const *nextp; + nextp = p; + s = transit_state(d, s, &nextp); + p = (unsigned char *)nextp; + + /* Trans table might be updated. */ + trans = d->trans; + } + else +#endif /* MBS_SUPPORT */ s = d->fails[s][*p++]; continue; } @@ -1945,7 +2938,16 @@ dfaexec (struct dfa *d, char *begin, char *end, /* Check if we've run off the end of the buffer. */ if ((char *) p > end) - return NULL; + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + free(mblen_buf); + free(inputwcs); + } +#endif /* MBS_SUPPORT */ + return NULL; + } if (s >= 0) { @@ -1976,6 +2978,16 @@ dfainit (struct dfa *d) d->talloc = 1; MALLOC(d->tokens, token, d->talloc); d->tindex = d->depth = d->nleaves = d->nregexps = 0; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + d->nmultibyte_prop = 1; + MALLOC(d->multibyte_prop, int, d->nmultibyte_prop); + d->nmbcsets = 0; + d->mbcsets_alloc = 1; + MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc); + } +#endif d->searchflag = 0; d->tralloc = 0; @@ -2036,6 +3048,37 @@ dfafree (struct dfa *d) free((ptr_t) d->charclasses); free((ptr_t) d->tokens); +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + free((ptr_t) d->multibyte_prop); + for (i = 0; i < d->nmbcsets; ++i) + { + int j; + struct mb_char_classes *p = &(d->mbcsets[i]); + if (p->chars != NULL) + free(p->chars); + if (p->ch_classes != NULL) + free(p->ch_classes); + if (p->range_sts != NULL) + free(p->range_sts); + if (p->range_ends != NULL) + free(p->range_ends); + + for (j = 0; j < p->nequivs; ++j) + free(p->equivs[j]); + if (p->equivs != NULL) + free(p->equivs); + + for (j = 0; j < p->ncoll_elems; ++j) + free(p->coll_elems[j]); + if (p->coll_elems != NULL) + free(p->coll_elems); + } + free((ptr_t) d->mbcsets); + } +#endif /* MBS_SUPPORT */ + for (i = 0; i < d->sindex; ++i) free((ptr_t) d->states[i].elems.elems); free((ptr_t) d->states); @@ -2091,6 +3134,10 @@ dfafree (struct dfa *d) ---- ---- ----- -- -- char c # c # c # c # c + ANYCHAR ZERO ZERO ZERO ZERO + + MBCSET ZERO ZERO ZERO ZERO + CSET ZERO ZERO ZERO ZERO STAR ZERO ZERO ZERO ZERO @@ -2536,7 +3583,12 @@ dfamust (struct dfa *dfa) /* not on *my* shift */ goto done; } - else if (t >= CSET) + else if (t >= CSET +#ifdef MBS_SUPPORT + || t == ANYCHAR + || t == MBCSET +#endif /* MBS_SUPPORT */ + ) { /* easy enough */ resetmust(mp); |