diff options
-rw-r--r-- | support/ChangeLog | 5 | ||||
-rw-r--r-- | support/intprops.h | 15 | ||||
-rw-r--r-- | support/regcomp.c | 813 | ||||
-rw-r--r-- | support/regex_internal.c | 40 | ||||
-rw-r--r-- | support/regex_internal.h | 49 | ||||
-rw-r--r-- | support/regexec.c | 77 |
6 files changed, 385 insertions, 614 deletions
diff --git a/support/ChangeLog b/support/ChangeLog index 22e2d694..36555936 100644 --- a/support/ChangeLog +++ b/support/ChangeLog @@ -1,3 +1,8 @@ +2021-12-10 Arnold D. Robbins <arnold@skeeve.com> + + * intprops.h, regcomp.c, regex_internal.c, regex_internal.h, + regexec.c: Sync with GNULIB. + 2021-10-27 Arnold D. Robbins <arnold@skeeve.com> * 5.1.1: Release tar ball made. diff --git a/support/intprops.h b/support/intprops.h index 3fe64e82..7f20f09f 100644 --- a/support/intprops.h +++ b/support/intprops.h @@ -229,18 +229,18 @@ /* True if __builtin_add_overflow (A, B, P) and __builtin_sub_overflow (A, B, P) work when P is non-null. */ +#if defined __has_builtin +# define _GL_HAS_BUILTIN_ADD_OVERFLOW __has_builtin (__builtin_add_overflow) /* __builtin_{add,sub}_overflow exists but is not reliable in GCC 5.x and 6.x, see <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98269>. */ -#if 7 <= __GNUC__ && !defined __ICC +#elif 7 <= __GNUC__ && !defined __EDG__ # define _GL_HAS_BUILTIN_ADD_OVERFLOW 1 -#elif defined __has_builtin -# define _GL_HAS_BUILTIN_ADD_OVERFLOW __has_builtin (__builtin_add_overflow) #else # define _GL_HAS_BUILTIN_ADD_OVERFLOW 0 #endif /* True if __builtin_mul_overflow (A, B, P) works when P is non-null. */ -#ifdef __clang__ +#if defined __clang_major_ && __clang_major__ < 14 /* Work around Clang bug <https://bugs.llvm.org/show_bug.cgi?id=16404>. */ # define _GL_HAS_BUILTIN_MUL_OVERFLOW 0 #else @@ -249,9 +249,8 @@ /* True if __builtin_add_overflow_p (A, B, C) works, and similarly for __builtin_sub_overflow_p and __builtin_mul_overflow_p. */ -#if defined __clang__ || defined __ICC -/* Clang 11 lacks __builtin_mul_overflow_p, and even if it did it - would presumably run afoul of Clang bug 16404. ICC 2021.1's +#ifdef __EDG__ +/* In EDG-based compilers like ICC 2021.3 and earlier, __builtin_add_overflow_p etc. are not treated as integral constant expressions even when all arguments are. */ # define _GL_HAS_BUILTIN_OVERFLOW_P 0 @@ -400,7 +399,7 @@ #if _GL_HAS_BUILTIN_MUL_OVERFLOW # if ((9 < __GNUC__ + (3 <= __GNUC_MINOR__) \ || (__GNUC__ == 8 && 4 <= __GNUC_MINOR__)) \ - && !defined __ICC) + && !defined __EDG__) # define INT_MULTIPLY_WRAPV(a, b, r) __builtin_mul_overflow (a, b, r) # else /* Work around GCC bug 91450. */ diff --git a/support/regcomp.c b/support/regcomp.c index 887e5b50..6a97fdee 100644 --- a/support/regcomp.c +++ b/support/regcomp.c @@ -27,14 +27,10 @@ static void re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, char *fastmap); static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); -#ifdef RE_ENABLE_I18N static void free_charset (re_charset_t *cset); -#endif /* RE_ENABLE_I18N */ static void free_workarea_compile (regex_t *preg); static reg_errcode_t create_initial_state (re_dfa_t *dfa); -#ifdef RE_ENABLE_I18N static void optimize_utf8 (re_dfa_t *dfa); -#endif static reg_errcode_t analyze (regex_t *preg); static reg_errcode_t preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), @@ -89,7 +85,6 @@ static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, re_token_t *token); -#ifdef RE_ENABLE_I18N static reg_errcode_t build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, Idx *equiv_class_alloc, @@ -100,14 +95,6 @@ static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, Idx *char_class_alloc, const char *class_name, reg_syntax_t syntax); -#else /* not RE_ENABLE_I18N */ -static reg_errcode_t build_equiv_class (bitset_t sbcset, - const unsigned char *name); -static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, - bitset_t sbcset, - const char *class_name, - reg_syntax_t syntax); -#endif /* not RE_ENABLE_I18N */ static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, const char *class_name, @@ -279,8 +266,7 @@ re_compile_fastmap (struct re_pattern_buffer *bufp) } weak_alias (__re_compile_fastmap, re_compile_fastmap) -static inline void -__attribute__ ((always_inline)) +static __always_inline void re_set_fastmap (char *fastmap, bool icase, int ch) { fastmap[ch] = 1; @@ -306,7 +292,6 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, if (type == CHARACTER) { re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); -#ifdef RE_ENABLE_I18N if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) { unsigned char buf[MB_LEN_MAX]; @@ -327,7 +312,6 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, != (size_t) -1)) re_set_fastmap (fastmap, false, buf[0]); } -#endif } else if (type == SIMPLE_BRACKET) { @@ -341,13 +325,12 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, re_set_fastmap (fastmap, icase, ch); } } -#ifdef RE_ENABLE_I18N else if (type == COMPLEX_BRACKET) { re_charset_t *cset = dfa->nodes[node].opr.mbcset; Idx i; -# ifdef _LIBC +#ifdef _LIBC /* See if we have to try all bytes which start multiple collation elements. e.g. In da_DK, we want to catch 'a' since "aa" is a valid @@ -363,7 +346,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, if (table[i] < 0) re_set_fastmap (fastmap, icase, i); } -# endif /* _LIBC */ +#endif /* _LIBC */ /* See if we have to start the match at all multibyte characters, i.e. where we would not find an invalid sequence. This only @@ -371,9 +354,9 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, sets, the SIMPLE_BRACKET again suffices. */ if (dfa->mb_cur_max > 1 && (cset->nchar_classes || cset->non_match || cset->nranges -# ifdef _LIBC +#ifdef _LIBC || cset->nequiv_classes -# endif /* _LIBC */ +#endif /* _LIBC */ )) { unsigned char c = 0; @@ -406,12 +389,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, } } } -#endif /* RE_ENABLE_I18N */ - else if (type == OP_PERIOD -#ifdef RE_ENABLE_I18N - || type == OP_UTF8_PERIOD -#endif /* RE_ENABLE_I18N */ - || type == END_OF_RE) + else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE) { memset (fastmap, '\1', sizeof (char) * SBC_MAX); if (type == END_OF_RE) @@ -550,7 +528,6 @@ regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf, weak_alias (__regerror, regerror) -#ifdef RE_ENABLE_I18N /* This static array is used for the map to single-byte characters when UTF-8 is used. Otherwise we would allocate memory just to initialize it the same all the time. UTF-8 is the preferred encoding so this is @@ -558,25 +535,24 @@ weak_alias (__regerror, regerror) static const bitset_t utf8_sb_map = { /* Set the first 128 bits. */ -# if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__ +#if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__ [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX -# else -# if 4 * BITSET_WORD_BITS < ASCII_CHARS -# error "bitset_word_t is narrower than 32 bits" -# elif 3 * BITSET_WORD_BITS < ASCII_CHARS +#else +# if 4 * BITSET_WORD_BITS < ASCII_CHARS +# error "bitset_word_t is narrower than 32 bits" +# elif 3 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 2 * BITSET_WORD_BITS < ASCII_CHARS +# elif 2 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 1 * BITSET_WORD_BITS < ASCII_CHARS +# elif 1 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, -# endif +# endif (BITSET_WORD_MAX >> (SBC_MAX % BITSET_WORD_BITS == 0 ? 0 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) -# endif -}; #endif +}; static void @@ -614,10 +590,8 @@ free_dfa_content (re_dfa_t *dfa) re_free (entry->array); } re_free (dfa->state_table); -#ifdef RE_ENABLE_I18N if (dfa->sb_char != utf8_sb_map) re_free (dfa->sb_char); -#endif re_free (dfa->subexp_map); #ifdef DEBUG re_free (dfa->re_str); @@ -796,11 +770,9 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, if (__glibc_unlikely (err != REG_NOERROR)) goto re_compile_internal_free_return; -#ifdef RE_ENABLE_I18N /* If possible, do searching in single byte encoding to speed things up. */ if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) optimize_utf8 (dfa); -#endif /* Then create the initial state of the dfa. */ err = create_initial_state (dfa); @@ -830,11 +802,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) #ifndef _LIBC const char *codeset_name; #endif -#ifdef RE_ENABLE_I18N size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); -#else - size_t max_i18n_object_size = 0; -#endif size_t max_object_size = MAX (sizeof (struct re_state_table_entry), MAX (sizeof (re_token_t), @@ -886,7 +854,6 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) dfa->map_notascii = 0; #endif -#ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) { if (dfa->is_utf8) @@ -906,14 +873,13 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) wint_t wch = __btowc (ch); if (wch != WEOF) dfa->sb_char[i] |= (bitset_word_t) 1 << j; -# ifndef _LIBC +#ifndef _LIBC if (isascii (ch) && wch != ch) dfa->map_notascii = 1; -# endif +#endif } } } -#endif if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL)) return REG_ESPACE; @@ -933,8 +899,6 @@ init_word_char (re_dfa_t *dfa) dfa->word_ops_used = 1; if (__glibc_likely (dfa->map_notascii == 0)) { - /* Avoid uint32_t and uint64_t as some non-GCC platforms lack - them, an issue when this code is used in Gnulib. */ bitset_word_t bits0 = 0x00000000; bitset_word_t bits1 = 0x03ff0000; bitset_word_t bits2 = 0x87fffffe; @@ -1074,7 +1038,6 @@ create_initial_state (re_dfa_t *dfa) return REG_NOERROR; } -#ifdef RE_ENABLE_I18N /* If it is possible to do searching in single byte encoding instead of UTF-8 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change DFA nodes where needed. */ @@ -1154,7 +1117,6 @@ optimize_utf8 (re_dfa_t *dfa) dfa->is_utf8 = 0; dfa->has_mb_node = dfa->nbackref > 0 || has_period; } -#endif /* Analyze the structure tree, and calculate "first", "next", "edest", "eclosure", and "inveclosure". */ @@ -1792,7 +1754,6 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) token->opr.c = c; token->word_char = 0; -#ifdef RE_ENABLE_I18N token->mb_partial = 0; if (input->mb_cur_max > 1 && !re_string_first_byte (input, re_string_cur_idx (input))) @@ -1801,7 +1762,6 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) token->mb_partial = 1; return 1; } -#endif if (c == '\\') { unsigned char c2; @@ -1814,7 +1774,6 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) c2 = re_string_peek_byte_case (input, 1); token->opr.c = c2; token->type = CHARACTER; -#ifdef RE_ENABLE_I18N if (input->mb_cur_max > 1) { wint_t wc = re_string_wchar_at (input, @@ -1822,7 +1781,6 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; } else -#endif token->word_char = IS_WORD_CHAR (c2) != 0; switch (c2) @@ -1928,14 +1886,12 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) } token->type = CHARACTER; -#ifdef RE_ENABLE_I18N if (input->mb_cur_max > 1) { wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; } else -#endif token->word_char = IS_WORD_CHAR (token->opr.c); switch (c) @@ -2027,14 +1983,12 @@ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) c = re_string_peek_byte (input, 0); token->opr.c = c; -#ifdef RE_ENABLE_I18N if (input->mb_cur_max > 1 && !re_string_first_byte (input, re_string_cur_idx (input))) { token->type = CHARACTER; return 1; } -#endif /* RE_ENABLE_I18N */ if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && re_string_cur_idx (input) + 1 < re_string_length (input)) @@ -2256,7 +2210,6 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, *err = REG_ESPACE; return NULL; } -#ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) { while (!re_string_eoi (regexp) @@ -2273,7 +2226,6 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, } } } -#endif break; case OP_OPEN_SUBEXP: @@ -2666,40 +2618,30 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, #ifndef _LIBC -# ifdef RE_ENABLE_I18N /* Convert the byte B to the corresponding wide character. In a unibyte locale, treat B as itself. In a multibyte locale, return WEOF if B is an encoding error. */ static wint_t -parse_byte (unsigned char b, re_charset_t *mbcset) +parse_byte (unsigned char b, re_dfa_t const *dfa) { - return mbcset == NULL ? b : __btowc (b); + return dfa->mb_cur_max > 1 ? __btowc (b) : b; } -# endif - /* Local function for parse_bracket_exp only used in case of NOT _LIBC. - Build the range expression which starts from START_ELEM, and ends - at END_ELEM. The result are written to MBCSET and SBCSET. - RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument since we may - update it. */ +/* Local function for parse_bracket_exp used in _LIBC environment. + Build the range expression which starts from START_ELEM, and ends + at END_ELEM. The result are written to MBCSET and SBCSET. + RANGE_ALLOC is the allocated size of mbcset->range_starts, and + mbcset->range_ends, is a pointer argument since we may + update it. */ static reg_errcode_t -# ifdef RE_ENABLE_I18N -build_range_exp (const reg_syntax_t syntax, - bitset_t sbcset, - re_charset_t *mbcset, - Idx *range_alloc, - const bracket_elem_t *start_elem, - const bracket_elem_t *end_elem) -# else /* not RE_ENABLE_I18N */ -build_range_exp (const reg_syntax_t syntax, - bitset_t sbcset, - const bracket_elem_t *start_elem, - const bracket_elem_t *end_elem) -# endif /* not RE_ENABLE_I18N */ +build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc, + bracket_elem_t *start_elem, bracket_elem_t *end_elem, + re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules, + const unsigned char *collseqmb, const char *collseqwc, + int_fast32_t table_size, const void *symb_table, + const unsigned char *extra) { - unsigned int start_ch, end_ch; /* Equivalence Classes and Character Classes can't be a range start/end. */ if (__glibc_unlikely (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS @@ -2715,110 +2657,88 @@ build_range_exp (const reg_syntax_t syntax, && strlen ((char *) end_elem->opr.name) > 1))) return REG_ECOLLATE; -# ifdef RE_ENABLE_I18N - { - wchar_t wc; - wint_t start_wc; - wint_t end_wc; - + unsigned int start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] - : 0)); + : 0)), end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] : 0)); + wint_t start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) - ? parse_byte (start_ch, mbcset) : start_elem->opr.wch); + ? parse_byte (start_ch, dfa) : start_elem->opr.wch), end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) - ? parse_byte (end_ch, mbcset) : end_elem->opr.wch); - if (start_wc == WEOF || end_wc == WEOF) - return REG_ECOLLATE; - else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) - && start_wc > end_wc)) - return REG_ERANGE; - - /* Got valid collation sequence values, add them as a new entry. - However, for !_LIBC we have no collation elements: if the - character set is single byte, the single byte character set - that we build below suffices. parse_bracket_exp passes - no MBCSET if dfa->mb_cur_max == 1. */ - if (mbcset) - { - /* Check the space of the arrays. */ - if (__glibc_unlikely (*range_alloc == mbcset->nranges)) - { - /* There is not enough space, need realloc. */ - wchar_t *new_array_start, *new_array_end; - Idx new_nranges; - - /* +1 in case of mbcset->nranges is 0. */ - new_nranges = 2 * mbcset->nranges + 1; - /* Use realloc since mbcset->range_starts and mbcset->range_ends - are NULL if *range_alloc == 0. */ - new_array_start = re_realloc (mbcset->range_starts, wchar_t, - new_nranges); - new_array_end = re_realloc (mbcset->range_ends, wchar_t, - new_nranges); + ? parse_byte (end_ch, dfa) : end_elem->opr.wch); - if (__glibc_unlikely (new_array_start == NULL - || new_array_end == NULL)) - { - re_free (new_array_start); - re_free (new_array_end); - return REG_ESPACE; - } + if (start_wc == WEOF || end_wc == WEOF) + return REG_ECOLLATE; + else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) + && start_wc > end_wc)) + return REG_ERANGE; - mbcset->range_starts = new_array_start; - mbcset->range_ends = new_array_end; - *range_alloc = new_nranges; - } + /* Got valid collation sequence values, add them as a new entry. + However, for !_LIBC we have no collation elements: if the + character set is single byte, the single byte character set + that we build below suffices. parse_bracket_exp passes + no MBCSET if dfa->mb_cur_max == 1. */ + if (dfa->mb_cur_max > 1) + { + /* Check the space of the arrays. */ + if (__glibc_unlikely (*range_alloc == mbcset->nranges)) + { + /* There is not enough space, need realloc. */ + wchar_t *new_array_start, *new_array_end; + Idx new_nranges; - mbcset->range_starts[mbcset->nranges] = start_wc; - mbcset->range_ends[mbcset->nranges++] = end_wc; - } + /* +1 in case of mbcset->nranges is 0. */ + new_nranges = 2 * mbcset->nranges + 1; + /* Use realloc since mbcset->range_starts and mbcset->range_ends + are NULL if *range_alloc == 0. */ + new_array_start = re_realloc (mbcset->range_starts, wchar_t, + new_nranges); + new_array_end = re_realloc (mbcset->range_ends, wchar_t, + new_nranges); + + if (__glibc_unlikely (new_array_start == NULL + || new_array_end == NULL)) + { + re_free (new_array_start); + re_free (new_array_end); + return REG_ESPACE; + } + + mbcset->range_starts = new_array_start; + mbcset->range_ends = new_array_end; + *range_alloc = new_nranges; + } + + mbcset->range_starts[mbcset->nranges] = start_wc; + mbcset->range_ends[mbcset->nranges++] = end_wc; + } + + /* Build the table for single byte characters. */ + for (wchar_t wc = 0; wc < SBC_MAX; ++wc) + { + if (start_wc <= wc && wc <= end_wc) + bitset_set (sbcset, wc); + } - /* Build the table for single byte characters. */ - for (wc = 0; wc < SBC_MAX; ++wc) - { - if (start_wc <= wc && wc <= end_wc) - bitset_set (sbcset, wc); - } - } -# else /* not RE_ENABLE_I18N */ - { - unsigned int ch; - start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch - : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] - : 0)); - end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch - : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] - : 0)); - if (start_ch > end_ch) - return REG_ERANGE; - /* Build the table for single byte characters. */ - for (ch = 0; ch < SBC_MAX; ++ch) - if (start_ch <= ch && ch <= end_ch) - bitset_set (sbcset, ch); - } -# endif /* not RE_ENABLE_I18N */ return REG_NOERROR; } #endif /* not _LIBC */ #ifndef _LIBC -/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. +/* Helper function for parse_bracket_exp only used in case of NOT _LIBC. Build the collating element which is represented by NAME. The result are written to MBCSET and SBCSET. COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a pointer argument since we may update it. */ static reg_errcode_t -# ifdef RE_ENABLE_I18N build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, - Idx *coll_sym_alloc, const unsigned char *name) -# else /* not RE_ENABLE_I18N */ -build_collating_symbol (bitset_t sbcset, const unsigned char *name) -# endif /* not RE_ENABLE_I18N */ + Idx *coll_sym_alloc, const unsigned char *name, + uint_fast32_t nrules, int_fast32_t table_size, + const void *symb_table, const unsigned char *extra) { size_t name_len = strlen ((const char *) name); if (__glibc_unlikely (name_len != 1)) @@ -2831,271 +2751,280 @@ build_collating_symbol (bitset_t sbcset, const unsigned char *name) } #endif /* not _LIBC */ -/* This function parse bracket expression like "[abc]", "[a-c]", - "[[.a-a.]]" etc. */ - -static bin_tree_t * -parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, - reg_syntax_t syntax, reg_errcode_t *err) -{ #ifdef _LIBC - const unsigned char *collseqmb; - const char *collseqwc; - uint32_t nrules; - int32_t table_size; - const int32_t *symb_table; - const unsigned char *extra; - - /* Local function for parse_bracket_exp used in _LIBC environment. - Seek the collating symbol entry corresponding to NAME. - Return the index of the symbol in the SYMB_TABLE, - or -1 if not found. */ - - auto inline int32_t - __attribute__ ((always_inline)) - seek_collating_symbol_entry (const unsigned char *name, size_t name_len) - { - int32_t elem; - - for (elem = 0; elem < table_size; elem++) - if (symb_table[2 * elem] != 0) - { - int32_t idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - if (/* Compare the length of the name. */ - name_len == extra[idx] - /* Compare the name. */ - && memcmp (name, &extra[idx + 1], name_len) == 0) - /* Yep, this is the entry. */ - return elem; - } - return -1; - } +/* Local function for parse_bracket_exp used in _LIBC environment. + Seek the collating symbol entry corresponding to NAME. + Return the index of the symbol in the SYMB_TABLE, + or -1 if not found. */ + +static __always_inline int32_t +seek_collating_symbol_entry (const unsigned char *name, size_t name_len, + const int32_t *symb_table, + int_fast32_t table_size, + const unsigned char *extra) +{ + int_fast32_t elem; - /* Local function for parse_bracket_exp used in _LIBC environment. - Look up the collation sequence value of BR_ELEM. - Return the value if succeeded, UINT_MAX otherwise. */ + for (elem = 0; elem < table_size; elem++) + if (symb_table[2 * elem] != 0) + { + int32_t idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + if (/* Compare the length of the name. */ + name_len == extra[idx] + /* Compare the name. */ + && memcmp (name, &extra[idx + 1], name_len) == 0) + /* Yep, this is the entry. */ + return elem; + } + return -1; +} - auto inline unsigned int - __attribute__ ((always_inline)) - lookup_collation_sequence_value (bracket_elem_t *br_elem) +/* Local function for parse_bracket_exp used in _LIBC environment. + Look up the collation sequence value of BR_ELEM. + Return the value if succeeded, UINT_MAX otherwise. */ + +static __always_inline unsigned int +lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules, + const unsigned char *collseqmb, + const char *collseqwc, + int_fast32_t table_size, + const int32_t *symb_table, + const unsigned char *extra) +{ + if (br_elem->type == SB_CHAR) { - if (br_elem->type == SB_CHAR) - { - /* - if (MB_CUR_MAX == 1) - */ - if (nrules == 0) - return collseqmb[br_elem->opr.ch]; - else - { - wint_t wc = __btowc (br_elem->opr.ch); - return __collseq_table_lookup (collseqwc, wc); - } - } - else if (br_elem->type == MB_CHAR) + /* if (MB_CUR_MAX == 1) */ + if (nrules == 0) + return collseqmb[br_elem->opr.ch]; + else { - if (nrules != 0) - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + wint_t wc = __btowc (br_elem->opr.ch); + return __collseq_table_lookup (collseqwc, wc); } - else if (br_elem->type == COLL_SYM) + } + else if (br_elem->type == MB_CHAR) + { + if (nrules != 0) + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + } + else if (br_elem->type == COLL_SYM) + { + size_t sym_name_len = strlen ((char *) br_elem->opr.name); + if (nrules != 0) { - size_t sym_name_len = strlen ((char *) br_elem->opr.name); - if (nrules != 0) + int32_t elem, idx; + elem = seek_collating_symbol_entry (br_elem->opr.name, + sym_name_len, + symb_table, table_size, + extra); + if (elem != -1) { - int32_t elem, idx; - elem = seek_collating_symbol_entry (br_elem->opr.name, - sym_name_len); - if (elem != -1) - { - /* We found the entry. */ - idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - /* Skip the byte sequence of the collating element. */ - idx += 1 + extra[idx]; - /* Adjust for the alignment. */ - idx = (idx + 3) & ~3; - /* Skip the multibyte collation sequence value. */ - idx += sizeof (unsigned int); - /* Skip the wide char sequence of the collating element. */ - idx += sizeof (unsigned int) * - (1 + *(unsigned int *) (extra + idx)); - /* Return the collation sequence value. */ - return *(unsigned int *) (extra + idx); - } - else if (sym_name_len == 1) - { - /* No valid character. Match it as a single byte - character. */ - return collseqmb[br_elem->opr.name[0]]; - } + /* We found the entry. */ + idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + /* Skip the byte sequence of the collating element. */ + idx += 1 + extra[idx]; + /* Adjust for the alignment. */ + idx = (idx + 3) & ~3; + /* Skip the multibyte collation sequence value. */ + idx += sizeof (unsigned int); + /* Skip the wide char sequence of the collating element. */ + idx += sizeof (unsigned int) * + (1 + *(unsigned int *) (extra + idx)); + /* Return the collation sequence value. */ + return *(unsigned int *) (extra + idx); } else if (sym_name_len == 1) - return collseqmb[br_elem->opr.name[0]]; + { + /* No valid character. Match it as a single byte + character. */ + return collseqmb[br_elem->opr.name[0]]; + } } - return UINT_MAX; + else if (sym_name_len == 1) + return collseqmb[br_elem->opr.name[0]]; } + return UINT_MAX; +} - /* Local function for parse_bracket_exp used in _LIBC environment. - Build the range expression which starts from START_ELEM, and ends - at END_ELEM. The result are written to MBCSET and SBCSET. - RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument since we may - update it. */ +/* Local function for parse_bracket_exp used in _LIBC environment. + Build the range expression which starts from START_ELEM, and ends + at END_ELEM. The result are written to MBCSET and SBCSET. + RANGE_ALLOC is the allocated size of mbcset->range_starts, and + mbcset->range_ends, is a pointer argument since we may + update it. */ + +static __always_inline reg_errcode_t +build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc, + bracket_elem_t *start_elem, bracket_elem_t *end_elem, + re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules, + const unsigned char *collseqmb, const char *collseqwc, + int_fast32_t table_size, const int32_t *symb_table, + const unsigned char *extra) +{ + unsigned int ch; + uint32_t start_collseq; + uint32_t end_collseq; - auto inline reg_errcode_t - __attribute__ ((always_inline)) - build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, - bracket_elem_t *start_elem, bracket_elem_t *end_elem) - { - unsigned int ch; - uint32_t start_collseq; - uint32_t end_collseq; - - /* Equivalence Classes and Character Classes can't be a range - start/end. */ - if (__glibc_unlikely (start_elem->type == EQUIV_CLASS - || start_elem->type == CHAR_CLASS - || end_elem->type == EQUIV_CLASS - || end_elem->type == CHAR_CLASS)) - return REG_ERANGE; + /* Equivalence Classes and Character Classes can't be a range + start/end. */ + if (__glibc_unlikely (start_elem->type == EQUIV_CLASS + || start_elem->type == CHAR_CLASS + || end_elem->type == EQUIV_CLASS + || end_elem->type == CHAR_CLASS)) + return REG_ERANGE; - /* FIXME: Implement rational ranges here, too. */ - start_collseq = lookup_collation_sequence_value (start_elem); - end_collseq = lookup_collation_sequence_value (end_elem); - /* Check start/end collation sequence values. */ - if (__glibc_unlikely (start_collseq == UINT_MAX - || end_collseq == UINT_MAX)) - return REG_ECOLLATE; - if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) - && start_collseq > end_collseq)) - return REG_ERANGE; + /* FIXME: Implement rational ranges here, too. */ + start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc, + table_size, symb_table, extra); + end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc, + table_size, symb_table, extra); + /* Check start/end collation sequence values. */ + if (__glibc_unlikely (start_collseq == UINT_MAX + || end_collseq == UINT_MAX)) + return REG_ECOLLATE; + if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) + && start_collseq > end_collseq)) + return REG_ERANGE; - /* Got valid collation sequence values, add them as a new entry. - However, if we have no collation elements, and the character set - is single byte, the single byte character set that we - build below suffices. */ - if (nrules > 0 || dfa->mb_cur_max > 1) + /* Got valid collation sequence values, add them as a new entry. + However, if we have no collation elements, and the character set + is single byte, the single byte character set that we + build below suffices. */ + if (nrules > 0 || dfa->mb_cur_max > 1) + { + /* Check the space of the arrays. */ + if (__glibc_unlikely (*range_alloc == mbcset->nranges)) { - /* Check the space of the arrays. */ - if (__glibc_unlikely (*range_alloc == mbcset->nranges)) - { - /* There is not enough space, need realloc. */ - uint32_t *new_array_start; - uint32_t *new_array_end; - Idx new_nranges; - - /* +1 in case of mbcset->nranges is 0. */ - new_nranges = 2 * mbcset->nranges + 1; - new_array_start = re_realloc (mbcset->range_starts, uint32_t, - new_nranges); - new_array_end = re_realloc (mbcset->range_ends, uint32_t, - new_nranges); - - if (__glibc_unlikely (new_array_start == NULL - || new_array_end == NULL)) - return REG_ESPACE; + /* There is not enough space, need realloc. */ + uint32_t *new_array_start; + uint32_t *new_array_end; + int new_nranges; - mbcset->range_starts = new_array_start; - mbcset->range_ends = new_array_end; - *range_alloc = new_nranges; - } + /* +1 in case of mbcset->nranges is 0. */ + new_nranges = 2 * mbcset->nranges + 1; + new_array_start = re_realloc (mbcset->range_starts, uint32_t, + new_nranges); + new_array_end = re_realloc (mbcset->range_ends, uint32_t, + new_nranges); - mbcset->range_starts[mbcset->nranges] = start_collseq; - mbcset->range_ends[mbcset->nranges++] = end_collseq; - } + if (__glibc_unlikely (new_array_start == NULL + || new_array_end == NULL)) + return REG_ESPACE; - /* Build the table for single byte characters. */ - for (ch = 0; ch < SBC_MAX; ch++) - { - uint32_t ch_collseq; - /* - if (MB_CUR_MAX == 1) - */ - if (nrules == 0) - ch_collseq = collseqmb[ch]; - else - ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); - if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) - bitset_set (sbcset, ch); + mbcset->range_starts = new_array_start; + mbcset->range_ends = new_array_end; + *range_alloc = new_nranges; } - return REG_NOERROR; + + mbcset->range_starts[mbcset->nranges] = start_collseq; + mbcset->range_ends[mbcset->nranges++] = end_collseq; } - /* Local function for parse_bracket_exp used in _LIBC environment. - Build the collating element which is represented by NAME. - The result are written to MBCSET and SBCSET. - COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a - pointer argument since we may update it. */ + /* Build the table for single byte characters. */ + for (ch = 0; ch < SBC_MAX; ch++) + { + uint32_t ch_collseq; + /* if (MB_CUR_MAX == 1) */ + if (nrules == 0) + ch_collseq = collseqmb[ch]; + else + ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); + if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) + bitset_set (sbcset, ch); + } + return REG_NOERROR; +} - auto inline reg_errcode_t - __attribute__ ((always_inline)) - build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, - Idx *coll_sym_alloc, const unsigned char *name) +/* Local function for parse_bracket_exp used in _LIBC environment. + Build the collating element which is represented by NAME. + The result are written to MBCSET and SBCSET. + COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a + pointer argument since we may update it. */ + +static __always_inline reg_errcode_t +build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, + Idx *coll_sym_alloc, const unsigned char *name, + uint_fast32_t nrules, int_fast32_t table_size, + const int32_t *symb_table, const unsigned char *extra) +{ + int32_t elem, idx; + size_t name_len = strlen ((const char *) name); + if (nrules != 0) { - int32_t elem, idx; - size_t name_len = strlen ((const char *) name); - if (nrules != 0) + elem = seek_collating_symbol_entry (name, name_len, symb_table, + table_size, extra); + if (elem != -1) { - elem = seek_collating_symbol_entry (name, name_len); - if (elem != -1) - { - /* We found the entry. */ - idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - } - else if (name_len == 1) - { - /* No valid character, treat it as a normal - character. */ - bitset_set (sbcset, name[0]); - return REG_NOERROR; - } - else - return REG_ECOLLATE; - - /* Got valid collation sequence, add it as a new entry. */ - /* Check the space of the arrays. */ - if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) - { - /* Not enough, realloc it. */ - /* +1 in case of mbcset->ncoll_syms is 0. */ - Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; - /* Use realloc since mbcset->coll_syms is NULL - if *alloc == 0. */ - int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, - new_coll_sym_alloc); - if (__glibc_unlikely (new_coll_syms == NULL)) - return REG_ESPACE; - mbcset->coll_syms = new_coll_syms; - *coll_sym_alloc = new_coll_sym_alloc; - } - mbcset->coll_syms[mbcset->ncoll_syms++] = idx; + /* We found the entry. */ + idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + } + else if (name_len == 1) + { + /* No valid character, treat it as a normal + character. */ + bitset_set (sbcset, name[0]); return REG_NOERROR; } else + return REG_ECOLLATE; + + /* Got valid collation sequence, add it as a new entry. */ + /* Check the space of the arrays. */ + if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) { - if (__glibc_unlikely (name_len != 1)) - return REG_ECOLLATE; - else - { - bitset_set (sbcset, name[0]); - return REG_NOERROR; - } + /* Not enough, realloc it. */ + /* +1 in case of mbcset->ncoll_syms is 0. */ + int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; + /* Use realloc since mbcset->coll_syms is NULL + if *alloc == 0. */ + int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, + new_coll_sym_alloc); + if (__glibc_unlikely (new_coll_syms == NULL)) + return REG_ESPACE; + mbcset->coll_syms = new_coll_syms; + *coll_sym_alloc = new_coll_sym_alloc; } + mbcset->coll_syms[mbcset->ncoll_syms++] = idx; + return REG_NOERROR; } -#endif + else + { + if (__glibc_unlikely (name_len != 1)) + return REG_ECOLLATE; + else + { + bitset_set (sbcset, name[0]); + return REG_NOERROR; + } + } +} +#endif /* _LIBC */ + +/* This function parse bracket expression like "[abc]", "[a-c]", + "[[.a-a.]]" etc. */ + +static bin_tree_t * +parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, + reg_syntax_t syntax, reg_errcode_t *err) +{ + const unsigned char *collseqmb = NULL; + const char *collseqwc = NULL; + uint_fast32_t nrules = 0; + int_fast32_t table_size = 0; + const void *symb_table = NULL; + const unsigned char *extra = NULL; re_token_t br_token; re_bitset_ptr_t sbcset; -#ifdef RE_ENABLE_I18N re_charset_t *mbcset; Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; Idx equiv_class_alloc = 0, char_class_alloc = 0; -#endif /* not RE_ENABLE_I18N */ bool non_match = false; bin_tree_t *work_tree; int token_len; @@ -3111,26 +3040,17 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, */ collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); - symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, - _NL_COLLATE_SYMB_TABLEMB); + symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB); extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); } #endif sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); -#ifdef RE_ENABLE_I18N mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); -#endif /* RE_ENABLE_I18N */ -#ifdef RE_ENABLE_I18N if (__glibc_unlikely (sbcset == NULL || mbcset == NULL)) -#else - if (__glibc_unlikely (sbcset == NULL)) -#endif /* RE_ENABLE_I18N */ { re_free (sbcset); -#ifdef RE_ENABLE_I18N re_free (mbcset); -#endif *err = REG_ESPACE; return NULL; } @@ -3143,9 +3063,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, } if (token->type == OP_NON_MATCH_LIST) { -#ifdef RE_ENABLE_I18N mbcset->non_match = 1; -#endif /* not RE_ENABLE_I18N */ non_match = true; if (syntax & RE_HAT_LISTS_NOT_NEWLINE) bitset_set (sbcset, '\n'); @@ -3228,18 +3146,10 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, token_len = peek_token_bracket (token, regexp, syntax); -#ifdef _LIBC *err = build_range_exp (sbcset, mbcset, &range_alloc, - &start_elem, &end_elem); -#else -# ifdef RE_ENABLE_I18N - *err = build_range_exp (syntax, sbcset, - dfa->mb_cur_max > 1 ? mbcset : NULL, - &range_alloc, &start_elem, &end_elem); -# else - *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); -# endif -#endif /* RE_ENABLE_I18N */ + &start_elem, &end_elem, + dfa, syntax, nrules, collseqmb, collseqwc, + table_size, symb_table, extra); if (__glibc_unlikely (*err != REG_NOERROR)) goto parse_bracket_exp_free_return; } @@ -3250,7 +3160,6 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, case SB_CHAR: bitset_set (sbcset, start_elem.opr.ch); break; -#ifdef RE_ENABLE_I18N case MB_CHAR: /* Check whether the array has enough space. */ if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars)) @@ -3268,30 +3177,24 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, } mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; break; -#endif /* RE_ENABLE_I18N */ case EQUIV_CLASS: *err = build_equiv_class (sbcset, -#ifdef RE_ENABLE_I18N mbcset, &equiv_class_alloc, -#endif /* RE_ENABLE_I18N */ start_elem.opr.name); if (__glibc_unlikely (*err != REG_NOERROR)) goto parse_bracket_exp_free_return; break; case COLL_SYM: *err = build_collating_symbol (sbcset, -#ifdef RE_ENABLE_I18N mbcset, &coll_sym_alloc, -#endif /* RE_ENABLE_I18N */ - start_elem.opr.name); + start_elem.opr.name, + nrules, table_size, symb_table, extra); if (__glibc_unlikely (*err != REG_NOERROR)) goto parse_bracket_exp_free_return; break; case CHAR_CLASS: *err = build_charclass (regexp->trans, sbcset, -#ifdef RE_ENABLE_I18N mbcset, &char_class_alloc, -#endif /* RE_ENABLE_I18N */ (const char *) start_elem.opr.name, syntax); if (__glibc_unlikely (*err != REG_NOERROR)) @@ -3317,7 +3220,6 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, if (non_match) bitset_not (sbcset); -#ifdef RE_ENABLE_I18N /* Ensure only single byte characters are set. */ if (dfa->mb_cur_max > 1) bitset_mask (sbcset, dfa->sb_char); @@ -3361,11 +3263,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, } } else -#endif /* not RE_ENABLE_I18N */ { -#ifdef RE_ENABLE_I18N free_charset (mbcset); -#endif /* Build a tree for simple bracket. */ br_token.type = SIMPLE_BRACKET; br_token.opr.sbcset = sbcset; @@ -3379,9 +3278,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, *err = REG_ESPACE; parse_bracket_exp_free_return: re_free (sbcset); -#ifdef RE_ENABLE_I18N free_charset (mbcset); -#endif /* RE_ENABLE_I18N */ return NULL; } @@ -3392,7 +3289,6 @@ parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, re_token_t *token, int token_len, re_dfa_t *dfa, reg_syntax_t syntax, bool accept_hyphen) { -#ifdef RE_ENABLE_I18N int cur_char_size; cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); if (cur_char_size > 1) @@ -3402,7 +3298,6 @@ parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, re_string_skip_bytes (regexp, cur_char_size); return REG_NOERROR; } -#endif /* RE_ENABLE_I18N */ re_string_skip_bytes (regexp, token_len); /* Skip a token. */ if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS || token->type == OP_OPEN_EQUIV_CLASS) @@ -3475,12 +3370,8 @@ parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, is a pointer argument since we may update it. */ static reg_errcode_t -#ifdef RE_ENABLE_I18N build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, Idx *equiv_class_alloc, const unsigned char *name) -#else /* not RE_ENABLE_I18N */ -build_equiv_class (bitset_t sbcset, const unsigned char *name) -#endif /* not RE_ENABLE_I18N */ { #ifdef _LIBC uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); @@ -3560,14 +3451,9 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) is a pointer argument since we may update it. */ static reg_errcode_t -#ifdef RE_ENABLE_I18N build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, re_charset_t *mbcset, Idx *char_class_alloc, const char *class_name, reg_syntax_t syntax) -#else /* not RE_ENABLE_I18N */ -build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, - const char *class_name, reg_syntax_t syntax) -#endif /* not RE_ENABLE_I18N */ { int i; const char *name = class_name; @@ -3578,7 +3464,6 @@ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) name = "alpha"; -#ifdef RE_ENABLE_I18N /* Check the space of the arrays. */ if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes)) { @@ -3594,7 +3479,6 @@ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, *char_class_alloc = new_char_class_alloc; } mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); -#endif /* RE_ENABLE_I18N */ #define BUILD_CHARCLASS_LOOP(ctype_func) \ do { \ @@ -3649,10 +3533,8 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, reg_errcode_t *err) { re_bitset_ptr_t sbcset; -#ifdef RE_ENABLE_I18N re_charset_t *mbcset; Idx alloc = 0; -#endif /* not RE_ENABLE_I18N */ reg_errcode_t ret; bin_tree_t *tree; @@ -3662,7 +3544,6 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, *err = REG_ESPACE; return NULL; } -#ifdef RE_ENABLE_I18N mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); if (__glibc_unlikely (mbcset == NULL)) { @@ -3671,21 +3552,14 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, return NULL; } mbcset->non_match = non_match; -#endif /* RE_ENABLE_I18N */ /* We don't care the syntax in this case. */ - ret = build_charclass (trans, sbcset, -#ifdef RE_ENABLE_I18N - mbcset, &alloc, -#endif /* RE_ENABLE_I18N */ - class_name, 0); + ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0); if (__glibc_unlikely (ret != REG_NOERROR)) { re_free (sbcset); -#ifdef RE_ENABLE_I18N free_charset (mbcset); -#endif /* RE_ENABLE_I18N */ *err = ret; return NULL; } @@ -3697,11 +3571,9 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, if (non_match) bitset_not (sbcset); -#ifdef RE_ENABLE_I18N /* Ensure only single byte characters are set. */ if (dfa->mb_cur_max > 1) bitset_mask (sbcset, dfa->sb_char); -#endif /* Build a tree for simple bracket. */ re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset }; @@ -3709,7 +3581,6 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, if (__glibc_unlikely (tree == NULL)) goto build_word_op_espace; -#ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) { bin_tree_t *mbc_tree; @@ -3730,15 +3601,10 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, free_charset (mbcset); return tree; } -#else /* not RE_ENABLE_I18N */ - return tree; -#endif /* not RE_ENABLE_I18N */ build_word_op_espace: re_free (sbcset); -#ifdef RE_ENABLE_I18N free_charset (mbcset); -#endif /* RE_ENABLE_I18N */ *err = REG_ESPACE; return NULL; } @@ -3771,21 +3637,19 @@ fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) return num; } -#ifdef RE_ENABLE_I18N static void free_charset (re_charset_t *cset) { re_free (cset->mbchars); -# ifdef _LIBC +#ifdef _LIBC re_free (cset->coll_syms); re_free (cset->equiv_classes); -# endif +#endif re_free (cset->range_starts); re_free (cset->range_ends); re_free (cset->char_classes); re_free (cset); } -#endif /* RE_ENABLE_I18N */ /* Functions for binary tree operation. */ @@ -3851,13 +3715,10 @@ mark_opt_subexp (void *extra, bin_tree_t *node) static void free_token (re_token_t *node) { -#ifdef RE_ENABLE_I18N if (node->type == COMPLEX_BRACKET && node->duplicated == 0) free_charset (node->opr.mbcset); - else -#endif /* RE_ENABLE_I18N */ - if (node->type == SIMPLE_BRACKET && node->duplicated == 0) - re_free (node->opr.sbcset); + else if (node->type == SIMPLE_BRACKET && node->duplicated == 0) + re_free (node->opr.sbcset); } /* Worker function for tree walking. Free the allocated memory inside NODE diff --git a/support/regex_internal.c b/support/regex_internal.c index aefcfa2f..9767cd0d 100644 --- a/support/regex_internal.c +++ b/support/regex_internal.c @@ -30,10 +30,8 @@ static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, re_hashval_t hash); static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len); -#ifdef RE_ENABLE_I18N static void build_wcs_buffer (re_string_t *pstr); static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr); -#endif /* RE_ENABLE_I18N */ static void build_upper_buffer (re_string_t *pstr); static void re_string_translate_buffer (re_string_t *pstr); static unsigned int re_string_context_at (const re_string_t *input, Idx idx, @@ -91,7 +89,6 @@ re_string_construct (re_string_t *pstr, const char *str, Idx len, if (icase) { -#ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) { while (1) @@ -109,16 +106,13 @@ re_string_construct (re_string_t *pstr, const char *str, Idx len, } } else -#endif /* RE_ENABLE_I18N */ build_upper_buffer (pstr); } else { -#ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) build_wcs_buffer (pstr); else -#endif /* RE_ENABLE_I18N */ { if (trans != NULL) re_string_translate_buffer (pstr); @@ -139,7 +133,6 @@ static reg_errcode_t __attribute_warn_unused_result__ re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) { -#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) { wint_t *new_wcs; @@ -162,7 +155,6 @@ re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) pstr->offsets = new_offsets; } } -#endif /* RE_ENABLE_I18N */ if (pstr->mbs_allocated) { unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, @@ -194,7 +186,6 @@ re_string_construct_common (const char *str, Idx len, re_string_t *pstr, pstr->raw_stop = pstr->stop; } -#ifdef RE_ENABLE_I18N /* Build wide character buffer PSTR->WCS. If the byte sequence of the string are: @@ -530,7 +521,6 @@ re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) *last_wc = wc; return rawbuf_idx; } -#endif /* RE_ENABLE_I18N */ /* Build the buffer PSTR->MBS, and apply the translation if we need. This function is used in case of REG_ICASE. */ @@ -585,10 +575,8 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) else { /* Reset buffer. */ -#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); -#endif /* RE_ENABLE_I18N */ pstr->len = pstr->raw_len; pstr->stop = pstr->raw_stop; pstr->valid_len = 0; @@ -608,7 +596,6 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) if (__glibc_likely (offset < pstr->valid_raw_len)) { /* Yes, move them to the front of the buffer. */ -#ifdef RE_ENABLE_I18N if (__glibc_unlikely (pstr->offsets_needed)) { Idx low = 0, high = pstr->valid_len, mid; @@ -672,15 +659,12 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) } } else -#endif { pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags); -#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) memmove (pstr->wcs, pstr->wcs + offset, (pstr->valid_len - offset) * sizeof (wint_t)); -#endif /* RE_ENABLE_I18N */ if (__glibc_unlikely (pstr->mbs_allocated)) memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); @@ -691,7 +675,6 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) } else { -#ifdef RE_ENABLE_I18N /* No, skip all characters until IDX. */ Idx prev_valid_len = pstr->valid_len; @@ -701,9 +684,7 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) pstr->stop = pstr->raw_stop - idx + offset; pstr->offsets_needed = 0; } -#endif pstr->valid_len = 0; -#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) { Idx wcs_idx; @@ -787,7 +768,6 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) pstr->valid_raw_len = pstr->valid_len; } else -#endif /* RE_ENABLE_I18N */ { int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; pstr->valid_raw_len = 0; @@ -807,7 +787,6 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) pstr->stop -= offset; /* Then build the buffers. */ -#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) { if (pstr->icase) @@ -820,7 +799,6 @@ re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) build_wcs_buffer (pstr); } else -#endif /* RE_ENABLE_I18N */ if (__glibc_unlikely (pstr->mbs_allocated)) { if (pstr->icase) @@ -846,28 +824,22 @@ re_string_peek_byte_case (const re_string_t *pstr, Idx idx) if (__glibc_likely (!pstr->mbs_allocated)) return re_string_peek_byte (pstr, idx); -#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) return re_string_peek_byte (pstr, idx); -#endif off = pstr->cur_idx + idx; -#ifdef RE_ENABLE_I18N if (pstr->offsets_needed) off = pstr->offsets[off]; -#endif ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; -#ifdef RE_ENABLE_I18N /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I this function returns CAPITAL LETTER I instead of first byte of DOTLESS SMALL LETTER I. The latter would confuse the parser, since peek_byte_case doesn't advance cur_idx in any way. */ if (pstr->offsets_needed && !isascii (ch)) return re_string_peek_byte (pstr, idx); -#endif return ch; } @@ -878,7 +850,6 @@ re_string_fetch_byte_case (re_string_t *pstr) if (__glibc_likely (!pstr->mbs_allocated)) return re_string_fetch_byte (pstr); -#ifdef RE_ENABLE_I18N if (pstr->offsets_needed) { Idx off; @@ -904,7 +875,6 @@ re_string_fetch_byte_case (re_string_t *pstr) re_string_char_size_at (pstr, pstr->cur_idx)); return ch; } -#endif return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; } @@ -912,10 +882,8 @@ re_string_fetch_byte_case (re_string_t *pstr) static void re_string_destruct (re_string_t *pstr) { -#ifdef RE_ENABLE_I18N re_free (pstr->wcs); re_free (pstr->offsets); -#endif /* RE_ENABLE_I18N */ if (pstr->mbs_allocated) re_free (pstr->mbs); } @@ -933,7 +901,6 @@ re_string_context_at (const re_string_t *input, Idx idx, int eflags) if (__glibc_unlikely (idx == input->len)) return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF : CONTEXT_NEWLINE | CONTEXT_ENDBUF); -#ifdef RE_ENABLE_I18N if (input->mb_cur_max > 1) { wint_t wc; @@ -953,7 +920,6 @@ re_string_context_at (const re_string_t *input, Idx idx, int eflags) ? CONTEXT_NEWLINE : 0); } else -#endif { c = re_string_byte_at (input, idx); if (bitset_contain (input->word_char, c)) @@ -1451,11 +1417,9 @@ re_dfa_add_node (re_dfa_t *dfa, re_token_t token) } dfa->nodes[dfa->nodes_len] = token; dfa->nodes[dfa->nodes_len].constraint = 0; -#ifdef RE_ENABLE_I18N dfa->nodes[dfa->nodes_len].accept_mb = ((token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET); -#endif dfa->nexts[dfa->nodes_len] = -1; re_node_set_init_empty (dfa->edests + dfa->nodes_len); re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); @@ -1651,9 +1615,7 @@ create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, re_token_type_t type = node->type; if (type == CHARACTER && !node->constraint) continue; -#ifdef RE_ENABLE_I18N newstate->accept_mb |= node->accept_mb; -#endif /* RE_ENABLE_I18N */ /* If the state has the halt node, the state is a halt state. */ if (type == END_OF_RE) @@ -1705,9 +1667,7 @@ create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, if (type == CHARACTER && !constraint) continue; -#ifdef RE_ENABLE_I18N newstate->accept_mb |= node->accept_mb; -#endif /* RE_ENABLE_I18N */ /* If the state has the halt node, the state is a halt state. */ if (type == END_OF_RE) diff --git a/support/regex_internal.h b/support/regex_internal.h index 1245e782..8493db27 100644 --- a/support/regex_internal.h +++ b/support/regex_internal.h @@ -116,10 +116,6 @@ # define gettext_noop(String) String #endif -#if (defined MB_CUR_MAX && HAVE_WCTYPE_H && HAVE_ISWCTYPE) || _LIBC -# define RE_ENABLE_I18N -#endif - /* Number of ASCII characters. */ #define ASCII_CHARS 0x80 @@ -150,6 +146,11 @@ # define __regfree regfree #endif /* not _LIBC */ +/* Types related to integers. Unless protected by #ifdef _LIBC, the + regex code should avoid exact-width types like int32_t and uint64_t + as some non-GCC platforms lack them, an issue when this code is + used in Gnulib. */ + #ifndef SSIZE_MAX # define SSIZE_MAX ((ssize_t) (SIZE_MAX / 2)) #endif @@ -246,10 +247,8 @@ typedef enum SIMPLE_BRACKET = 3, OP_BACK_REF = 4, OP_PERIOD = 5, -#ifdef RE_ENABLE_I18N COMPLEX_BRACKET = 6, OP_UTF8_PERIOD = 7, -#endif /* RE_ENABLE_I18N */ /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used when the debugger shows values of this enum type. */ @@ -287,30 +286,29 @@ typedef enum } re_token_type_t; -#ifdef RE_ENABLE_I18N typedef struct { /* Multibyte characters. */ wchar_t *mbchars; +#ifdef _LIBC /* Collating symbols. */ -# ifdef _LIBC int32_t *coll_syms; -# endif +#endif +#ifdef _LIBC /* Equivalence classes. */ -# ifdef _LIBC int32_t *equiv_classes; -# endif +#endif /* Range expressions. */ -# ifdef _LIBC +#ifdef _LIBC uint32_t *range_starts; uint32_t *range_ends; -# else /* not _LIBC */ +#else wchar_t *range_starts; wchar_t *range_ends; -# endif /* not _LIBC */ +#endif /* Character classes. */ wctype_t *char_classes; @@ -333,7 +331,6 @@ typedef struct /* # of character classes. */ Idx nchar_classes; } re_charset_t; -#endif /* RE_ENABLE_I18N */ typedef struct { @@ -341,9 +338,7 @@ typedef struct { unsigned char c; /* for CHARACTER */ re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */ -#ifdef RE_ENABLE_I18N re_charset_t *mbcset; /* for COMPLEX_BRACKET */ -#endif /* RE_ENABLE_I18N */ Idx idx; /* for BACK_REF */ re_context_type ctx_type; /* for ANCHOR */ } opr; @@ -355,12 +350,10 @@ typedef struct unsigned int constraint : 10; /* context constraint */ unsigned int duplicated : 1; unsigned int opt_subexp : 1; -#ifdef RE_ENABLE_I18N unsigned int accept_mb : 1; /* These 2 bits can be moved into the union if needed (e.g. if running out of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */ unsigned int mb_partial : 1; -#endif unsigned int word_char : 1; } re_token_t; @@ -375,12 +368,10 @@ struct re_string_t REG_ICASE, upper cases of the string are stored, otherwise MBS points the same address that RAW_MBS points. */ unsigned char *mbs; -#ifdef RE_ENABLE_I18N /* Store the wide character string which is corresponding to MBS. */ wint_t *wcs; Idx *offsets; mbstate_t cur_state; -#endif /* Index in RAW_MBS. Each character mbs[i] corresponds to raw_mbs[raw_mbs_idx + i]. */ Idx raw_mbs_idx; @@ -779,7 +770,6 @@ bitset_mask (bitset_t dest, const bitset_t src) dest[bitset_i] &= src[bitset_i]; } -#ifdef RE_ENABLE_I18N /* Functions for re_string. */ static int __attribute__ ((pure, unused)) @@ -803,15 +793,15 @@ re_string_wchar_at (const re_string_t *pstr, Idx idx) return (wint_t) pstr->wcs[idx]; } -# ifdef _LIBC -# include <locale/weight.h> -# endif +#ifdef _LIBC +# include <locale/weight.h> +#endif static int __attribute__ ((pure, unused)) re_string_elem_size_at (const re_string_t *pstr, Idx idx) { -# ifdef _LIBC +#ifdef _LIBC const unsigned char *p, *extra; const int32_t *table, *indirect; uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); @@ -827,11 +817,10 @@ re_string_elem_size_at (const re_string_t *pstr, Idx idx) findidx (table, indirect, extra, &p, pstr->len - idx); return p - pstr->mbs - idx; } - else -# endif /* _LIBC */ - return 1; +#endif /* _LIBC */ + + return 1; } -#endif /* RE_ENABLE_I18N */ #ifdef _LIBC # if __GNUC__ >= 7 diff --git a/support/regexec.c b/support/regexec.c index 6aeba3c0..31967083 100644 --- a/support/regexec.c +++ b/support/regexec.c @@ -67,11 +67,9 @@ static reg_errcode_t set_regs (const regex_t *preg, bool fl_backtrack); static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs); -#ifdef RE_ENABLE_I18N static int sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx, Idx node_idx, Idx str_idx, Idx max_str_idx); -#endif /* RE_ENABLE_I18N */ static reg_errcode_t sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx); static reg_errcode_t build_sifted_states (const re_match_context_t *mctx, @@ -123,10 +121,8 @@ static re_dfastate_t *transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx, re_dfastate_t *pstate); #endif -#ifdef RE_ENABLE_I18N static reg_errcode_t transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate); -#endif /* RE_ENABLE_I18N */ static reg_errcode_t transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes); static reg_errcode_t get_subexp (re_match_context_t *mctx, @@ -156,14 +152,12 @@ static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes, Idx cur_str, Idx subexp_num, int type); static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state); -#ifdef RE_ENABLE_I18N static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, const re_string_t *input, Idx idx); -# ifdef _LIBC +#ifdef _LIBC static unsigned int find_collation_sequence_value (const unsigned char *mbs, size_t name_len); -# endif /* _LIBC */ -#endif /* RE_ENABLE_I18N */ +#endif static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, re_node_set *states_node, @@ -779,12 +773,10 @@ re_search_internal (const regex_t *preg, const char *string, Idx length, if (__glibc_unlikely (err != REG_NOERROR)) goto free_return; -#ifdef RE_ENABLE_I18N - /* Don't consider this char as a possible match start if it part, - yet isn't the head, of a multibyte character. */ + /* Don't consider this char as a possible match start if it part, + yet isn't the head, of a multibyte character. */ if (!sb && !re_string_first_byte (&mctx.input, 0)) continue; -#endif /* It seems to be appropriate one, then use the matcher. */ /* We assume that the matching starts from 0. */ @@ -858,7 +850,6 @@ re_search_internal (const regex_t *preg, const char *string, Idx length, for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) if (pmatch[reg_idx].rm_so != -1) { -#ifdef RE_ENABLE_I18N if (__glibc_unlikely (mctx.input.offsets_needed != 0)) { pmatch[reg_idx].rm_so = @@ -870,9 +861,6 @@ re_search_internal (const regex_t *preg, const char *string, Idx length, ? mctx.input.valid_raw_len : mctx.input.offsets[pmatch[reg_idx].rm_eo]); } -#else - DEBUG_ASSERT (mctx.input.offsets_needed == 0); -#endif pmatch[reg_idx].rm_so += match_first; pmatch[reg_idx].rm_eo += match_first; } @@ -996,8 +984,7 @@ prune_impossible_nodes (re_match_context_t *mctx) We must select appropriate initial state depending on the context, since initial states may have constraints like "\<", "^", etc.. */ -static inline re_dfastate_t * -__attribute__ ((always_inline)) +static __always_inline re_dfastate_t * acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx, Idx idx) { @@ -1261,12 +1248,9 @@ proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs, Idx naccepted = 0; re_token_type_t type = dfa->nodes[node].type; -#ifdef RE_ENABLE_I18N if (dfa->nodes[node].accept_mb) naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx); - else -#endif /* RE_ENABLE_I18N */ - if (type == OP_BACK_REF) + else if (type == OP_BACK_REF) { Idx subexp_idx = dfa->nodes[node].opr.idx + 1; if (subexp_idx < nregs) @@ -1634,12 +1618,10 @@ build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx, bool ok; DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type)); -#ifdef RE_ENABLE_I18N /* If the node may accept "multi byte". */ if (dfa->nodes[prev_node].accept_mb) naccepted = sift_states_iter_mb (mctx, sctx, prev_node, str_idx, sctx->last_str_idx); -#endif /* RE_ENABLE_I18N */ /* We don't check backreferences here. See update_cur_sifted_state(). */ @@ -1688,6 +1670,7 @@ clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx) if (top < next_state_log_idx) { + DEBUG_ASSERT (mctx->state_log != NULL); memset (mctx->state_log + top + 1, '\0', sizeof (re_dfastate_t *) * (next_state_log_idx - top)); mctx->state_log_top = next_state_log_idx; @@ -2176,7 +2159,6 @@ sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx, } -#ifdef RE_ENABLE_I18N static int sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx, Idx node_idx, Idx str_idx, Idx max_str_idx) @@ -2196,8 +2178,6 @@ sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx, 'naccepted' bytes input. */ return naccepted; } -#endif /* RE_ENABLE_I18N */ - /* Functions for state transition. */ @@ -2215,7 +2195,6 @@ transit_state (reg_errcode_t *err, re_match_context_t *mctx, re_dfastate_t **trtable; unsigned char ch; -#ifdef RE_ENABLE_I18N /* If the current state can accept multibyte. */ if (__glibc_unlikely (state->accept_mb)) { @@ -2223,7 +2202,6 @@ transit_state (reg_errcode_t *err, re_match_context_t *mctx, if (__glibc_unlikely (*err != REG_NOERROR)) return NULL; } -#endif /* RE_ENABLE_I18N */ /* Then decide the next state with the single byte. */ #if 0 @@ -2444,7 +2422,6 @@ transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx, } #endif -#ifdef RE_ENABLE_I18N static reg_errcode_t transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate) { @@ -2512,7 +2489,6 @@ transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate) } return REG_NOERROR; } -#endif /* RE_ENABLE_I18N */ static reg_errcode_t transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes) @@ -3002,9 +2978,7 @@ check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx, const re_dfa_t *const dfa = mctx->dfa; bool ok; Idx cur_idx; -#ifdef RE_ENABLE_I18N reg_errcode_t err = REG_NOERROR; -#endif re_node_set union_set; re_node_set_init_empty (&union_set); for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) @@ -3013,7 +2987,6 @@ check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx, Idx cur_node = cur_nodes->elems[cur_idx]; DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type)); -#ifdef RE_ENABLE_I18N /* If the node may accept "multi byte". */ if (dfa->nodes[cur_node].accept_mb) { @@ -3051,7 +3024,7 @@ check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx, } } } -#endif /* RE_ENABLE_I18N */ + if (naccepted || check_node_accept (mctx, dfa->nodes + cur_node, str_idx)) { @@ -3475,18 +3448,15 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, } else if (type == OP_PERIOD) { -#ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) bitset_merge (accepts, dfa->sb_char); else -#endif bitset_set_all (accepts); if (!(dfa->syntax & RE_DOT_NEWLINE)) bitset_clear (accepts, '\n'); if (dfa->syntax & RE_DOT_NOT_NULL) bitset_clear (accepts, '\0'); } -#ifdef RE_ENABLE_I18N else if (type == OP_UTF8_PERIOD) { if (ASCII_CHARS % BITSET_WORD_BITS == 0) @@ -3498,7 +3468,6 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, if (dfa->syntax & RE_DOT_NOT_NULL) bitset_clear (accepts, '\0'); } -#endif else continue; @@ -3529,12 +3498,10 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, bitset_empty (accepts); continue; } -#ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j])); else -#endif for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= dfa->word_char[j]); if (!any_set) @@ -3548,12 +3515,10 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, bitset_empty (accepts); continue; } -#ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j])); else -#endif for (j = 0; j < BITSET_WORDS; ++j) any_set |= (accepts[j] &= ~dfa->word_char[j]); if (!any_set) @@ -3630,7 +3595,6 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, return -1; } -#ifdef RE_ENABLE_I18N /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts. Return the number of the bytes the node accepts. STR_IDX is the current index of the input string. @@ -3639,9 +3603,9 @@ group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, one collating element like '.', '[a-z]', opposite to the other nodes can only accept one byte. */ -# ifdef _LIBC -# include <locale/weight.h> -# endif +#ifdef _LIBC +# include <locale/weight.h> +#endif static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, @@ -3725,12 +3689,12 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, if (node->type == COMPLEX_BRACKET) { const re_charset_t *cset = node->opr.mbcset; -# ifdef _LIBC +#ifdef _LIBC const unsigned char *pin = ((const unsigned char *) re_string_get_buffer (input) + str_idx); Idx j; uint32_t nrules; -# endif /* _LIBC */ +#endif int match_len = 0; wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) ? re_string_wchar_at (input, str_idx) : 0); @@ -3753,7 +3717,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, } } -# ifdef _LIBC +#ifdef _LIBC nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); if (nrules != 0) { @@ -3842,7 +3806,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, } } else -# endif /* _LIBC */ +#endif /* _LIBC */ { /* match with range expression? */ for (i = 0; i < cset->nranges; ++i) @@ -3868,7 +3832,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, return 0; } -# ifdef _LIBC +#ifdef _LIBC static unsigned int find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len) { @@ -3926,8 +3890,7 @@ find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len) return UINT_MAX; } } -# endif /* _LIBC */ -#endif /* RE_ENABLE_I18N */ +#endif /* _LIBC */ /* Check whether the node accepts the byte which is IDX-th byte of the INPUT. */ @@ -3950,12 +3913,10 @@ check_node_accept (const re_match_context_t *mctx, const re_token_t *node, return false; break; -#ifdef RE_ENABLE_I18N case OP_UTF8_PERIOD: if (ch >= ASCII_CHARS) return false; FALLTHROUGH; -#endif case OP_PERIOD: if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE)) || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL))) @@ -4016,7 +3977,6 @@ extend_buffers (re_match_context_t *mctx, int min_len) /* Then reconstruct the buffers. */ if (pstr->icase) { -#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) { ret = build_wcs_upper_buffer (pstr); @@ -4024,16 +3984,13 @@ extend_buffers (re_match_context_t *mctx, int min_len) return ret; } else -#endif /* RE_ENABLE_I18N */ build_upper_buffer (pstr); } else { -#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) build_wcs_buffer (pstr); else -#endif /* RE_ENABLE_I18N */ { if (pstr->trans != NULL) re_string_translate_buffer (pstr); |