diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2010-07-16 14:49:57 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2010-07-16 14:49:57 +0300 |
commit | 6a2caf2157d87b4b582b2494bdd7d6a688dd0b1f (patch) | |
tree | 9a2862cc11be4832f188cfbdce175120ceba5024 /regcomp.c | |
parent | 315bd501ca696bc3e3c938b4604d8dac7a6f512f (diff) | |
download | egawk-6a2caf2157d87b4b582b2494bdd7d6a688dd0b1f.tar.gz egawk-6a2caf2157d87b4b582b2494bdd7d6a688dd0b1f.tar.bz2 egawk-6a2caf2157d87b4b582b2494bdd7d6a688dd0b1f.zip |
Move to gawk-3.1.6.
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 697 |
1 files changed, 306 insertions, 391 deletions
@@ -1,5 +1,5 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. @@ -19,12 +19,11 @@ 02110-1301 USA. */ static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, - int length, reg_syntax_t syntax); + size_t length, reg_syntax_t syntax); static void re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, char *fastmap); -static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len); -static void init_word_char (re_dfa_t *dfa); +static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); #ifdef RE_ENABLE_I18N static void free_charset (re_charset_t *cset); #endif /* RE_ENABLE_I18N */ @@ -34,7 +33,6 @@ static reg_errcode_t create_initial_state (re_dfa_t *dfa); static void optimize_utf8 (re_dfa_t *dfa); #endif static reg_errcode_t analyze (regex_t *preg); -static reg_errcode_t create_initial_state (re_dfa_t *dfa); static reg_errcode_t preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), void *extra); @@ -48,12 +46,8 @@ static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, static reg_errcode_t calc_first (void *extra, bin_tree_t *node); static reg_errcode_t calc_next (void *extra, bin_tree_t *node); static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); -static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node, - int top_clone_node, int root_node, - unsigned int constraint); -static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx, - unsigned int constraint); -static int search_duplicated_node (re_dfa_t *dfa, int org_node, +static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint); +static int search_duplicated_node (const re_dfa_t *dfa, int org_node, unsigned int constraint); static reg_errcode_t calc_eclosure (re_dfa_t *dfa); static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, @@ -61,12 +55,8 @@ static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); static int fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax); -static void fetch_token (re_token_t *result, re_string_t *input, - reg_syntax_t syntax); static int peek_token (re_token_t *token, re_string_t *input, - reg_syntax_t syntax); -static int peek_token_bracket (re_token_t *token, re_string_t *input, - reg_syntax_t syntax); + reg_syntax_t syntax) internal_function; static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, reg_errcode_t *err); static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, @@ -96,45 +86,27 @@ static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, re_token_t *token); -#ifndef _LIBC -# ifdef RE_ENABLE_I18N -static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, - re_charset_t *mbcset, int *range_alloc, - bracket_elem_t *start_elem, - bracket_elem_t *end_elem); -static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, - re_charset_t *mbcset, - int *coll_sym_alloc, - const unsigned char *name); -# else /* not RE_ENABLE_I18N */ -static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, - bracket_elem_t *start_elem, - bracket_elem_t *end_elem); -static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, - const unsigned char *name); -# endif /* not RE_ENABLE_I18N */ -#endif /* not _LIBC */ #ifdef RE_ENABLE_I18N -static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, +static reg_errcode_t build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, int *equiv_class_alloc, const unsigned char *name); -static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans, - re_bitset_ptr_t sbcset, +static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, + bitset_t sbcset, re_charset_t *mbcset, int *char_class_alloc, const char *class_name, reg_syntax_t syntax); #else /* not RE_ENABLE_I18N */ -static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, +static reg_errcode_t build_equiv_class (bitset_t sbcset, const unsigned char *name); -static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans, - re_bitset_ptr_t sbcset, +static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, + bitset_t sbcset, const char *class_name, reg_syntax_t syntax); #endif /* not RE_ENABLE_I18N */ static bin_tree_t *build_charclass_op (re_dfa_t *dfa, - unsigned RE_TRANSLATE_TYPE trans, + RE_TRANSLATE_TYPE trans, const char *class_name, const char *extra, int non_match, reg_errcode_t *err); @@ -154,59 +126,58 @@ static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); POSIX doesn't require that we do anything for REG_NOERROR, but why not be nice? */ -const ERRMSG_TYPE __re_error_msgid[] attribute_hidden = +const char __re_error_msgid[] attribute_hidden = { #define REG_NOERROR_IDX 0 gettext_noop ("Success") /* REG_NOERROR */ - ERRMSG_SEPARATOR + "\0" #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") gettext_noop ("No match") /* REG_NOMATCH */ - ERRMSG_SEPARATOR + "\0" #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") gettext_noop ("Invalid regular expression") /* REG_BADPAT */ - ERRMSG_SEPARATOR + "\0" #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ - ERRMSG_SEPARATOR + "\0" #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") gettext_noop ("Invalid character class name") /* REG_ECTYPE */ - ERRMSG_SEPARATOR + "\0" #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") gettext_noop ("Trailing backslash") /* REG_EESCAPE */ - ERRMSG_SEPARATOR + "\0" #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") gettext_noop ("Invalid back reference") /* REG_ESUBREG */ - ERRMSG_SEPARATOR + "\0" #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ - ERRMSG_SEPARATOR + "\0" #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ - ERRMSG_SEPARATOR + "\0" #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") gettext_noop ("Unmatched \\{") /* REG_EBRACE */ - ERRMSG_SEPARATOR + "\0" #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ - ERRMSG_SEPARATOR + "\0" #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") gettext_noop ("Invalid range end") /* REG_ERANGE */ - ERRMSG_SEPARATOR + "\0" #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") gettext_noop ("Memory exhausted") /* REG_ESPACE */ - ERRMSG_SEPARATOR + "\0" #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ - ERRMSG_SEPARATOR + "\0" #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") gettext_noop ("Premature end of regular expression") /* REG_EEND */ - ERRMSG_SEPARATOR + "\0" #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") gettext_noop ("Regular expression too big") /* REG_ESIZE */ - ERRMSG_SEPARATOR + "\0" #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ - ERRMSG_SEPARATOR }; const size_t __re_error_msgid_idx[] attribute_hidden = @@ -259,7 +230,7 @@ re_compile_pattern (pattern, length, bufp) if (!ret) return NULL; - return gettext (RE_ERRMSG(ret)); + return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); } #ifdef _LIBC weak_alias (__re_compile_pattern, re_compile_pattern) @@ -328,12 +299,10 @@ re_set_fastmap (char *fastmap, int icase, int ch) Compile fastmap for the initial_state INIT_STATE. */ static void -re_compile_fastmap_iter (bufp, init_state, fastmap) - regex_t *bufp; - const re_dfastate_t *init_state; - char *fastmap; +re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, + char *fastmap) { - re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; + volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; int node_cnt; int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) @@ -357,10 +326,11 @@ re_compile_fastmap_iter (bufp, init_state, fastmap) && dfa->nodes[node].type == CHARACTER && dfa->nodes[node].mb_partial) *p++ = dfa->nodes[node].opr.c; - memset (&state, 0, sizeof (state)); + memset (&state, '\0', sizeof (state)); if (mbrtowc (&wc, (const char *) buf, p - buf, &state) == p - buf - && __wcrtomb ((char *) buf, towlower (wc), &state) > 0) + && (__wcrtomb ((char *) buf, towlower (wc), &state) + != (size_t) -1)) re_set_fastmap (fastmap, 0, buf[0]); re_free (buf); } @@ -368,11 +338,15 @@ re_compile_fastmap_iter (bufp, init_state, fastmap) } else if (type == SIMPLE_BRACKET) { - int i, j, ch; - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) - if (dfa->nodes[node].opr.sbcset[i] & (1 << j)) - re_set_fastmap (fastmap, icase, ch); + int i, ch; + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + { + int j; + bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) + if (w & ((bitset_word_t) 1 << j)) + re_set_fastmap (fastmap, icase, ch); + } } #ifdef RE_ENABLE_I18N else if (type == COMPLEX_BRACKET) @@ -391,19 +365,21 @@ re_compile_fastmap_iter (bufp, init_state, fastmap) is a valid collation element, and don't catch 'b' since 'b' is the only collation element which starts from 'b'. */ - int j, ch; const int32_t *table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) - if (table[ch] < 0) - re_set_fastmap (fastmap, icase, ch); + for (i = 0; i < SBC_MAX; ++i) + if (table[i] < 0) + re_set_fastmap (fastmap, icase, i); } # else if (dfa->mb_cur_max > 1) - for (i = 0; i < SBC_MAX; ++i) - if (__btowc (i) == WEOF) + for (i = 0; i < SBC_MAX; ++i) { + wint_t wc; + wc = __btowc (i); + + if (wc == WEOF || wc >= SBC_MAX) re_set_fastmap (fastmap, icase, i); + } # endif /* not _LIBC */ } for (i = 0; i < cset->nmbchars; ++i) @@ -411,12 +387,13 @@ re_compile_fastmap_iter (bufp, init_state, fastmap) char buf[256]; mbstate_t state; memset (&state, '\0', sizeof (state)); - __wcrtomb (buf, cset->mbchars[i], &state); - re_set_fastmap (fastmap, icase, *(unsigned char *) buf); + if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) + re_set_fastmap (fastmap, icase, *(unsigned char *) buf); if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) { - __wcrtomb (buf, towlower (cset->mbchars[i]), &state); - re_set_fastmap (fastmap, 0, *(unsigned char *) buf); + if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) + != (size_t) -1) + re_set_fastmap (fastmap, 0, *(unsigned char *) buf); } } } @@ -536,8 +513,8 @@ weak_alias (__regcomp, regcomp) size_t regerror (errcode, preg, errbuf, errbuf_size) int errcode; - const regex_t *preg; - char *errbuf; + const regex_t *__restrict preg; + char *__restrict errbuf; size_t errbuf_size; { const char *msg; @@ -552,7 +529,7 @@ regerror (errcode, preg, errbuf, errbuf_size) Dump core so we can fix it. */ abort (); - msg = gettext (RE_ERRMSG(errcode)); + msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); msg_size = strlen (msg) + 1; /* Includes the null. */ @@ -579,16 +556,15 @@ weak_alias (__regerror, regerror) UTF-8 is used. Otherwise we would allocate memory just to initialize it the same all the time. UTF-8 is the preferred encoding so this is a worthwhile optimization. */ -static const bitset utf8_sb_map = -{ +#if __GNUC__ >= 3 +static const bitset_t utf8_sb_map = { /* Set the first 128 bits. */ -# if UINT_MAX == 0xffffffff - 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff -# else -# error "Add case for new unsigned int size" -# endif + [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX }; -#endif +#else /* ! (__GNUC__ >= 3) */ +static bitset_t utf8_sb_map; +#endif /* __GNUC__ >= 3 */ +#endif /* RE_ENABLE_I18N */ static void @@ -702,7 +678,8 @@ re_comp (s) { re_comp_buf.fastmap = (char *) malloc (SBC_MAX); if (re_comp_buf.fastmap == NULL) - return (char *) gettext (RE_ERRMSG(REG_ESPACE_IDX)); + return (char *) gettext (__re_error_msgid + + __re_error_msgid_idx[(int) REG_ESPACE]); } /* Since `re_exec' always passes NULL for the `regs' argument, we @@ -717,7 +694,7 @@ re_comp (s) return NULL; /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ - return (char *) gettext (RE_ERRMSG(ret)); + return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); } #ifdef _LIBC @@ -734,11 +711,8 @@ libc_freeres_fn (free_mem) SYNTAX indicate regular expression's syntax. */ static reg_errcode_t -re_compile_internal (preg, pattern, length, syntax) - regex_t *preg; - const char * pattern; - int length; - reg_syntax_t syntax; +re_compile_internal (regex_t *preg, const char * pattern, size_t length, + reg_syntax_t syntax) { reg_errcode_t err = REG_NOERROR; re_dfa_t *dfa; @@ -778,10 +752,13 @@ re_compile_internal (preg, pattern, length, syntax) return err; } #ifdef DEBUG + /* Note: length+1 will not overflow since it is checked in init_dfa. */ dfa->re_str = re_malloc (char, length + 1); strncpy (dfa->re_str, pattern, length + 1); #endif + __libc_lock_init (dfa->lock); + err = re_string_construct (®exp, pattern, length, preg->translate, syntax & RE_ICASE, dfa); if (BE (err != REG_NOERROR, 0)) @@ -833,11 +810,9 @@ re_compile_internal (preg, pattern, length, syntax) as the initial length of some arrays. */ static reg_errcode_t -init_dfa (dfa, pat_len) - re_dfa_t *dfa; - int pat_len; +init_dfa (re_dfa_t *dfa, size_t pat_len) { - int table_size; + unsigned int table_size; #ifndef _LIBC char *codeset_name; #endif @@ -847,13 +822,15 @@ init_dfa (dfa, pat_len) /* Force allocation of str_tree_storage the first time. */ dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; + /* Avoid overflows. */ + if (pat_len == SIZE_MAX) + return REG_ESPACE; + dfa->nodes_alloc = pat_len + 1; dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); - dfa->states_alloc = pat_len + 1; - /* table_size = 2 ^ ceil(log pat_len) */ - for (table_size = 1; table_size > 0; table_size <<= 1) + for (table_size = 1; ; table_size <<= 1) if (table_size > pat_len) break; @@ -906,27 +883,38 @@ init_dfa (dfa, pat_len) if (dfa->mb_cur_max > 1) { if (dfa->is_utf8) - dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; + { +#if !defined(__GNUC__) || __GNUC__ < 3 + static short utf8_sb_map_inited = 0; + + if (! utf8_sb_map_inited) + { + int i; + + utf8_sb_map_inited = 0; + for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++) + utf8_sb_map[i] = BITSET_WORD_MAX; + } +#endif + dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; + } else { int i, j, ch; - dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); + dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); if (BE (dfa->sb_char == NULL, 0)) return REG_ESPACE; - /* Clear all bits by, then set those corresponding to single - byte chars. */ - bitset_empty (dfa->sb_char); - - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) + /* Set the bits corresponding to single byte chars. */ + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) { - wchar_t wch = __btowc (ch); + wint_t wch = __btowc (ch); if (wch != WEOF) - dfa->sb_char[i] |= 1 << j; + dfa->sb_char[i] |= (bitset_word_t) 1 << j; # ifndef _LIBC - if (isascii (ch) && wch != (wchar_t) ch) + if (isascii (ch) && wch != ch) dfa->map_notascii = 1; # endif } @@ -944,22 +932,21 @@ init_dfa (dfa, pat_len) character used by some operators like "\<", "\>", etc. */ static void -init_word_char (dfa) - re_dfa_t *dfa; +internal_function +init_word_char (re_dfa_t *dfa) { int i, j, ch; dfa->word_ops_used = 1; - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) if (isalnum (ch) || ch == '_') - dfa->word_char[i] |= 1 << j; + dfa->word_char[i] |= (bitset_word_t) 1 << j; } /* Free the work area which are only used while compiling. */ static void -free_workarea_compile (preg) - regex_t *preg; +free_workarea_compile (regex_t *preg) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_storage_t *storage, *next; @@ -978,8 +965,7 @@ free_workarea_compile (preg) /* Create initial states for all contexts. */ static reg_errcode_t -create_initial_state (dfa) - re_dfa_t *dfa; +create_initial_state (re_dfa_t *dfa) { int first, i; reg_errcode_t err; @@ -1061,8 +1047,7 @@ create_initial_state (dfa) DFA nodes where needed. */ static void -optimize_utf8 (dfa) - re_dfa_t *dfa; +optimize_utf8 (re_dfa_t *dfa) { int node, i, mb_chars = 0, has_period = 0; @@ -1099,8 +1084,9 @@ optimize_utf8 (dfa) case COMPLEX_BRACKET: return; case SIMPLE_BRACKET: - /* Just double check. */ - for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i) + /* Just double check. The non-ASCII range starts at 0x80. */ + assert (0x80 % BITSET_WORD_BITS == 0); + for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) if (dfa->nodes[node].opr.sbcset[i]) return; break; @@ -1129,8 +1115,7 @@ optimize_utf8 (dfa) "eclosure", and "inveclosure". */ static reg_errcode_t -analyze (preg) - regex_t *preg; +analyze (regex_t *preg) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; reg_errcode_t ret; @@ -1193,10 +1178,8 @@ analyze (preg) implement parse tree visits. Instead, we use parent pointers and some hairy code in these two functions. */ static reg_errcode_t -postorder (root, fn, extra) - bin_tree_t *root; - reg_errcode_t (fn (void *, bin_tree_t *)); - void *extra; +postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), + void *extra) { bin_tree_t *node, *prev; @@ -1227,10 +1210,8 @@ postorder (root, fn, extra) } static reg_errcode_t -preorder (root, fn, extra) - bin_tree_t *root; - reg_errcode_t (fn (void *, bin_tree_t *)); - void *extra; +preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), + void *extra) { bin_tree_t *node; @@ -1262,9 +1243,7 @@ preorder (root, fn, extra) re_search_internal to map the inner one's opr.idx to this one's. Adjust backreferences as well. Requires a preorder visit. */ static reg_errcode_t -optimize_subexps (extra, node) - void *extra; - bin_tree_t *node; +optimize_subexps (void *extra, bin_tree_t *node) { re_dfa_t *dfa = (re_dfa_t *) extra; @@ -1285,8 +1264,8 @@ optimize_subexps (extra, node) node->left->parent = node; dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; - if (other_idx < 8 * sizeof (dfa->used_bkref_map)) - dfa->used_bkref_map &= ~(1 << other_idx); + if (other_idx < BITSET_WORD_BITS) + dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); } return REG_NOERROR; @@ -1295,9 +1274,7 @@ optimize_subexps (extra, node) /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ static reg_errcode_t -lower_subexps (extra, node) - void *extra; - bin_tree_t *node; +lower_subexps (void *extra, bin_tree_t *node) { regex_t *preg = (regex_t *) extra; reg_errcode_t err = REG_NOERROR; @@ -1319,18 +1296,21 @@ lower_subexps (extra, node) } static bin_tree_t * -lower_subexp (err, preg, node) - reg_errcode_t *err; - regex_t *preg; - bin_tree_t *node; +lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *body = node->left; bin_tree_t *op, *cls, *tree1, *tree; if (preg->no_sub - && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map) - || !(dfa->used_bkref_map & (1 << node->token.opr.idx)))) + /* We do not optimize empty subexpressions, because otherwise we may + have bad CONCAT nodes with NULL children. This is obviously not + very common, so we do not lose much. An example that triggers + this case is the sed "script" /\(\)/x. */ + && node->left != NULL + && (node->token.opr.idx >= BITSET_WORD_BITS + || !(dfa->used_bkref_map + & ((bitset_word_t) 1 << node->token.opr.idx)))) return node->left; /* Convert the SUBEXP node to the concatenation of an @@ -1353,9 +1333,7 @@ lower_subexp (err, preg, node) /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton nodes. Requires a postorder visit. */ static reg_errcode_t -calc_first (extra, node) - void *extra; - bin_tree_t *node; +calc_first (void *extra, bin_tree_t *node) { re_dfa_t *dfa = (re_dfa_t *) extra; if (node->token.type == CONCAT) @@ -1375,9 +1353,7 @@ calc_first (extra, node) /* Pass 2: compute NEXT on the tree. Preorder visit. */ static reg_errcode_t -calc_next (extra, node) - void *extra; - bin_tree_t *node; +calc_next (void *extra, bin_tree_t *node) { switch (node->token.type) { @@ -1400,9 +1376,7 @@ calc_next (extra, node) /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ static reg_errcode_t -link_nfa_nodes (extra, node) - void *extra; - bin_tree_t *node; +link_nfa_nodes (void *extra, bin_tree_t *node) { re_dfa_t *dfa = (re_dfa_t *) extra; int idx = node->node_idx; @@ -1462,13 +1436,10 @@ link_nfa_nodes (extra, node) to their own constraint. */ static reg_errcode_t -duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, - init_constraint) - re_dfa_t *dfa; - int top_org_node, top_clone_node, root_node; - unsigned int init_constraint; +internal_function +duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, + int root_node, unsigned int init_constraint) { - reg_errcode_t err; int org_node, clone_node, ret; unsigned int constraint = init_constraint; for (org_node = top_org_node, clone_node = top_clone_node;;) @@ -1482,9 +1453,9 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, edests of the back reference. */ org_dest = dfa->nexts[org_node]; re_node_set_empty (dfa->edests + clone_node); - err = duplicate_node (&clone_dest, dfa, org_dest, constraint); - if (BE (err != REG_NOERROR, 0)) - return err; + clone_dest = duplicate_node (dfa, org_dest, constraint); + if (BE (clone_dest == -1, 0)) + return REG_ESPACE; dfa->nexts[clone_node] = dfa->nexts[org_node]; ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) @@ -1520,9 +1491,9 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, } constraint |= dfa->nodes[org_node].opr.ctx_type; } - err = duplicate_node (&clone_dest, dfa, org_dest, constraint); - if (BE (err != REG_NOERROR, 0)) - return err; + clone_dest = duplicate_node (dfa, org_dest, constraint); + if (BE (clone_dest == -1, 0)) + return REG_ESPACE; ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) return REG_ESPACE; @@ -1538,9 +1509,10 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, if (clone_dest == -1) { /* There are no such a duplicated node, create a new one. */ - err = duplicate_node (&clone_dest, dfa, org_dest, constraint); - if (BE (err != REG_NOERROR, 0)) - return err; + reg_errcode_t err; + clone_dest = duplicate_node (dfa, org_dest, constraint); + if (BE (clone_dest == -1, 0)) + return REG_ESPACE; ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) return REG_ESPACE; @@ -1559,9 +1531,9 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, } org_dest = dfa->edests[org_node].elems[1]; - err = duplicate_node (&clone_dest, dfa, org_dest, constraint); - if (BE (err != REG_NOERROR, 0)) - return err; + clone_dest = duplicate_node (dfa, org_dest, constraint); + if (BE (clone_dest == -1, 0)) + return REG_ESPACE; ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); if (BE (ret < 0, 0)) return REG_ESPACE; @@ -1576,10 +1548,8 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, satisfies the constraint CONSTRAINT. */ static int -search_duplicated_node (dfa, org_node, constraint) - re_dfa_t *dfa; - int org_node; - unsigned int constraint; +search_duplicated_node (const re_dfa_t *dfa, int org_node, + unsigned int constraint) { int idx; for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) @@ -1592,32 +1562,28 @@ search_duplicated_node (dfa, org_node, constraint) } /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. - The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded, - otherwise return the error code. */ + Return the index of the new node, or -1 if insufficient storage is + available. */ -static reg_errcode_t -duplicate_node (new_idx, dfa, org_idx, constraint) - re_dfa_t *dfa; - int *new_idx, org_idx; - unsigned int constraint; +static int +duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint) { int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); - if (BE (dup_idx == -1, 0)) - return REG_ESPACE; - dfa->nodes[dup_idx].constraint = constraint; - if (dfa->nodes[org_idx].type == ANCHOR) - dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; - dfa->nodes[dup_idx].duplicated = 1; - - /* Store the index of the original node. */ - dfa->org_indices[dup_idx] = org_idx; - *new_idx = dup_idx; - return REG_NOERROR; + if (BE (dup_idx != -1, 1)) + { + dfa->nodes[dup_idx].constraint = constraint; + if (dfa->nodes[org_idx].type == ANCHOR) + dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; + dfa->nodes[dup_idx].duplicated = 1; + + /* Store the index of the original node. */ + dfa->org_indices[dup_idx] = org_idx; + } + return dup_idx; } static reg_errcode_t -calc_inveclosure (dfa) - re_dfa_t *dfa; +calc_inveclosure (re_dfa_t *dfa) { int src, idx, ret; for (idx = 0; idx < dfa->nodes_len; ++idx) @@ -1640,8 +1606,7 @@ calc_inveclosure (dfa) /* Calculate "eclosure" for all the node in DFA. */ static reg_errcode_t -calc_eclosure (dfa) - re_dfa_t *dfa; +calc_eclosure (re_dfa_t *dfa) { int node_idx, incomplete; #ifdef DEBUG @@ -1685,10 +1650,7 @@ calc_eclosure (dfa) /* Calculate epsilon closure of NODE. */ static reg_errcode_t -calc_eclosure_iter (new_set, dfa, node, root) - re_node_set *new_set; - re_dfa_t *dfa; - int node, root; +calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root) { reg_errcode_t err; unsigned int constraint; @@ -1711,8 +1673,6 @@ calc_eclosure_iter (new_set, dfa, node, root) && dfa->edests[node].nelem && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) { - int org_node, cur_node; - org_node = cur_node = node; err = duplicate_node_closure (dfa, node, node, node, constraint); if (BE (err != REG_NOERROR, 0)) return err; @@ -1768,10 +1728,8 @@ calc_eclosure_iter (new_set, dfa, node, root) We must not use this function inside bracket expressions. */ static void -fetch_token (result, input, syntax) - re_token_t *result; - re_string_t *input; - reg_syntax_t syntax; +internal_function +fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) { re_string_skip_bytes (input, peek_token (result, input, syntax)); } @@ -1780,10 +1738,8 @@ fetch_token (result, input, syntax) We must not use this function inside bracket expressions. */ static int -peek_token (token, input, syntax) - re_token_t *token; - re_string_t *input; - reg_syntax_t syntax; +internal_function +peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) { unsigned char c; @@ -2023,10 +1979,8 @@ peek_token (token, input, syntax) We must not use this function out of bracket expressions. */ static int -peek_token_bracket (token, input, syntax) - re_token_t *token; - re_string_t *input; - reg_syntax_t syntax; +internal_function +peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) { unsigned char c; if (re_string_eoi (input)) @@ -2122,11 +2076,8 @@ peek_token_bracket (token, input, syntax) EOR means end of regular expression. */ static bin_tree_t * -parse (regexp, preg, syntax, err) - re_string_t *regexp; - regex_t *preg; - reg_syntax_t syntax; - reg_errcode_t *err; +parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, + reg_errcode_t *err) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *tree, *eor, *root; @@ -2159,13 +2110,8 @@ parse (regexp, preg, syntax, err) ALT means alternative, which represents the operator `|'. */ static bin_tree_t * -parse_reg_exp (regexp, preg, token, syntax, nest, err) - re_string_t *regexp; - regex_t *preg; - re_token_t *token; - reg_syntax_t syntax; - int nest; - reg_errcode_t *err; +parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, + reg_syntax_t syntax, int nest, reg_errcode_t *err) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *tree, *branch = NULL; @@ -2205,13 +2151,8 @@ parse_reg_exp (regexp, preg, token, syntax, nest, err) CAT means concatenation. */ static bin_tree_t * -parse_branch (regexp, preg, token, syntax, nest, err) - re_string_t *regexp; - regex_t *preg; - re_token_t *token; - reg_syntax_t syntax; - int nest; - reg_errcode_t *err; +parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, + reg_syntax_t syntax, int nest, reg_errcode_t *err) { bin_tree_t *tree, *exp; re_dfa_t *dfa = (re_dfa_t *) preg->buffer; @@ -2250,13 +2191,8 @@ parse_branch (regexp, preg, token, syntax, nest, err) */ static bin_tree_t * -parse_expression (regexp, preg, token, syntax, nest, err) - re_string_t *regexp; - regex_t *preg; - re_token_t *token; - reg_syntax_t syntax; - int nest; - reg_errcode_t *err; +parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, + reg_syntax_t syntax, int nest, reg_errcode_t *err) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *tree; @@ -2414,8 +2350,8 @@ parse_expression (regexp, preg, token, syntax, nest, err) case OP_WORD: case OP_NOTWORD: tree = build_charclass_op (dfa, regexp->trans, - (const char *) "alnum", - (const char *) "_", + "alnum", + "_", token->type == OP_NOTWORD, err); if (BE (*err != REG_NOERROR && tree == NULL, 0)) return NULL; @@ -2423,8 +2359,8 @@ parse_expression (regexp, preg, token, syntax, nest, err) case OP_SPACE: case OP_NOTSPACE: tree = build_charclass_op (dfa, regexp->trans, - (const char *) "space", - (const char *) "", + "space", + "", token->type == OP_NOTSPACE, err); if (BE (*err != REG_NOERROR && tree == NULL, 0)) return NULL; @@ -2471,13 +2407,8 @@ parse_expression (regexp, preg, token, syntax, nest, err) */ static bin_tree_t * -parse_sub_exp (regexp, preg, token, syntax, nest, err) - re_string_t *regexp; - regex_t *preg; - re_token_t *token; - reg_syntax_t syntax; - int nest; - reg_errcode_t *err; +parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, + reg_syntax_t syntax, int nest, reg_errcode_t *err) { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *tree; @@ -2497,7 +2428,9 @@ parse_sub_exp (regexp, preg, token, syntax, nest, err) if (BE (*err != REG_NOERROR, 0)) return NULL; } - dfa->completed_bkref_map |= 1 << cur_nsub; + + if (cur_nsub <= '9' - '1') + dfa->completed_bkref_map |= 1 << cur_nsub; tree = create_tree (dfa, tree, NULL, SUBEXP); if (BE (tree == NULL, 0)) @@ -2512,13 +2445,8 @@ parse_sub_exp (regexp, preg, token, syntax, nest, err) /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ static bin_tree_t * -parse_dup_op (elem, regexp, dfa, token, syntax, err) - bin_tree_t *elem; - re_string_t *regexp; - re_dfa_t *dfa; - re_token_t *token; - reg_syntax_t syntax; - reg_errcode_t *err; +parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, + re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) { bin_tree_t *tree = NULL, *old_tree = NULL; int i, start, end, start_idx = re_string_cur_idx (regexp); @@ -2663,15 +2591,14 @@ parse_dup_op (elem, regexp, dfa, token, syntax, err) update it. */ static reg_errcode_t +internal_function # ifdef RE_ENABLE_I18N -build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) - re_charset_t *mbcset; - int *range_alloc; +build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, + bracket_elem_t *start_elem, bracket_elem_t *end_elem) # else /* not RE_ENABLE_I18N */ -build_range_exp (sbcset, start_elem, end_elem) +build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, + bracket_elem_t *end_elem) # endif /* not RE_ENABLE_I18N */ - re_bitset_ptr_t sbcset; - bracket_elem_t *start_elem, *end_elem; { unsigned int start_ch, end_ch; /* Equivalence Classes and Character Classes can't be a range start/end. */ @@ -2690,7 +2617,9 @@ build_range_exp (sbcset, start_elem, end_elem) # ifdef RE_ENABLE_I18N { - wchar_t wc, start_wc, end_wc; + wchar_t wc; + wint_t start_wc; + wint_t end_wc; wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch @@ -2699,10 +2628,22 @@ build_range_exp (sbcset, start_elem, end_elem) end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] : 0)); +#ifdef GAWK + /* + * Fedora Core 2, maybe others, have broken `btowc' that returns -1 + * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are + * unsigned, so we don't have sign extension problems. + */ + start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) + ? start_ch : start_elem->opr.wch); + end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) + ? end_ch : end_elem->opr.wch); +#else start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) ? __btowc (start_ch) : start_elem->opr.wch); end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) ? __btowc (end_ch) : end_elem->opr.wch); +#endif if (start_wc == WEOF || end_wc == WEOF) return REG_ECOLLATE; cmp_buf[0] = start_wc; @@ -2783,15 +2724,13 @@ build_range_exp (sbcset, start_elem, end_elem) pointer argument since we may update it. */ static reg_errcode_t +internal_function # ifdef RE_ENABLE_I18N -build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) - re_charset_t *mbcset; - int *coll_sym_alloc; +build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, + int *coll_sym_alloc, const unsigned char *name) # else /* not RE_ENABLE_I18N */ -build_collating_symbol (sbcset, name) +build_collating_symbol (bitset_t sbcset, const unsigned char *name) # endif /* not RE_ENABLE_I18N */ - re_bitset_ptr_t sbcset; - const unsigned char *name; { size_t name_len = strlen ((const char *) name); if (BE (name_len != 1, 0)) @@ -2808,12 +2747,8 @@ build_collating_symbol (sbcset, name) "[[.a-a.]]" etc. */ static bin_tree_t * -parse_bracket_exp (regexp, dfa, token, syntax, err) - re_string_t *regexp; - re_dfa_t *dfa; - re_token_t *token; - reg_syntax_t syntax; - reg_errcode_t *err; +parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, + reg_syntax_t syntax, reg_errcode_t *err) { #ifdef _LIBC const unsigned char *collseqmb; @@ -2835,23 +2770,28 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) { int32_t hash = elem_hash ((const char *) name, name_len); int32_t elem = hash % table_size; - int32_t second = hash % (table_size - 2); - while (symb_table[2 * elem] != 0) - { - /* First compare the hashing value. */ - if (symb_table[2 * elem] == hash - /* Compare the length of the name. */ - && name_len == extra[symb_table[2 * elem + 1]] - /* Compare the name. */ - && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], - name_len) == 0) + if (symb_table[2 * elem] != 0) + { + int32_t second = hash % (table_size - 2) + 1; + + do { - /* Yep, this is the entry. */ - break; - } + /* First compare the hashing value. */ + if (symb_table[2 * elem] == hash + /* Compare the length of the name. */ + && name_len == extra[symb_table[2 * elem + 1]] + /* Compare the name. */ + && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], + name_len) == 0) + { + /* Yep, this is the entry. */ + break; + } - /* Next entry. */ - elem += second; + /* Next entry. */ + elem += second; + } + while (symb_table[2 * elem] != 0); } return elem; } @@ -2933,7 +2873,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) re_charset_t *mbcset; int *range_alloc; - re_bitset_ptr_t sbcset; + bitset_t sbcset; bracket_elem_t *start_elem, *end_elem; { unsigned int ch; @@ -3016,7 +2956,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) re_charset_t *mbcset; int *coll_sym_alloc; - re_bitset_ptr_t sbcset; + bitset_t sbcset; const unsigned char *name; { int32_t elem, idx; @@ -3093,7 +3033,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) /* if (MB_CUR_MAX > 1) */ - collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); + collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB); @@ -3101,7 +3041,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) _NL_COLLATE_SYMB_EXTRAMB); } #endif - sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); + sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); #ifdef RE_ENABLE_I18N mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); #endif /* RE_ENABLE_I18N */ @@ -3311,12 +3251,12 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); if (BE (mbc_tree == NULL, 0)) goto parse_bracket_exp_espace; - for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx) + for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) if (sbcset[sbc_idx]) break; /* If there are no bits set in sbcset, there is no point of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ - if (sbc_idx < BITSET_UINTS) + if (sbc_idx < BITSET_WORDS) { /* Build a tree for simple bracket. */ br_token.type = SIMPLE_BRACKET; @@ -3364,15 +3304,9 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) /* Parse an element in the bracket expression. */ static reg_errcode_t -parse_bracket_element (elem, regexp, token, token_len, dfa, syntax, - accept_hyphen) - bracket_elem_t *elem; - re_string_t *regexp; - re_token_t *token; - int token_len; - re_dfa_t *dfa; - reg_syntax_t syntax; - int accept_hyphen; +parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, + re_token_t *token, int token_len, re_dfa_t *dfa, + reg_syntax_t syntax, int accept_hyphen) { #ifdef RE_ENABLE_I18N int cur_char_size; @@ -3410,10 +3344,8 @@ parse_bracket_element (elem, regexp, token, token_len, dfa, syntax, [=<equivalent_class>=]. */ static reg_errcode_t -parse_bracket_symbol (elem, regexp, token) - bracket_elem_t *elem; - re_string_t *regexp; - re_token_t *token; +parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, + re_token_t *token) { unsigned char ch, delim = token->opr.c; int i = 0; @@ -3460,16 +3392,13 @@ parse_bracket_symbol (elem, regexp, token) static reg_errcode_t #ifdef RE_ENABLE_I18N -build_equiv_class (sbcset, mbcset, equiv_class_alloc, name) - re_charset_t *mbcset; - int *equiv_class_alloc; +build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, + int *equiv_class_alloc, const unsigned char *name) #else /* not RE_ENABLE_I18N */ -build_equiv_class (sbcset, name) +build_equiv_class (bitset_t sbcset, const unsigned char *name) #endif /* not RE_ENABLE_I18N */ - re_bitset_ptr_t sbcset; - const unsigned char *name; { -#if defined _LIBC +#ifdef _LIBC uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); if (nrules != 0) { @@ -3555,16 +3484,13 @@ build_equiv_class (sbcset, name) static reg_errcode_t #ifdef RE_ENABLE_I18N -build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax) - re_charset_t *mbcset; - int *char_class_alloc; +build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, + re_charset_t *mbcset, int *char_class_alloc, + const char *class_name, reg_syntax_t syntax) #else /* not RE_ENABLE_I18N */ -build_charclass (trans, sbcset, class_name, syntax) +build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, + const char *class_name, reg_syntax_t syntax) #endif /* not RE_ENABLE_I18N */ - unsigned RE_TRANSLATE_TYPE trans; - re_bitset_ptr_t sbcset; - const char *class_name; - reg_syntax_t syntax; { int i; @@ -3593,45 +3519,50 @@ build_charclass (trans, sbcset, class_name, syntax) #endif /* RE_ENABLE_I18N */ #define BUILD_CHARCLASS_LOOP(ctype_func) \ - for (i = 0; i < SBC_MAX; ++i) \ + do { \ + if (BE (trans != NULL, 0)) \ { \ - if (ctype_func (i)) \ - { \ - int ch = trans ? trans[i] : i; \ - bitset_set (sbcset, ch); \ - } \ - } + for (i = 0; i < SBC_MAX; ++i) \ + if (ctype_func (i)) \ + bitset_set (sbcset, trans[i]); \ + } \ + else \ + { \ + for (i = 0; i < SBC_MAX; ++i) \ + if (ctype_func (i)) \ + bitset_set (sbcset, i); \ + } \ + } while (0) if (strcmp (class_name, "alnum") == 0) - BUILD_CHARCLASS_LOOP (isalnum) + BUILD_CHARCLASS_LOOP (isalnum); else if (strcmp (class_name, "cntrl") == 0) - BUILD_CHARCLASS_LOOP (iscntrl) + BUILD_CHARCLASS_LOOP (iscntrl); else if (strcmp (class_name, "lower") == 0) - BUILD_CHARCLASS_LOOP (islower) + BUILD_CHARCLASS_LOOP (islower); else if (strcmp (class_name, "space") == 0) - BUILD_CHARCLASS_LOOP (isspace) + BUILD_CHARCLASS_LOOP (isspace); else if (strcmp (class_name, "alpha") == 0) - BUILD_CHARCLASS_LOOP (isalpha) + BUILD_CHARCLASS_LOOP (isalpha); else if (strcmp (class_name, "digit") == 0) - BUILD_CHARCLASS_LOOP (isdigit) + BUILD_CHARCLASS_LOOP (isdigit); else if (strcmp (class_name, "print") == 0) - BUILD_CHARCLASS_LOOP (isprint) + BUILD_CHARCLASS_LOOP (isprint); else if (strcmp (class_name, "upper") == 0) - BUILD_CHARCLASS_LOOP (isupper) -#ifndef GAWK + BUILD_CHARCLASS_LOOP (isupper); else if (strcmp (class_name, "blank") == 0) - BUILD_CHARCLASS_LOOP (isblank) +#ifndef GAWK + BUILD_CHARCLASS_LOOP (isblank); #else - /* see comments above */ - else if (strcmp (class_name, "blank") == 0) - BUILD_CHARCLASS_LOOP (is_blank) + /* see comments above */ + BUILD_CHARCLASS_LOOP (is_blank); #endif else if (strcmp (class_name, "graph") == 0) - BUILD_CHARCLASS_LOOP (isgraph) + BUILD_CHARCLASS_LOOP (isgraph); else if (strcmp (class_name, "punct") == 0) - BUILD_CHARCLASS_LOOP (ispunct) + BUILD_CHARCLASS_LOOP (ispunct); else if (strcmp (class_name, "xdigit") == 0) - BUILD_CHARCLASS_LOOP (isxdigit) + BUILD_CHARCLASS_LOOP (isxdigit); else return REG_ECTYPE; @@ -3639,13 +3570,10 @@ build_charclass (trans, sbcset, class_name, syntax) } static bin_tree_t * -build_charclass_op (dfa, trans, class_name, extra, non_match, err) - re_dfa_t *dfa; - unsigned RE_TRANSLATE_TYPE trans; - const char *class_name; - const char *extra; - int non_match; - reg_errcode_t *err; +build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, + const char *class_name, + const char *extra, int non_match, + reg_errcode_t *err) { re_bitset_ptr_t sbcset; #ifdef RE_ENABLE_I18N @@ -3656,7 +3584,7 @@ build_charclass_op (dfa, trans, class_name, extra, non_match, err) re_token_t br_token; bin_tree_t *tree; - sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); + sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); #ifdef RE_ENABLE_I18N mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); #endif /* RE_ENABLE_I18N */ @@ -3759,10 +3687,7 @@ build_charclass_op (dfa, trans, class_name, extra, non_match, err) Return -2, If an error is occured. */ static int -fetch_number (input, token, syntax) - re_string_t *input; - re_token_t *token; - reg_syntax_t syntax; +fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) { int num = -1; unsigned char c; @@ -3802,11 +3727,8 @@ free_charset (re_charset_t *cset) /* Create a tree node. */ static bin_tree_t * -create_tree (dfa, left, right, type) - re_dfa_t *dfa; - bin_tree_t *left; - bin_tree_t *right; - re_token_type_t type; +create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, + re_token_type_t type) { re_token_t t; t.type = type; @@ -3814,11 +3736,8 @@ create_tree (dfa, left, right, type) } static bin_tree_t * -create_token_tree (dfa, left, right, token) - re_dfa_t *dfa; - bin_tree_t *left; - bin_tree_t *right; - const re_token_t *token; +create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, + const re_token_t *token) { bin_tree_t *tree; if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) @@ -3854,9 +3773,7 @@ create_token_tree (dfa, left, right, token) To be called from preorder or postorder. */ static reg_errcode_t -mark_opt_subexp (extra, node) - void *extra; - bin_tree_t *node; +mark_opt_subexp (void *extra, bin_tree_t *node) { int idx = (int) (long) extra; if (node->token.type == SUBEXP && node->token.opr.idx == idx) @@ -3896,9 +3813,7 @@ free_tree (void *extra, bin_tree_t *node) it's easier to duplicate. */ static bin_tree_t * -duplicate_tree (root, dfa) - const bin_tree_t *root; - re_dfa_t *dfa; +duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) { const bin_tree_t *node; bin_tree_t *dup_root; |