diff options
Diffstat (limited to 'regex.c')
-rw-r--r-- | regex.c | 229 |
1 files changed, 153 insertions, 76 deletions
@@ -1,4 +1,4 @@ -/* Copyright 2009-2020 +/* Copyright 2009-2024 * Kaz Kylheku <kaz@kylheku.com> * Vancouver, Canada * All rights reserved. @@ -6,23 +6,24 @@ * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. */ #include <stdio.h> @@ -256,6 +257,9 @@ union regex_machine { int opt_derivative_regex = 0; +struct cobj_class *regex_cls; +static struct cobj_class *chset_cls; + wchar_t spaces[] = { 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x0020, 0x00a0, 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, @@ -296,7 +300,8 @@ static void L0_fill_range(cset_L0_t *L0, wchar_t ch0, wchar_t ch1) static int L0_contains(cset_L0_t *L0, wchar_t ch) { - return ((*L0)[CHAR_SET_INDEX(ch)] & (1 << CHAR_SET_BIT(ch))) != 0; + return ((*L0)[CHAR_SET_INDEX(ch)] & + (convert(bitcell_t, 1) << CHAR_SET_BIT(ch))) != 0; } static int L1_full(cset_L1_t *L1) @@ -320,8 +325,10 @@ static void L1_fill_range(cset_L1_t *L1, wchar_t ch0, wchar_t ch1) cset_L0_t *L0; if (i1 > i10 && i1 < i11) { - free((*L1)[i1]); - (*L1)[i1] = coerce(cset_L0_t *, -1); + if ((*L1)[i1] != coerce(cset_L0_t *, -1)) { + free((*L1)[i1]); + (*L1)[i1] = coerce(cset_L0_t *, -1); + } continue; } else if (i10 == i11) { c0 = ch0; @@ -401,8 +408,10 @@ static void L2_fill_range(cset_L2_t *L2, wchar_t ch0, wchar_t ch1) cset_L1_t *L1; if (i2 > i20 && i2 < i21) { - free((*L2)[i2]); - (*L2)[i2] = coerce(cset_L1_t *, -1); + if ((*L2)[i2] != coerce(cset_L1_t *, -1)) { + free((*L2)[i2]); + (*L2)[i2] = coerce(cset_L1_t *, -1); + } continue; } else if (i20 == i21) { c0 = ch0; @@ -473,8 +482,10 @@ static void L3_fill_range(cset_L3_t *L3, wchar_t ch0, wchar_t ch1) cset_L2_t *L2; if (i3 > i30 && i3 < i31) { - free((*L3)[i3]); - (*L3)[i3] = coerce(cset_L2_t *, -1); + if ((*L3)[i3] != coerce(cset_L2_t *, -1)) { + free((*L3)[i3]); + (*L3)[i3] = coerce(cset_L2_t *, -1); + } continue; } else if (i30 == i31) { c0 = ch0; @@ -587,7 +598,8 @@ static void char_set_add(char_set_t *set, wchar_t ch) /* fallthrough */ case CHSET_SMALL: assert (ch < 256); - set->s.bitcell[CHAR_SET_INDEX(ch)] |= (1 << CHAR_SET_BIT(ch)); + set->s.bitcell[CHAR_SET_INDEX(ch)] |= (convert(bitcell_t, 1) + << CHAR_SET_BIT(ch)); break; case CHSET_LARGE: assert (ch < 0x10000); @@ -1094,8 +1106,8 @@ static void nfa_map_states(nfa_state_t *s, static void nfa_count_one(nfa_state_t *s, mem_t *ctx) { - (void) s; int *pcount = coerce(int *, ctx); + (void) s; (*pcount)++; } @@ -1603,17 +1615,17 @@ static val reg_nary_to_bin(val regex) static val reg_compile_csets(val exp) { if (exp == space_k) { - return cobj(coerce(mem_t *, space_cs), chset_s, &char_set_obj_ops); + return cobj(coerce(mem_t *, space_cs), chset_cls, &char_set_obj_ops); } else if (exp == digit_k) { - return cobj(coerce(mem_t *, digit_cs), chset_s, &char_set_obj_ops); + return cobj(coerce(mem_t *, digit_cs), chset_cls, &char_set_obj_ops); } else if (exp == word_char_k) { - return cobj(coerce(mem_t *, word_cs), chset_s, &char_set_obj_ops); + return cobj(coerce(mem_t *, word_cs), chset_cls, &char_set_obj_ops); } else if (exp == cspace_k) { - return cobj(coerce(mem_t *, cspace_cs), chset_s, &char_set_obj_ops); + return cobj(coerce(mem_t *, cspace_cs), chset_cls, &char_set_obj_ops); } else if (exp == cdigit_k) { - return cobj(coerce(mem_t *, cdigit_cs), chset_s, &char_set_obj_ops); + return cobj(coerce(mem_t *, cdigit_cs), chset_cls, &char_set_obj_ops); } else if (exp == cword_char_k) { - return cobj(coerce(mem_t *, cword_cs), chset_s, &char_set_obj_ops); + return cobj(coerce(mem_t *, cword_cs), chset_cls, &char_set_obj_ops); } else if (symbolp(exp) || chrp(exp)) { return exp; } else if (stringp(exp)) { @@ -1624,7 +1636,7 @@ static val reg_compile_csets(val exp) if (sym == set_s || sym == cset_s) { char_set_t *set = char_set_compile(args, eq(sym, cset_s)); - return cobj(coerce(mem_t *, set), chset_s, &char_set_obj_ops); + return cobj(coerce(mem_t *, set), chset_cls, &char_set_obj_ops); } else if (sym == compound_s || sym == zeroplus_s || sym == oneplus_s || sym == optional_s || sym == compl_s || sym == nongreedy_s || sym == or_s || sym == and_s) @@ -1835,7 +1847,7 @@ static val reg_derivative(val exp, val ch) return t; } else if (chrp(exp)) { return null(eq(exp, ch)); - } else if (cobjclassp(exp, chset_s)) { + } else if (cobjclassp(exp, chset_cls)) { char_set_t *set = coerce(char_set_t *, exp->co.handle); return if3(char_set_contains(set, c_chr(ch)), nil, t); } else if (exp == wild_s) { @@ -2203,16 +2215,23 @@ static val regex_requires_dv(val exp) } } +static val regex_optimize(val regex_sexp) +{ + return reg_optimize(reg_expand_nongreedy(reg_nary_to_bin(regex_sexp))); +} + val regex_compile(val regex_sexp, val error_stream) { - val regex_source = regex_sexp; + val regex_source; if (stringp(regex_sexp)) { - regex_sexp = regex_parse(regex_sexp, default_null_arg(error_stream)); + regex_sexp = regex_parse(regex_sexp, error_stream); return if2(regex_sexp, regex_compile(regex_sexp, error_stream)); } - regex_sexp = reg_optimize(reg_expand_nongreedy(reg_nary_to_bin(regex_sexp))); + regex_source = reg_nary_to_bin(regex_sexp); + + regex_sexp = reg_optimize(reg_expand_nongreedy(regex_source)); if (opt_derivative_regex || regex_requires_dv(regex_sexp)) { regex_t *regex = coerce(regex_t *, chk_malloc(sizeof *regex)); @@ -2221,7 +2240,7 @@ val regex_compile(val regex_sexp, val error_stream) regex->kind = REGEX_DV; regex->nstates = 0; regex->source = nil; - ret = cobj(coerce(mem_t *, regex), regex_s, ®ex_obj_ops); + ret = cobj(coerce(mem_t *, regex), regex_cls, ®ex_obj_ops); regex->r.dv = dv; regex->source = regex_source; return ret; @@ -2230,7 +2249,7 @@ val regex_compile(val regex_sexp, val error_stream) val ret; regex->kind = REGEX_NFA; regex->source = nil; - ret = cobj(coerce(mem_t *, regex), regex_s, ®ex_obj_ops); + ret = cobj(coerce(mem_t *, regex), regex_cls, ®ex_obj_ops); regex->r.nfa = nfa_optimize(nfa_compile_regex(regex_sexp)); regex->nstates = nfa_count_states(regex->r.nfa.start); regex->source = regex_source; @@ -2240,14 +2259,14 @@ val regex_compile(val regex_sexp, val error_stream) val regexp(val obj) { - return cobjclassp(obj, regex_s); + return cobjclassp(obj, regex_cls); } val regex_source(val compiled_regex) { val self = lit("regex-source"); regex_t *regex = coerce(regex_t *, - cobj_handle(self, compiled_regex, regex_s)); + cobj_handle(self, compiled_regex, regex_cls)); return regex->source; } @@ -2290,6 +2309,8 @@ static void paren_print_rec(val exp, val stream, int *semi_flag) static void print_rec(val exp, val stream, int *semi_flag) { + val self = lit("regex-print"); + if (exp == space_k) { puts_clear_flag(lit("\\s"), stream, semi_flag); } else if (exp == digit_k) { @@ -2319,7 +2340,7 @@ static void print_rec(val exp, val stream, int *semi_flag) } } else if (stringp(exp)) { cnum i; - cnum l = c_num(length(exp)); + cnum l = c_num(length(exp), self); for (i = 0; i < l; i++) print_rec(chr_str(exp, num(i)), stream, semi_flag); } else if (consp(exp)) { @@ -2327,10 +2348,10 @@ static void print_rec(val exp, val stream, int *semi_flag) val args = rest(exp); if (sym == set_s || sym == cset_s) { - putc_clear_flag(chr('['), stream, semi_flag); - val first_p = t; + putc_clear_flag(chr('['), stream, semi_flag); + if (sym == cset_s) { put_char(chr('^'), stream); first_p = nil; @@ -2419,7 +2440,7 @@ static void print_rec(val exp, val stream, int *semi_flag) static void regex_print(val obj, val stream, val pretty, struct strm_ctx *ctx) { val self = lit("regex-print"); - regex_t *regex = coerce(regex_t *, cobj_handle(self, obj, regex_s)); + regex_t *regex = coerce(regex_t *, cobj_handle(self, obj, regex_cls)); int semi_flag = 0; (void) pretty; @@ -2433,7 +2454,7 @@ static void regex_print(val obj, val stream, val pretty, struct strm_ctx *ctx) static cnum regex_run(val compiled_regex, const wchar_t *str) { val self = lit("regex-run"); - regex_t *regex = coerce(regex_t *, cobj_handle(self, compiled_regex, regex_s)); + regex_t *regex = coerce(regex_t *, cobj_handle(self, compiled_regex, regex_cls)); return if3(regex->kind == REGEX_DV, dv_run(regex->r.dv, str), @@ -2477,7 +2498,7 @@ static void regex_machine_reset(regex_machine_t *regm) static void regex_machine_init(val self, regex_machine_t *regm, val reg) { - regex_t *regex = coerce(regex_t *, cobj_handle(self, reg, regex_s)); + regex_t *regex = coerce(regex_t *, cobj_handle(self, reg, regex_cls)); if (regex->kind == REGEX_DV) { regm->n.is_nfa = 0; @@ -2584,15 +2605,15 @@ val search_regex(val haystack, val needle_regex, val start, if (from_end) { cnum i; - cnum s = c_num(start); - const wchar_t *h = c_str(haystack); + cnum s = c_num(start, self); + const wchar_t *h = c_str(haystack, self); slen = (slen ? slen : length_str(haystack)); if (regex_run(needle_regex, L"") >= 0) return cons(slen, zero); - for (i = c_num(slen) - 1; i >= s; i--) { + for (i = c_num(slen, self) - 1; i >= s; i--) { cnum span = regex_run(needle_regex, h + i); if (span >= 0) return cons(num(i), num(span)); @@ -2867,32 +2888,44 @@ val regsub(val regex, val repl, val str) { val rf = from(range); val rt = to(range); + val scopy = copy_str(str); - return replace_str(str, if3(isfunc, - funcall1(repl, sub_str(str, rf, rt)), - repl), + return replace_str(scopy, if3(isfunc, + funcall1(repl, sub_str(scopy, rf, rt)), + repl), rf, rt); } } else { - list_collect_decl (out, ptail); val pos = zero; + val out = mkustring(zero); + val slen = if2(stringp(regex), length(regex)); do { - cons_bind (find, len, search_regex(str, regex, pos, nil)); + val find, len; + + if (slen) { + len = slen; + find = search_str(str, regex, pos, nil); + } else { + cons_bind (a, d, search_regex(str, regex, pos, nil)); + find = a; + len = d; + } + if (!find) { if (pos == zero) return str; - ptail = list_collect(ptail, sub_str(str, pos, nil)); - break; + return string_extend(out, sub_str(str, pos, nil), t); } - ptail = list_collect(ptail, sub_str(str, pos, find)); - ptail = list_collect(ptail, if3(isfunc, - funcall1(repl, sub_str(str, find, - plus(find, len))), - repl)); + string_extend(out, sub_str(str, pos, find), nil); + string_extend(out, if3(isfunc, + funcall1(repl, sub_str(str, find, + plus(find, len))), + repl), + nil); if (len == zero && eql(find, pos)) { if (lt(pos, length_str(str))) { - ptail = list_collect(ptail, chr_str(str, pos)); + string_extend(out, chr_str(str, pos), nil); pos = plus(pos, one); } } else { @@ -2900,7 +2933,7 @@ val regsub(val regex, val repl, val str) } } while (lt(pos, length_str(str))); - return cat_str(out, nil); + return string_finish(out); } } @@ -3159,7 +3192,7 @@ static val scan_until_common(val self, val regex, val stream_in, if (!out) out = mkstring(one, ch); else - string_extend(out, ch); + string_extend(out, ch, nil); } else { count++; } @@ -3216,6 +3249,32 @@ val count_until_match(val regex, val stream_in) return scan_until_common(lit("count-until-match"), regex, stream_in, nil, nil); } +static val trim_left(val regex, val string) +{ + if (regexp(regex)) { + val pos = match_regex(string, regex, nil); + if (pos) + return sub_str(string, pos, t); + } else if (starts_with(regex, string, nil, nil)) { + return sub_str(string, length(regex), t); + } + + return string; +} + +static val trim_right(val regex, val string) +{ + if (regexp(regex)) { + val pos = match_regex_right(string, regex, nil); + if (pos) + return sub_str(string, zero, minus(length(string), pos)); + } else if (ends_with(regex, string, nil, nil)) { + return sub_str(string, zero, minus(length(string), length(regex))); + } + + return string; +} + static char_set_t *create_wide_cs(void) { #ifdef FULL_UNICODE @@ -3226,14 +3285,13 @@ static char_set_t *create_wide_cs(void) char_set_t *cs = char_set_create(cst, 0, 1); - char_set_add_range(cs, 0x1100, 0x115F); + char_set_add_range(cs, 0x1100, 0x11F9); char_set_add_range(cs, 0x2329, 0x232A); char_set_add_range(cs, 0x2E80, 0x2E99); char_set_add_range(cs, 0x2E9B, 0x2EF3); char_set_add_range(cs, 0x2F00, 0x2FD5); char_set_add_range(cs, 0x2FF0, 0x2FFB); - char_set_add_range(cs, 0x3000, 0x303E); - char_set_add_range(cs, 0x3000, 0x303E); + char_set_add_range(cs, 0x3000, 0x303F); char_set_add_range(cs, 0x3041, 0x3096); char_set_add_range(cs, 0x3099, 0x30FF); char_set_add_range(cs, 0x3105, 0x312D); @@ -3242,13 +3300,13 @@ static char_set_t *create_wide_cs(void) char_set_add_range(cs, 0x31C0, 0x31E3); char_set_add_range(cs, 0x31F0, 0x321E); char_set_add_range(cs, 0x3220, 0x3247); - char_set_add_range(cs, 0x3250, 0x32FE); - char_set_add_range(cs, 0x3300, 0x4DB5); + char_set_add_range(cs, 0x3250, 0x4DBF); char_set_add_range(cs, 0x4E00, 0x9FFF); char_set_add_range(cs, 0xA000, 0xA48C); char_set_add_range(cs, 0xA490, 0xA4C6); char_set_add_range(cs, 0xA960, 0xA97C); char_set_add_range(cs, 0xAC00, 0xD7A3); + char_set_add_range(cs, 0xE000, 0xE757); char_set_add_range(cs, 0xF900, 0xFAFF); char_set_add_range(cs, 0xFE10, 0xFE19); char_set_add_range(cs, 0xFE30, 0xFE52); @@ -3258,12 +3316,19 @@ static char_set_t *create_wide_cs(void) #ifdef FULL_UNICODE char_set_add_range(cs, 0x1B000, 0x1B001); + char_set_add_range(cs, 0x1F004, 0x1F004); + char_set_add_range(cs, 0x1F0CF, 0x1F0CF); + char_set_add_range(cs, 0x1F170, 0x1F171); + char_set_add_range(cs, 0x1F17E, 0x1F17F); + char_set_add_range(cs, 0x1F191, 0x1F19A); char_set_add_range(cs, 0x1F200, 0x1F202); char_set_add_range(cs, 0x1F210, 0x1F23A); char_set_add_range(cs, 0x1F240, 0x1F248); char_set_add_range(cs, 0x1F250, 0x1F251); - char_set_add_range(cs, 0x20000, 0x2FFFD); - char_set_add_range(cs, 0x30000, 0x3FFFD); + char_set_add_range(cs, 0x1F300, 0x1F7FF); + char_set_add_range(cs, 0x1F900, 0x1FAFF); + char_set_add_range(cs, 0x20000, 0x2FFFF); + char_set_add_range(cs, 0x30000, 0x3FFFF); #endif return cs; @@ -3294,22 +3359,21 @@ void regex_init(void) cdigit_k = intern(lit("cdigit"), keyword_package); cword_char_k = intern(lit("cword-char"), keyword_package); + regex_cls = cobj_register(regex_s); + chset_cls = cobj_register(chset_s); + reg_fun(intern(lit("regex-compile"), user_package), func_n2o(regex_compile, 1)); reg_fun(intern(lit("regexp"), user_package), func_n1(regexp)); reg_fun(intern(lit("regex-source"), user_package), func_n1(regex_source)); reg_fun(intern(lit("search-regex"), user_package), func_n4o(search_regex, 2)); reg_fun(intern(lit("range-regex"), user_package), func_n4o(range_regex, 2)); reg_fun(intern(lit("search-regst"), user_package), func_n4o(search_regst, 2)); - reg_fun(intern(lit("match-regex"), user_package), - func_n3o((opt_compat && opt_compat <= 150) ? - match_regex : match_regex_len, 2)); + reg_fun(intern(lit("match-regex"), user_package), func_n3o(match_regex_len, 2)); reg_fun(intern(lit("match-regst"), user_package), func_n3o(match_regst, 2)); reg_fun(intern(lit("match-regex-right"), user_package), - func_n3o((opt_compat && opt_compat <= 150) ? - match_regex_right_old : match_regex_right, 2)); + func_n3o(match_regex_right, 2)); reg_fun(intern(lit("match-regst-right"), user_package), - func_n3o((opt_compat && opt_compat <= 150) ? - match_regst_right_old : match_regst_right, 2)); + func_n3o(match_regst_right, 2)); reg_fun(intern(lit("regex-prefix-match"), user_package), func_n3o(regex_prefix_match, 2)); reg_fun(intern(lit("regsub"), user_package), func_n3(regsub)); @@ -3317,7 +3381,7 @@ void regex_init(void) reg_fun(intern(lit("reg-expand-nongreedy"), system_package), func_n1(reg_expand_nongreedy)); - reg_fun(intern(lit("reg-optimize"), system_package), func_n1(reg_optimize)); + reg_fun(intern(lit("regex-optimize"), user_package), func_n1(regex_optimize)); reg_fun(intern(lit("read-until-match"), user_package), func_n3o(read_until_match, 1)); reg_fun(intern(lit("scan-until-match"), user_package), func_n2(scan_until_match)); reg_fun(intern(lit("count-until-match"), user_package), func_n2(count_until_match)); @@ -3336,9 +3400,22 @@ void regex_init(void) reg_fun(intern(lit("fr^"), user_package), func_n2o(regex_range_left_fun, 1)); reg_fun(intern(lit("fr$"), user_package), func_n2o(regex_range_right_fun, 1)); reg_fun(intern(lit("frr"), user_package), func_n3o(regex_range_search_fun, 1)); + reg_fun(intern(lit("trim-left"), user_package), func_n2(trim_left)); + reg_fun(intern(lit("trim-right"), user_package), func_n2(trim_right)); init_special_char_sets(); } +void regex_compat_fixup(int compat_ver) +{ + if (compat_ver <= 150) { + reg_fun(intern(lit("match-regex"), user_package), func_n3o(match_regex, 2)); + reg_fun(intern(lit("match-regex-right"), user_package), + func_n3o(match_regex_right_old, 2)); + reg_fun(intern(lit("match-regst-right"), user_package), + func_n3o(match_regst_right_old, 2)); + } +} + void regex_free_all(void) { char_set_destroy(space_cs, 1); |