diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2011-06-17 11:03:41 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2011-06-17 11:03:41 +0300 |
commit | 61e6d1bdd8bb6518d6293ddf2da845c4195d8535 (patch) | |
tree | 7d161dafa8727b63fa867f5463d6474a33565a44 | |
parent | 0479a809ad3a0a0437ce16f889d7b07a09c39323 (diff) | |
download | egawk-61e6d1bdd8bb6518d6293ddf2da845c4195d8535.tar.gz egawk-61e6d1bdd8bb6518d6293ddf2da845c4195d8535.tar.bz2 egawk-61e6d1bdd8bb6518d6293ddf2da845c4195d8535.zip |
Put RRI into code.
-rw-r--r-- | ChangeLog | 15 | ||||
-rw-r--r-- | Makefile.am | 2 | ||||
-rw-r--r-- | Makefile.in | 10 | ||||
-rw-r--r-- | dfa.c | 31 | ||||
-rw-r--r-- | hard-locale.c | 83 | ||||
-rw-r--r-- | hard-locale.h | 27 | ||||
-rw-r--r-- | re.c | 14 | ||||
-rw-r--r-- | regcomp.c | 28 | ||||
-rw-r--r-- | regex.h | 5 |
9 files changed, 24 insertions, 191 deletions
@@ -1,3 +1,18 @@ +Fri Jun 17 10:55:27 2011 Arnold D. Robbins <arnold@skeeve.com> + + Implement Rational Range Interpretation (RRI) directly in code. + + * regex.h [RE_RANGES_IGNORE_LOCALES]: Remove macro and its use. + * dfa.c (parse_bracket_exp): Remove use of RE_RANGES_IGNORE_LOCALES + and just do it in code. + (hard-locale.h): Remove include. + (hard_LC_COLLATE): Remove variable and its uses. + * re.c (resetup): Remove use of RE_RANGES_IGNORE_LOCALES. + * regcomp.c (build_range_exp): Remove use of RE_RANGES_IGNORE_LOCALES + and just do it in code. Remove cmp_buf array; it's no longer needed. + * Makefile.am (base_sources): Remove hard_locale.h and hard_locale.c. + * hard_locale.h, hard_locale.c: Removed from dist. + Sun Jun 12 23:43:06 2011 Arnold D. Robbins <arnold@skeeve.com> * re.c (resetup): Always turn on RE_RANGES_IGNORE_LOCALES. diff --git a/Makefile.am b/Makefile.am index c89f6423..d08ff172 100644 --- a/Makefile.am +++ b/Makefile.am @@ -96,8 +96,6 @@ base_sources = \ getopt1.c \ getopt_int.h \ gettext.h \ - hard-locale.h \ - hard-locale.c \ io.c \ mbsupport.h \ main.c \ diff --git a/Makefile.in b/Makefile.in index 6348d56b..4feab746 100644 --- a/Makefile.in +++ b/Makefile.in @@ -90,10 +90,9 @@ PROGRAMS = $(bin_PROGRAMS) am__objects_1 = array.$(OBJEXT) awkgram.$(OBJEXT) builtin.$(OBJEXT) \ dfa.$(OBJEXT) ext.$(OBJEXT) field.$(OBJEXT) \ floatcomp.$(OBJEXT) gawkmisc.$(OBJEXT) getopt.$(OBJEXT) \ - getopt1.$(OBJEXT) hard-locale.$(OBJEXT) io.$(OBJEXT) \ - main.$(OBJEXT) msg.$(OBJEXT) node.$(OBJEXT) random.$(OBJEXT) \ - re.$(OBJEXT) regex.$(OBJEXT) replace.$(OBJEXT) \ - version.$(OBJEXT) + getopt1.$(OBJEXT) io.$(OBJEXT) main.$(OBJEXT) msg.$(OBJEXT) \ + node.$(OBJEXT) random.$(OBJEXT) re.$(OBJEXT) regex.$(OBJEXT) \ + replace.$(OBJEXT) version.$(OBJEXT) am_dgawk_OBJECTS = $(am__objects_1) eval_d.$(OBJEXT) profile.$(OBJEXT) \ command.$(OBJEXT) debug.$(OBJEXT) dgawk_OBJECTS = $(am_dgawk_OBJECTS) @@ -372,8 +371,6 @@ base_sources = \ getopt1.c \ getopt_int.h \ gettext.h \ - hard-locale.h \ - hard-locale.c \ io.c \ mbsupport.h \ main.c \ @@ -530,7 +527,6 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gawkmisc.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getopt.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getopt1.Po@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hard-locale.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/io.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/msg.Po@am__quote@ @@ -64,7 +64,6 @@ #endif #endif -/* need this before include of hard-locale.h */ #ifdef GAWK #define bool int #define true (1) @@ -73,7 +72,6 @@ #include "regex.h" #include "dfa.h" -#include "hard-locale.h" #include "xalloc.h" #ifdef GAWK @@ -650,7 +648,6 @@ static int laststart; /* True if we're separated from beginning or (, | only by zero-width characters. */ static int parens; /* Count of outstanding left parens. */ static int minrep, maxrep; /* Repeat counts for {m,n}. */ -static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */ static int cur_mb_len = 1; /* Length of the multibyte representation of wctok. */ @@ -1007,29 +1004,8 @@ parse_bracket_exp (void) c1 = tolower (c1); c2 = tolower (c2); } - if (!hard_LC_COLLATE - || (syntax_bits & RE_RANGES_IGNORE_LOCALES)) - for (c = c1; c <= c2; c++) - setbit_case_fold_c (c, ccl); - else - { - /* Defer to the system regex library about the meaning - of range expressions. */ - regex_t re; - char pattern[6] = { '[', 0, '-', 0, ']', 0 }; - char subject[2] = { 0, 0 }; - pattern[1] = c1; - pattern[3] = c2; - regcomp (&re, pattern, REG_NOSUB); - for (c = 0; c < NOTCHAR; ++c) - { - subject[0] = c; - if (!(case_fold && isupper (c)) - && regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH) - setbit_case_fold_c (c, ccl); - } - regfree (&re); - } + for (c = c1; c <= c2; c++) + setbit_case_fold_c (c, ccl); } colon_warning_state |= 8; @@ -1821,9 +1797,6 @@ dfaparse (char const *s, size_t len, struct dfa *d) lasttok = END; laststart = 1; parens = 0; -#ifdef LC_COLLATE - hard_LC_COLLATE = hard_locale (LC_COLLATE); -#endif #if MBS_SUPPORT if (MB_CUR_MAX > 1) { diff --git a/hard-locale.c b/hard-locale.c deleted file mode 100644 index 8b7353bb..00000000 --- a/hard-locale.c +++ /dev/null @@ -1,83 +0,0 @@ -/* hard-locale.c -- Determine whether a locale is hard. - - Copyright (C) 1997, 1998, 1999, 2002, 2003, 2004, 2006, 2007, 2009, 2010, - 2011, - Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. */ - -#include <config.h> - -#ifdef GAWK -#define bool int -#define true (1) -#define false (0) -#endif - -#include "hard-locale.h" - -#ifdef HAVE_LOCALE_H -#include <locale.h> -#endif -#include <stdlib.h> -#include <string.h> - -#ifdef __GLIBC__ -# define GLIBC_VERSION __GLIBC__ -#else -# define GLIBC_VERSION 0 -#endif - -/* Return true if the current CATEGORY locale is hard, i.e. if you - can't get away with assuming traditional C or POSIX behavior. */ -bool -hard_locale (int category) -{ -#if ! (defined ENABLE_NLS && HAVE_SETLOCALE) - return 0; -#else - bool hard = true; - char const *p = setlocale (category, NULL); - - if (p) - { - if (2 <= GLIBC_VERSION) - { - if (strcmp (p, "C") == 0 || strcmp (p, "POSIX") == 0) - hard = false; - } - else - { - char *locale = strdup (p); - if (locale) - { - /* Temporarily set the locale to the "C" and "POSIX" locales - to find their names, so that we can determine whether one - or the other is the caller's locale. */ - if (((p = setlocale (category, "C")) - && strcmp (p, locale) == 0) - || ((p = setlocale (category, "POSIX")) - && strcmp (p, locale) == 0)) - hard = false; - - /* Restore the caller's locale. */ - setlocale (category, locale); - free (locale); - } - } - } - - return hard; -#endif -} diff --git a/hard-locale.h b/hard-locale.h deleted file mode 100644 index 160d5443..00000000 --- a/hard-locale.h +++ /dev/null @@ -1,27 +0,0 @@ -/* Determine whether a locale is hard. - - Copyright (C) 1999, 2003, 2004, 2009, 2010, 2011 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. */ - -#ifndef HARD_LOCALE_H_ -# define HARD_LOCALE_H_ 1 - -#ifndef GAWK -# include <stdbool.h> -#endif - -bool hard_locale (int); - -#endif /* HARD_LOCALE_H_ */ @@ -388,20 +388,6 @@ resetup() syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */ /* - * As of POSIX 1003.1-2008 (see rule 7 of - * http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_03_05 - * and the rationale, at http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html#tag_21_09_03_05) - * POSIX changed ranges outside the POSIX locale from requiring - * Collation Element Order to being "undefined". This gives an - * implementation, like gawk, the freedom to do ranges as it - * pleases. - * - * We very much please to always use numeric ordering, as - * the Good Lord intended. - */ - syn |= RE_RANGES_IGNORE_LOCALES; - - /* * Interval expressions are now on by default, as POSIX is * wide-spread enough that people want it. The do_intervals * variable remains for use with --traditional. @@ -2651,7 +2651,6 @@ build_range_exp (reg_syntax_t syntax, bitset_t sbcset, # endif /* not RE_ENABLE_I18N */ { unsigned int start_ch, end_ch; - int ignore_locales = (syntax & RE_RANGES_IGNORE_LOCALES) != 0; /* Equivalence Classes and Character Classes can't be a range start/end. */ if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS @@ -2672,7 +2671,6 @@ build_range_exp (reg_syntax_t syntax, bitset_t sbcset, wchar_t wc; wint_t start_wc; wint_t end_wc; - wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] @@ -2698,12 +2696,7 @@ build_range_exp (reg_syntax_t syntax, bitset_t sbcset, #endif if (start_wc == WEOF || end_wc == WEOF) return REG_ECOLLATE; - cmp_buf[0] = start_wc; - cmp_buf[4] = end_wc; - if (ignore_locales && start_wc > end_wc) - return REG_ERANGE; - else if ((syntax & RE_NO_EMPTY_RANGES) - && wcscoll (cmp_buf, cmp_buf + 4) > 0) + else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc) return REG_ERANGE; /* Got valid collation sequence values, add them as a new entry. @@ -2742,23 +2735,10 @@ build_range_exp (reg_syntax_t syntax, bitset_t sbcset, } /* Build the table for single byte characters. */ - if (ignore_locales) - { - for (wc = 0; wc < SBC_MAX; ++wc) - { - if (start_wc <= wc && wc <= end_wc) - bitset_set (sbcset, wc); - } - } - else + for (wc = 0; wc < SBC_MAX; ++wc) { - for (wc = 0; wc < SBC_MAX; ++wc) - { - cmp_buf[2] = wc; - if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 - && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) - bitset_set (sbcset, wc); - } + if (start_wc <= wc && wc <= end_wc) + bitset_set (sbcset, wc); } } # else /* not RE_ENABLE_I18N */ @@ -184,10 +184,6 @@ typedef unsigned long int reg_syntax_t; /* If this bit is set, then no_sub will be set to 1 during re_compile_pattern. */ # define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1) - -/* If this bit is set, then ranges act like they are in - the "C" locale. */ -# define RE_RANGES_IGNORE_LOCALES (RE_NO_SUB << 1) #endif /* This global variable defines the particular regexp syntax to use (for @@ -213,7 +209,6 @@ extern reg_syntax_t re_syntax_options; #define RE_SYNTAX_GNU_AWK \ ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \ - | RE_RANGES_IGNORE_LOCALES \ | RE_INVALID_INTERVAL_ORD) \ & ~(RE_DOT_NOT_NULL | RE_CONTEXT_INDEP_OPS \ | RE_CONTEXT_INVALID_OPS )) |