aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2011-06-17 11:03:41 +0300
committerArnold D. Robbins <arnold@skeeve.com>2011-06-17 11:03:41 +0300
commit61e6d1bdd8bb6518d6293ddf2da845c4195d8535 (patch)
tree7d161dafa8727b63fa867f5463d6474a33565a44
parent0479a809ad3a0a0437ce16f889d7b07a09c39323 (diff)
downloadegawk-61e6d1bdd8bb6518d6293ddf2da845c4195d8535.tar.gz
egawk-61e6d1bdd8bb6518d6293ddf2da845c4195d8535.tar.bz2
egawk-61e6d1bdd8bb6518d6293ddf2da845c4195d8535.zip
Put RRI into code.
-rw-r--r--ChangeLog15
-rw-r--r--Makefile.am2
-rw-r--r--Makefile.in10
-rw-r--r--dfa.c31
-rw-r--r--hard-locale.c83
-rw-r--r--hard-locale.h27
-rw-r--r--re.c14
-rw-r--r--regcomp.c28
-rw-r--r--regex.h5
9 files changed, 24 insertions, 191 deletions
diff --git a/ChangeLog b/ChangeLog
index 8507b985..6e655835 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+Fri Jun 17 10:55:27 2011 Arnold D. Robbins <arnold@skeeve.com>
+
+ Implement Rational Range Interpretation (RRI) directly in code.
+
+ * regex.h [RE_RANGES_IGNORE_LOCALES]: Remove macro and its use.
+ * dfa.c (parse_bracket_exp): Remove use of RE_RANGES_IGNORE_LOCALES
+ and just do it in code.
+ (hard-locale.h): Remove include.
+ (hard_LC_COLLATE): Remove variable and its uses.
+ * re.c (resetup): Remove use of RE_RANGES_IGNORE_LOCALES.
+ * regcomp.c (build_range_exp): Remove use of RE_RANGES_IGNORE_LOCALES
+ and just do it in code. Remove cmp_buf array; it's no longer needed.
+ * Makefile.am (base_sources): Remove hard_locale.h and hard_locale.c.
+ * hard_locale.h, hard_locale.c: Removed from dist.
+
Sun Jun 12 23:43:06 2011 Arnold D. Robbins <arnold@skeeve.com>
* re.c (resetup): Always turn on RE_RANGES_IGNORE_LOCALES.
diff --git a/Makefile.am b/Makefile.am
index c89f6423..d08ff172 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -96,8 +96,6 @@ base_sources = \
getopt1.c \
getopt_int.h \
gettext.h \
- hard-locale.h \
- hard-locale.c \
io.c \
mbsupport.h \
main.c \
diff --git a/Makefile.in b/Makefile.in
index 6348d56b..4feab746 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -90,10 +90,9 @@ PROGRAMS = $(bin_PROGRAMS)
am__objects_1 = array.$(OBJEXT) awkgram.$(OBJEXT) builtin.$(OBJEXT) \
dfa.$(OBJEXT) ext.$(OBJEXT) field.$(OBJEXT) \
floatcomp.$(OBJEXT) gawkmisc.$(OBJEXT) getopt.$(OBJEXT) \
- getopt1.$(OBJEXT) hard-locale.$(OBJEXT) io.$(OBJEXT) \
- main.$(OBJEXT) msg.$(OBJEXT) node.$(OBJEXT) random.$(OBJEXT) \
- re.$(OBJEXT) regex.$(OBJEXT) replace.$(OBJEXT) \
- version.$(OBJEXT)
+ getopt1.$(OBJEXT) io.$(OBJEXT) main.$(OBJEXT) msg.$(OBJEXT) \
+ node.$(OBJEXT) random.$(OBJEXT) re.$(OBJEXT) regex.$(OBJEXT) \
+ replace.$(OBJEXT) version.$(OBJEXT)
am_dgawk_OBJECTS = $(am__objects_1) eval_d.$(OBJEXT) profile.$(OBJEXT) \
command.$(OBJEXT) debug.$(OBJEXT)
dgawk_OBJECTS = $(am_dgawk_OBJECTS)
@@ -372,8 +371,6 @@ base_sources = \
getopt1.c \
getopt_int.h \
gettext.h \
- hard-locale.h \
- hard-locale.c \
io.c \
mbsupport.h \
main.c \
@@ -530,7 +527,6 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gawkmisc.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getopt.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getopt1.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hard-locale.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/io.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/msg.Po@am__quote@
diff --git a/dfa.c b/dfa.c
index 2042dc34..f1557423 100644
--- a/dfa.c
+++ b/dfa.c
@@ -64,7 +64,6 @@
#endif
#endif
-/* need this before include of hard-locale.h */
#ifdef GAWK
#define bool int
#define true (1)
@@ -73,7 +72,6 @@
#include "regex.h"
#include "dfa.h"
-#include "hard-locale.h"
#include "xalloc.h"
#ifdef GAWK
@@ -650,7 +648,6 @@ static int laststart; /* True if we're separated from beginning or (, |
only by zero-width characters. */
static int parens; /* Count of outstanding left parens. */
static int minrep, maxrep; /* Repeat counts for {m,n}. */
-static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
static int cur_mb_len = 1; /* Length of the multibyte representation of
wctok. */
@@ -1007,29 +1004,8 @@ parse_bracket_exp (void)
c1 = tolower (c1);
c2 = tolower (c2);
}
- if (!hard_LC_COLLATE
- || (syntax_bits & RE_RANGES_IGNORE_LOCALES))
- for (c = c1; c <= c2; c++)
- setbit_case_fold_c (c, ccl);
- else
- {
- /* Defer to the system regex library about the meaning
- of range expressions. */
- regex_t re;
- char pattern[6] = { '[', 0, '-', 0, ']', 0 };
- char subject[2] = { 0, 0 };
- pattern[1] = c1;
- pattern[3] = c2;
- regcomp (&re, pattern, REG_NOSUB);
- for (c = 0; c < NOTCHAR; ++c)
- {
- subject[0] = c;
- if (!(case_fold && isupper (c))
- && regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
- setbit_case_fold_c (c, ccl);
- }
- regfree (&re);
- }
+ for (c = c1; c <= c2; c++)
+ setbit_case_fold_c (c, ccl);
}
colon_warning_state |= 8;
@@ -1821,9 +1797,6 @@ dfaparse (char const *s, size_t len, struct dfa *d)
lasttok = END;
laststart = 1;
parens = 0;
-#ifdef LC_COLLATE
- hard_LC_COLLATE = hard_locale (LC_COLLATE);
-#endif
#if MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
diff --git a/hard-locale.c b/hard-locale.c
deleted file mode 100644
index 8b7353bb..00000000
--- a/hard-locale.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/* hard-locale.c -- Determine whether a locale is hard.
-
- Copyright (C) 1997, 1998, 1999, 2002, 2003, 2004, 2006, 2007, 2009, 2010,
- 2011,
- Free Software Foundation, Inc.
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>. */
-
-#include <config.h>
-
-#ifdef GAWK
-#define bool int
-#define true (1)
-#define false (0)
-#endif
-
-#include "hard-locale.h"
-
-#ifdef HAVE_LOCALE_H
-#include <locale.h>
-#endif
-#include <stdlib.h>
-#include <string.h>
-
-#ifdef __GLIBC__
-# define GLIBC_VERSION __GLIBC__
-#else
-# define GLIBC_VERSION 0
-#endif
-
-/* Return true if the current CATEGORY locale is hard, i.e. if you
- can't get away with assuming traditional C or POSIX behavior. */
-bool
-hard_locale (int category)
-{
-#if ! (defined ENABLE_NLS && HAVE_SETLOCALE)
- return 0;
-#else
- bool hard = true;
- char const *p = setlocale (category, NULL);
-
- if (p)
- {
- if (2 <= GLIBC_VERSION)
- {
- if (strcmp (p, "C") == 0 || strcmp (p, "POSIX") == 0)
- hard = false;
- }
- else
- {
- char *locale = strdup (p);
- if (locale)
- {
- /* Temporarily set the locale to the "C" and "POSIX" locales
- to find their names, so that we can determine whether one
- or the other is the caller's locale. */
- if (((p = setlocale (category, "C"))
- && strcmp (p, locale) == 0)
- || ((p = setlocale (category, "POSIX"))
- && strcmp (p, locale) == 0))
- hard = false;
-
- /* Restore the caller's locale. */
- setlocale (category, locale);
- free (locale);
- }
- }
- }
-
- return hard;
-#endif
-}
diff --git a/hard-locale.h b/hard-locale.h
deleted file mode 100644
index 160d5443..00000000
--- a/hard-locale.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Determine whether a locale is hard.
-
- Copyright (C) 1999, 2003, 2004, 2009, 2010, 2011 Free Software Foundation, Inc.
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>. */
-
-#ifndef HARD_LOCALE_H_
-# define HARD_LOCALE_H_ 1
-
-#ifndef GAWK
-# include <stdbool.h>
-#endif
-
-bool hard_locale (int);
-
-#endif /* HARD_LOCALE_H_ */
diff --git a/re.c b/re.c
index 2e1a37e7..234384b7 100644
--- a/re.c
+++ b/re.c
@@ -388,20 +388,6 @@ resetup()
syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
/*
- * As of POSIX 1003.1-2008 (see rule 7 of
- * http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_03_05
- * and the rationale, at http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html#tag_21_09_03_05)
- * POSIX changed ranges outside the POSIX locale from requiring
- * Collation Element Order to being "undefined". This gives an
- * implementation, like gawk, the freedom to do ranges as it
- * pleases.
- *
- * We very much please to always use numeric ordering, as
- * the Good Lord intended.
- */
- syn |= RE_RANGES_IGNORE_LOCALES;
-
- /*
* Interval expressions are now on by default, as POSIX is
* wide-spread enough that people want it. The do_intervals
* variable remains for use with --traditional.
diff --git a/regcomp.c b/regcomp.c
index 22c79cdb..a181d63f 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2651,7 +2651,6 @@ build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
# endif /* not RE_ENABLE_I18N */
{
unsigned int start_ch, end_ch;
- int ignore_locales = (syntax & RE_RANGES_IGNORE_LOCALES) != 0;
/* Equivalence Classes and Character Classes can't be a range start/end. */
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
@@ -2672,7 +2671,6 @@ build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
wchar_t wc;
wint_t start_wc;
wint_t end_wc;
- wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
@@ -2698,12 +2696,7 @@ build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
#endif
if (start_wc == WEOF || end_wc == WEOF)
return REG_ECOLLATE;
- cmp_buf[0] = start_wc;
- cmp_buf[4] = end_wc;
- if (ignore_locales && start_wc > end_wc)
- return REG_ERANGE;
- else if ((syntax & RE_NO_EMPTY_RANGES)
- && wcscoll (cmp_buf, cmp_buf + 4) > 0)
+ else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc)
return REG_ERANGE;
/* Got valid collation sequence values, add them as a new entry.
@@ -2742,23 +2735,10 @@ build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
}
/* Build the table for single byte characters. */
- if (ignore_locales)
- {
- for (wc = 0; wc < SBC_MAX; ++wc)
- {
- if (start_wc <= wc && wc <= end_wc)
- bitset_set (sbcset, wc);
- }
- }
- else
+ for (wc = 0; wc < SBC_MAX; ++wc)
{
- for (wc = 0; wc < SBC_MAX; ++wc)
- {
- cmp_buf[2] = wc;
- if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
- && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
- bitset_set (sbcset, wc);
- }
+ if (start_wc <= wc && wc <= end_wc)
+ bitset_set (sbcset, wc);
}
}
# else /* not RE_ENABLE_I18N */
diff --git a/regex.h b/regex.h
index a2d120f9..6bc503b2 100644
--- a/regex.h
+++ b/regex.h
@@ -184,10 +184,6 @@ typedef unsigned long int reg_syntax_t;
/* If this bit is set, then no_sub will be set to 1 during
re_compile_pattern. */
# define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
-
-/* If this bit is set, then ranges act like they are in
- the "C" locale. */
-# define RE_RANGES_IGNORE_LOCALES (RE_NO_SUB << 1)
#endif
/* This global variable defines the particular regexp syntax to use (for
@@ -213,7 +209,6 @@ extern reg_syntax_t re_syntax_options;
#define RE_SYNTAX_GNU_AWK \
((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \
- | RE_RANGES_IGNORE_LOCALES \
| RE_INVALID_INTERVAL_ORD) \
& ~(RE_DOT_NOT_NULL | RE_CONTEXT_INDEP_OPS \
| RE_CONTEXT_INVALID_OPS ))