aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2019-12-16 21:37:04 +0200
committerArnold D. Robbins <arnold@skeeve.com>2019-12-16 21:37:04 +0200
commit6d1580bfd328fbbb04f4b5627032602dd8dfe98c (patch)
tree7eede9ec76045a2460e94389d0ca9243bbaf2a72
parentfb48abe6ca16de5887b15f7c7774cd6c2e402176 (diff)
downloadegawk-6d1580bfd328fbbb04f4b5627032602dd8dfe98c.tar.gz
egawk-6d1580bfd328fbbb04f4b5627032602dd8dfe98c.tar.bz2
egawk-6d1580bfd328fbbb04f4b5627032602dd8dfe98c.zip
Sync localeinfo and partially sync dfa from GNULIB.
-rw-r--r--support/ChangeLog7
-rw-r--r--support/dfa.c54
-rw-r--r--support/localeinfo.c44
-rw-r--r--support/localeinfo.h6
4 files changed, 56 insertions, 55 deletions
diff --git a/support/ChangeLog b/support/ChangeLog
index bb323e96..d3213ee4 100644
--- a/support/ChangeLog
+++ b/support/ChangeLog
@@ -1,6 +1,11 @@
+2019-12-16 Arnold D. Robbins <arnold@skeeve.com>
+
+ * localeinfo.h, localeinfo.c: Updated from GNULIB.
+ * dfa.c: Partial sync with GNULIB for localeinfo related stuff.
+
2019-12-13 Arnold D. Robbins <arnold@skeeve.com>
- * dfah, dfa.c: Updated from GNULIB.
+ * dfa.h, dfa.c: Updated from GNULIB.
2019-11-21 Arnold D. Robbins <arnold@skeeve.com>
diff --git a/support/dfa.c b/support/dfa.c
index cfa54211..9e7c8a86 100644
--- a/support/dfa.c
+++ b/support/dfa.c
@@ -40,9 +40,6 @@
#include <stdlib.h>
#include <limits.h>
#include <string.h>
-#if HAVE_SETLOCALE
-#include <locale.h>
-#endif
#include "dfa.h" // gets stdbool.h for us
@@ -58,11 +55,6 @@ isasciidigit (char c)
return '0' <= c && c <= '9';
}
-/* Gawk doesn't use Gnulib, so don't assume that setlocale is present. */
-#ifndef LC_ALL
-# define setlocale(category, locale) NULL
-#endif
-
#include "gettext.h"
#define _(str) gettext (str)
@@ -597,12 +589,6 @@ struct dfa
char *(*dfaexec) (struct dfa *, char const *, char *,
bool, size_t *, bool *);
- /* The locale is simple, like the C locale. These locales can be
- processed more efficiently, as they are single-byte, their native
- character set is in collating-sequence order, and they do not
- have multi-character collating elements. */
- bool simple_locale;
-
/* Other cached information derived from the locale. */
struct localeinfo localeinfo;
};
@@ -921,7 +907,6 @@ void
dfacopysyntax (struct dfa *to, const struct dfa *from)
{
to->dfaexec = from->dfaexec;
- to->simple_locale = from->simple_locale;
to->localeinfo = from->localeinfo;
to->fast = from->fast;
@@ -958,38 +943,6 @@ setbit_case_fold_c (int b, charclass *c)
setbit (i, c);
}
-/* Return true if the locale compatible with the C locale. */
-
-static bool
-using_simple_locale (bool multibyte)
-{
- /* The native character set is known to be compatible with
- the C locale. The following test isn't perfect, but it's good
- enough in practice, as only ASCII and EBCDIC are in common use
- and this test correctly accepts ASCII and rejects EBCDIC. */
- enum { native_c_charset =
- ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
- && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
- && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
- && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
- && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
- && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
- && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
- && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
- && '}' == 125 && '~' == 126)
- };
-
- if (!native_c_charset || multibyte)
- return false;
- else
- {
- /* Treat C and POSIX locales as being compatible. Also, treat
- errors as compatible, as these are invariably from stubs. */
- char const *loc = setlocale (LC_ALL, NULL);
- return !loc || streq (loc, "C") || streq (loc, "POSIX");
- }
-}
-
/* Fetch the next lexical input character from the pattern. There
must at least one byte of pattern input. Set DFA->lex.wctok to the
value of the character or to WEOF depending on whether the input is
@@ -1080,7 +1033,7 @@ parse_bracket_exp (struct dfa *dfa)
if (invert)
{
c = bracket_fetch_wc (dfa);
- known_bracket_exp = dfa->simple_locale;
+ known_bracket_exp = dfa->localeinfo.simple;
}
wint_t wc = dfa->lex.wctok;
int c1;
@@ -1210,7 +1163,7 @@ parse_bracket_exp (struct dfa *dfa)
/* Treat [x-y] as a range if x != y. */
if (wc != wc2 || wc == WEOF)
{
- if (dfa->simple_locale
+ if (dfa->localeinfo.simple
|| (isasciidigit (c) & isasciidigit (c2)))
{
for (int ci = c; ci <= c2; ci++)
@@ -3389,7 +3342,7 @@ skip_remains_mb (struct dfa *d, unsigned char const *p,
- [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
- back-reference: (.)\1
- word-delimiter in multibyte locale: \<, \>, \b, \B
- See using_simple_locale for the definition of "simple locale". */
+ See struct localeinfo.simple for the definition of "simple locale". */
static inline char *
dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
@@ -4352,7 +4305,6 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
{
memset (dfa, 0, offsetof (struct dfa, dfaexec));
dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
- dfa->simple_locale = using_simple_locale (linfo->multibyte);
dfa->localeinfo = *linfo;
dfa->fast = !dfa->localeinfo.multibyte;
diff --git a/support/localeinfo.c b/support/localeinfo.c
index 5c38849e..694735e2 100644
--- a/support/localeinfo.c
+++ b/support/localeinfo.c
@@ -48,17 +48,55 @@ is_using_utf8 (void)
return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
}
+/* Return true if the locale is compatible enough with the C locale so
+ that the locale is single-byte, bytes are in collating-sequence
+ order, and there are no multi-character collating elements. */
+
+static bool
+using_simple_locale (bool multibyte)
+{
+ /* The native character set is known to be compatible with
+ the C locale. The following test isn't perfect, but it's good
+ enough in practice, as only ASCII and EBCDIC are in common use
+ and this test correctly accepts ASCII and rejects EBCDIC. */
+ enum { native_c_charset =
+ ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
+ && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
+ && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
+ && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
+ && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
+ && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
+ && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
+ && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
+ && '}' == 125 && '~' == 126)
+ };
+
+ if (!native_c_charset || multibyte)
+ return false;
+
+ /* As a heuristic, use strcoll to compare native character order.
+ If this agrees with byte order the locale should be simple.
+ This heuristic should work for all known practical locales,
+ although it would be invalid for artificially-constructed locales
+ where the native order is the collating-sequence order but there
+ are multi-character collating elements. */
+ for (int i = 0; i < UCHAR_MAX; i++)
+ if (strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})) <= 0)
+ return false;
+
+ return true;
+}
+
/* Initialize *LOCALEINFO from the current locale. */
void
init_localeinfo (struct localeinfo *localeinfo)
{
- int i;
-
localeinfo->multibyte = MB_CUR_MAX > 1;
+ localeinfo->simple = using_simple_locale (localeinfo->multibyte);
localeinfo->using_utf8 = is_using_utf8 ();
- for (i = CHAR_MIN; i <= CHAR_MAX; i++)
+ for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
{
char c = i;
unsigned char uc = i;
diff --git a/support/localeinfo.h b/support/localeinfo.h
index a5140164..c827a2bf 100644
--- a/support/localeinfo.h
+++ b/support/localeinfo.h
@@ -28,6 +28,12 @@ struct localeinfo
/* MB_CUR_MAX > 1. */
bool multibyte;
+ /* The locale is simple, like the C locale. These locales can be
+ processed more efficiently, as they are single-byte, their native
+ character set is in collating-sequence order, and they do not
+ have multi-character collating elements. */
+ bool simple;
+
/* The locale uses UTF-8. */
bool using_utf8;