diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2019-12-16 21:37:04 +0200 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2019-12-16 21:37:04 +0200 |
commit | 6d1580bfd328fbbb04f4b5627032602dd8dfe98c (patch) | |
tree | 7eede9ec76045a2460e94389d0ca9243bbaf2a72 | |
parent | fb48abe6ca16de5887b15f7c7774cd6c2e402176 (diff) | |
download | egawk-6d1580bfd328fbbb04f4b5627032602dd8dfe98c.tar.gz egawk-6d1580bfd328fbbb04f4b5627032602dd8dfe98c.tar.bz2 egawk-6d1580bfd328fbbb04f4b5627032602dd8dfe98c.zip |
Sync localeinfo and partially sync dfa from GNULIB.
-rw-r--r-- | support/ChangeLog | 7 | ||||
-rw-r--r-- | support/dfa.c | 54 | ||||
-rw-r--r-- | support/localeinfo.c | 44 | ||||
-rw-r--r-- | support/localeinfo.h | 6 |
4 files changed, 56 insertions, 55 deletions
diff --git a/support/ChangeLog b/support/ChangeLog index bb323e96..d3213ee4 100644 --- a/support/ChangeLog +++ b/support/ChangeLog @@ -1,6 +1,11 @@ +2019-12-16 Arnold D. Robbins <arnold@skeeve.com> + + * localeinfo.h, localeinfo.c: Updated from GNULIB. + * dfa.c: Partial sync with GNULIB for localeinfo related stuff. + 2019-12-13 Arnold D. Robbins <arnold@skeeve.com> - * dfah, dfa.c: Updated from GNULIB. + * dfa.h, dfa.c: Updated from GNULIB. 2019-11-21 Arnold D. Robbins <arnold@skeeve.com> diff --git a/support/dfa.c b/support/dfa.c index cfa54211..9e7c8a86 100644 --- a/support/dfa.c +++ b/support/dfa.c @@ -40,9 +40,6 @@ #include <stdlib.h> #include <limits.h> #include <string.h> -#if HAVE_SETLOCALE -#include <locale.h> -#endif #include "dfa.h" // gets stdbool.h for us @@ -58,11 +55,6 @@ isasciidigit (char c) return '0' <= c && c <= '9'; } -/* Gawk doesn't use Gnulib, so don't assume that setlocale is present. */ -#ifndef LC_ALL -# define setlocale(category, locale) NULL -#endif - #include "gettext.h" #define _(str) gettext (str) @@ -597,12 +589,6 @@ struct dfa char *(*dfaexec) (struct dfa *, char const *, char *, bool, size_t *, bool *); - /* The locale is simple, like the C locale. These locales can be - processed more efficiently, as they are single-byte, their native - character set is in collating-sequence order, and they do not - have multi-character collating elements. */ - bool simple_locale; - /* Other cached information derived from the locale. */ struct localeinfo localeinfo; }; @@ -921,7 +907,6 @@ void dfacopysyntax (struct dfa *to, const struct dfa *from) { to->dfaexec = from->dfaexec; - to->simple_locale = from->simple_locale; to->localeinfo = from->localeinfo; to->fast = from->fast; @@ -958,38 +943,6 @@ setbit_case_fold_c (int b, charclass *c) setbit (i, c); } -/* Return true if the locale compatible with the C locale. */ - -static bool -using_simple_locale (bool multibyte) -{ - /* The native character set is known to be compatible with - the C locale. The following test isn't perfect, but it's good - enough in practice, as only ASCII and EBCDIC are in common use - and this test correctly accepts ASCII and rejects EBCDIC. */ - enum { native_c_charset = - ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12 - && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35 - && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41 - && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46 - && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59 - && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65 - && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94 - && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124 - && '}' == 125 && '~' == 126) - }; - - if (!native_c_charset || multibyte) - return false; - else - { - /* Treat C and POSIX locales as being compatible. Also, treat - errors as compatible, as these are invariably from stubs. */ - char const *loc = setlocale (LC_ALL, NULL); - return !loc || streq (loc, "C") || streq (loc, "POSIX"); - } -} - /* Fetch the next lexical input character from the pattern. There must at least one byte of pattern input. Set DFA->lex.wctok to the value of the character or to WEOF depending on whether the input is @@ -1080,7 +1033,7 @@ parse_bracket_exp (struct dfa *dfa) if (invert) { c = bracket_fetch_wc (dfa); - known_bracket_exp = dfa->simple_locale; + known_bracket_exp = dfa->localeinfo.simple; } wint_t wc = dfa->lex.wctok; int c1; @@ -1210,7 +1163,7 @@ parse_bracket_exp (struct dfa *dfa) /* Treat [x-y] as a range if x != y. */ if (wc != wc2 || wc == WEOF) { - if (dfa->simple_locale + if (dfa->localeinfo.simple || (isasciidigit (c) & isasciidigit (c2))) { for (int ci = c; ci <= c2; ci++) @@ -3389,7 +3342,7 @@ skip_remains_mb (struct dfa *d, unsigned char const *p, - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK) - back-reference: (.)\1 - word-delimiter in multibyte locale: \<, \>, \b, \B - See using_simple_locale for the definition of "simple locale". */ + See struct localeinfo.simple for the definition of "simple locale". */ static inline char * dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, @@ -4352,7 +4305,6 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, { memset (dfa, 0, offsetof (struct dfa, dfaexec)); dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb; - dfa->simple_locale = using_simple_locale (linfo->multibyte); dfa->localeinfo = *linfo; dfa->fast = !dfa->localeinfo.multibyte; diff --git a/support/localeinfo.c b/support/localeinfo.c index 5c38849e..694735e2 100644 --- a/support/localeinfo.c +++ b/support/localeinfo.c @@ -48,17 +48,55 @@ is_using_utf8 (void) return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; } +/* Return true if the locale is compatible enough with the C locale so + that the locale is single-byte, bytes are in collating-sequence + order, and there are no multi-character collating elements. */ + +static bool +using_simple_locale (bool multibyte) +{ + /* The native character set is known to be compatible with + the C locale. The following test isn't perfect, but it's good + enough in practice, as only ASCII and EBCDIC are in common use + and this test correctly accepts ASCII and rejects EBCDIC. */ + enum { native_c_charset = + ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12 + && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35 + && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41 + && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46 + && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59 + && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65 + && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94 + && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124 + && '}' == 125 && '~' == 126) + }; + + if (!native_c_charset || multibyte) + return false; + + /* As a heuristic, use strcoll to compare native character order. + If this agrees with byte order the locale should be simple. + This heuristic should work for all known practical locales, + although it would be invalid for artificially-constructed locales + where the native order is the collating-sequence order but there + are multi-character collating elements. */ + for (int i = 0; i < UCHAR_MAX; i++) + if (strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})) <= 0) + return false; + + return true; +} + /* Initialize *LOCALEINFO from the current locale. */ void init_localeinfo (struct localeinfo *localeinfo) { - int i; - localeinfo->multibyte = MB_CUR_MAX > 1; + localeinfo->simple = using_simple_locale (localeinfo->multibyte); localeinfo->using_utf8 = is_using_utf8 (); - for (i = CHAR_MIN; i <= CHAR_MAX; i++) + for (int i = CHAR_MIN; i <= CHAR_MAX; i++) { char c = i; unsigned char uc = i; diff --git a/support/localeinfo.h b/support/localeinfo.h index a5140164..c827a2bf 100644 --- a/support/localeinfo.h +++ b/support/localeinfo.h @@ -28,6 +28,12 @@ struct localeinfo /* MB_CUR_MAX > 1. */ bool multibyte; + /* The locale is simple, like the C locale. These locales can be + processed more efficiently, as they are single-byte, their native + character set is in collating-sequence order, and they do not + have multi-character collating elements. */ + bool simple; + /* The locale uses UTF-8. */ bool using_utf8; |