diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2014-03-10 22:40:16 +0200 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2014-03-10 22:40:16 +0200 |
commit | 6736261fb372cce9bc6c71deea6944fc882c79bb (patch) | |
tree | cb892db1cc35026fbca919ef256891dccbe8efb6 | |
parent | 23e7f1057b1abdebb25fc7d2f11ee3f5360976a4 (diff) | |
parent | 0102531b85a7cb85320c0b499c52d44f6822d1f0 (diff) | |
download | egawk-6736261fb372cce9bc6c71deea6944fc882c79bb.tar.gz egawk-6736261fb372cce9bc6c71deea6944fc882c79bb.tar.bz2 egawk-6736261fb372cce9bc6c71deea6944fc882c79bb.zip |
Merge branch 'gawk-4.1-stable'
-rw-r--r-- | ChangeLog | 14 | ||||
-rw-r--r-- | NEWS | 5 | ||||
-rw-r--r-- | builtin.c | 30 | ||||
-rw-r--r-- | dfa.c | 142 | ||||
-rw-r--r-- | dfa.h | 8 | ||||
-rw-r--r-- | doc/ChangeLog | 9 | ||||
-rw-r--r-- | doc/awkcard.in | 5 | ||||
-rw-r--r-- | doc/gawk.1 | 11 | ||||
-rw-r--r-- | doc/gawk.info | 148 | ||||
-rw-r--r-- | doc/gawk.texi | 27 | ||||
-rw-r--r-- | doc/gawktexi.in | 27 | ||||
-rw-r--r-- | helpers/ChangeLog | 5 | ||||
-rwxr-xr-x | helpers/quoteconvert2.sh | 12 | ||||
-rw-r--r-- | regex_internal.c | 15 |
14 files changed, 303 insertions, 155 deletions
@@ -1,3 +1,17 @@ +2014-03-10 Arnold D. Robbins <arnold@skeeve.com> + + * dfa.h, dfa.c: Sync with grep. Yet again. + * regex_internal.c (built_wcs_upper_buffer, build_upper_buffer): + Fixes from GNULIB for mixed case matching on Mac OS X. + + Unrelated: + + * builtin.c (format_tree): Smarten handling of %' flag. Always + pass it in for floating point formats. Then only add the + thousands_sep if there is one. Also, allow for thousands_sep + to be a string, not just one character. Thanks to Michal Jaegermann + for the report. + 2014-03-08 Andrew J. Schorr <aschorr@telemetry-investments.com> * gawkapi.c (api_impl): Add memory allocation function pointers. @@ -52,6 +52,11 @@ extension facility only works on Alpha and Itanium. realloc() and free(), to insure that the same memory allocation functions are always used. This bumps the minor version by one. +13. The printf quote flag now works correctly in locales with a different +decimal point character but without a thousands separator character. +If the thousands separator is a string, it will be correctly added +to decimal numbers. + XXX. A number of bugs have been fixed. See the ChangeLog. Changes from 4.0.2 to 4.1.0 @@ -994,9 +994,7 @@ check_pos: goto check_pos; case '\'': #if defined(HAVE_LOCALE_H) - /* allow quote_flag if there is a thousands separator. */ - if (loc.thousands_sep[0] != '\0') - quote_flag = true; + quote_flag = true; goto check_pos; #else goto retry; @@ -1196,6 +1194,9 @@ out0: } if (i < 1) goto out_of_range; +#if defined(HAVE_LOCALE_H) + quote_flag = (quote_flag && loc.thousands_sep[0] != 0); +#endif chp = &cpbufs[1].buf[i-1]; ii = jj = 0; do { @@ -1203,8 +1204,14 @@ out0: chp--; i--; #if defined(HAVE_LOCALE_H) if (quote_flag && loc.grouping[ii] && ++jj == loc.grouping[ii]) { - if (i) /* only add if more digits coming */ - PREPEND(loc.thousands_sep[0]); /* XXX - assumption it's one char */ + if (i) { /* only add if more digits coming */ + int k; + const char *ts = loc.thousands_sep; + + for (k = strlen(ts) - 1; k >= 0; k--) { + PREPEND(ts[k]); + } + } if (loc.grouping[ii+1] == 0) jj = 0; /* keep using current val in loc.grouping[ii] */ else if (loc.grouping[ii+1] == CHAR_MAX) @@ -1360,6 +1367,9 @@ mpf1: #ifdef HAVE_MPFR int0: #endif +#if defined(HAVE_LOCALE_H) + quote_flag = (quote_flag && loc.thousands_sep[0] != 0); +#endif /* * When to fill with zeroes is of course not simple. * First: No zero fill if left-justifying. @@ -1378,8 +1388,14 @@ mpf1: uval /= base; #if defined(HAVE_LOCALE_H) if (base == 10 && quote_flag && loc.grouping[ii] && ++jj == loc.grouping[ii]) { - if (uval) /* only add if more digits coming */ - PREPEND(loc.thousands_sep[0]); /* XXX --- assumption it's one char */ + if (uval) { /* only add if more digits coming */ + int k; + const char *ts = loc.thousands_sep; + + for (k = strlen(ts) - 1; k >= 0; k--) { + PREPEND(ts[k]); + } + } if (loc.grouping[ii+1] == 0) jj = 0; /* keep using current val in loc.grouping[ii] */ else if (loc.grouping[ii+1] == CHAR_MAX) @@ -45,6 +45,11 @@ #include "dfa.h" +/* Gawk doesn't use Gnulib, so don't assume static_assert is present. */ +#ifndef static_assert +# define static_assert(cond, diagnostic) \ + extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })] +#endif #define STREQ(a, b) (strcmp (a, b) == 0) @@ -748,34 +753,16 @@ setbit_wc (wint_t wc, charclass c) #endif } -/* Set a bit for B in the charclass C, if B is a valid single byte - character in the current character set. If case is folded, set B's - lower and upper case variants similarly. If MB_CUR_MAX > 1, the - resulting charset is used only as an optimization, and the caller - should set the appropriate field of struct mb_char_classes. */ +/* Set a bit for B and its case variants in the charclass C. + MB_CUR_MAX must be 1. */ static void setbit_case_fold_c (int b, charclass c) { - if (MB_CUR_MAX > 1) - { - wint_t wc = btowc (b); - if (wc == WEOF) - return; - if (case_fold) - { - setbit_wc (towlower (wc), c); - setbit_wc (towupper (wc), c); - } - } - else - { - if (case_fold) - { - setbit (tolower (b), c); - setbit (toupper (b), c); - } - } - setbit (b, c); + int ub = toupper (b); + int i; + for (i = 0; i < NOTCHAR; i++) + if (toupper (i) == ub) + setbit (i, c); } @@ -940,6 +927,50 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */ # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif +/* The set of wchar_t values C such that there's a useful locale + somewhere where C != towupper (C) && C != towlower (towupper (C)). + For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because + towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and + towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ +static short const lonesome_lower[] = + { + 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, + 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, + + /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase + counterpart in locales predating Unicode 4.0.0 (April 2003). */ + 0x03F2, + + 0x03F5, 0x1E9B, 0x1FBE, + }; + +static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2 + == CASE_FOLDED_BUFSIZE), + "CASE_FOLDED_BUFSIZE is wrong"); + +/* Find the characters equal to C after case-folding, other than C + itself, and store them into FOLDED. Return the number of characters + stored. */ +int +case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) +{ + int i; + int n = 0; + wint_t uc = towupper (c); + wint_t lc = towlower (uc); + if (uc != c) + folded[n++] = uc; + if (lc != uc && lc != c && towupper (lc) == uc) + folded[n++] = lc; + for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) + { + wint_t li = lonesome_lower[i]; + if (li != lc && li != uc && li != c && towupper (li) == uc) + folded[n++] = li; + } + return n; +} + typedef int predicate (int); /* The following list maps the names of the Posix named character classes @@ -1100,7 +1131,7 @@ parse_bracket_exp (void) for (c2 = 0; c2 < NOTCHAR; ++c2) if (pred->func (c2)) - setbit_case_fold_c (c2, ccl); + setbit (c2, ccl); } else known_bracket_exp = false; @@ -1167,8 +1198,21 @@ parse_bracket_exp (void) } } else if (using_simple_locale ()) - for (; c <= c2; c++) - setbit_case_fold_c (c, ccl); + { + for (c1 = c; c1 <= c2; c1++) + setbit (c1, ccl); + if (case_fold) + { + int uc = toupper (c); + int uc2 = toupper (c2); + for (c1 = 0; c1 < NOTCHAR; c1++) + { + int uc1 = toupper (c1); + if (uc <= uc1 && uc1 <= uc2) + setbit (c1, ccl); + } + } + } else known_bracket_exp = false; @@ -1187,26 +1231,22 @@ parse_bracket_exp (void) if (MB_CUR_MAX == 1) { - setbit_case_fold_c (c, ccl); + if (case_fold) + setbit_case_fold_c (c, ccl); + else + setbit (c, ccl); continue; } if (case_fold) { - wint_t folded = towlower (wc); - if (folded != wc && !setbit_wc (folded, ccl)) - { - REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, - work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = folded; - } - folded = towupper (wc); - if (folded != wc && !setbit_wc (folded, ccl)) - { - REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, - work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = folded; - } + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int i, n = case_folded_counterparts (wc, folded); + REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, + work_mbc->nchars + n); + for (i = 0; i < n; i++) + if (!setbit_wc (folded[i], ccl)) + work_mbc->chars[work_mbc->nchars++] = folded[i]; } if (!setbit_wc (wc, ccl)) { @@ -1552,7 +1592,7 @@ lex (void) if (MB_CUR_MAX > 1) return lasttok = WCHAR; - if (case_fold && (tolower (c) != c || toupper (c) != c)) + if (case_fold && isalpha (c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); @@ -1799,18 +1839,14 @@ atom (void) if (MBS_SUPPORT && tok == WCHAR) { addtok_wc (wctok); + if (case_fold) { - wint_t folded = towlower (wctok); - if (folded != wctok) - { - addtok_wc (folded); - addtok (OR); - } - folded = towupper (wctok); - if (folded != wctok) + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int i, n = case_folded_counterparts (wctok, folded); + for (i = 0; i < n; i++) { - addtok_wc (folded); + addtok_wc (folded[i]); addtok (OR); } } @@ -101,3 +101,11 @@ extern void dfawarn (const char *); extern _Noreturn void dfaerror (const char *); extern int using_utf8 (void); + +/* Maximum number of characters that can be the case-folded + counterparts of a single character, not counting the character + itself. This is 1 for towupper, 1 for towlower, and 1 for each + entry in LONESOME_LOWER; see dfa.c. */ +enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 }; + +extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]); diff --git a/doc/ChangeLog b/doc/ChangeLog index ac48a973..0b34c55f 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,12 @@ +2014-03-10 Arnold D. Robbins <arnold@skeeve.com> + + * gawktexi.in: Finish indexing improvements. (For now, anyway.) + + Unrelated: + + * gawk.1: Document the quote flag! (Better late than never.) + * awkcard.in: Update documentation of quote flag. + 2014-03-08 Arnold D. Robbins <arnold@skeeve.com> * gawktexi.in: Minor edits to the discussion of the memory allocation diff --git a/doc/awkcard.in b/doc/awkcard.in index 610032b7..5f3a9735 100644 --- a/doc/awkcard.in +++ b/doc/awkcard.in @@ -1,7 +1,7 @@ .\" AWK Reference Card --- Arnold Robbins, arnold@skeeve.com .\" .\" Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, -.\" 2003, 2004, 2005, 2007, 2009, 2010, 2011, 2012, 2013 +.\" 2003, 2004, 2005, 2007, 2009, 2010, 2011, 2012, 2013, 2014 .\" Free Software Foundation, Inc. .\" .\" Permission is granted to make and distribute verbatim copies of @@ -1493,7 +1493,8 @@ Only has an effect when the field width is wider than the value to be printed. T} \*(CB\*(FC'\*(FR T{ -Use the locale's thousands separator for \*(FC%d\fP, \*(FC%i\fP, and \*(FC%u\fP.\*(CD +Use the locale's thousands separator and decimal +point characters.\*(CD T} \*(FIwidth\fP T{ Pad the field to this width. The field is normally @@ -13,7 +13,7 @@ . if \w'\(rq' .ds rq "\(rq . \} .\} -.TH GAWK 1 "Jan 28 2014" "Free Software Foundation" "Utility Commands" +.TH GAWK 1 "Mar 08 2014" "Free Software Foundation" "Utility Commands" .SH NAME gawk \- pattern scanning and processing language .SH SYNOPSIS @@ -2454,6 +2454,15 @@ This applies only to the numeric output formats. This flag only has an effect when the field width is wider than the value to be printed. .TP +.B ' +A single quote character instructs +.I gawk +to insert the locale's thousands-separator character +into decimal numbers, and to also use the locale's +decimal point character with floating point formats. +This requires correct locale support in the C library +and in the definition of the current locale. +.TP .I width The field should be padded to this width. The field is normally padded with spaces. With the diff --git a/doc/gawk.info b/doc/gawk.info index 74773fcb..13f5ea77 100644 --- a/doc/gawk.info +++ b/doc/gawk.info @@ -25937,6 +25937,9 @@ Info file, in approximate chronological order: * Anders Wallin helped keep the VMS port going for several years. + * Assaf Gordon contributed the code to implement the `--sandbox' + option. + * John Haque made the following contributions: - The modifications to convert `gawk' into a byte-code @@ -31301,12 +31304,12 @@ Index * for statement, looping over arrays: Scanning an Array. (line 20) * fork() extension function: Extension Sample Fork. (line 11) +* format specifiers: Basic Printf. (line 15) * format specifiers, mixing regular with positional specifiers: Printf Ordering. (line 57) * format specifiers, printf statement: Control Letters. (line 6) * format specifiers, strftime() function (gawk): Time Functions. (line 88) -* format strings: Basic Printf. (line 15) * formats, numeric output: OFMT. (line 6) * formatting output: Printf. (line 6) * forward slash (/): Regexp. (line 10) @@ -31531,6 +31534,12 @@ Index * gettext() function (C library): Explaining gettext. (line 62) * gettimeofday() extension function: Extension Sample Time. (line 13) +* git utility <1>: Adding Code. (line 111) +* git utility <2>: Accessing The Source. + (line 10) +* git utility <3>: Other Versions. (line 29) +* git utility: gawkextlib. (line 29) +* git, use of for gawk source code: Derived Files. (line 6) * GMP: Arbitrary Precision Arithmetic. (line 6) * GNITS mailing list: Acknowledgments. (line 52) @@ -31547,6 +31556,7 @@ Index * GNU/Linux <1>: Glossary. (line 624) * GNU/Linux <2>: I18N Example. (line 55) * GNU/Linux: Manual History. (line 28) +* Gordon, Assaf: Contributors. (line 105) * GPL (General Public License) <1>: Glossary. (line 314) * GPL (General Public License): Manual History. (line 11) * GPL (General Public License), printing: Options. (line 88) @@ -31564,7 +31574,7 @@ Index (line 66) * Hankerson, Darrel <1>: Contributors. (line 60) * Hankerson, Darrel: Acknowledgments. (line 60) -* Haque, John: Contributors. (line 105) +* Haque, John: Contributors. (line 108) * Hartholz, Elaine: Acknowledgments. (line 38) * Hartholz, Marshall: Acknowledgments. (line 38) * Hasegawa, Isamu: Contributors. (line 94) @@ -31652,9 +31662,9 @@ Index * installing gawk: Installation. (line 6) * INT signal (MS-Windows): Profiling. (line 214) * int() function: Numeric Functions. (line 23) -* integer, arbitrary precision: Arbitrary Precision Integers. - (line 6) * integers: General Arithmetic. (line 6) +* integers, arbitrary precision: Arbitrary Precision Integers. + (line 6) * integers, unsigned: General Arithmetic. (line 15) * interacting with other programs: I/O Functions. (line 72) * internationalization <1>: I18N and L10N. (line 6) @@ -31800,10 +31810,15 @@ Index * login information: Passwd Functions. (line 16) * long options: Command Line. (line 13) * loops: While Statement. (line 6) +* loops, break statement and: Break Statement. (line 6) * loops, continue statements and: For Statement. (line 64) * loops, count for header: Profiling. (line 131) +* loops, do-while: Do Statement. (line 6) * loops, exiting: Break Statement. (line 6) +* loops, for, array scanning: Scanning an Array. (line 6) +* loops, for, iterative: For Statement. (line 6) * loops, See Also while statement: While Statement. (line 6) +* loops, while: While Statement. (line 6) * ls utility: More Complex. (line 15) * lshift() function (gawk): Bitwise Functions. (line 46) * lvalues/rvalues: Assignment Ops. (line 32) @@ -32200,10 +32215,10 @@ Index * programming conventions, private variable names: Library Names. (line 23) * programming language, recipe for: History. (line 6) -* Programming languages, Ada: Glossary. (line 20) +* programming languages, Ada: Glossary. (line 20) * programming languages, data-driven vs. procedural: Getting Started. (line 12) -* Programming languages, Java: Glossary. (line 388) +* programming languages, Java: Glossary. (line 388) * programming, basic steps: Basic High Level. (line 20) * programming, concepts: Basic Concepts. (line 6) * pwcat program: Passwd Functions. (line 23) @@ -32340,7 +32355,7 @@ Index * RLENGTH variable, match() function and: String Functions. (line 220) * Robbins, Arnold <1>: Future Extensions. (line 6) * Robbins, Arnold <2>: Bugs. (line 32) -* Robbins, Arnold <3>: Contributors. (line 132) +* Robbins, Arnold <3>: Contributors. (line 135) * Robbins, Arnold <4>: General Data Types. (line 6) * Robbins, Arnold <5>: Alarm Program. (line 6) * Robbins, Arnold <6>: Passwd Functions. (line 90) @@ -32381,7 +32396,7 @@ Index (line 68) * sandbox mode: Options. (line 279) * scalar values: Basic Data Typing. (line 13) -* Schorr, Andrew <1>: Contributors. (line 128) +* Schorr, Andrew <1>: Contributors. (line 131) * Schorr, Andrew: Acknowledgments. (line 60) * Schreiber, Bert: Acknowledgments. (line 38) * Schreiber, Rita: Acknowledgments. (line 38) @@ -32516,6 +32531,7 @@ Index * source code, mawk: Other Versions. (line 44) * source code, mixing: Options. (line 117) * source code, pawk: Other Versions. (line 78) +* source code, pawk (Python version): Other Versions. (line 124) * source code, QSE Awk: Other Versions. (line 130) * source code, QuikTrim Awk: Other Versions. (line 134) * source code, Solaris awk: Other Versions. (line 96) @@ -32816,7 +32832,7 @@ Index * xgettext utility: String Extraction. (line 13) * XOR bitwise operation: Bitwise Functions. (line 6) * xor() function (gawk): Bitwise Functions. (line 55) -* Yawitz, Efraim: Contributors. (line 126) +* Yawitz, Efraim: Contributors. (line 129) * Zaretskii, Eli <1>: Bugs. (line 70) * Zaretskii, Eli <2>: Contributors. (line 55) * Zaretskii, Eli: Acknowledgments. (line 60) @@ -33317,62 +33333,62 @@ Ref: Ranges and Locales-Footnote-11034931 Ref: Ranges and Locales-Footnote-21034958 Ref: Ranges and Locales-Footnote-31035192 Node: Contributors1035413 -Node: Installation1040558 -Node: Gawk Distribution1041452 -Node: Getting1041936 -Node: Extracting1042762 -Node: Distribution contents1044454 -Node: Unix Installation1050159 -Node: Quick Installation1050776 -Node: Additional Configuration Options1053222 -Node: Configuration Philosophy1054958 -Node: Non-Unix Installation1057312 -Node: PC Installation1057770 -Node: PC Binary Installation1059069 -Node: PC Compiling1060917 -Node: PC Testing1063861 -Node: PC Using1065037 -Node: Cygwin1069205 -Node: MSYS1070014 -Node: VMS Installation1070528 -Node: VMS Compilation1071292 -Ref: VMS Compilation-Footnote-11072544 -Node: VMS Dynamic Extensions1072602 -Node: VMS Installation Details1073975 -Node: VMS Running1076226 -Node: VMS GNV1079060 -Node: VMS Old Gawk1079783 -Node: Bugs1080253 -Node: Other Versions1084171 -Node: Notes1090255 -Node: Compatibility Mode1091055 -Node: Additions1091838 -Node: Accessing The Source1092765 -Node: Adding Code1094205 -Node: New Ports1100250 -Node: Derived Files1104385 -Ref: Derived Files-Footnote-11109706 -Ref: Derived Files-Footnote-21109740 -Ref: Derived Files-Footnote-31110340 -Node: Future Extensions1110438 -Node: Implementation Limitations1111021 -Node: Extension Design1112273 -Node: Old Extension Problems1113427 -Ref: Old Extension Problems-Footnote-11114935 -Node: Extension New Mechanism Goals1114992 -Ref: Extension New Mechanism Goals-Footnote-11118357 -Node: Extension Other Design Decisions1118543 -Node: Extension Future Growth1120649 -Node: Old Extension Mechanism1121485 -Node: Basic Concepts1123225 -Node: Basic High Level1123906 -Ref: figure-general-flow1124177 -Ref: figure-process-flow1124776 -Ref: Basic High Level-Footnote-11128005 -Node: Basic Data Typing1128190 -Node: Glossary1131545 -Node: Copying1157007 -Node: GNU Free Documentation License1194564 -Node: Index1219701 +Node: Installation1040640 +Node: Gawk Distribution1041534 +Node: Getting1042018 +Node: Extracting1042844 +Node: Distribution contents1044536 +Node: Unix Installation1050241 +Node: Quick Installation1050858 +Node: Additional Configuration Options1053304 +Node: Configuration Philosophy1055040 +Node: Non-Unix Installation1057394 +Node: PC Installation1057852 +Node: PC Binary Installation1059151 +Node: PC Compiling1060999 +Node: PC Testing1063943 +Node: PC Using1065119 +Node: Cygwin1069287 +Node: MSYS1070096 +Node: VMS Installation1070610 +Node: VMS Compilation1071374 +Ref: VMS Compilation-Footnote-11072626 +Node: VMS Dynamic Extensions1072684 +Node: VMS Installation Details1074057 +Node: VMS Running1076308 +Node: VMS GNV1079142 +Node: VMS Old Gawk1079865 +Node: Bugs1080335 +Node: Other Versions1084253 +Node: Notes1090337 +Node: Compatibility Mode1091137 +Node: Additions1091920 +Node: Accessing The Source1092847 +Node: Adding Code1094287 +Node: New Ports1100332 +Node: Derived Files1104467 +Ref: Derived Files-Footnote-11109788 +Ref: Derived Files-Footnote-21109822 +Ref: Derived Files-Footnote-31110422 +Node: Future Extensions1110520 +Node: Implementation Limitations1111103 +Node: Extension Design1112355 +Node: Old Extension Problems1113509 +Ref: Old Extension Problems-Footnote-11115017 +Node: Extension New Mechanism Goals1115074 +Ref: Extension New Mechanism Goals-Footnote-11118439 +Node: Extension Other Design Decisions1118625 +Node: Extension Future Growth1120731 +Node: Old Extension Mechanism1121567 +Node: Basic Concepts1123307 +Node: Basic High Level1123988 +Ref: figure-general-flow1124259 +Ref: figure-process-flow1124858 +Ref: Basic High Level-Footnote-11128087 +Node: Basic Data Typing1128272 +Node: Glossary1131627 +Node: Copying1157089 +Node: GNU Free Documentation License1194646 +Node: Index1219783 End Tag Table diff --git a/doc/gawk.texi b/doc/gawk.texi index b9bf2170..1b610c42 100644 --- a/doc/gawk.texi +++ b/doc/gawk.texi @@ -8359,7 +8359,7 @@ parentheses are necessary if any of the item expressions use the @samp{>} relational operator; otherwise, it can be confused with an output redirection (@pxref{Redirection}). -@cindex format strings +@cindex format specifiers The difference between @code{printf} and @code{print} is the @var{format} argument. This is an expression whose value is taken as a string; it specifies how to output each of the other arguments. It is called the @@ -12706,6 +12706,7 @@ the first thing on its line. @subsection The @code{while} Statement @cindex @code{while} statement @cindex loops +@cindex loops, @code{while} @cindex loops, See Also @code{while} statement In programming, a @dfn{loop} is a part of a program that can @@ -12766,6 +12767,7 @@ program is harder to read without it. @node Do Statement @subsection The @code{do}-@code{while} Statement @cindex @code{do}-@code{while} statement +@cindex loops, @code{do}-@code{while} The @code{do} loop is a variation of the @code{while} looping statement. The @code{do} loop executes the @var{body} once and then repeats the @@ -12811,6 +12813,7 @@ occasionally is there a real use for a @code{do} statement. @node For Statement @subsection The @code{for} Statement @cindex @code{for} statement +@cindex loops, @code{for}, iterative The @code{for} statement makes it more convenient to count iterations of a loop. The general form of the @code{for} statement looks like this: @@ -12983,6 +12986,7 @@ it is not available. @subsection The @code{break} Statement @cindex @code{break} statement @cindex loops, exiting +@cindex loops, @code{break} statement and The @code{break} statement jumps out of the innermost @code{for}, @code{while}, or @code{do} loop that encloses it. The following example @@ -14628,6 +14632,7 @@ END @{ @subsection Scanning All Elements of an Array @cindex elements in arrays, scanning @cindex arrays, scanning +@cindex loops, @code{for}, array scanning In programs that use arrays, it is often necessary to use a loop that executes once for each element of an array. In other languages, where @@ -29318,7 +29323,7 @@ the problem at hand is often the correct approach in such situations. @node Arbitrary Precision Integers @section Arbitrary Precision Integer Arithmetic with @command{gawk} -@cindex integer, arbitrary precision +@cindex integers, arbitrary precision If one of the options @option{--bignum} or @option{-M} is specified, @command{gawk} performs all @@ -33068,6 +33073,7 @@ The @code{time} extension described earlier (@pxref{Extension Sample Time}) was originally from this project but has been moved in to the main @command{gawk} distribution. +@cindex @command{git} utility You can check out the code for the @code{gawkextlib} project using the @uref{http://git-scm.com, GIT} distributed source code control system. The command is as follows: @@ -34698,6 +34704,11 @@ environments. Anders Wallin helped keep the VMS port going for several years. @item +@cindex Gordon, Assaf +Assaf Gordon contributed the code to implement the +@option{--sandbox} option. + +@item @cindex Haque, John John Haque made the following contributions: @@ -36110,6 +36121,7 @@ It is available in several archive formats: @uref{http://www.cs.princeton.edu/~bwk/btl.mirror/awk.zip} @end table +@cindex @command{git} utility You can also retrieve it from Git Hub: @example @@ -36238,6 +36250,7 @@ This is an embeddable @command{awk} interpreter derived from @uref{http://repo.hu/projects/libmawk/}. @item @code{pawk} +@cindex source code, @command{pawk} (Python version) @cindex @code{pawk}, @command{awk}-like facilities for Python This is a Python module that claims to bring @command{awk}-like features to Python. See @uref{https://github.com/alecthomas/pawk} @@ -36343,6 +36356,7 @@ As @command{gawk} is Free Software, the source code is always available. @ref{Gawk Distribution}, describes how to get and build the formal, released versions of @command{gawk}. +@cindex @command{git} utility However, if you want to modify @command{gawk} and contribute back your changes, you will probably wish to work with the development version. To do so, you will need to access the @command{gawk} source code @@ -36518,6 +36532,7 @@ If possible, please update the @command{man} page as well. You will also have to sign paperwork for your documentation changes. +@cindex @command{git} utility @item Submit changes as unified diffs. Use @samp{diff -u -r -N} to compare @@ -36651,6 +36666,8 @@ coding style and brace layout that suits your taste. @node Derived Files @appendixsubsec Why Generated Files Are Kept In @command{git} +@c STARTOFRANGE gawkgit +@cindex @command{git}, use of for @command{gawk} source code @c From emails written March 22, 2012, to the gawk developers list. If you look at the @command{gawk} source in the @command{git} @@ -36830,7 +36847,7 @@ wget http://git.savannah.gnu.org/cgit/gawk.git/snapshot/gawk-@var{branchname}.ta @noindent to retrieve a snapshot of the given branch. - +@c ENDOFRANGE gawkgit @node Future Extensions @appendixsec Probable Future Extensions @@ -37409,7 +37426,7 @@ better written in another language. You can get it from @uref{http://awk.info/?awk100/aaa}. @cindex Ada programming language -@cindex Programming languages, Ada +@cindex programming languages, Ada @item Ada A programming language originally defined by the U.S.@: Department of Defense for embedded programming. It was designed to enforce good @@ -37892,7 +37909,7 @@ information about the name of the organization and its language-independent three-letter acronym. @cindex Java programming language -@cindex Programming languages, Java +@cindex programming languages, Java @item Java A modern programming language originally developed by Sun Microsystems (now Oracle) supporting Object-Oriented programming. Although usually diff --git a/doc/gawktexi.in b/doc/gawktexi.in index 6054ed40..e9a31935 100644 --- a/doc/gawktexi.in +++ b/doc/gawktexi.in @@ -7977,7 +7977,7 @@ parentheses are necessary if any of the item expressions use the @samp{>} relational operator; otherwise, it can be confused with an output redirection (@pxref{Redirection}). -@cindex format strings +@cindex format specifiers The difference between @code{printf} and @code{print} is the @var{format} argument. This is an expression whose value is taken as a string; it specifies how to output each of the other arguments. It is called the @@ -12083,6 +12083,7 @@ the first thing on its line. @subsection The @code{while} Statement @cindex @code{while} statement @cindex loops +@cindex loops, @code{while} @cindex loops, See Also @code{while} statement In programming, a @dfn{loop} is a part of a program that can @@ -12143,6 +12144,7 @@ program is harder to read without it. @node Do Statement @subsection The @code{do}-@code{while} Statement @cindex @code{do}-@code{while} statement +@cindex loops, @code{do}-@code{while} The @code{do} loop is a variation of the @code{while} looping statement. The @code{do} loop executes the @var{body} once and then repeats the @@ -12188,6 +12190,7 @@ occasionally is there a real use for a @code{do} statement. @node For Statement @subsection The @code{for} Statement @cindex @code{for} statement +@cindex loops, @code{for}, iterative The @code{for} statement makes it more convenient to count iterations of a loop. The general form of the @code{for} statement looks like this: @@ -12360,6 +12363,7 @@ it is not available. @subsection The @code{break} Statement @cindex @code{break} statement @cindex loops, exiting +@cindex loops, @code{break} statement and The @code{break} statement jumps out of the innermost @code{for}, @code{while}, or @code{do} loop that encloses it. The following example @@ -13959,6 +13963,7 @@ END @{ @subsection Scanning All Elements of an Array @cindex elements in arrays, scanning @cindex arrays, scanning +@cindex loops, @code{for}, array scanning In programs that use arrays, it is often necessary to use a loop that executes once for each element of an array. In other languages, where @@ -28459,7 +28464,7 @@ the problem at hand is often the correct approach in such situations. @node Arbitrary Precision Integers @section Arbitrary Precision Integer Arithmetic with @command{gawk} -@cindex integer, arbitrary precision +@cindex integers, arbitrary precision If one of the options @option{--bignum} or @option{-M} is specified, @command{gawk} performs all @@ -32209,6 +32214,7 @@ The @code{time} extension described earlier (@pxref{Extension Sample Time}) was originally from this project but has been moved in to the main @command{gawk} distribution. +@cindex @command{git} utility You can check out the code for the @code{gawkextlib} project using the @uref{http://git-scm.com, GIT} distributed source code control system. The command is as follows: @@ -33839,6 +33845,11 @@ environments. Anders Wallin helped keep the VMS port going for several years. @item +@cindex Gordon, Assaf +Assaf Gordon contributed the code to implement the +@option{--sandbox} option. + +@item @cindex Haque, John John Haque made the following contributions: @@ -35251,6 +35262,7 @@ It is available in several archive formats: @uref{http://www.cs.princeton.edu/~bwk/btl.mirror/awk.zip} @end table +@cindex @command{git} utility You can also retrieve it from Git Hub: @example @@ -35379,6 +35391,7 @@ This is an embeddable @command{awk} interpreter derived from @uref{http://repo.hu/projects/libmawk/}. @item @code{pawk} +@cindex source code, @command{pawk} (Python version) @cindex @code{pawk}, @command{awk}-like facilities for Python This is a Python module that claims to bring @command{awk}-like features to Python. See @uref{https://github.com/alecthomas/pawk} @@ -35484,6 +35497,7 @@ As @command{gawk} is Free Software, the source code is always available. @ref{Gawk Distribution}, describes how to get and build the formal, released versions of @command{gawk}. +@cindex @command{git} utility However, if you want to modify @command{gawk} and contribute back your changes, you will probably wish to work with the development version. To do so, you will need to access the @command{gawk} source code @@ -35659,6 +35673,7 @@ If possible, please update the @command{man} page as well. You will also have to sign paperwork for your documentation changes. +@cindex @command{git} utility @item Submit changes as unified diffs. Use @samp{diff -u -r -N} to compare @@ -35792,6 +35807,8 @@ coding style and brace layout that suits your taste. @node Derived Files @appendixsubsec Why Generated Files Are Kept In @command{git} +@c STARTOFRANGE gawkgit +@cindex @command{git}, use of for @command{gawk} source code @c From emails written March 22, 2012, to the gawk developers list. If you look at the @command{gawk} source in the @command{git} @@ -35971,7 +35988,7 @@ wget http://git.savannah.gnu.org/cgit/gawk.git/snapshot/gawk-@var{branchname}.ta @noindent to retrieve a snapshot of the given branch. - +@c ENDOFRANGE gawkgit @node Future Extensions @appendixsec Probable Future Extensions @@ -36550,7 +36567,7 @@ better written in another language. You can get it from @uref{http://awk.info/?awk100/aaa}. @cindex Ada programming language -@cindex Programming languages, Ada +@cindex programming languages, Ada @item Ada A programming language originally defined by the U.S.@: Department of Defense for embedded programming. It was designed to enforce good @@ -37033,7 +37050,7 @@ information about the name of the organization and its language-independent three-letter acronym. @cindex Java programming language -@cindex Programming languages, Java +@cindex programming languages, Java @item Java A modern programming language originally developed by Sun Microsystems (now Oracle) supporting Object-Oriented programming. Although usually diff --git a/helpers/ChangeLog b/helpers/ChangeLog index 14affebc..e5087f37 100644 --- a/helpers/ChangeLog +++ b/helpers/ChangeLog @@ -1,3 +1,8 @@ +2014-03-10 Arnold D. Robbins <arnold@skeeve.com> + + * quoteconvert2.sh: Use .UTF-8 locales per request from + Michal Jaegermann. + 2014-03-08 Arnold D. Robbins <arnold@skeeve.com> * quoteconvert2.sh, tryfmt.c, scanfmt.c: New files. diff --git a/helpers/quoteconvert2.sh b/helpers/quoteconvert2.sh index e33a4d05..63750a37 100755 --- a/helpers/quoteconvert2.sh +++ b/helpers/quoteconvert2.sh @@ -17,17 +17,17 @@ fi llist=" C en_US -en_US.utf8 +en_US.UTF-8 de_DE -de_DE.utf8 +de_DE.UTF-8 fr_FR -fr_FR.utf8 +fr_FR.UTF-8 pt_PT -pt_PT.utf8 +pt_PT.UTF-8 pt_BR -pt_BR.utf8 +pt_BR.UTF-8 ru_RU -ru_RU.utf8 +ru_RU.UTF-8 pl_PX " diff --git a/regex_internal.c b/regex_internal.c index 10dd6e00..056cff3d 100644 --- a/regex_internal.c +++ b/regex_internal.c @@ -320,12 +320,11 @@ build_wcs_upper_buffer (re_string_t *pstr) + byte_idx), remain_len, &pstr->cur_state); if (BE (mbclen + 2 > 2, 1)) { - wchar_t wcu = wc; - if (iswlower (wc)) + wchar_t wcu = towupper (wc); + if (wcu != wc) { size_t mbcdlen; - wcu = towupper (wc); mbcdlen = wcrtomb (buf, wcu, &prev_st); if (BE (mbclen == mbcdlen, 1)) memcpy (pstr->mbs + byte_idx, buf, mbclen); @@ -390,12 +389,11 @@ build_wcs_upper_buffer (re_string_t *pstr) mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); if (BE (mbclen + 2 > 2, 1)) { - wchar_t wcu = wc; - if (iswlower (wc)) + wchar_t wcu = towupper (wc); + if (wcu != wc) { size_t mbcdlen; - wcu = towupper (wc); mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); if (BE (mbclen == mbcdlen, 1)) memcpy (pstr->mbs + byte_idx, buf, mbclen); @@ -547,10 +545,7 @@ build_upper_buffer (re_string_t *pstr) int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; if (BE (pstr->trans != NULL, 0)) ch = pstr->trans[ch]; - if (islower (ch)) - pstr->mbs[char_idx] = toupper (ch); - else - pstr->mbs[char_idx] = ch; + pstr->mbs[char_idx] = toupper (ch); } pstr->valid_len = char_idx; pstr->valid_raw_len = char_idx; |