aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2014-03-10 22:40:16 +0200
committerArnold D. Robbins <arnold@skeeve.com>2014-03-10 22:40:16 +0200
commit6736261fb372cce9bc6c71deea6944fc882c79bb (patch)
treecb892db1cc35026fbca919ef256891dccbe8efb6
parent23e7f1057b1abdebb25fc7d2f11ee3f5360976a4 (diff)
parent0102531b85a7cb85320c0b499c52d44f6822d1f0 (diff)
downloadegawk-6736261fb372cce9bc6c71deea6944fc882c79bb.tar.gz
egawk-6736261fb372cce9bc6c71deea6944fc882c79bb.tar.bz2
egawk-6736261fb372cce9bc6c71deea6944fc882c79bb.zip
Merge branch 'gawk-4.1-stable'
-rw-r--r--ChangeLog14
-rw-r--r--NEWS5
-rw-r--r--builtin.c30
-rw-r--r--dfa.c142
-rw-r--r--dfa.h8
-rw-r--r--doc/ChangeLog9
-rw-r--r--doc/awkcard.in5
-rw-r--r--doc/gawk.111
-rw-r--r--doc/gawk.info148
-rw-r--r--doc/gawk.texi27
-rw-r--r--doc/gawktexi.in27
-rw-r--r--helpers/ChangeLog5
-rwxr-xr-xhelpers/quoteconvert2.sh12
-rw-r--r--regex_internal.c15
14 files changed, 303 insertions, 155 deletions
diff --git a/ChangeLog b/ChangeLog
index 4ec71357..d9942268 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2014-03-10 Arnold D. Robbins <arnold@skeeve.com>
+
+ * dfa.h, dfa.c: Sync with grep. Yet again.
+ * regex_internal.c (built_wcs_upper_buffer, build_upper_buffer):
+ Fixes from GNULIB for mixed case matching on Mac OS X.
+
+ Unrelated:
+
+ * builtin.c (format_tree): Smarten handling of %' flag. Always
+ pass it in for floating point formats. Then only add the
+ thousands_sep if there is one. Also, allow for thousands_sep
+ to be a string, not just one character. Thanks to Michal Jaegermann
+ for the report.
+
2014-03-08 Andrew J. Schorr <aschorr@telemetry-investments.com>
* gawkapi.c (api_impl): Add memory allocation function pointers.
diff --git a/NEWS b/NEWS
index 8d919351..cf741b18 100644
--- a/NEWS
+++ b/NEWS
@@ -52,6 +52,11 @@ extension facility only works on Alpha and Itanium.
realloc() and free(), to insure that the same memory allocation
functions are always used. This bumps the minor version by one.
+13. The printf quote flag now works correctly in locales with a different
+decimal point character but without a thousands separator character.
+If the thousands separator is a string, it will be correctly added
+to decimal numbers.
+
XXX. A number of bugs have been fixed. See the ChangeLog.
Changes from 4.0.2 to 4.1.0
diff --git a/builtin.c b/builtin.c
index 8b1f9868..ebd0fc99 100644
--- a/builtin.c
+++ b/builtin.c
@@ -994,9 +994,7 @@ check_pos:
goto check_pos;
case '\'':
#if defined(HAVE_LOCALE_H)
- /* allow quote_flag if there is a thousands separator. */
- if (loc.thousands_sep[0] != '\0')
- quote_flag = true;
+ quote_flag = true;
goto check_pos;
#else
goto retry;
@@ -1196,6 +1194,9 @@ out0:
}
if (i < 1)
goto out_of_range;
+#if defined(HAVE_LOCALE_H)
+ quote_flag = (quote_flag && loc.thousands_sep[0] != 0);
+#endif
chp = &cpbufs[1].buf[i-1];
ii = jj = 0;
do {
@@ -1203,8 +1204,14 @@ out0:
chp--; i--;
#if defined(HAVE_LOCALE_H)
if (quote_flag && loc.grouping[ii] && ++jj == loc.grouping[ii]) {
- if (i) /* only add if more digits coming */
- PREPEND(loc.thousands_sep[0]); /* XXX - assumption it's one char */
+ if (i) { /* only add if more digits coming */
+ int k;
+ const char *ts = loc.thousands_sep;
+
+ for (k = strlen(ts) - 1; k >= 0; k--) {
+ PREPEND(ts[k]);
+ }
+ }
if (loc.grouping[ii+1] == 0)
jj = 0; /* keep using current val in loc.grouping[ii] */
else if (loc.grouping[ii+1] == CHAR_MAX)
@@ -1360,6 +1367,9 @@ mpf1:
#ifdef HAVE_MPFR
int0:
#endif
+#if defined(HAVE_LOCALE_H)
+ quote_flag = (quote_flag && loc.thousands_sep[0] != 0);
+#endif
/*
* When to fill with zeroes is of course not simple.
* First: No zero fill if left-justifying.
@@ -1378,8 +1388,14 @@ mpf1:
uval /= base;
#if defined(HAVE_LOCALE_H)
if (base == 10 && quote_flag && loc.grouping[ii] && ++jj == loc.grouping[ii]) {
- if (uval) /* only add if more digits coming */
- PREPEND(loc.thousands_sep[0]); /* XXX --- assumption it's one char */
+ if (uval) { /* only add if more digits coming */
+ int k;
+ const char *ts = loc.thousands_sep;
+
+ for (k = strlen(ts) - 1; k >= 0; k--) {
+ PREPEND(ts[k]);
+ }
+ }
if (loc.grouping[ii+1] == 0)
jj = 0; /* keep using current val in loc.grouping[ii] */
else if (loc.grouping[ii+1] == CHAR_MAX)
diff --git a/dfa.c b/dfa.c
index 3dd3c209..8771bbee 100644
--- a/dfa.c
+++ b/dfa.c
@@ -45,6 +45,11 @@
#include "dfa.h"
+/* Gawk doesn't use Gnulib, so don't assume static_assert is present. */
+#ifndef static_assert
+# define static_assert(cond, diagnostic) \
+ extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
+#endif
#define STREQ(a, b) (strcmp (a, b) == 0)
@@ -748,34 +753,16 @@ setbit_wc (wint_t wc, charclass c)
#endif
}
-/* Set a bit for B in the charclass C, if B is a valid single byte
- character in the current character set. If case is folded, set B's
- lower and upper case variants similarly. If MB_CUR_MAX > 1, the
- resulting charset is used only as an optimization, and the caller
- should set the appropriate field of struct mb_char_classes. */
+/* Set a bit for B and its case variants in the charclass C.
+ MB_CUR_MAX must be 1. */
static void
setbit_case_fold_c (int b, charclass c)
{
- if (MB_CUR_MAX > 1)
- {
- wint_t wc = btowc (b);
- if (wc == WEOF)
- return;
- if (case_fold)
- {
- setbit_wc (towlower (wc), c);
- setbit_wc (towupper (wc), c);
- }
- }
- else
- {
- if (case_fold)
- {
- setbit (tolower (b), c);
- setbit (toupper (b), c);
- }
- }
- setbit (b, c);
+ int ub = toupper (b);
+ int i;
+ for (i = 0; i < NOTCHAR; i++)
+ if (toupper (i) == ub)
+ setbit (i, c);
}
@@ -940,6 +927,50 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
+/* The set of wchar_t values C such that there's a useful locale
+ somewhere where C != towupper (C) && C != towlower (towupper (C)).
+ For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+ towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+ towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
+static short const lonesome_lower[] =
+ {
+ 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+ 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+ /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+ counterpart in locales predating Unicode 4.0.0 (April 2003). */
+ 0x03F2,
+
+ 0x03F5, 0x1E9B, 0x1FBE,
+ };
+
+static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2
+ == CASE_FOLDED_BUFSIZE),
+ "CASE_FOLDED_BUFSIZE is wrong");
+
+/* Find the characters equal to C after case-folding, other than C
+ itself, and store them into FOLDED. Return the number of characters
+ stored. */
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+ int i;
+ int n = 0;
+ wint_t uc = towupper (c);
+ wint_t lc = towlower (uc);
+ if (uc != c)
+ folded[n++] = uc;
+ if (lc != uc && lc != c && towupper (lc) == uc)
+ folded[n++] = lc;
+ for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+ {
+ wint_t li = lonesome_lower[i];
+ if (li != lc && li != uc && li != c && towupper (li) == uc)
+ folded[n++] = li;
+ }
+ return n;
+}
+
typedef int predicate (int);
/* The following list maps the names of the Posix named character classes
@@ -1100,7 +1131,7 @@ parse_bracket_exp (void)
for (c2 = 0; c2 < NOTCHAR; ++c2)
if (pred->func (c2))
- setbit_case_fold_c (c2, ccl);
+ setbit (c2, ccl);
}
else
known_bracket_exp = false;
@@ -1167,8 +1198,21 @@ parse_bracket_exp (void)
}
}
else if (using_simple_locale ())
- for (; c <= c2; c++)
- setbit_case_fold_c (c, ccl);
+ {
+ for (c1 = c; c1 <= c2; c1++)
+ setbit (c1, ccl);
+ if (case_fold)
+ {
+ int uc = toupper (c);
+ int uc2 = toupper (c2);
+ for (c1 = 0; c1 < NOTCHAR; c1++)
+ {
+ int uc1 = toupper (c1);
+ if (uc <= uc1 && uc1 <= uc2)
+ setbit (c1, ccl);
+ }
+ }
+ }
else
known_bracket_exp = false;
@@ -1187,26 +1231,22 @@ parse_bracket_exp (void)
if (MB_CUR_MAX == 1)
{
- setbit_case_fold_c (c, ccl);
+ if (case_fold)
+ setbit_case_fold_c (c, ccl);
+ else
+ setbit (c, ccl);
continue;
}
if (case_fold)
{
- wint_t folded = towlower (wc);
- if (folded != wc && !setbit_wc (folded, ccl))
- {
- REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
- work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = folded;
- }
- folded = towupper (wc);
- if (folded != wc && !setbit_wc (folded, ccl))
- {
- REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
- work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = folded;
- }
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ int i, n = case_folded_counterparts (wc, folded);
+ REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+ work_mbc->nchars + n);
+ for (i = 0; i < n; i++)
+ if (!setbit_wc (folded[i], ccl))
+ work_mbc->chars[work_mbc->nchars++] = folded[i];
}
if (!setbit_wc (wc, ccl))
{
@@ -1552,7 +1592,7 @@ lex (void)
if (MB_CUR_MAX > 1)
return lasttok = WCHAR;
- if (case_fold && (tolower (c) != c || toupper (c) != c))
+ if (case_fold && isalpha (c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
@@ -1799,18 +1839,14 @@ atom (void)
if (MBS_SUPPORT && tok == WCHAR)
{
addtok_wc (wctok);
+
if (case_fold)
{
- wint_t folded = towlower (wctok);
- if (folded != wctok)
- {
- addtok_wc (folded);
- addtok (OR);
- }
- folded = towupper (wctok);
- if (folded != wctok)
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ int i, n = case_folded_counterparts (wctok, folded);
+ for (i = 0; i < n; i++)
{
- addtok_wc (folded);
+ addtok_wc (folded[i]);
addtok (OR);
}
}
diff --git a/dfa.h b/dfa.h
index 7e0674fc..24fbcbe7 100644
--- a/dfa.h
+++ b/dfa.h
@@ -101,3 +101,11 @@ extern void dfawarn (const char *);
extern _Noreturn void dfaerror (const char *);
extern int using_utf8 (void);
+
+/* Maximum number of characters that can be the case-folded
+ counterparts of a single character, not counting the character
+ itself. This is 1 for towupper, 1 for towlower, and 1 for each
+ entry in LONESOME_LOWER; see dfa.c. */
+enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 };
+
+extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/doc/ChangeLog b/doc/ChangeLog
index ac48a973..0b34c55f 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,12 @@
+2014-03-10 Arnold D. Robbins <arnold@skeeve.com>
+
+ * gawktexi.in: Finish indexing improvements. (For now, anyway.)
+
+ Unrelated:
+
+ * gawk.1: Document the quote flag! (Better late than never.)
+ * awkcard.in: Update documentation of quote flag.
+
2014-03-08 Arnold D. Robbins <arnold@skeeve.com>
* gawktexi.in: Minor edits to the discussion of the memory allocation
diff --git a/doc/awkcard.in b/doc/awkcard.in
index 610032b7..5f3a9735 100644
--- a/doc/awkcard.in
+++ b/doc/awkcard.in
@@ -1,7 +1,7 @@
.\" AWK Reference Card --- Arnold Robbins, arnold@skeeve.com
.\"
.\" Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-.\" 2003, 2004, 2005, 2007, 2009, 2010, 2011, 2012, 2013
+.\" 2003, 2004, 2005, 2007, 2009, 2010, 2011, 2012, 2013, 2014
.\" Free Software Foundation, Inc.
.\"
.\" Permission is granted to make and distribute verbatim copies of
@@ -1493,7 +1493,8 @@ Only has an effect when the field width is wider
than the value to be printed.
T}
\*(CB\*(FC'\*(FR T{
-Use the locale's thousands separator for \*(FC%d\fP, \*(FC%i\fP, and \*(FC%u\fP.\*(CD
+Use the locale's thousands separator and decimal
+point characters.\*(CD
T}
\*(FIwidth\fP T{
Pad the field to this width. The field is normally
diff --git a/doc/gawk.1 b/doc/gawk.1
index bd58b10c..a879b527 100644
--- a/doc/gawk.1
+++ b/doc/gawk.1
@@ -13,7 +13,7 @@
. if \w'\(rq' .ds rq "\(rq
. \}
.\}
-.TH GAWK 1 "Jan 28 2014" "Free Software Foundation" "Utility Commands"
+.TH GAWK 1 "Mar 08 2014" "Free Software Foundation" "Utility Commands"
.SH NAME
gawk \- pattern scanning and processing language
.SH SYNOPSIS
@@ -2454,6 +2454,15 @@ This applies only to the numeric output formats.
This flag only has an effect when the field width is wider than the
value to be printed.
.TP
+.B '
+A single quote character instructs
+.I gawk
+to insert the locale's thousands-separator character
+into decimal numbers, and to also use the locale's
+decimal point character with floating point formats.
+This requires correct locale support in the C library
+and in the definition of the current locale.
+.TP
.I width
The field should be padded to this width. The field is normally padded
with spaces. With the
diff --git a/doc/gawk.info b/doc/gawk.info
index 74773fcb..13f5ea77 100644
--- a/doc/gawk.info
+++ b/doc/gawk.info
@@ -25937,6 +25937,9 @@ Info file, in approximate chronological order:
* Anders Wallin helped keep the VMS port going for several years.
+ * Assaf Gordon contributed the code to implement the `--sandbox'
+ option.
+
* John Haque made the following contributions:
- The modifications to convert `gawk' into a byte-code
@@ -31301,12 +31304,12 @@ Index
* for statement, looping over arrays: Scanning an Array. (line 20)
* fork() extension function: Extension Sample Fork.
(line 11)
+* format specifiers: Basic Printf. (line 15)
* format specifiers, mixing regular with positional specifiers: Printf Ordering.
(line 57)
* format specifiers, printf statement: Control Letters. (line 6)
* format specifiers, strftime() function (gawk): Time Functions.
(line 88)
-* format strings: Basic Printf. (line 15)
* formats, numeric output: OFMT. (line 6)
* formatting output: Printf. (line 6)
* forward slash (/): Regexp. (line 10)
@@ -31531,6 +31534,12 @@ Index
* gettext() function (C library): Explaining gettext. (line 62)
* gettimeofday() extension function: Extension Sample Time.
(line 13)
+* git utility <1>: Adding Code. (line 111)
+* git utility <2>: Accessing The Source.
+ (line 10)
+* git utility <3>: Other Versions. (line 29)
+* git utility: gawkextlib. (line 29)
+* git, use of for gawk source code: Derived Files. (line 6)
* GMP: Arbitrary Precision Arithmetic.
(line 6)
* GNITS mailing list: Acknowledgments. (line 52)
@@ -31547,6 +31556,7 @@ Index
* GNU/Linux <1>: Glossary. (line 624)
* GNU/Linux <2>: I18N Example. (line 55)
* GNU/Linux: Manual History. (line 28)
+* Gordon, Assaf: Contributors. (line 105)
* GPL (General Public License) <1>: Glossary. (line 314)
* GPL (General Public License): Manual History. (line 11)
* GPL (General Public License), printing: Options. (line 88)
@@ -31564,7 +31574,7 @@ Index
(line 66)
* Hankerson, Darrel <1>: Contributors. (line 60)
* Hankerson, Darrel: Acknowledgments. (line 60)
-* Haque, John: Contributors. (line 105)
+* Haque, John: Contributors. (line 108)
* Hartholz, Elaine: Acknowledgments. (line 38)
* Hartholz, Marshall: Acknowledgments. (line 38)
* Hasegawa, Isamu: Contributors. (line 94)
@@ -31652,9 +31662,9 @@ Index
* installing gawk: Installation. (line 6)
* INT signal (MS-Windows): Profiling. (line 214)
* int() function: Numeric Functions. (line 23)
-* integer, arbitrary precision: Arbitrary Precision Integers.
- (line 6)
* integers: General Arithmetic. (line 6)
+* integers, arbitrary precision: Arbitrary Precision Integers.
+ (line 6)
* integers, unsigned: General Arithmetic. (line 15)
* interacting with other programs: I/O Functions. (line 72)
* internationalization <1>: I18N and L10N. (line 6)
@@ -31800,10 +31810,15 @@ Index
* login information: Passwd Functions. (line 16)
* long options: Command Line. (line 13)
* loops: While Statement. (line 6)
+* loops, break statement and: Break Statement. (line 6)
* loops, continue statements and: For Statement. (line 64)
* loops, count for header: Profiling. (line 131)
+* loops, do-while: Do Statement. (line 6)
* loops, exiting: Break Statement. (line 6)
+* loops, for, array scanning: Scanning an Array. (line 6)
+* loops, for, iterative: For Statement. (line 6)
* loops, See Also while statement: While Statement. (line 6)
+* loops, while: While Statement. (line 6)
* ls utility: More Complex. (line 15)
* lshift() function (gawk): Bitwise Functions. (line 46)
* lvalues/rvalues: Assignment Ops. (line 32)
@@ -32200,10 +32215,10 @@ Index
* programming conventions, private variable names: Library Names.
(line 23)
* programming language, recipe for: History. (line 6)
-* Programming languages, Ada: Glossary. (line 20)
+* programming languages, Ada: Glossary. (line 20)
* programming languages, data-driven vs. procedural: Getting Started.
(line 12)
-* Programming languages, Java: Glossary. (line 388)
+* programming languages, Java: Glossary. (line 388)
* programming, basic steps: Basic High Level. (line 20)
* programming, concepts: Basic Concepts. (line 6)
* pwcat program: Passwd Functions. (line 23)
@@ -32340,7 +32355,7 @@ Index
* RLENGTH variable, match() function and: String Functions. (line 220)
* Robbins, Arnold <1>: Future Extensions. (line 6)
* Robbins, Arnold <2>: Bugs. (line 32)
-* Robbins, Arnold <3>: Contributors. (line 132)
+* Robbins, Arnold <3>: Contributors. (line 135)
* Robbins, Arnold <4>: General Data Types. (line 6)
* Robbins, Arnold <5>: Alarm Program. (line 6)
* Robbins, Arnold <6>: Passwd Functions. (line 90)
@@ -32381,7 +32396,7 @@ Index
(line 68)
* sandbox mode: Options. (line 279)
* scalar values: Basic Data Typing. (line 13)
-* Schorr, Andrew <1>: Contributors. (line 128)
+* Schorr, Andrew <1>: Contributors. (line 131)
* Schorr, Andrew: Acknowledgments. (line 60)
* Schreiber, Bert: Acknowledgments. (line 38)
* Schreiber, Rita: Acknowledgments. (line 38)
@@ -32516,6 +32531,7 @@ Index
* source code, mawk: Other Versions. (line 44)
* source code, mixing: Options. (line 117)
* source code, pawk: Other Versions. (line 78)
+* source code, pawk (Python version): Other Versions. (line 124)
* source code, QSE Awk: Other Versions. (line 130)
* source code, QuikTrim Awk: Other Versions. (line 134)
* source code, Solaris awk: Other Versions. (line 96)
@@ -32816,7 +32832,7 @@ Index
* xgettext utility: String Extraction. (line 13)
* XOR bitwise operation: Bitwise Functions. (line 6)
* xor() function (gawk): Bitwise Functions. (line 55)
-* Yawitz, Efraim: Contributors. (line 126)
+* Yawitz, Efraim: Contributors. (line 129)
* Zaretskii, Eli <1>: Bugs. (line 70)
* Zaretskii, Eli <2>: Contributors. (line 55)
* Zaretskii, Eli: Acknowledgments. (line 60)
@@ -33317,62 +33333,62 @@ Ref: Ranges and Locales-Footnote-11034931
Ref: Ranges and Locales-Footnote-21034958
Ref: Ranges and Locales-Footnote-31035192
Node: Contributors1035413
-Node: Installation1040558
-Node: Gawk Distribution1041452
-Node: Getting1041936
-Node: Extracting1042762
-Node: Distribution contents1044454
-Node: Unix Installation1050159
-Node: Quick Installation1050776
-Node: Additional Configuration Options1053222
-Node: Configuration Philosophy1054958
-Node: Non-Unix Installation1057312
-Node: PC Installation1057770
-Node: PC Binary Installation1059069
-Node: PC Compiling1060917
-Node: PC Testing1063861
-Node: PC Using1065037
-Node: Cygwin1069205
-Node: MSYS1070014
-Node: VMS Installation1070528
-Node: VMS Compilation1071292
-Ref: VMS Compilation-Footnote-11072544
-Node: VMS Dynamic Extensions1072602
-Node: VMS Installation Details1073975
-Node: VMS Running1076226
-Node: VMS GNV1079060
-Node: VMS Old Gawk1079783
-Node: Bugs1080253
-Node: Other Versions1084171
-Node: Notes1090255
-Node: Compatibility Mode1091055
-Node: Additions1091838
-Node: Accessing The Source1092765
-Node: Adding Code1094205
-Node: New Ports1100250
-Node: Derived Files1104385
-Ref: Derived Files-Footnote-11109706
-Ref: Derived Files-Footnote-21109740
-Ref: Derived Files-Footnote-31110340
-Node: Future Extensions1110438
-Node: Implementation Limitations1111021
-Node: Extension Design1112273
-Node: Old Extension Problems1113427
-Ref: Old Extension Problems-Footnote-11114935
-Node: Extension New Mechanism Goals1114992
-Ref: Extension New Mechanism Goals-Footnote-11118357
-Node: Extension Other Design Decisions1118543
-Node: Extension Future Growth1120649
-Node: Old Extension Mechanism1121485
-Node: Basic Concepts1123225
-Node: Basic High Level1123906
-Ref: figure-general-flow1124177
-Ref: figure-process-flow1124776
-Ref: Basic High Level-Footnote-11128005
-Node: Basic Data Typing1128190
-Node: Glossary1131545
-Node: Copying1157007
-Node: GNU Free Documentation License1194564
-Node: Index1219701
+Node: Installation1040640
+Node: Gawk Distribution1041534
+Node: Getting1042018
+Node: Extracting1042844
+Node: Distribution contents1044536
+Node: Unix Installation1050241
+Node: Quick Installation1050858
+Node: Additional Configuration Options1053304
+Node: Configuration Philosophy1055040
+Node: Non-Unix Installation1057394
+Node: PC Installation1057852
+Node: PC Binary Installation1059151
+Node: PC Compiling1060999
+Node: PC Testing1063943
+Node: PC Using1065119
+Node: Cygwin1069287
+Node: MSYS1070096
+Node: VMS Installation1070610
+Node: VMS Compilation1071374
+Ref: VMS Compilation-Footnote-11072626
+Node: VMS Dynamic Extensions1072684
+Node: VMS Installation Details1074057
+Node: VMS Running1076308
+Node: VMS GNV1079142
+Node: VMS Old Gawk1079865
+Node: Bugs1080335
+Node: Other Versions1084253
+Node: Notes1090337
+Node: Compatibility Mode1091137
+Node: Additions1091920
+Node: Accessing The Source1092847
+Node: Adding Code1094287
+Node: New Ports1100332
+Node: Derived Files1104467
+Ref: Derived Files-Footnote-11109788
+Ref: Derived Files-Footnote-21109822
+Ref: Derived Files-Footnote-31110422
+Node: Future Extensions1110520
+Node: Implementation Limitations1111103
+Node: Extension Design1112355
+Node: Old Extension Problems1113509
+Ref: Old Extension Problems-Footnote-11115017
+Node: Extension New Mechanism Goals1115074
+Ref: Extension New Mechanism Goals-Footnote-11118439
+Node: Extension Other Design Decisions1118625
+Node: Extension Future Growth1120731
+Node: Old Extension Mechanism1121567
+Node: Basic Concepts1123307
+Node: Basic High Level1123988
+Ref: figure-general-flow1124259
+Ref: figure-process-flow1124858
+Ref: Basic High Level-Footnote-11128087
+Node: Basic Data Typing1128272
+Node: Glossary1131627
+Node: Copying1157089
+Node: GNU Free Documentation License1194646
+Node: Index1219783

End Tag Table
diff --git a/doc/gawk.texi b/doc/gawk.texi
index b9bf2170..1b610c42 100644
--- a/doc/gawk.texi
+++ b/doc/gawk.texi
@@ -8359,7 +8359,7 @@ parentheses are necessary if any of the item expressions use the @samp{>}
relational operator; otherwise, it can be confused with an output redirection
(@pxref{Redirection}).
-@cindex format strings
+@cindex format specifiers
The difference between @code{printf} and @code{print} is the @var{format}
argument. This is an expression whose value is taken as a string; it
specifies how to output each of the other arguments. It is called the
@@ -12706,6 +12706,7 @@ the first thing on its line.
@subsection The @code{while} Statement
@cindex @code{while} statement
@cindex loops
+@cindex loops, @code{while}
@cindex loops, See Also @code{while} statement
In programming, a @dfn{loop} is a part of a program that can
@@ -12766,6 +12767,7 @@ program is harder to read without it.
@node Do Statement
@subsection The @code{do}-@code{while} Statement
@cindex @code{do}-@code{while} statement
+@cindex loops, @code{do}-@code{while}
The @code{do} loop is a variation of the @code{while} looping statement.
The @code{do} loop executes the @var{body} once and then repeats the
@@ -12811,6 +12813,7 @@ occasionally is there a real use for a @code{do} statement.
@node For Statement
@subsection The @code{for} Statement
@cindex @code{for} statement
+@cindex loops, @code{for}, iterative
The @code{for} statement makes it more convenient to count iterations of a
loop. The general form of the @code{for} statement looks like this:
@@ -12983,6 +12986,7 @@ it is not available.
@subsection The @code{break} Statement
@cindex @code{break} statement
@cindex loops, exiting
+@cindex loops, @code{break} statement and
The @code{break} statement jumps out of the innermost @code{for},
@code{while}, or @code{do} loop that encloses it. The following example
@@ -14628,6 +14632,7 @@ END @{
@subsection Scanning All Elements of an Array
@cindex elements in arrays, scanning
@cindex arrays, scanning
+@cindex loops, @code{for}, array scanning
In programs that use arrays, it is often necessary to use a loop that
executes once for each element of an array. In other languages, where
@@ -29318,7 +29323,7 @@ the problem at hand is often the correct approach in such situations.
@node Arbitrary Precision Integers
@section Arbitrary Precision Integer Arithmetic with @command{gawk}
-@cindex integer, arbitrary precision
+@cindex integers, arbitrary precision
If one of the options @option{--bignum} or @option{-M} is specified,
@command{gawk} performs all
@@ -33068,6 +33073,7 @@ The @code{time} extension described earlier (@pxref{Extension Sample
Time}) was originally from this project but has been moved in to the
main @command{gawk} distribution.
+@cindex @command{git} utility
You can check out the code for the @code{gawkextlib} project
using the @uref{http://git-scm.com, GIT} distributed source
code control system. The command is as follows:
@@ -34698,6 +34704,11 @@ environments.
Anders Wallin helped keep the VMS port going for several years.
@item
+@cindex Gordon, Assaf
+Assaf Gordon contributed the code to implement the
+@option{--sandbox} option.
+
+@item
@cindex Haque, John
John Haque made the following contributions:
@@ -36110,6 +36121,7 @@ It is available in several archive formats:
@uref{http://www.cs.princeton.edu/~bwk/btl.mirror/awk.zip}
@end table
+@cindex @command{git} utility
You can also retrieve it from Git Hub:
@example
@@ -36238,6 +36250,7 @@ This is an embeddable @command{awk} interpreter derived from
@uref{http://repo.hu/projects/libmawk/}.
@item @code{pawk}
+@cindex source code, @command{pawk} (Python version)
@cindex @code{pawk}, @command{awk}-like facilities for Python
This is a Python module that claims to bring @command{awk}-like
features to Python. See @uref{https://github.com/alecthomas/pawk}
@@ -36343,6 +36356,7 @@ As @command{gawk} is Free Software, the source code is always available.
@ref{Gawk Distribution}, describes how to get and build the formal,
released versions of @command{gawk}.
+@cindex @command{git} utility
However, if you want to modify @command{gawk} and contribute back your
changes, you will probably wish to work with the development version.
To do so, you will need to access the @command{gawk} source code
@@ -36518,6 +36532,7 @@ If possible, please update the @command{man} page as well.
You will also have to sign paperwork for your documentation changes.
+@cindex @command{git} utility
@item
Submit changes as unified diffs.
Use @samp{diff -u -r -N} to compare
@@ -36651,6 +36666,8 @@ coding style and brace layout that suits your taste.
@node Derived Files
@appendixsubsec Why Generated Files Are Kept In @command{git}
+@c STARTOFRANGE gawkgit
+@cindex @command{git}, use of for @command{gawk} source code
@c From emails written March 22, 2012, to the gawk developers list.
If you look at the @command{gawk} source in the @command{git}
@@ -36830,7 +36847,7 @@ wget http://git.savannah.gnu.org/cgit/gawk.git/snapshot/gawk-@var{branchname}.ta
@noindent
to retrieve a snapshot of the given branch.
-
+@c ENDOFRANGE gawkgit
@node Future Extensions
@appendixsec Probable Future Extensions
@@ -37409,7 +37426,7 @@ better written in another language.
You can get it from @uref{http://awk.info/?awk100/aaa}.
@cindex Ada programming language
-@cindex Programming languages, Ada
+@cindex programming languages, Ada
@item Ada
A programming language originally defined by the U.S.@: Department of
Defense for embedded programming. It was designed to enforce good
@@ -37892,7 +37909,7 @@ information about the name of the organization and its language-independent
three-letter acronym.
@cindex Java programming language
-@cindex Programming languages, Java
+@cindex programming languages, Java
@item Java
A modern programming language originally developed by Sun Microsystems
(now Oracle) supporting Object-Oriented programming. Although usually
diff --git a/doc/gawktexi.in b/doc/gawktexi.in
index 6054ed40..e9a31935 100644
--- a/doc/gawktexi.in
+++ b/doc/gawktexi.in
@@ -7977,7 +7977,7 @@ parentheses are necessary if any of the item expressions use the @samp{>}
relational operator; otherwise, it can be confused with an output redirection
(@pxref{Redirection}).
-@cindex format strings
+@cindex format specifiers
The difference between @code{printf} and @code{print} is the @var{format}
argument. This is an expression whose value is taken as a string; it
specifies how to output each of the other arguments. It is called the
@@ -12083,6 +12083,7 @@ the first thing on its line.
@subsection The @code{while} Statement
@cindex @code{while} statement
@cindex loops
+@cindex loops, @code{while}
@cindex loops, See Also @code{while} statement
In programming, a @dfn{loop} is a part of a program that can
@@ -12143,6 +12144,7 @@ program is harder to read without it.
@node Do Statement
@subsection The @code{do}-@code{while} Statement
@cindex @code{do}-@code{while} statement
+@cindex loops, @code{do}-@code{while}
The @code{do} loop is a variation of the @code{while} looping statement.
The @code{do} loop executes the @var{body} once and then repeats the
@@ -12188,6 +12190,7 @@ occasionally is there a real use for a @code{do} statement.
@node For Statement
@subsection The @code{for} Statement
@cindex @code{for} statement
+@cindex loops, @code{for}, iterative
The @code{for} statement makes it more convenient to count iterations of a
loop. The general form of the @code{for} statement looks like this:
@@ -12360,6 +12363,7 @@ it is not available.
@subsection The @code{break} Statement
@cindex @code{break} statement
@cindex loops, exiting
+@cindex loops, @code{break} statement and
The @code{break} statement jumps out of the innermost @code{for},
@code{while}, or @code{do} loop that encloses it. The following example
@@ -13959,6 +13963,7 @@ END @{
@subsection Scanning All Elements of an Array
@cindex elements in arrays, scanning
@cindex arrays, scanning
+@cindex loops, @code{for}, array scanning
In programs that use arrays, it is often necessary to use a loop that
executes once for each element of an array. In other languages, where
@@ -28459,7 +28464,7 @@ the problem at hand is often the correct approach in such situations.
@node Arbitrary Precision Integers
@section Arbitrary Precision Integer Arithmetic with @command{gawk}
-@cindex integer, arbitrary precision
+@cindex integers, arbitrary precision
If one of the options @option{--bignum} or @option{-M} is specified,
@command{gawk} performs all
@@ -32209,6 +32214,7 @@ The @code{time} extension described earlier (@pxref{Extension Sample
Time}) was originally from this project but has been moved in to the
main @command{gawk} distribution.
+@cindex @command{git} utility
You can check out the code for the @code{gawkextlib} project
using the @uref{http://git-scm.com, GIT} distributed source
code control system. The command is as follows:
@@ -33839,6 +33845,11 @@ environments.
Anders Wallin helped keep the VMS port going for several years.
@item
+@cindex Gordon, Assaf
+Assaf Gordon contributed the code to implement the
+@option{--sandbox} option.
+
+@item
@cindex Haque, John
John Haque made the following contributions:
@@ -35251,6 +35262,7 @@ It is available in several archive formats:
@uref{http://www.cs.princeton.edu/~bwk/btl.mirror/awk.zip}
@end table
+@cindex @command{git} utility
You can also retrieve it from Git Hub:
@example
@@ -35379,6 +35391,7 @@ This is an embeddable @command{awk} interpreter derived from
@uref{http://repo.hu/projects/libmawk/}.
@item @code{pawk}
+@cindex source code, @command{pawk} (Python version)
@cindex @code{pawk}, @command{awk}-like facilities for Python
This is a Python module that claims to bring @command{awk}-like
features to Python. See @uref{https://github.com/alecthomas/pawk}
@@ -35484,6 +35497,7 @@ As @command{gawk} is Free Software, the source code is always available.
@ref{Gawk Distribution}, describes how to get and build the formal,
released versions of @command{gawk}.
+@cindex @command{git} utility
However, if you want to modify @command{gawk} and contribute back your
changes, you will probably wish to work with the development version.
To do so, you will need to access the @command{gawk} source code
@@ -35659,6 +35673,7 @@ If possible, please update the @command{man} page as well.
You will also have to sign paperwork for your documentation changes.
+@cindex @command{git} utility
@item
Submit changes as unified diffs.
Use @samp{diff -u -r -N} to compare
@@ -35792,6 +35807,8 @@ coding style and brace layout that suits your taste.
@node Derived Files
@appendixsubsec Why Generated Files Are Kept In @command{git}
+@c STARTOFRANGE gawkgit
+@cindex @command{git}, use of for @command{gawk} source code
@c From emails written March 22, 2012, to the gawk developers list.
If you look at the @command{gawk} source in the @command{git}
@@ -35971,7 +35988,7 @@ wget http://git.savannah.gnu.org/cgit/gawk.git/snapshot/gawk-@var{branchname}.ta
@noindent
to retrieve a snapshot of the given branch.
-
+@c ENDOFRANGE gawkgit
@node Future Extensions
@appendixsec Probable Future Extensions
@@ -36550,7 +36567,7 @@ better written in another language.
You can get it from @uref{http://awk.info/?awk100/aaa}.
@cindex Ada programming language
-@cindex Programming languages, Ada
+@cindex programming languages, Ada
@item Ada
A programming language originally defined by the U.S.@: Department of
Defense for embedded programming. It was designed to enforce good
@@ -37033,7 +37050,7 @@ information about the name of the organization and its language-independent
three-letter acronym.
@cindex Java programming language
-@cindex Programming languages, Java
+@cindex programming languages, Java
@item Java
A modern programming language originally developed by Sun Microsystems
(now Oracle) supporting Object-Oriented programming. Although usually
diff --git a/helpers/ChangeLog b/helpers/ChangeLog
index 14affebc..e5087f37 100644
--- a/helpers/ChangeLog
+++ b/helpers/ChangeLog
@@ -1,3 +1,8 @@
+2014-03-10 Arnold D. Robbins <arnold@skeeve.com>
+
+ * quoteconvert2.sh: Use .UTF-8 locales per request from
+ Michal Jaegermann.
+
2014-03-08 Arnold D. Robbins <arnold@skeeve.com>
* quoteconvert2.sh, tryfmt.c, scanfmt.c: New files.
diff --git a/helpers/quoteconvert2.sh b/helpers/quoteconvert2.sh
index e33a4d05..63750a37 100755
--- a/helpers/quoteconvert2.sh
+++ b/helpers/quoteconvert2.sh
@@ -17,17 +17,17 @@ fi
llist="
C
en_US
-en_US.utf8
+en_US.UTF-8
de_DE
-de_DE.utf8
+de_DE.UTF-8
fr_FR
-fr_FR.utf8
+fr_FR.UTF-8
pt_PT
-pt_PT.utf8
+pt_PT.UTF-8
pt_BR
-pt_BR.utf8
+pt_BR.UTF-8
ru_RU
-ru_RU.utf8
+ru_RU.UTF-8
pl_PX
"
diff --git a/regex_internal.c b/regex_internal.c
index 10dd6e00..056cff3d 100644
--- a/regex_internal.c
+++ b/regex_internal.c
@@ -320,12 +320,11 @@ build_wcs_upper_buffer (re_string_t *pstr)
+ byte_idx), remain_len, &pstr->cur_state);
if (BE (mbclen + 2 > 2, 1))
{
- wchar_t wcu = wc;
- if (iswlower (wc))
+ wchar_t wcu = towupper (wc);
+ if (wcu != wc)
{
size_t mbcdlen;
- wcu = towupper (wc);
mbcdlen = wcrtomb (buf, wcu, &prev_st);
if (BE (mbclen == mbcdlen, 1))
memcpy (pstr->mbs + byte_idx, buf, mbclen);
@@ -390,12 +389,11 @@ build_wcs_upper_buffer (re_string_t *pstr)
mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
if (BE (mbclen + 2 > 2, 1))
{
- wchar_t wcu = wc;
- if (iswlower (wc))
+ wchar_t wcu = towupper (wc);
+ if (wcu != wc)
{
size_t mbcdlen;
- wcu = towupper (wc);
mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
if (BE (mbclen == mbcdlen, 1))
memcpy (pstr->mbs + byte_idx, buf, mbclen);
@@ -547,10 +545,7 @@ build_upper_buffer (re_string_t *pstr)
int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
if (BE (pstr->trans != NULL, 0))
ch = pstr->trans[ch];
- if (islower (ch))
- pstr->mbs[char_idx] = toupper (ch);
- else
- pstr->mbs[char_idx] = ch;
+ pstr->mbs[char_idx] = toupper (ch);
}
pstr->valid_len = char_idx;
pstr->valid_raw_len = char_idx;