aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2016-08-25 22:14:15 +0300
committerArnold D. Robbins <arnold@skeeve.com>2016-08-25 22:14:15 +0300
commitb03d089e9b87c4e64bd539a1703e740923a67aa4 (patch)
treec7351e0b46c45d282eba64e478c99c0771a055a1
parente0dd835cc155c900ca9725a0d36eb0f5a856d9bf (diff)
parent00682d87a1a1c0535c0fa5adb27867578dc76d49 (diff)
downloadegawk-b03d089e9b87c4e64bd539a1703e740923a67aa4.tar.gz
egawk-b03d089e9b87c4e64bd539a1703e740923a67aa4.tar.bz2
egawk-b03d089e9b87c4e64bd539a1703e740923a67aa4.zip
Merge branch 'master' into feature/typed-regex
-rw-r--r--ChangeLog33
-rw-r--r--awk.h13
-rw-r--r--debug.c2
-rw-r--r--dfa.c393
-rw-r--r--doc/ChangeLog6
-rw-r--r--doc/gawk.info830
-rw-r--r--doc/gawk.texi42
-rw-r--r--doc/gawktexi.in42
-rw-r--r--eval.c15
-rw-r--r--extension/configure.ac2
-rw-r--r--interpret.h32
-rw-r--r--re.c57
12 files changed, 761 insertions, 706 deletions
diff --git a/ChangeLog b/ChangeLog
index 54242f64..75d1bd37 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,38 @@
2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
+ POSIX now says use strcmp for == and !=. Thanks to Chet Ramey
+ for pointing me at the change. Make it so:
+
+ * awk.h (cmp_nodes): New 3rd param indicating strcmp, not strcoll.
+ * debug.c (cmp_val): Update call to cmp_nodes.
+ * eval.c (cmp_nodes): New 3rd param indicating strcmp, not strcoll.
+ Adjust code and all callers.
+ (scalar_cmp_t): New enum type. Used in ...
+ (cmp_scalars): ... in order to call cmp_nodes correctly.
+ * interpret.h: Use the enum type in calls to cmp_scalars.
+ * re.c (re_update): Adjust call to cmp_nodes.
+
+2016-08-25 Norihiro Tanaka <noritnk@kcn.ne.jp>
+
+ * awk.h (struct Regexp): Remove dfa. Now dfareg instead of it. All
+ referers changed.
+ * re.c (research): Arrange caller of dfaexec and research.
+ * (avoid_dfa): Removed. All callers changed.
+ * awk.h (avoid_dfa): Removed.
+
+ Other changes by Arnold Robbins:
+
+ * awk.h (struct Regexp): Change various boolean members to bool.
+ (RE_NO_FLAGS): New #define.
+ * interpret.h: Use RE_NO_FLAGS instead of zero.
+ * re.c (research): Prettify the logic a little bit.
+
+2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
+ * dfa.c: Sync with grep.
+
+2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
* 4.1.4: Release tar ball made.
2016-08-23 Arnold D. Robbins <arnold@skeeve.com>
diff --git a/awk.h b/awk.h
index f11105ba..89106588 100644
--- a/awk.h
+++ b/awk.h
@@ -206,11 +206,10 @@ typedef struct Regexp {
struct re_pattern_buffer pat;
struct re_registers regs;
struct dfa *dfareg;
- short dfa;
- short has_anchor; /* speed up of avoid_dfa kludge, temporary */
- short non_empty; /* for use in fpat_parse_field */
- short has_meta; /* re has meta chars so (probably) isn't simple string */
- short maybe_long; /* re has meta chars that can match long text */
+ bool has_anchor; /* re has anchors which dfa avoids */
+ bool non_empty; /* for use in fpat_parse_field */
+ bool has_meta; /* re has meta chars so (probably) isn't simple string */
+ bool maybe_long; /* re has meta chars that can match long text */
} Regexp;
#define RESTART(rp,s) (rp)->regs.start[0]
#define REEND(rp,s) (rp)->regs.end[0]
@@ -219,6 +218,7 @@ typedef struct Regexp {
#define NUMSUBPATS(rp,s) (rp)->regs.num_regs
/* regexp matching flags: */
+#define RE_NO_FLAGS 0 /* empty flags */
#define RE_NEED_START 1 /* need to know start/end of match */
#define RE_NO_BOL 2 /* not allowed to match ^ in regexp */
@@ -1443,7 +1443,7 @@ extern int sanitize_exit_status(int status);
extern void PUSH_CODE(INSTRUCTION *cp);
extern INSTRUCTION *POP_CODE(void);
extern void init_interpret(void);
-extern int cmp_nodes(NODE *t1, NODE *t2);
+extern int cmp_nodes(NODE *t1, NODE *t2, bool use_strcmp);
extern int cmp_awknums(const NODE *t1, const NODE *t2);
extern void set_IGNORECASE(void);
extern void set_OFS(void);
@@ -1651,7 +1651,6 @@ extern void reg_error(const char *s);
extern Regexp *re_update(NODE *t);
extern void resyntax(int syntax);
extern void resetup(void);
-extern int avoid_dfa(NODE *re, char *str, size_t len);
extern int reisstring(const char *text, size_t len, Regexp *re, const char *buf);
extern int get_numbase(const char *str, bool use_locale);
diff --git a/debug.c b/debug.c
index a0830621..c3d149d6 100644
--- a/debug.c
+++ b/debug.c
@@ -1670,7 +1670,7 @@ cmp_val(struct list_item *w, NODE *old, NODE *new)
if (new->type == Node_var_array) /* 5 */
return true;
- return cmp_nodes(old, new); /* 4 */
+ return cmp_nodes(old, new, true); /* 4 */
}
/* watchpoint_triggered --- check if we should stop at this watchpoint;
diff --git a/dfa.c b/dfa.c
index cb11043e..85cb46ad 100644
--- a/dfa.c
+++ b/dfa.c
@@ -387,8 +387,8 @@ struct regex_syntax
meaning of the @#%!@#%^!@ syntax bits. */
struct lexer_state
{
- char const *lexptr; /* Pointer to next input character. */
- size_t lexleft; /* Number of characters remaining. */
+ char const *ptr; /* Pointer to next input character. */
+ size_t left; /* Number of characters remaining. */
token lasttok; /* Previous token returned; initially END. */
size_t parens; /* Count of outstanding left parens. */
int minrep, maxrep; /* Repeat counts for {m,n}. */
@@ -429,10 +429,10 @@ struct dfa
size_t calloc; /* Number of charclasses allocated. */
/* Scanner state */
- struct lexer_state lexstate;
+ struct lexer_state lex;
/* Parser state */
- struct parser_state parsestate;
+ struct parser_state parse;
/* Fields filled by the parser. */
token *tokens; /* Postfix parse array. */
@@ -910,7 +910,7 @@ using_simple_locale (struct dfa const *dfa)
&& '}' == 125 && '~' == 126)
};
- return (!native_c_charset || dfa->multibyte) ? false : unibyte_c;
+ return (native_c_charset & !dfa->multibyte) | unibyte_c;
}
/* Fetch the next lexical input character. Set C (of type int) to the
@@ -922,23 +922,23 @@ using_simple_locale (struct dfa const *dfa)
otherwise. */
# define FETCH_WC(dfa, c, wc, eoferr) \
do { \
- if (! dfa->lexstate.lexleft) \
+ if (! (dfa)->lex.left) \
{ \
if ((eoferr) != 0) \
dfaerror (eoferr); \
else \
- return dfa->lexstate.lasttok = END; \
+ return (dfa)->lex.lasttok = END; \
} \
else \
{ \
wint_t _wc; \
- size_t nbytes = mbs_to_wchar (&_wc, dfa->lexstate.lexptr, \
- dfa->lexstate.lexleft, dfa); \
- dfa->lexstate.cur_mb_len = nbytes; \
+ size_t nbytes = mbs_to_wchar (&_wc, (dfa)->lex.ptr, \
+ (dfa)->lex.left, dfa); \
+ (dfa)->lex.cur_mb_len = nbytes; \
(wc) = _wc; \
- (c) = nbytes == 1 ? to_uchar (*dfa->lexstate.lexptr) : EOF; \
- dfa->lexstate.lexptr += nbytes; \
- dfa->lexstate.lexleft -= nbytes; \
+ (c) = nbytes == 1 ? to_uchar ((dfa)->lex.ptr[0]) : EOF; \
+ (dfa)->lex.ptr += nbytes; \
+ (dfa)->lex.left -= nbytes; \
} \
} while (false)
@@ -1112,8 +1112,8 @@ parse_bracket_exp (struct dfa *dfa)
for (;;)
{
FETCH_WC (dfa, c, wc, _("unbalanced ["));
- if ((c == c1 && *dfa->lexstate.lexptr == ']')
- || dfa->lexstate.lexleft == 0)
+ if (dfa->lex.left == 0
+ || (c == c1 && dfa->lex.ptr[0] == ']'))
break;
if (len < MAX_BRACKET_STRING_LEN)
str[len++] = c;
@@ -1133,8 +1133,8 @@ parse_bracket_exp (struct dfa *dfa)
{
char const *class
= (dfa->syntax.case_fold && (STREQ (str, "upper")
- || STREQ (str, "lower")) ?
- "alpha" : str);
+ || STREQ (str, "lower"))
+ ? "alpha" : str);
const struct dfa_ctype *pred = find_pred (class);
if (!pred)
dfaerror (_("invalid character class"));
@@ -1174,7 +1174,7 @@ parse_bracket_exp (struct dfa *dfa)
/* A bracket expression like [a-[.aa.]] matches an unknown set.
Treat it like [-a[.aa.]] while parsing it, and
remember that the set is unknown. */
- if (c2 == '[' && *dfa->lexstate.lexptr == '.')
+ if (c2 == '[' && dfa->lex.ptr[0] == '.')
{
known_bracket_exp = false;
c2 = ']';
@@ -1184,8 +1184,8 @@ parse_bracket_exp (struct dfa *dfa)
{
/* In the case [x-], the - is an ordinary hyphen,
which is left in c1, the lookahead character. */
- dfa->lexstate.lexptr -= dfa->lexstate.cur_mb_len;
- dfa->lexstate.lexleft += dfa->lexstate.cur_mb_len;
+ dfa->lex.ptr -= dfa->lex.cur_mb_len;
+ dfa->lex.left += dfa->lex.cur_mb_len;
}
else
{
@@ -1283,19 +1283,27 @@ parse_bracket_exp (struct dfa *dfa)
return CSET + dfa_charclass_index (dfa, ccl);
}
-#define PUSH_LEX_STATE(s) \
- do \
- { \
- char const *lexptr_saved = dfa->lexstate.lexptr; \
- size_t lexleft_saved = dfa->lexstate.lexleft; \
- dfa->lexstate.lexptr = (s); \
- dfa->lexstate.lexleft = strlen (dfa->lexstate.lexptr)
+struct lexptr
+{
+ char const *ptr;
+ size_t left;
+};
+
+static void
+push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
+{
+ ls->ptr = dfa->lex.ptr;
+ ls->left = dfa->lex.left;
+ dfa->lex.ptr = s;
+ dfa->lex.left = strlen (s);
+}
-#define POP_LEX_STATE() \
- dfa->lexstate.lexptr = lexptr_saved; \
- dfa->lexstate.lexleft = lexleft_saved; \
- } \
- while (false)
+static void
+pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
+{
+ dfa->lex.ptr = ls->ptr;
+ dfa->lex.left = ls->left;
+}
static token
lex (struct dfa *dfa)
@@ -1313,14 +1321,14 @@ lex (struct dfa *dfa)
"if (backslash) ...". */
for (i = 0; i < 2; ++i)
{
- FETCH_WC (dfa, c, dfa->lexstate.wctok, NULL);
+ FETCH_WC (dfa, c, dfa->lex.wctok, NULL);
switch (c)
{
case '\\':
if (backslash)
goto normal_char;
- if (dfa->lexstate.lexleft == 0)
+ if (dfa->lex.left == 0)
dfaerror (_("unfinished \\ escape"));
backslash = true;
break;
@@ -1329,28 +1337,29 @@ lex (struct dfa *dfa)
if (backslash)
goto normal_char;
if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
- || dfa->lexstate.lasttok == END || dfa->lexstate.lasttok == LPAREN
- || dfa->lexstate.lasttok == OR)
- return dfa->lexstate.lasttok = BEGLINE;
+ || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
+ || dfa->lex.lasttok == OR)
+ return dfa->lex.lasttok = BEGLINE;
goto normal_char;
case '$':
if (backslash)
goto normal_char;
if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
- || dfa->lexstate.lexleft == 0
- || (dfa->syntax.syntax_bits & RE_NO_BK_PARENS
- ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == ')'
- : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\'
- && dfa->lexstate.lexptr[1] == ')')
- || (dfa->syntax.syntax_bits & RE_NO_BK_VBAR
- ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == '|'
- : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\'
- && dfa->lexstate.lexptr[1] == '|')
+ || dfa->lex.left == 0
+ || ((dfa->lex.left
+ > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
+ && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
+ & (dfa->lex.ptr[0] == '\\')]
+ == ')'))
+ || ((dfa->lex.left
+ > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
+ && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
+ & (dfa->lex.ptr[0] == '\\')]
+ == '|'))
|| ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
- && dfa->lexstate.lexleft > 0
- && *dfa->lexstate.lexptr == '\n'))
- return dfa->lexstate.lasttok = ENDLINE;
+ && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
+ return dfa->lex.lasttok = ENDLINE;
goto normal_char;
case '1':
@@ -1364,8 +1373,8 @@ lex (struct dfa *dfa)
case '9':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
{
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = BACKREF;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = BACKREF;
}
goto normal_char;
@@ -1373,7 +1382,7 @@ lex (struct dfa *dfa)
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
{
/* FIXME: should be beginning of string */
- return dfa->lexstate.lasttok = BEGLINE;
+ return dfa->lex.lasttok = BEGLINE;
}
goto normal_char;
@@ -1381,28 +1390,28 @@ lex (struct dfa *dfa)
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
{
/* FIXME: should be end of string */
- return dfa->lexstate.lasttok = ENDLINE;
+ return dfa->lex.lasttok = ENDLINE;
}
goto normal_char;
case '<':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = BEGWORD;
+ return dfa->lex.lasttok = BEGWORD;
goto normal_char;
case '>':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = ENDWORD;
+ return dfa->lex.lasttok = ENDWORD;
goto normal_char;
case 'b':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = LIMWORD;
+ return dfa->lex.lasttok = LIMWORD;
goto normal_char;
case 'B':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = NOTLIMWORD;
+ return dfa->lex.lasttok = NOTLIMWORD;
goto normal_char;
case '?':
@@ -1411,17 +1420,17 @@ lex (struct dfa *dfa)
if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
- return dfa->lexstate.lasttok = QMARK;
+ return dfa->lex.lasttok = QMARK;
case '*':
if (backslash)
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
- return dfa->lexstate.lasttok = STAR;
+ return dfa->lex.lasttok = STAR;
case '+':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
@@ -1429,9 +1438,9 @@ lex (struct dfa *dfa)
if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
- return dfa->lexstate.lasttok = PLUS;
+ return dfa->lex.lasttok = PLUS;
case '{':
if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
@@ -1439,7 +1448,7 @@ lex (struct dfa *dfa)
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
/* Cases:
@@ -1449,86 +1458,79 @@ lex (struct dfa *dfa)
{,} - 0 to infinity (same as '*')
{M,N} - M through N */
{
- char const *p = dfa->lexstate.lexptr;
- char const *lim = p + dfa->lexstate.lexleft;
- dfa->lexstate.minrep = dfa->lexstate.maxrep = -1;
+ char const *p = dfa->lex.ptr;
+ char const *lim = p + dfa->lex.left;
+ dfa->lex.minrep = dfa->lex.maxrep = -1;
for (; p != lim && ISASCIIDIGIT (*p); p++)
- {
- if (dfa->lexstate.minrep < 0)
- dfa->lexstate.minrep = *p - '0';
- else
- dfa->lexstate.minrep = MIN (RE_DUP_MAX + 1,
- (dfa->lexstate.minrep
- * 10 + *p - '0'));
- }
+ dfa->lex.minrep = (dfa->lex.minrep < 0
+ ? *p - '0'
+ : MIN (RE_DUP_MAX + 1,
+ dfa->lex.minrep * 10 + *p - '0'));
if (p != lim)
{
if (*p != ',')
- dfa->lexstate.maxrep = dfa->lexstate.minrep;
+ dfa->lex.maxrep = dfa->lex.minrep;
else
{
- if (dfa->lexstate.minrep < 0)
- dfa->lexstate.minrep = 0;
+ if (dfa->lex.minrep < 0)
+ dfa->lex.minrep = 0;
while (++p != lim && ISASCIIDIGIT (*p))
- {
- if (dfa->lexstate.maxrep < 0)
- dfa->lexstate.maxrep = *p - '0';
- else
- dfa->lexstate.maxrep = MIN (RE_DUP_MAX + 1,
- (dfa->lexstate.maxrep
- * 10 + *p - '0'));
- }
+ dfa->lex.maxrep
+ = (dfa->lex.maxrep < 0
+ ? *p - '0'
+ : MIN (RE_DUP_MAX + 1,
+ dfa->lex.maxrep * 10 + *p - '0'));
}
}
if (! ((! backslash || (p != lim && *p++ == '\\'))
&& p != lim && *p++ == '}'
- && 0 <= dfa->lexstate.minrep
- && (dfa->lexstate.maxrep < 0
- || dfa->lexstate.minrep <= dfa->lexstate.maxrep)))
+ && 0 <= dfa->lex.minrep
+ && (dfa->lex.maxrep < 0
+ || dfa->lex.minrep <= dfa->lex.maxrep)))
{
if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
goto normal_char;
dfaerror (_("invalid content of \\{\\}"));
}
- if (RE_DUP_MAX < dfa->lexstate.maxrep)
+ if (RE_DUP_MAX < dfa->lex.maxrep)
dfaerror (_("regular expression too big"));
- dfa->lexstate.lexptr = p;
- dfa->lexstate.lexleft = lim - p;
+ dfa->lex.ptr = p;
+ dfa->lex.left = lim - p;
}
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = REPMN;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = REPMN;
case '|':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
goto normal_char;
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
goto normal_char;
- dfa->lexstate.laststart = true;
- return dfa->lexstate.lasttok = OR;
+ dfa->lex.laststart = true;
+ return dfa->lex.lasttok = OR;
case '\n':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
|| backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
goto normal_char;
- dfa->lexstate.laststart = true;
- return dfa->lexstate.lasttok = OR;
+ dfa->lex.laststart = true;
+ return dfa->lex.lasttok = OR;
case '(':
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
goto normal_char;
- ++dfa->lexstate.parens;
- dfa->lexstate.laststart = true;
- return dfa->lexstate.lasttok = LPAREN;
+ dfa->lex.parens++;
+ dfa->lex.laststart = true;
+ return dfa->lex.lasttok = LPAREN;
case ')':
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
goto normal_char;
- if (dfa->lexstate.parens == 0
+ if (dfa->lex.parens == 0
&& dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
goto normal_char;
- --dfa->lexstate.parens;
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = RPAREN;
+ dfa->lex.parens--;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = RPAREN;
case '.':
if (backslash)
@@ -1537,8 +1539,8 @@ lex (struct dfa *dfa)
{
/* In multibyte environment period must match with a single
character not a byte. So we use ANYCHAR. */
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = ANYCHAR;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = ANYCHAR;
}
zeroset (ccl);
notset (ccl);
@@ -1546,8 +1548,8 @@ lex (struct dfa *dfa)
clrbit ('\n', ccl);
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', ccl);
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
case 's':
case 'S':
@@ -1561,9 +1563,8 @@ lex (struct dfa *dfa)
setbit (c2, ccl);
if (c == 'S')
notset (ccl);
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
- ccl);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1572,14 +1573,15 @@ lex (struct dfa *dfa)
/* \s and \S are documented to be equivalent to [[:space:]] and
[^[:space:]] respectively, so tell the lexer to process those
strings, each minus its "already processed" '['. */
- PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]");
-
- dfa->lexstate.lasttok = parse_bracket_exp (dfa);
-
- POP_LEX_STATE ();
+ {
+ struct lexptr ls;
+ push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
+ dfa->lex.lasttok = parse_bracket_exp (dfa);
+ pop_lex_state (dfa, &ls);
+ }
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok;
case 'w':
case 'W':
@@ -1594,9 +1596,8 @@ lex (struct dfa *dfa)
setbit (c2, ccl);
if (c == 'W')
notset (ccl);
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
- ccl);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1605,38 +1606,38 @@ lex (struct dfa *dfa)
/* \w and \W are documented to be equivalent to [_[:alnum:]] and
[^_[:alnum:]] respectively, so tell the lexer to process those
strings, each minus its "already processed" '['. */
- PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]");
-
- dfa->lexstate.lasttok = parse_bracket_exp (dfa);
-
- POP_LEX_STATE ();
+ {
+ struct lexptr ls;
+ push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
+ dfa->lex.lasttok = parse_bracket_exp (dfa);
+ pop_lex_state (dfa, &ls);
+ }
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok;
case '[':
if (backslash)
goto normal_char;
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = parse_bracket_exp (dfa);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = parse_bracket_exp (dfa);
default:
normal_char:
- dfa->lexstate.laststart = false;
+ dfa->lex.laststart = false;
/* For multibyte character sets, folding is done in atom. Always
return WCHAR. */
if (dfa->multibyte)
- return dfa->lexstate.lasttok = WCHAR;
+ return dfa->lex.lasttok = WCHAR;
if (dfa->syntax.case_fold && isalpha (c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
- ccl);
+ return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
}
- return dfa->lexstate.lasttok = c;
+ return dfa->lex.lasttok = c;
}
}
@@ -1670,21 +1671,21 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
case CAT:
case OR:
- --dfa->parsestate.depth;
+ dfa->parse.depth--;
break;
case BACKREF:
dfa->fast = false;
/* fallthrough */
default:
- ++dfa->nleaves;
+ dfa->nleaves++;
/* fallthrough */
case EMPTY:
- ++dfa->parsestate.depth;
+ dfa->parse.depth++;
break;
}
- if (dfa->parsestate.depth > dfa->depth)
- dfa->depth = dfa->parsestate.depth;
+ if (dfa->parse.depth > dfa->depth)
+ dfa->depth = dfa->parse.depth;
}
static void addtok_wc (struct dfa *dfa, wint_t wc);
@@ -1741,19 +1742,19 @@ addtok_wc (struct dfa *dfa, wint_t wc)
size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
if (stored_bytes != (size_t) -1)
- dfa->lexstate.cur_mb_len = stored_bytes;
+ dfa->lex.cur_mb_len = stored_bytes;
else
{
/* This is merely stop-gap. buf[0] is undefined, yet skipping
the addtok_mb call altogether can corrupt the heap. */
- dfa->lexstate.cur_mb_len = 1;
+ dfa->lex.cur_mb_len = 1;
buf[0] = 0;
}
- addtok_mb (dfa, buf[0], dfa->lexstate.cur_mb_len == 1 ? 3 : 1);
- for (i = 1; i < dfa->lexstate.cur_mb_len; i++)
+ addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1);
+ for (i = 1; i < dfa->lex.cur_mb_len; i++)
{
- addtok_mb (dfa, buf[i], i == dfa->lexstate.cur_mb_len - 1 ? 2 : 0);
+ addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0);
addtok (dfa, CAT);
}
}
@@ -1854,18 +1855,18 @@ add_utf8_anychar (struct dfa *dfa)
static void
atom (struct dfa *dfa)
{
- if (dfa->parsestate.tok == WCHAR)
+ if (dfa->parse.tok == WCHAR)
{
- if (dfa->lexstate.wctok == WEOF)
+ if (dfa->lex.wctok == WEOF)
addtok (dfa, BACKREF);
else
{
- addtok_wc (dfa, dfa->lexstate.wctok);
+ addtok_wc (dfa, dfa->lex.wctok);
if (dfa->syntax.case_fold)
{
wchar_t folded[CASE_FOLDED_BUFSIZE];
- unsigned int i, n = case_folded_counterparts (dfa->lexstate.wctok,
+ unsigned int i, n = case_folded_counterparts (dfa->lex.wctok,
folded);
for (i = 0; i < n; i++)
{
@@ -1875,9 +1876,9 @@ atom (struct dfa *dfa)
}
}
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
- else if (dfa->parsestate.tok == ANYCHAR && using_utf8)
+ else if (dfa->parse.tok == ANYCHAR && using_utf8)
{
/* For UTF-8 expand the period to a series of CSETs that define a valid
UTF-8 character. This avoids using the slow multibyte path. I'm
@@ -1887,26 +1888,25 @@ atom (struct dfa *dfa)
UTF-8: it is the most used, and the structure of the encoding
makes the correctness more obvious. */
add_utf8_anychar (dfa);
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
- else if ((dfa->parsestate.tok >= 0 && dfa->parsestate.tok < NOTCHAR)
- || dfa->parsestate.tok >= CSET || dfa->parsestate.tok == BACKREF
- || dfa->parsestate.tok == BEGLINE || dfa->parsestate.tok == ENDLINE
- || dfa->parsestate.tok == BEGWORD || dfa->parsestate.tok == ANYCHAR
- || dfa->parsestate.tok == MBCSET || dfa->parsestate.tok == ENDWORD
- || dfa->parsestate.tok == LIMWORD
- || dfa->parsestate.tok == NOTLIMWORD)
+ else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
+ || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
+ || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
+ || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
+ || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
+ || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
{
- addtok (dfa, dfa->parsestate.tok);
- dfa->parsestate.tok = lex (dfa);
+ addtok (dfa, dfa->parse.tok);
+ dfa->parse.tok = lex (dfa);
}
- else if (dfa->parsestate.tok == LPAREN)
+ else if (dfa->parse.tok == LPAREN)
{
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
regexp (dfa);
- if (dfa->parsestate.tok != RPAREN)
+ if (dfa->parse.tok != RPAREN)
dfaerror (_("unbalanced ("));
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
else
addtok (dfa, EMPTY);
@@ -1954,40 +1954,39 @@ closure (struct dfa *dfa)
size_t tindex, ntokens;
atom (dfa);
- while (dfa->parsestate.tok == QMARK || dfa->parsestate.tok == STAR
- || dfa->parsestate.tok == PLUS || dfa->parsestate.tok == REPMN)
- if (dfa->parsestate.tok == REPMN
- && (dfa->lexstate.minrep || dfa->lexstate.maxrep))
+ while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
+ || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
+ if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
{
ntokens = nsubtoks (dfa, dfa->tindex);
tindex = dfa->tindex - ntokens;
- if (dfa->lexstate.maxrep < 0)
+ if (dfa->lex.maxrep < 0)
addtok (dfa, PLUS);
- if (dfa->lexstate.minrep == 0)
+ if (dfa->lex.minrep == 0)
addtok (dfa, QMARK);
- for (i = 1; i < dfa->lexstate.minrep; ++i)
+ for (i = 1; i < dfa->lex.minrep; i++)
{
copytoks (dfa, tindex, ntokens);
addtok (dfa, CAT);
}
- for (; i < dfa->lexstate.maxrep; ++i)
+ for (; i < dfa->lex.maxrep; i++)
{
copytoks (dfa, tindex, ntokens);
addtok (dfa, QMARK);
addtok (dfa, CAT);
}
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
- else if (dfa->parsestate.tok == REPMN)
+ else if (dfa->parse.tok == REPMN)
{
dfa->tindex -= nsubtoks (dfa, dfa->tindex);
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
closure (dfa);
}
else
{
- addtok (dfa, dfa->parsestate.tok);
- dfa->parsestate.tok = lex (dfa);
+ addtok (dfa, dfa->parse.tok);
+ dfa->parse.tok = lex (dfa);
}
}
@@ -1995,8 +1994,8 @@ static void
branch (struct dfa* dfa)
{
closure (dfa);
- while (dfa->parsestate.tok != RPAREN && dfa->parsestate.tok != OR
- && dfa->parsestate.tok >= 0)
+ while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
+ && dfa->parse.tok >= 0)
{
closure (dfa);
addtok (dfa, CAT);
@@ -2007,9 +2006,9 @@ static void
regexp (struct dfa *dfa)
{
branch (dfa);
- while (dfa->parsestate.tok == OR)
+ while (dfa->parse.tok == OR)
{
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
branch (dfa);
addtok (dfa, OR);
}
@@ -2021,26 +2020,26 @@ regexp (struct dfa *dfa)
static void
dfaparse (char const *s, size_t len, struct dfa *d)
{
- d->lexstate.lexptr = s;
- d->lexstate.lexleft = len;
- d->lexstate.lasttok = END;
- d->lexstate.laststart = true;
- d->lexstate.parens = 0;
+ d->lex.ptr = s;
+ d->lex.left = len;
+ d->lex.lasttok = END;
+ d->lex.laststart = true;
+ d->lex.parens = 0;
if (d->multibyte)
{
- d->lexstate.cur_mb_len = 0;
+ d->lex.cur_mb_len = 0;
memset (&d->mbs, 0, sizeof d->mbs);
}
if (!d->syntax.syntax_bits_set)
dfaerror (_("no syntax specified"));
- d->parsestate.tok = lex (d);
- d->parsestate.depth = d->depth;
+ d->parse.tok = lex (d);
+ d->parse.depth = d->depth;
regexp (d);
- if (d->parsestate.tok != END)
+ if (d->parse.tok != END)
dfaerror (_("unbalanced )"));
addtok (d, END - d->nregexps);
@@ -3990,11 +3989,9 @@ dfamust (struct dfa const *d)
bool exact = false;
bool begline = false;
bool endline = false;
- size_t rj;
bool need_begline = false;
bool need_endline = false;
bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
- struct dfamust *dm;
for (ri = 0; ri < d->tindex; ++ri)
{
@@ -4171,7 +4168,7 @@ dfamust (struct dfa const *d)
}
}
- rj = ri + 2;
+ size_t rj = ri + 2;
if (d->tokens[ri + 1] == CAT)
{
for (; rj < d->tindex - 1; rj += 2)
@@ -4200,7 +4197,7 @@ dfamust (struct dfa const *d)
}
done:;
- dm = NULL;
+ struct dfamust *dm = NULL;
if (*result)
{
dm = xmalloc (sizeof *dm);
@@ -4230,11 +4227,11 @@ dfamustfree (struct dfamust *dm)
struct dfa *
dfaalloc (void)
{
- struct dfa *d = xcalloc (1, sizeof (struct dfa));
+ struct dfa *d = xzalloc (sizeof *d);
d->multibyte = MB_CUR_MAX > 1;
d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
d->fast = !d->multibyte;
- d->lexstate.cur_mb_len = 1;
+ d->lex.cur_mb_len = 1;
return d;
}
diff --git a/doc/ChangeLog b/doc/ChangeLog
index 2dc83a60..ce21ba92 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,5 +1,11 @@
2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
+ * gawktexi.in (POSIX String Comparison): Update for new
+ spec where == and != use strcmp, rest use strcoll. Thanks to
+ Chet Ramey for pointing me at the new rules.
+
+2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
* 4.1.4: Release tar ball made.
2016-08-24 Arnold D. Robbins <arnold@skeeve.com>
diff --git a/doc/gawk.info b/doc/gawk.info
index fd3a5b8a..c39afa4f 100644
--- a/doc/gawk.info
+++ b/doc/gawk.info
@@ -8666,18 +8666,18 @@ Constant Regexps::, where this is discussed in more detail.

File: gawk.info, Node: POSIX String Comparison, Prev: Comparison Operators, Up: Typing and Comparison
-6.3.2.3 String Comparison with POSIX Rules
-..........................................
+6.3.2.3 String Comparison Based on Locale Collating Order
+.........................................................
-The POSIX standard says that string comparison is performed based on the
-locale's "collating order". This is the order in which characters sort,
-as defined by the locale (for more discussion, *note Locales::). This
-order is usually very different from the results obtained when doing
-straight character-by-character comparison.(1)
+The POSIX standard used to say that all string comparisons are performed
+based on the locale's "collating order". This is the order in which
+characters sort, as defined by the locale (for more discussion, *note
+Locales::). This order is usually very different from the results
+obtained when doing straight byte-by-byte comparison.(1)
Because this behavior differs considerably from existing practice,
-'gawk' only implements it when in POSIX mode (*note Options::). Here is
-an example to illustrate the difference, in an 'en_US.UTF-8' locale:
+'gawk' only implemented it when in POSIX mode (*note Options::). Here
+is an example to illustrate the difference, in an 'en_US.UTF-8' locale:
$ gawk 'BEGIN { printf("ABC < abc = %s\n",
> ("ABC" < "abc" ? "TRUE" : "FALSE")) }'
@@ -8686,11 +8686,28 @@ an example to illustrate the difference, in an 'en_US.UTF-8' locale:
> ("ABC" < "abc" ? "TRUE" : "FALSE")) }'
-| ABC < abc = FALSE
+ Fortunately, as of August 2016, comparison based on locale collating
+order is no longer required for the '==' and '!=' operators.(2)
+However, comparison based on locales is still required for '<', '<=',
+'>', and '>='. POSIX thus recommends as follows:
+
+ Since the '==' operator checks whether strings are identical, not
+ whether they collate equally, applications needing to check whether
+ strings collate equally can use:
+
+ a <= b && a >= b
+
+ As of version 4.2, 'gawk' continues to use locale collating order for
+'<', '<=', '>', and '>=' only in POSIX mode.
+
---------- Footnotes ----------
(1) Technically, string comparison is supposed to behave the same way
as if the strings were compared with the C 'strcoll()' function.
+ (2) See the Austin Group website
+(http://austingroupbugs.net/view.php?id=1070).
+

File: gawk.info, Node: Boolean Ops, Next: Conditional Exp, Prev: Typing and Comparison, Up: Truth Values and Conditions
@@ -27659,7 +27676,7 @@ ranges, such that outside the '"C"' and '"POSIX"' locales, the meaning
of range expressions was _undefined_.(3)
By using this lovely technical term, the standard gives license to
-implementors to implement ranges in whatever way they choose. The
+implementers to implement ranges in whatever way they choose. The
'gawk' maintainer chose to apply the pre-POSIX meaning both with the
default regexp matching and when '--traditional' or '--posix' are used.
In all cases 'gawk' remains POSIX-compliant.
@@ -35483,401 +35500,402 @@ Node: Variable Typing367063
Node: Comparison Operators370687
Ref: table-relational-ops371106
Node: POSIX String Comparison374601
-Ref: POSIX String Comparison-Footnote-1375675
-Node: Boolean Ops375814
-Ref: Boolean Ops-Footnote-1380296
-Node: Conditional Exp380388
-Node: Function Calls382124
-Node: Precedence386001
-Node: Locales389660
-Node: Expressions Summary391292
-Node: Patterns and Actions393865
-Node: Pattern Overview394985
-Node: Regexp Patterns396662
-Node: Expression Patterns397204
-Node: Ranges400985
-Node: BEGIN/END404093
-Node: Using BEGIN/END404854
-Ref: Using BEGIN/END-Footnote-1407590
-Node: I/O And BEGIN/END407696
-Node: BEGINFILE/ENDFILE410010
-Node: Empty412917
-Node: Using Shell Variables413234
-Node: Action Overview415508
-Node: Statements417833
-Node: If Statement419681
-Node: While Statement421176
-Node: Do Statement423204
-Node: For Statement424352
-Node: Switch Statement427510
-Node: Break Statement429896
-Node: Continue Statement431988
-Node: Next Statement433815
-Node: Nextfile Statement436198
-Node: Exit Statement438850
-Node: Built-in Variables441253
-Node: User-modified442386
-Node: Auto-set449972
-Ref: Auto-set-Footnote-1464625
-Ref: Auto-set-Footnote-2464831
-Node: ARGC and ARGV464887
-Node: Pattern Action Summary469100
-Node: Arrays471530
-Node: Array Basics472859
-Node: Array Intro473703
-Ref: figure-array-elements475678
-Ref: Array Intro-Footnote-1478382
-Node: Reference to Elements478510
-Node: Assigning Elements480974
-Node: Array Example481465
-Node: Scanning an Array483224
-Node: Controlling Scanning486246
-Ref: Controlling Scanning-Footnote-1491645
-Node: Numeric Array Subscripts491961
-Node: Uninitialized Subscripts494145
-Node: Delete495764
-Ref: Delete-Footnote-1498516
-Node: Multidimensional498573
-Node: Multiscanning501668
-Node: Arrays of Arrays503259
-Node: Arrays Summary508026
-Node: Functions510119
-Node: Built-in511157
-Node: Calling Built-in512238
-Node: Numeric Functions514234
-Ref: Numeric Functions-Footnote-1519067
-Ref: Numeric Functions-Footnote-2519424
-Ref: Numeric Functions-Footnote-3519472
-Node: String Functions519744
-Ref: String Functions-Footnote-1543248
-Ref: String Functions-Footnote-2543376
-Ref: String Functions-Footnote-3543624
-Node: Gory Details543711
-Ref: table-sub-escapes545502
-Ref: table-sub-proposed547021
-Ref: table-posix-sub548384
-Ref: table-gensub-escapes549925
-Ref: Gory Details-Footnote-1550748
-Node: I/O Functions550902
-Ref: table-system-return-values557484
-Ref: I/O Functions-Footnote-1559464
-Ref: I/O Functions-Footnote-2559612
-Node: Time Functions559732
-Ref: Time Functions-Footnote-1570237
-Ref: Time Functions-Footnote-2570305
-Ref: Time Functions-Footnote-3570463
-Ref: Time Functions-Footnote-4570574
-Ref: Time Functions-Footnote-5570686
-Ref: Time Functions-Footnote-6570913
-Node: Bitwise Functions571179
-Ref: table-bitwise-ops571773
-Ref: Bitwise Functions-Footnote-1576111
-Node: Type Functions576284
-Node: I18N Functions578945
-Node: User-defined580596
-Node: Definition Syntax581401
-Ref: Definition Syntax-Footnote-1587088
-Node: Function Example587159
-Ref: Function Example-Footnote-1590081
-Node: Function Caveats590103
-Node: Calling A Function590621
-Node: Variable Scope591579
-Node: Pass By Value/Reference594573
-Node: Return Statement598072
-Node: Dynamic Typing601051
-Node: Indirect Calls601981
-Ref: Indirect Calls-Footnote-1612232
-Node: Functions Summary612360
-Node: Library Functions615065
-Ref: Library Functions-Footnote-1618672
-Ref: Library Functions-Footnote-2618815
-Node: Library Names618986
-Ref: Library Names-Footnote-1622446
-Ref: Library Names-Footnote-2622669
-Node: General Functions622755
-Node: Strtonum Function623858
-Node: Assert Function626880
-Node: Round Function630206
-Node: Cliff Random Function631747
-Node: Ordinal Functions632763
-Ref: Ordinal Functions-Footnote-1635826
-Ref: Ordinal Functions-Footnote-2636078
-Node: Join Function636288
-Ref: Join Function-Footnote-1638058
-Node: Getlocaltime Function638258
-Node: Readfile Function642000
-Node: Shell Quoting643972
-Node: Data File Management645373
-Node: Filetrans Function646005
-Node: Rewind Function650101
-Node: File Checking652007
-Ref: File Checking-Footnote-1653341
-Node: Empty Files653542
-Node: Ignoring Assigns655521
-Node: Getopt Function657071
-Ref: Getopt Function-Footnote-1668540
-Node: Passwd Functions668740
-Ref: Passwd Functions-Footnote-1677579
-Node: Group Functions677667
-Ref: Group Functions-Footnote-1685564
-Node: Walking Arrays685771
-Node: Library Functions Summary688779
-Node: Library Exercises690185
-Node: Sample Programs690650
-Node: Running Examples691420
-Node: Clones692148
-Node: Cut Program693372
-Node: Egrep Program703301
-Ref: Egrep Program-Footnote-1710813
-Node: Id Program710923
-Node: Split Program714603
-Ref: Split Program-Footnote-1718062
-Node: Tee Program718191
-Node: Uniq Program720981
-Node: Wc Program728407
-Ref: Wc Program-Footnote-1732662
-Node: Miscellaneous Programs732756
-Node: Dupword Program733969
-Node: Alarm Program735999
-Node: Translate Program740854
-Ref: Translate Program-Footnote-1745419
-Node: Labels Program745689
-Ref: Labels Program-Footnote-1749040
-Node: Word Sorting749124
-Node: History Sorting753196
-Node: Extract Program755031
-Node: Simple Sed762560
-Node: Igawk Program765634
-Ref: Igawk Program-Footnote-1779965
-Ref: Igawk Program-Footnote-2780167
-Ref: Igawk Program-Footnote-3780289
-Node: Anagram Program780404
-Node: Signature Program783466
-Node: Programs Summary784713
-Node: Programs Exercises785927
-Ref: Programs Exercises-Footnote-1790056
-Node: Advanced Features790147
-Node: Nondecimal Data792137
-Node: Array Sorting793728
-Node: Controlling Array Traversal794428
-Ref: Controlling Array Traversal-Footnote-1802795
-Node: Array Sorting Functions802913
-Ref: Array Sorting Functions-Footnote-1808004
-Node: Two-way I/O808200
-Ref: Two-way I/O-Footnote-1814750
-Ref: Two-way I/O-Footnote-2814937
-Node: TCP/IP Networking815019
-Node: Profiling818137
-Ref: Profiling-Footnote-1826630
-Node: Advanced Features Summary826953
-Node: Internationalization828797
-Node: I18N and L10N830277
-Node: Explaining gettext830964
-Ref: Explaining gettext-Footnote-1836856
-Ref: Explaining gettext-Footnote-2837041
-Node: Programmer i18n837206
-Ref: Programmer i18n-Footnote-1842061
-Node: Translator i18n842110
-Node: String Extraction842904
-Ref: String Extraction-Footnote-1844036
-Node: Printf Ordering844122
-Ref: Printf Ordering-Footnote-1846908
-Node: I18N Portability846972
-Ref: I18N Portability-Footnote-1849428
-Node: I18N Example849491
-Ref: I18N Example-Footnote-1852297
-Node: Gawk I18N852370
-Node: I18N Summary853015
-Node: Debugger854356
-Node: Debugging855378
-Node: Debugging Concepts855819
-Node: Debugging Terms857628
-Node: Awk Debugging860203
-Node: Sample Debugging Session861109
-Node: Debugger Invocation861643
-Node: Finding The Bug863029
-Node: List of Debugger Commands869507
-Node: Breakpoint Control870840
-Node: Debugger Execution Control874534
-Node: Viewing And Changing Data877896
-Node: Execution Stack881270
-Node: Debugger Info882907
-Node: Miscellaneous Debugger Commands886978
-Node: Readline Support892066
-Node: Limitations892962
-Ref: Limitations-Footnote-1897193
-Node: Debugging Summary897244
-Node: Arbitrary Precision Arithmetic898523
-Node: Computer Arithmetic899939
-Ref: table-numeric-ranges903530
-Ref: Computer Arithmetic-Footnote-1904252
-Node: Math Definitions904309
-Ref: table-ieee-formats907623
-Ref: Math Definitions-Footnote-1908226
-Node: MPFR features908331
-Node: FP Math Caution910048
-Ref: FP Math Caution-Footnote-1911120
-Node: Inexactness of computations911489
-Node: Inexact representation912449
-Node: Comparing FP Values913809
-Node: Errors accumulate914891
-Node: Getting Accuracy916324
-Node: Try To Round919034
-Node: Setting precision919933
-Ref: table-predefined-precision-strings920630
-Node: Setting the rounding mode922460
-Ref: table-gawk-rounding-modes922834
-Ref: Setting the rounding mode-Footnote-1926242
-Node: Arbitrary Precision Integers926421
-Ref: Arbitrary Precision Integers-Footnote-1931338
-Node: POSIX Floating Point Problems931487
-Ref: POSIX Floating Point Problems-Footnote-1935369
-Node: Floating point summary935407
-Node: Dynamic Extensions937597
-Node: Extension Intro939150
-Node: Plugin License940416
-Node: Extension Mechanism Outline941213
-Ref: figure-load-extension941652
-Ref: figure-register-new-function943217
-Ref: figure-call-new-function944309
-Node: Extension API Description946371
-Node: Extension API Functions Introduction947903
-Node: General Data Types952762
-Ref: General Data Types-Footnote-1958717
-Node: Memory Allocation Functions959016
-Ref: Memory Allocation Functions-Footnote-1961861
-Node: Constructor Functions961960
-Node: Registration Functions963705
-Node: Extension Functions964390
-Node: Exit Callback Functions967013
-Node: Extension Version String968263
-Node: Input Parsers968926
-Node: Output Wrappers978808
-Node: Two-way processors983320
-Node: Printing Messages985585
-Ref: Printing Messages-Footnote-1986756
-Node: Updating ERRNO986909
-Node: Requesting Values987648
-Ref: table-value-types-returned988385
-Node: Accessing Parameters989268
-Node: Symbol Table Access990503
-Node: Symbol table by name991015
-Node: Symbol table by cookie993036
-Ref: Symbol table by cookie-Footnote-1997188
-Node: Cached values997252
-Ref: Cached values-Footnote-11000759
-Node: Array Manipulation1000850
-Ref: Array Manipulation-Footnote-11001941
-Node: Array Data Types1001978
-Ref: Array Data Types-Footnote-11004636
-Node: Array Functions1004728
-Node: Flattening Arrays1008586
-Node: Creating Arrays1015494
-Node: Redirection API1020263
-Node: Extension API Variables1023094
-Node: Extension Versioning1023727
-Ref: gawk-api-version1024164
-Node: Extension API Informational Variables1025920
-Node: Extension API Boilerplate1026984
-Node: Finding Extensions1030798
-Node: Extension Example1031357
-Node: Internal File Description1032155
-Node: Internal File Ops1036235
-Ref: Internal File Ops-Footnote-11047997
-Node: Using Internal File Ops1048137
-Ref: Using Internal File Ops-Footnote-11050520
-Node: Extension Samples1050794
-Node: Extension Sample File Functions1052323
-Node: Extension Sample Fnmatch1059972
-Node: Extension Sample Fork1061459
-Node: Extension Sample Inplace1062677
-Node: Extension Sample Ord1065887
-Node: Extension Sample Readdir1066723
-Ref: table-readdir-file-types1067612
-Node: Extension Sample Revout1068417
-Node: Extension Sample Rev2way1069006
-Node: Extension Sample Read write array1069746
-Node: Extension Sample Readfile1071688
-Node: Extension Sample Time1072783
-Node: Extension Sample API Tests1074131
-Node: gawkextlib1074623
-Node: Extension summary1077070
-Node: Extension Exercises1080772
-Node: Language History1082270
-Node: V7/SVR3.11083926
-Node: SVR41086078
-Node: POSIX1087512
-Node: BTL1088891
-Node: POSIX/GNU1089620
-Node: Feature History1095482
-Node: Common Extensions1109852
-Node: Ranges and Locales1111135
-Ref: Ranges and Locales-Footnote-11115751
-Ref: Ranges and Locales-Footnote-21115778
-Ref: Ranges and Locales-Footnote-31116013
-Node: Contributors1116234
-Node: History summary1121794
-Node: Installation1123174
-Node: Gawk Distribution1124118
-Node: Getting1124602
-Node: Extracting1125563
-Node: Distribution contents1127201
-Node: Unix Installation1133295
-Node: Quick Installation1133977
-Node: Shell Startup Files1136391
-Node: Additional Configuration Options1137469
-Node: Configuration Philosophy1139274
-Node: Non-Unix Installation1141643
-Node: PC Installation1142101
-Node: PC Binary Installation1143421
-Node: PC Compiling1145273
-Ref: PC Compiling-Footnote-11148067
-Node: PC Testing1148176
-Node: PC Using1149356
-Ref: PC Using-Footnote-11153509
-Node: Cygwin1153582
-Node: MSYS1154352
-Node: VMS Installation1154853
-Node: VMS Compilation1155644
-Ref: VMS Compilation-Footnote-11156873
-Node: VMS Dynamic Extensions1156931
-Node: VMS Installation Details1158616
-Node: VMS Running1160869
-Node: VMS GNV1165148
-Node: VMS Old Gawk1165883
-Node: Bugs1166354
-Node: Other Versions1170669
-Node: Installation summary1177253
-Node: Notes1178304
-Node: Compatibility Mode1179169
-Node: Additions1179951
-Node: Accessing The Source1180876
-Node: Adding Code1182311
-Node: New Ports1188530
-Node: Derived Files1193018
-Ref: Derived Files-Footnote-11198503
-Ref: Derived Files-Footnote-21198538
-Ref: Derived Files-Footnote-31199136
-Node: Future Extensions1199250
-Node: Implementation Limitations1199908
-Node: Extension Design1201091
-Node: Old Extension Problems1202245
-Ref: Old Extension Problems-Footnote-11203763
-Node: Extension New Mechanism Goals1203820
-Ref: Extension New Mechanism Goals-Footnote-11207184
-Node: Extension Other Design Decisions1207373
-Node: Extension Future Growth1209486
-Node: Old Extension Mechanism1210322
-Node: Notes summary1212085
-Node: Basic Concepts1213267
-Node: Basic High Level1213948
-Ref: figure-general-flow1214230
-Ref: figure-process-flow1214915
-Ref: Basic High Level-Footnote-11218216
-Node: Basic Data Typing1218401
-Node: Glossary1221729
-Node: Copying1253676
-Node: GNU Free Documentation License1291215
-Node: Index1316333
+Ref: POSIX String Comparison-Footnote-1376296
+Ref: POSIX String Comparison-Footnote-2376435
+Node: Boolean Ops376519
+Ref: Boolean Ops-Footnote-1381001
+Node: Conditional Exp381093
+Node: Function Calls382829
+Node: Precedence386706
+Node: Locales390365
+Node: Expressions Summary391997
+Node: Patterns and Actions394570
+Node: Pattern Overview395690
+Node: Regexp Patterns397367
+Node: Expression Patterns397909
+Node: Ranges401690
+Node: BEGIN/END404798
+Node: Using BEGIN/END405559
+Ref: Using BEGIN/END-Footnote-1408295
+Node: I/O And BEGIN/END408401
+Node: BEGINFILE/ENDFILE410715
+Node: Empty413622
+Node: Using Shell Variables413939
+Node: Action Overview416213
+Node: Statements418538
+Node: If Statement420386
+Node: While Statement421881
+Node: Do Statement423909
+Node: For Statement425057
+Node: Switch Statement428215
+Node: Break Statement430601
+Node: Continue Statement432693
+Node: Next Statement434520
+Node: Nextfile Statement436903
+Node: Exit Statement439555
+Node: Built-in Variables441958
+Node: User-modified443091
+Node: Auto-set450677
+Ref: Auto-set-Footnote-1465330
+Ref: Auto-set-Footnote-2465536
+Node: ARGC and ARGV465592
+Node: Pattern Action Summary469805
+Node: Arrays472235
+Node: Array Basics473564
+Node: Array Intro474408
+Ref: figure-array-elements476383
+Ref: Array Intro-Footnote-1479087
+Node: Reference to Elements479215
+Node: Assigning Elements481679
+Node: Array Example482170
+Node: Scanning an Array483929
+Node: Controlling Scanning486951
+Ref: Controlling Scanning-Footnote-1492350
+Node: Numeric Array Subscripts492666
+Node: Uninitialized Subscripts494850
+Node: Delete496469
+Ref: Delete-Footnote-1499221
+Node: Multidimensional499278
+Node: Multiscanning502373
+Node: Arrays of Arrays503964
+Node: Arrays Summary508731
+Node: Functions510824
+Node: Built-in511862
+Node: Calling Built-in512943
+Node: Numeric Functions514939
+Ref: Numeric Functions-Footnote-1519772
+Ref: Numeric Functions-Footnote-2520129
+Ref: Numeric Functions-Footnote-3520177
+Node: String Functions520449
+Ref: String Functions-Footnote-1543953
+Ref: String Functions-Footnote-2544081
+Ref: String Functions-Footnote-3544329
+Node: Gory Details544416
+Ref: table-sub-escapes546207
+Ref: table-sub-proposed547726
+Ref: table-posix-sub549089
+Ref: table-gensub-escapes550630
+Ref: Gory Details-Footnote-1551453
+Node: I/O Functions551607
+Ref: table-system-return-values558189
+Ref: I/O Functions-Footnote-1560169
+Ref: I/O Functions-Footnote-2560317
+Node: Time Functions560437
+Ref: Time Functions-Footnote-1570942
+Ref: Time Functions-Footnote-2571010
+Ref: Time Functions-Footnote-3571168
+Ref: Time Functions-Footnote-4571279
+Ref: Time Functions-Footnote-5571391
+Ref: Time Functions-Footnote-6571618
+Node: Bitwise Functions571884
+Ref: table-bitwise-ops572478
+Ref: Bitwise Functions-Footnote-1576816
+Node: Type Functions576989
+Node: I18N Functions579650
+Node: User-defined581301
+Node: Definition Syntax582106
+Ref: Definition Syntax-Footnote-1587793
+Node: Function Example587864
+Ref: Function Example-Footnote-1590786
+Node: Function Caveats590808
+Node: Calling A Function591326
+Node: Variable Scope592284
+Node: Pass By Value/Reference595278
+Node: Return Statement598777
+Node: Dynamic Typing601756
+Node: Indirect Calls602686
+Ref: Indirect Calls-Footnote-1612937
+Node: Functions Summary613065
+Node: Library Functions615770
+Ref: Library Functions-Footnote-1619377
+Ref: Library Functions-Footnote-2619520
+Node: Library Names619691
+Ref: Library Names-Footnote-1623151
+Ref: Library Names-Footnote-2623374
+Node: General Functions623460
+Node: Strtonum Function624563
+Node: Assert Function627585
+Node: Round Function630911
+Node: Cliff Random Function632452
+Node: Ordinal Functions633468
+Ref: Ordinal Functions-Footnote-1636531
+Ref: Ordinal Functions-Footnote-2636783
+Node: Join Function636993
+Ref: Join Function-Footnote-1638763
+Node: Getlocaltime Function638963
+Node: Readfile Function642705
+Node: Shell Quoting644677
+Node: Data File Management646078
+Node: Filetrans Function646710
+Node: Rewind Function650806
+Node: File Checking652712
+Ref: File Checking-Footnote-1654046
+Node: Empty Files654247
+Node: Ignoring Assigns656226
+Node: Getopt Function657776
+Ref: Getopt Function-Footnote-1669245
+Node: Passwd Functions669445
+Ref: Passwd Functions-Footnote-1678284
+Node: Group Functions678372
+Ref: Group Functions-Footnote-1686269
+Node: Walking Arrays686476
+Node: Library Functions Summary689484
+Node: Library Exercises690890
+Node: Sample Programs691355
+Node: Running Examples692125
+Node: Clones692853
+Node: Cut Program694077
+Node: Egrep Program704006
+Ref: Egrep Program-Footnote-1711518
+Node: Id Program711628
+Node: Split Program715308
+Ref: Split Program-Footnote-1718767
+Node: Tee Program718896
+Node: Uniq Program721686
+Node: Wc Program729112
+Ref: Wc Program-Footnote-1733367
+Node: Miscellaneous Programs733461
+Node: Dupword Program734674
+Node: Alarm Program736704
+Node: Translate Program741559
+Ref: Translate Program-Footnote-1746124
+Node: Labels Program746394
+Ref: Labels Program-Footnote-1749745
+Node: Word Sorting749829
+Node: History Sorting753901
+Node: Extract Program755736
+Node: Simple Sed763265
+Node: Igawk Program766339
+Ref: Igawk Program-Footnote-1780670
+Ref: Igawk Program-Footnote-2780872
+Ref: Igawk Program-Footnote-3780994
+Node: Anagram Program781109
+Node: Signature Program784171
+Node: Programs Summary785418
+Node: Programs Exercises786632
+Ref: Programs Exercises-Footnote-1790761
+Node: Advanced Features790852
+Node: Nondecimal Data792842
+Node: Array Sorting794433
+Node: Controlling Array Traversal795133
+Ref: Controlling Array Traversal-Footnote-1803500
+Node: Array Sorting Functions803618
+Ref: Array Sorting Functions-Footnote-1808709
+Node: Two-way I/O808905
+Ref: Two-way I/O-Footnote-1815455
+Ref: Two-way I/O-Footnote-2815642
+Node: TCP/IP Networking815724
+Node: Profiling818842
+Ref: Profiling-Footnote-1827335
+Node: Advanced Features Summary827658
+Node: Internationalization829502
+Node: I18N and L10N830982
+Node: Explaining gettext831669
+Ref: Explaining gettext-Footnote-1837561
+Ref: Explaining gettext-Footnote-2837746
+Node: Programmer i18n837911
+Ref: Programmer i18n-Footnote-1842766
+Node: Translator i18n842815
+Node: String Extraction843609
+Ref: String Extraction-Footnote-1844741
+Node: Printf Ordering844827
+Ref: Printf Ordering-Footnote-1847613
+Node: I18N Portability847677
+Ref: I18N Portability-Footnote-1850133
+Node: I18N Example850196
+Ref: I18N Example-Footnote-1853002
+Node: Gawk I18N853075
+Node: I18N Summary853720
+Node: Debugger855061
+Node: Debugging856083
+Node: Debugging Concepts856524
+Node: Debugging Terms858333
+Node: Awk Debugging860908
+Node: Sample Debugging Session861814
+Node: Debugger Invocation862348
+Node: Finding The Bug863734
+Node: List of Debugger Commands870212
+Node: Breakpoint Control871545
+Node: Debugger Execution Control875239
+Node: Viewing And Changing Data878601
+Node: Execution Stack881975
+Node: Debugger Info883612
+Node: Miscellaneous Debugger Commands887683
+Node: Readline Support892771
+Node: Limitations893667
+Ref: Limitations-Footnote-1897898
+Node: Debugging Summary897949
+Node: Arbitrary Precision Arithmetic899228
+Node: Computer Arithmetic900644
+Ref: table-numeric-ranges904235
+Ref: Computer Arithmetic-Footnote-1904957
+Node: Math Definitions905014
+Ref: table-ieee-formats908328
+Ref: Math Definitions-Footnote-1908931
+Node: MPFR features909036
+Node: FP Math Caution910753
+Ref: FP Math Caution-Footnote-1911825
+Node: Inexactness of computations912194
+Node: Inexact representation913154
+Node: Comparing FP Values914514
+Node: Errors accumulate915596
+Node: Getting Accuracy917029
+Node: Try To Round919739
+Node: Setting precision920638
+Ref: table-predefined-precision-strings921335
+Node: Setting the rounding mode923165
+Ref: table-gawk-rounding-modes923539
+Ref: Setting the rounding mode-Footnote-1926947
+Node: Arbitrary Precision Integers927126
+Ref: Arbitrary Precision Integers-Footnote-1932043
+Node: POSIX Floating Point Problems932192
+Ref: POSIX Floating Point Problems-Footnote-1936074
+Node: Floating point summary936112
+Node: Dynamic Extensions938302
+Node: Extension Intro939855
+Node: Plugin License941121
+Node: Extension Mechanism Outline941918
+Ref: figure-load-extension942357
+Ref: figure-register-new-function943922
+Ref: figure-call-new-function945014
+Node: Extension API Description947076
+Node: Extension API Functions Introduction948608
+Node: General Data Types953467
+Ref: General Data Types-Footnote-1959422
+Node: Memory Allocation Functions959721
+Ref: Memory Allocation Functions-Footnote-1962566
+Node: Constructor Functions962665
+Node: Registration Functions964410
+Node: Extension Functions965095
+Node: Exit Callback Functions967718
+Node: Extension Version String968968
+Node: Input Parsers969631
+Node: Output Wrappers979513
+Node: Two-way processors984025
+Node: Printing Messages986290
+Ref: Printing Messages-Footnote-1987461
+Node: Updating ERRNO987614
+Node: Requesting Values988353
+Ref: table-value-types-returned989090
+Node: Accessing Parameters989973
+Node: Symbol Table Access991208
+Node: Symbol table by name991720
+Node: Symbol table by cookie993741
+Ref: Symbol table by cookie-Footnote-1997893
+Node: Cached values997957
+Ref: Cached values-Footnote-11001464
+Node: Array Manipulation1001555
+Ref: Array Manipulation-Footnote-11002646
+Node: Array Data Types1002683
+Ref: Array Data Types-Footnote-11005341
+Node: Array Functions1005433
+Node: Flattening Arrays1009291
+Node: Creating Arrays1016199
+Node: Redirection API1020968
+Node: Extension API Variables1023799
+Node: Extension Versioning1024432
+Ref: gawk-api-version1024869
+Node: Extension API Informational Variables1026625
+Node: Extension API Boilerplate1027689
+Node: Finding Extensions1031503
+Node: Extension Example1032062
+Node: Internal File Description1032860
+Node: Internal File Ops1036940
+Ref: Internal File Ops-Footnote-11048702
+Node: Using Internal File Ops1048842
+Ref: Using Internal File Ops-Footnote-11051225
+Node: Extension Samples1051499
+Node: Extension Sample File Functions1053028
+Node: Extension Sample Fnmatch1060677
+Node: Extension Sample Fork1062164
+Node: Extension Sample Inplace1063382
+Node: Extension Sample Ord1066592
+Node: Extension Sample Readdir1067428
+Ref: table-readdir-file-types1068317
+Node: Extension Sample Revout1069122
+Node: Extension Sample Rev2way1069711
+Node: Extension Sample Read write array1070451
+Node: Extension Sample Readfile1072393
+Node: Extension Sample Time1073488
+Node: Extension Sample API Tests1074836
+Node: gawkextlib1075328
+Node: Extension summary1077775
+Node: Extension Exercises1081477
+Node: Language History1082975
+Node: V7/SVR3.11084631
+Node: SVR41086783
+Node: POSIX1088217
+Node: BTL1089596
+Node: POSIX/GNU1090325
+Node: Feature History1096187
+Node: Common Extensions1110557
+Node: Ranges and Locales1111840
+Ref: Ranges and Locales-Footnote-11116456
+Ref: Ranges and Locales-Footnote-21116483
+Ref: Ranges and Locales-Footnote-31116718
+Node: Contributors1116939
+Node: History summary1122499
+Node: Installation1123879
+Node: Gawk Distribution1124823
+Node: Getting1125307
+Node: Extracting1126268
+Node: Distribution contents1127906
+Node: Unix Installation1134000
+Node: Quick Installation1134682
+Node: Shell Startup Files1137096
+Node: Additional Configuration Options1138174
+Node: Configuration Philosophy1139979
+Node: Non-Unix Installation1142348
+Node: PC Installation1142806
+Node: PC Binary Installation1144126
+Node: PC Compiling1145978
+Ref: PC Compiling-Footnote-11148772
+Node: PC Testing1148881
+Node: PC Using1150061
+Ref: PC Using-Footnote-11154214
+Node: Cygwin1154287
+Node: MSYS1155057
+Node: VMS Installation1155558
+Node: VMS Compilation1156349
+Ref: VMS Compilation-Footnote-11157578
+Node: VMS Dynamic Extensions1157636
+Node: VMS Installation Details1159321
+Node: VMS Running1161574
+Node: VMS GNV1165853
+Node: VMS Old Gawk1166588
+Node: Bugs1167059
+Node: Other Versions1171374
+Node: Installation summary1177958
+Node: Notes1179009
+Node: Compatibility Mode1179874
+Node: Additions1180656
+Node: Accessing The Source1181581
+Node: Adding Code1183016
+Node: New Ports1189235
+Node: Derived Files1193723
+Ref: Derived Files-Footnote-11199208
+Ref: Derived Files-Footnote-21199243
+Ref: Derived Files-Footnote-31199841
+Node: Future Extensions1199955
+Node: Implementation Limitations1200613
+Node: Extension Design1201796
+Node: Old Extension Problems1202950
+Ref: Old Extension Problems-Footnote-11204468
+Node: Extension New Mechanism Goals1204525
+Ref: Extension New Mechanism Goals-Footnote-11207889
+Node: Extension Other Design Decisions1208078
+Node: Extension Future Growth1210191
+Node: Old Extension Mechanism1211027
+Node: Notes summary1212790
+Node: Basic Concepts1213972
+Node: Basic High Level1214653
+Ref: figure-general-flow1214935
+Ref: figure-process-flow1215620
+Ref: Basic High Level-Footnote-11218921
+Node: Basic Data Typing1219106
+Node: Glossary1222434
+Node: Copying1254381
+Node: GNU Free Documentation License1291920
+Node: Index1317038

End Tag Table
diff --git a/doc/gawk.texi b/doc/gawk.texi
index 91c4893e..60dfe961 100644
--- a/doc/gawk.texi
+++ b/doc/gawk.texi
@@ -12646,19 +12646,19 @@ One special place where @code{/foo/} is @emph{not} an abbreviation for
where this is discussed in more detail.
@node POSIX String Comparison
-@subsubsection String Comparison with POSIX Rules
+@subsubsection String Comparison Based on Locale Collating Order
-The POSIX standard says that string comparison is performed based
-on the locale's @dfn{collating order}. This is the order in which
-characters sort, as defined by the locale (for more discussion,
-@pxref{Locales}). This order is usually very different
-from the results obtained when doing straight character-by-character
-comparison.@footnote{Technically, string comparison is supposed
-to behave the same way as if the strings were compared with the C
-@code{strcoll()} function.}
+The POSIX standard used to say that all string comparisons are
+performed based on the locale's @dfn{collating order}. This
+is the order in which characters sort, as defined by the locale
+(for more discussion, @pxref{Locales}). This order is usually very
+different from the results obtained when doing straight byte-by-byte
+comparison.@footnote{Technically, string comparison is supposed to behave
+the same way as if the strings were compared with the C @code{strcoll()}
+function.}
Because this behavior differs considerably from existing practice,
-@command{gawk} only implements it when in POSIX mode (@pxref{Options}).
+@command{gawk} only implemented it when in POSIX mode (@pxref{Options}).
Here is an example to illustrate the difference, in an @code{en_US.UTF-8}
locale:
@@ -12671,6 +12671,26 @@ $ @kbd{gawk --posix 'BEGIN @{ printf("ABC < abc = %s\n",}
@print{} ABC < abc = FALSE
@end example
+Fortunately, as of August 2016, comparison based on locale
+collating order is no longer required for the @code{==} and @code{!=}
+operators.@footnote{See @uref{http://austingroupbugs.net/view.php?id=1070,
+the Austin Group website}.} However, comparison based on locales is still
+required for @code{<}, @code{<=}, @code{>}, and @code{>=}. POSIX thus
+recommends as follows:
+
+@quotation
+Since the @code{==} operator checks whether strings are identical,
+not whether they collate equally, applications needing to check whether
+strings collate equally can use:
+
+@example
+a <= b && a >= b
+@end example
+@end quotation
+
+As of @value{PVERSION} 4.2, @command{gawk} continues to use locale
+collating order for @code{<}, @code{<=}, @code{>}, and @code{>=} only
+in POSIX mode.
@node Boolean Ops
@subsection Boolean Expressions
@@ -37458,7 +37478,7 @@ and
@uref{http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html#tag_21_09_03_05, its rationale}.}
By using this lovely technical term, the standard gives license
-to implementors to implement ranges in whatever way they choose.
+to implementers to implement ranges in whatever way they choose.
The @command{gawk} maintainer chose to apply the pre-POSIX meaning
both with the default regexp matching and when @option{--traditional} or
@option{--posix} are used.
diff --git a/doc/gawktexi.in b/doc/gawktexi.in
index 6d7eceb9..546f7611 100644
--- a/doc/gawktexi.in
+++ b/doc/gawktexi.in
@@ -11965,19 +11965,19 @@ One special place where @code{/foo/} is @emph{not} an abbreviation for
where this is discussed in more detail.
@node POSIX String Comparison
-@subsubsection String Comparison with POSIX Rules
+@subsubsection String Comparison Based on Locale Collating Order
-The POSIX standard says that string comparison is performed based
-on the locale's @dfn{collating order}. This is the order in which
-characters sort, as defined by the locale (for more discussion,
-@pxref{Locales}). This order is usually very different
-from the results obtained when doing straight character-by-character
-comparison.@footnote{Technically, string comparison is supposed
-to behave the same way as if the strings were compared with the C
-@code{strcoll()} function.}
+The POSIX standard used to say that all string comparisons are
+performed based on the locale's @dfn{collating order}. This
+is the order in which characters sort, as defined by the locale
+(for more discussion, @pxref{Locales}). This order is usually very
+different from the results obtained when doing straight byte-by-byte
+comparison.@footnote{Technically, string comparison is supposed to behave
+the same way as if the strings were compared with the C @code{strcoll()}
+function.}
Because this behavior differs considerably from existing practice,
-@command{gawk} only implements it when in POSIX mode (@pxref{Options}).
+@command{gawk} only implemented it when in POSIX mode (@pxref{Options}).
Here is an example to illustrate the difference, in an @code{en_US.UTF-8}
locale:
@@ -11990,6 +11990,26 @@ $ @kbd{gawk --posix 'BEGIN @{ printf("ABC < abc = %s\n",}
@print{} ABC < abc = FALSE
@end example
+Fortunately, as of August 2016, comparison based on locale
+collating order is no longer required for the @code{==} and @code{!=}
+operators.@footnote{See @uref{http://austingroupbugs.net/view.php?id=1070,
+the Austin Group website}.} However, comparison based on locales is still
+required for @code{<}, @code{<=}, @code{>}, and @code{>=}. POSIX thus
+recommends as follows:
+
+@quotation
+Since the @code{==} operator checks whether strings are identical,
+not whether they collate equally, applications needing to check whether
+strings collate equally can use:
+
+@example
+a <= b && a >= b
+@end example
+@end quotation
+
+As of @value{PVERSION} 4.2, @command{gawk} continues to use locale
+collating order for @code{<}, @code{<=}, @code{>}, and @code{>=} only
+in POSIX mode.
@node Boolean Ops
@subsection Boolean Expressions
@@ -36540,7 +36560,7 @@ and
@uref{http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html#tag_21_09_03_05, its rationale}.}
By using this lovely technical term, the standard gives license
-to implementors to implement ranges in whatever way they choose.
+to implementers to implement ranges in whatever way they choose.
The @command{gawk} maintainer chose to apply the pre-POSIX meaning
both with the default regexp matching and when @option{--traditional} or
@option{--posix} are used.
diff --git a/eval.c b/eval.c
index 6bd854e9..bfe6b3c0 100644
--- a/eval.c
+++ b/eval.c
@@ -575,7 +575,7 @@ posix_compare(NODE *s1, NODE *s2)
/* cmp_nodes --- compare two nodes, returning negative, 0, positive */
int
-cmp_nodes(NODE *t1, NODE *t2)
+cmp_nodes(NODE *t1, NODE *t2, bool use_strcmp)
{
int ret = 0;
size_t len1, len2;
@@ -598,7 +598,7 @@ cmp_nodes(NODE *t1, NODE *t2)
if (len1 == 0 || len2 == 0)
return ldiff;
- if (do_posix)
+ if (do_posix && ! use_strcmp)
return posix_compare(t1, t2);
l = (ldiff <= 0 ? len1 : len2);
@@ -885,7 +885,7 @@ fmt_index(NODE *n)
emalloc(fmt_list, NODE **, fmt_num*sizeof(*fmt_list), "fmt_index");
n = force_string(n);
while (ix < fmt_hiwater) {
- if (cmp_nodes(fmt_list[ix], n) == 0)
+ if (cmp_nodes(fmt_list[ix], n, true) == 0)
return ix;
ix++;
}
@@ -1514,10 +1514,15 @@ eval_condition(NODE *t)
return boolval(t);
}
+typedef enum {
+ SCALAR_EQ_NEQ,
+ SCALAR_RELATIONAL
+} scalar_cmp_t;
+
/* cmp_scalars -- compare two nodes on the stack */
static inline int
-cmp_scalars()
+cmp_scalars(scalar_cmp_t comparison_type)
{
NODE *t1, *t2;
int di;
@@ -1528,7 +1533,7 @@ cmp_scalars()
DEREF(t2);
fatal(_("attempt to use array `%s' in a scalar context"), array_vname(t1));
}
- di = cmp_nodes(t1, t2);
+ di = cmp_nodes(t1, t2, comparison_type == SCALAR_EQ_NEQ);
DEREF(t1);
DEREF(t2);
return di;
diff --git a/extension/configure.ac b/extension/configure.ac
index b723a3c1..b5b27d03 100644
--- a/extension/configure.ac
+++ b/extension/configure.ac
@@ -23,7 +23,7 @@ dnl
dnl Process this file with autoconf to produce a configure script.
-AC_INIT([GNU Awk Bundled Extensions],[4.1.3],[bug-gawk@gnu.org],[gawk-extensions])
+AC_INIT([GNU Awk Bundled Extensions],[4.1.4],[bug-gawk@gnu.org],[gawk-extensions])
AC_CONFIG_MACRO_DIR([m4])
AC_CONFIG_AUX_DIR([build-aux])
diff --git a/interpret.h b/interpret.h
index 3bb4532e..5467aa87 100644
--- a/interpret.h
+++ b/interpret.h
@@ -444,37 +444,37 @@ uninitialized_scalar:
break;
case Op_equal:
- r = node_Boolean[cmp_scalars() == 0];
+ r = node_Boolean[cmp_scalars(SCALAR_EQ_NEQ) == 0];
UPREF(r);
REPLACE(r);
break;
case Op_notequal:
- r = node_Boolean[cmp_scalars() != 0];
+ r = node_Boolean[cmp_scalars(SCALAR_EQ_NEQ) != 0];
UPREF(r);
REPLACE(r);
break;
case Op_less:
- r = node_Boolean[cmp_scalars() < 0];
+ r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) < 0];
UPREF(r);
REPLACE(r);
break;
case Op_greater:
- r = node_Boolean[cmp_scalars() > 0];
+ r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) > 0];
UPREF(r);
REPLACE(r);
break;
case Op_leq:
- r = node_Boolean[cmp_scalars() <= 0];
+ r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) <= 0];
UPREF(r);
REPLACE(r);
break;
case Op_geq:
- r = node_Boolean[cmp_scalars() >= 0];
+ r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) >= 0];
UPREF(r);
REPLACE(r);
break;
@@ -832,12 +832,11 @@ mod:
t2 = TOP_SCALAR(); /* switch expression */
t2 = force_string(t2);
rp = re_update(m);
- di = (research(rp, t2->stptr, 0, t2->stlen,
- avoid_dfa(m, t2->stptr, t2->stlen)) >= 0);
+ di = (research(rp, t2->stptr, 0, t2->stlen, RE_NO_FLAGS) >= 0);
} else {
t1 = POP_SCALAR(); /* case value */
t2 = TOP_SCALAR(); /* switch expression */
- di = (cmp_nodes(t2, t1) == 0);
+ di = (cmp_nodes(t2, t1, true) == 0);
DEREF(t1);
}
@@ -998,20 +997,7 @@ arrayfor:
t1 = *get_field(0, (Func_ptr *) 0);
match_re:
rp = re_update(m);
- /*
- * Any place where research() is called with a last parameter of
- * zero, we need to use the avoid_dfa test. This appears here and
- * in the code for Op_K_case.
- *
- * A new or improved dfa that distinguishes beginning/end of
- * string from beginning/end of line will allow us to get rid of
- * this hack.
- *
- * The avoid_dfa() function is in re.c; it is not very smart.
- */
-
- di = research(rp, t1->stptr, 0, t1->stlen,
- avoid_dfa(m, t1->stptr, t1->stlen));
+ di = research(rp, t1->stptr, 0, t1->stlen, RE_NO_FLAGS);
di = (di == -1) ^ (op != Op_nomatch);
if (op != Op_match_rec) {
decr_sp();
diff --git a/re.c b/re.c
index a4a03904..f05cc467 100644
--- a/re.c
+++ b/re.c
@@ -170,7 +170,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
memset((char *) rp, 0, sizeof(*rp));
- rp->dfareg = NULL;
rp->pat.allocated = 0; /* regex will allocate the buffer */
emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
@@ -223,12 +222,11 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
/* gack. this must be done *after* re_compile_pattern */
rp->pat.newline_anchor = false; /* don't get \n in middle of string */
if (dfa && ! no_dfa) {
- rp->dfa = true;
rp->dfareg = dfaalloc();
dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n');
dfacomp(buf, len, rp->dfareg, true);
} else
- rp->dfa = false;
+ rp->dfareg = NULL;
rp->has_anchor = has_anchor;
/* Additional flags that help with RS as regexp. */
@@ -278,26 +276,25 @@ research(Regexp *rp, char *str, int start,
* starts in the middle of a string, so don't bother trying it
* in that case.
*/
- if (rp->dfa && ! no_bol && start == 0) {
- char save;
- size_t count = 0;
+ if (rp->dfareg != NULL && ! no_bol && start == 0) {
struct dfa *superset = dfasuperset(rp->dfareg);
- /*
- * dfa likes to stick a '\n' right after the matched
- * text. So we just save and restore the character.
- */
- save = str[start+len];
if (superset)
ret = dfaexec(superset, str+start, str+start+len,
true, NULL, NULL);
- if (ret)
+
+ if (ret && ((! need_start && ! rp->has_anchor)
+ || (! superset && dfaisfast(rp->dfareg))))
ret = dfaexec(rp->dfareg, str+start, str+start+len,
- true, &count, &try_backref);
- str[start+len] = save;
+ true, NULL, &try_backref);
}
if (ret) {
- if (need_start || rp->dfa == false || try_backref) {
+ if ( rp->dfareg == NULL
+ || start != 0
+ || no_bol
+ || need_start
+ || rp->has_anchor
+ || try_backref) {
/*
* Passing NULL as last arg speeds up search for cases
* where we don't need the start/end info.
@@ -326,7 +323,7 @@ refree(Regexp *rp)
free(rp->regs.start);
if (rp->regs.end)
free(rp->regs.end);
- if (rp->dfa) {
+ if (rp->dfareg != NULL) {
dfafree(rp->dfareg);
free(rp->dfareg);
}
@@ -363,7 +360,7 @@ re_update(NODE *t)
}
if (t->re_text != NULL) {
/* if contents haven't changed, just return it */
- if (cmp_nodes(t->re_text, t1) == 0)
+ if (cmp_nodes(t->re_text, t1, true) == 0)
return t->re_reg;
/* things changed, fall through to recompile */
unref(t->re_text);
@@ -429,32 +426,6 @@ resetup()
dfa_init();
}
-/* avoid_dfa --- return true if we should not use the DFA matcher */
-
-int
-avoid_dfa(NODE *re, char *str, size_t len)
-{
- char *end;
-
- /*
- * f = @/.../
- * if ("foo" ~ f) ...
- *
- * This creates a Node_dynregex with NULL re_reg.
- */
- if (re->re_reg == NULL)
- return false;
-
- if (! re->re_reg->has_anchor)
- return false;
-
- for (end = str + len; str < end; str++)
- if (*str == '\n')
- return true;
-
- return false;
-}
-
/* reisstring --- return true if the RE match is a simple string match */
int