aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog33
-rw-r--r--awk.h13
-rw-r--r--debug.c2
-rw-r--r--dfa.c393
-rw-r--r--doc/ChangeLog6
-rw-r--r--doc/gawk.info830
-rw-r--r--doc/gawk.texi42
-rw-r--r--doc/gawktexi.in42
-rw-r--r--eval.c15
-rw-r--r--extension/configure.ac2
-rw-r--r--interpret.h32
-rw-r--r--re.c57
12 files changed, 761 insertions, 706 deletions
diff --git a/ChangeLog b/ChangeLog
index 06c4f84d..9b29fe9d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,38 @@
2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
+ POSIX now says use strcmp for == and !=. Thanks to Chet Ramey
+ for pointing me at the change. Make it so:
+
+ * awk.h (cmp_nodes): New 3rd param indicating strcmp, not strcoll.
+ * debug.c (cmp_val): Update call to cmp_nodes.
+ * eval.c (cmp_nodes): New 3rd param indicating strcmp, not strcoll.
+ Adjust code and all callers.
+ (scalar_cmp_t): New enum type. Used in ...
+ (cmp_scalars): ... in order to call cmp_nodes correctly.
+ * interpret.h: Use the enum type in calls to cmp_scalars.
+ * re.c (re_update): Adjust call to cmp_nodes.
+
+2016-08-25 Norihiro Tanaka <noritnk@kcn.ne.jp>
+
+ * awk.h (struct Regexp): Remove dfa. Now dfareg instead of it. All
+ referers changed.
+ * re.c (research): Arrange caller of dfaexec and research.
+ * (avoid_dfa): Removed. All callers changed.
+ * awk.h (avoid_dfa): Removed.
+
+ Other changes by Arnold Robbins:
+
+ * awk.h (struct Regexp): Change various boolean members to bool.
+ (RE_NO_FLAGS): New #define.
+ * interpret.h: Use RE_NO_FLAGS instead of zero.
+ * re.c (research): Prettify the logic a little bit.
+
+2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
+ * dfa.c: Sync with grep.
+
+2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
* 4.1.4: Release tar ball made.
2016-08-23 Arnold D. Robbins <arnold@skeeve.com>
diff --git a/awk.h b/awk.h
index 15336c05..c107adb7 100644
--- a/awk.h
+++ b/awk.h
@@ -206,11 +206,10 @@ typedef struct Regexp {
struct re_pattern_buffer pat;
struct re_registers regs;
struct dfa *dfareg;
- short dfa;
- short has_anchor; /* speed up of avoid_dfa kludge, temporary */
- short non_empty; /* for use in fpat_parse_field */
- short has_meta; /* re has meta chars so (probably) isn't simple string */
- short maybe_long; /* re has meta chars that can match long text */
+ bool has_anchor; /* re has anchors which dfa avoids */
+ bool non_empty; /* for use in fpat_parse_field */
+ bool has_meta; /* re has meta chars so (probably) isn't simple string */
+ bool maybe_long; /* re has meta chars that can match long text */
} Regexp;
#define RESTART(rp,s) (rp)->regs.start[0]
#define REEND(rp,s) (rp)->regs.end[0]
@@ -219,6 +218,7 @@ typedef struct Regexp {
#define NUMSUBPATS(rp,s) (rp)->regs.num_regs
/* regexp matching flags: */
+#define RE_NO_FLAGS 0 /* empty flags */
#define RE_NEED_START 1 /* need to know start/end of match */
#define RE_NO_BOL 2 /* not allowed to match ^ in regexp */
@@ -1442,7 +1442,7 @@ extern int sanitize_exit_status(int status);
extern void PUSH_CODE(INSTRUCTION *cp);
extern INSTRUCTION *POP_CODE(void);
extern void init_interpret(void);
-extern int cmp_nodes(NODE *t1, NODE *t2);
+extern int cmp_nodes(NODE *t1, NODE *t2, bool use_strcmp);
extern int cmp_awknums(const NODE *t1, const NODE *t2);
extern void set_IGNORECASE(void);
extern void set_OFS(void);
@@ -1651,7 +1651,6 @@ extern void reg_error(const char *s);
extern Regexp *re_update(NODE *t);
extern void resyntax(int syntax);
extern void resetup(void);
-extern int avoid_dfa(NODE *re, char *str, size_t len);
extern int reisstring(const char *text, size_t len, Regexp *re, const char *buf);
extern int get_numbase(const char *str, bool use_locale);
diff --git a/debug.c b/debug.c
index f4640adb..9f2d948b 100644
--- a/debug.c
+++ b/debug.c
@@ -1670,7 +1670,7 @@ cmp_val(struct list_item *w, NODE *old, NODE *new)
if (new->type == Node_var_array) /* 5 */
return true;
- return cmp_nodes(old, new); /* 4 */
+ return cmp_nodes(old, new, true); /* 4 */
}
/* watchpoint_triggered --- check if we should stop at this watchpoint;
diff --git a/dfa.c b/dfa.c
index cb11043e..85cb46ad 100644
--- a/dfa.c
+++ b/dfa.c
@@ -387,8 +387,8 @@ struct regex_syntax
meaning of the @#%!@#%^!@ syntax bits. */
struct lexer_state
{
- char const *lexptr; /* Pointer to next input character. */
- size_t lexleft; /* Number of characters remaining. */
+ char const *ptr; /* Pointer to next input character. */
+ size_t left; /* Number of characters remaining. */
token lasttok; /* Previous token returned; initially END. */
size_t parens; /* Count of outstanding left parens. */
int minrep, maxrep; /* Repeat counts for {m,n}. */
@@ -429,10 +429,10 @@ struct dfa
size_t calloc; /* Number of charclasses allocated. */
/* Scanner state */
- struct lexer_state lexstate;
+ struct lexer_state lex;
/* Parser state */
- struct parser_state parsestate;
+ struct parser_state parse;
/* Fields filled by the parser. */
token *tokens; /* Postfix parse array. */
@@ -910,7 +910,7 @@ using_simple_locale (struct dfa const *dfa)
&& '}' == 125 && '~' == 126)
};
- return (!native_c_charset || dfa->multibyte) ? false : unibyte_c;
+ return (native_c_charset & !dfa->multibyte) | unibyte_c;
}
/* Fetch the next lexical input character. Set C (of type int) to the
@@ -922,23 +922,23 @@ using_simple_locale (struct dfa const *dfa)
otherwise. */
# define FETCH_WC(dfa, c, wc, eoferr) \
do { \
- if (! dfa->lexstate.lexleft) \
+ if (! (dfa)->lex.left) \
{ \
if ((eoferr) != 0) \
dfaerror (eoferr); \
else \
- return dfa->lexstate.lasttok = END; \
+ return (dfa)->lex.lasttok = END; \
} \
else \
{ \
wint_t _wc; \
- size_t nbytes = mbs_to_wchar (&_wc, dfa->lexstate.lexptr, \
- dfa->lexstate.lexleft, dfa); \
- dfa->lexstate.cur_mb_len = nbytes; \
+ size_t nbytes = mbs_to_wchar (&_wc, (dfa)->lex.ptr, \
+ (dfa)->lex.left, dfa); \
+ (dfa)->lex.cur_mb_len = nbytes; \
(wc) = _wc; \
- (c) = nbytes == 1 ? to_uchar (*dfa->lexstate.lexptr) : EOF; \
- dfa->lexstate.lexptr += nbytes; \
- dfa->lexstate.lexleft -= nbytes; \
+ (c) = nbytes == 1 ? to_uchar ((dfa)->lex.ptr[0]) : EOF; \
+ (dfa)->lex.ptr += nbytes; \
+ (dfa)->lex.left -= nbytes; \
} \
} while (false)
@@ -1112,8 +1112,8 @@ parse_bracket_exp (struct dfa *dfa)
for (;;)
{
FETCH_WC (dfa, c, wc, _("unbalanced ["));
- if ((c == c1 && *dfa->lexstate.lexptr == ']')
- || dfa->lexstate.lexleft == 0)
+ if (dfa->lex.left == 0
+ || (c == c1 && dfa->lex.ptr[0] == ']'))
break;
if (len < MAX_BRACKET_STRING_LEN)
str[len++] = c;
@@ -1133,8 +1133,8 @@ parse_bracket_exp (struct dfa *dfa)
{
char const *class
= (dfa->syntax.case_fold && (STREQ (str, "upper")
- || STREQ (str, "lower")) ?
- "alpha" : str);
+ || STREQ (str, "lower"))
+ ? "alpha" : str);
const struct dfa_ctype *pred = find_pred (class);
if (!pred)
dfaerror (_("invalid character class"));
@@ -1174,7 +1174,7 @@ parse_bracket_exp (struct dfa *dfa)
/* A bracket expression like [a-[.aa.]] matches an unknown set.
Treat it like [-a[.aa.]] while parsing it, and
remember that the set is unknown. */
- if (c2 == '[' && *dfa->lexstate.lexptr == '.')
+ if (c2 == '[' && dfa->lex.ptr[0] == '.')
{
known_bracket_exp = false;
c2 = ']';
@@ -1184,8 +1184,8 @@ parse_bracket_exp (struct dfa *dfa)
{
/* In the case [x-], the - is an ordinary hyphen,
which is left in c1, the lookahead character. */
- dfa->lexstate.lexptr -= dfa->lexstate.cur_mb_len;
- dfa->lexstate.lexleft += dfa->lexstate.cur_mb_len;
+ dfa->lex.ptr -= dfa->lex.cur_mb_len;
+ dfa->lex.left += dfa->lex.cur_mb_len;
}
else
{
@@ -1283,19 +1283,27 @@ parse_bracket_exp (struct dfa *dfa)
return CSET + dfa_charclass_index (dfa, ccl);
}
-#define PUSH_LEX_STATE(s) \
- do \
- { \
- char const *lexptr_saved = dfa->lexstate.lexptr; \
- size_t lexleft_saved = dfa->lexstate.lexleft; \
- dfa->lexstate.lexptr = (s); \
- dfa->lexstate.lexleft = strlen (dfa->lexstate.lexptr)
+struct lexptr
+{
+ char const *ptr;
+ size_t left;
+};
+
+static void
+push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
+{
+ ls->ptr = dfa->lex.ptr;
+ ls->left = dfa->lex.left;
+ dfa->lex.ptr = s;
+ dfa->lex.left = strlen (s);
+}
-#define POP_LEX_STATE() \
- dfa->lexstate.lexptr = lexptr_saved; \
- dfa->lexstate.lexleft = lexleft_saved; \
- } \
- while (false)
+static void
+pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
+{
+ dfa->lex.ptr = ls->ptr;
+ dfa->lex.left = ls->left;
+}
static token
lex (struct dfa *dfa)
@@ -1313,14 +1321,14 @@ lex (struct dfa *dfa)
"if (backslash) ...". */
for (i = 0; i < 2; ++i)
{
- FETCH_WC (dfa, c, dfa->lexstate.wctok, NULL);
+ FETCH_WC (dfa, c, dfa->lex.wctok, NULL);
switch (c)
{
case '\\':
if (backslash)
goto normal_char;
- if (dfa->lexstate.lexleft == 0)
+ if (dfa->lex.left == 0)
dfaerror (_("unfinished \\ escape"));
backslash = true;
break;
@@ -1329,28 +1337,29 @@ lex (struct dfa *dfa)
if (backslash)
goto normal_char;
if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
- || dfa->lexstate.lasttok == END || dfa->lexstate.lasttok == LPAREN
- || dfa->lexstate.lasttok == OR)
- return dfa->lexstate.lasttok = BEGLINE;
+ || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
+ || dfa->lex.lasttok == OR)
+ return dfa->lex.lasttok = BEGLINE;
goto normal_char;
case '$':
if (backslash)
goto normal_char;
if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
- || dfa->lexstate.lexleft == 0
- || (dfa->syntax.syntax_bits & RE_NO_BK_PARENS
- ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == ')'
- : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\'
- && dfa->lexstate.lexptr[1] == ')')
- || (dfa->syntax.syntax_bits & RE_NO_BK_VBAR
- ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == '|'
- : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\'
- && dfa->lexstate.lexptr[1] == '|')
+ || dfa->lex.left == 0
+ || ((dfa->lex.left
+ > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
+ && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
+ & (dfa->lex.ptr[0] == '\\')]
+ == ')'))
+ || ((dfa->lex.left
+ > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
+ && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
+ & (dfa->lex.ptr[0] == '\\')]
+ == '|'))
|| ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
- && dfa->lexstate.lexleft > 0
- && *dfa->lexstate.lexptr == '\n'))
- return dfa->lexstate.lasttok = ENDLINE;
+ && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
+ return dfa->lex.lasttok = ENDLINE;
goto normal_char;
case '1':
@@ -1364,8 +1373,8 @@ lex (struct dfa *dfa)
case '9':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
{
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = BACKREF;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = BACKREF;
}
goto normal_char;
@@ -1373,7 +1382,7 @@ lex (struct dfa *dfa)
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
{
/* FIXME: should be beginning of string */
- return dfa->lexstate.lasttok = BEGLINE;
+ return dfa->lex.lasttok = BEGLINE;
}
goto normal_char;
@@ -1381,28 +1390,28 @@ lex (struct dfa *dfa)
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
{
/* FIXME: should be end of string */
- return dfa->lexstate.lasttok = ENDLINE;
+ return dfa->lex.lasttok = ENDLINE;
}
goto normal_char;
case '<':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = BEGWORD;
+ return dfa->lex.lasttok = BEGWORD;
goto normal_char;
case '>':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = ENDWORD;
+ return dfa->lex.lasttok = ENDWORD;
goto normal_char;
case 'b':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = LIMWORD;
+ return dfa->lex.lasttok = LIMWORD;
goto normal_char;
case 'B':
if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
- return dfa->lexstate.lasttok = NOTLIMWORD;
+ return dfa->lex.lasttok = NOTLIMWORD;
goto normal_char;
case '?':
@@ -1411,17 +1420,17 @@ lex (struct dfa *dfa)
if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
- return dfa->lexstate.lasttok = QMARK;
+ return dfa->lex.lasttok = QMARK;
case '*':
if (backslash)
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
- return dfa->lexstate.lasttok = STAR;
+ return dfa->lex.lasttok = STAR;
case '+':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
@@ -1429,9 +1438,9 @@ lex (struct dfa *dfa)
if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
- return dfa->lexstate.lasttok = PLUS;
+ return dfa->lex.lasttok = PLUS;
case '{':
if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
@@ -1439,7 +1448,7 @@ lex (struct dfa *dfa)
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
goto normal_char;
if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
- && dfa->lexstate.laststart)
+ && dfa->lex.laststart)
goto normal_char;
/* Cases:
@@ -1449,86 +1458,79 @@ lex (struct dfa *dfa)
{,} - 0 to infinity (same as '*')
{M,N} - M through N */
{
- char const *p = dfa->lexstate.lexptr;
- char const *lim = p + dfa->lexstate.lexleft;
- dfa->lexstate.minrep = dfa->lexstate.maxrep = -1;
+ char const *p = dfa->lex.ptr;
+ char const *lim = p + dfa->lex.left;
+ dfa->lex.minrep = dfa->lex.maxrep = -1;
for (; p != lim && ISASCIIDIGIT (*p); p++)
- {
- if (dfa->lexstate.minrep < 0)
- dfa->lexstate.minrep = *p - '0';
- else
- dfa->lexstate.minrep = MIN (RE_DUP_MAX + 1,
- (dfa->lexstate.minrep
- * 10 + *p - '0'));
- }
+ dfa->lex.minrep = (dfa->lex.minrep < 0
+ ? *p - '0'
+ : MIN (RE_DUP_MAX + 1,
+ dfa->lex.minrep * 10 + *p - '0'));
if (p != lim)
{
if (*p != ',')
- dfa->lexstate.maxrep = dfa->lexstate.minrep;
+ dfa->lex.maxrep = dfa->lex.minrep;
else
{
- if (dfa->lexstate.minrep < 0)
- dfa->lexstate.minrep = 0;
+ if (dfa->lex.minrep < 0)
+ dfa->lex.minrep = 0;
while (++p != lim && ISASCIIDIGIT (*p))
- {
- if (dfa->lexstate.maxrep < 0)
- dfa->lexstate.maxrep = *p - '0';
- else
- dfa->lexstate.maxrep = MIN (RE_DUP_MAX + 1,
- (dfa->lexstate.maxrep
- * 10 + *p - '0'));
- }
+ dfa->lex.maxrep
+ = (dfa->lex.maxrep < 0
+ ? *p - '0'
+ : MIN (RE_DUP_MAX + 1,
+ dfa->lex.maxrep * 10 + *p - '0'));
}
}
if (! ((! backslash || (p != lim && *p++ == '\\'))
&& p != lim && *p++ == '}'
- && 0 <= dfa->lexstate.minrep
- && (dfa->lexstate.maxrep < 0
- || dfa->lexstate.minrep <= dfa->lexstate.maxrep)))
+ && 0 <= dfa->lex.minrep
+ && (dfa->lex.maxrep < 0
+ || dfa->lex.minrep <= dfa->lex.maxrep)))
{
if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
goto normal_char;
dfaerror (_("invalid content of \\{\\}"));
}
- if (RE_DUP_MAX < dfa->lexstate.maxrep)
+ if (RE_DUP_MAX < dfa->lex.maxrep)
dfaerror (_("regular expression too big"));
- dfa->lexstate.lexptr = p;
- dfa->lexstate.lexleft = lim - p;
+ dfa->lex.ptr = p;
+ dfa->lex.left = lim - p;
}
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = REPMN;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = REPMN;
case '|':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
goto normal_char;
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
goto normal_char;
- dfa->lexstate.laststart = true;
- return dfa->lexstate.lasttok = OR;
+ dfa->lex.laststart = true;
+ return dfa->lex.lasttok = OR;
case '\n':
if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
|| backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
goto normal_char;
- dfa->lexstate.laststart = true;
- return dfa->lexstate.lasttok = OR;
+ dfa->lex.laststart = true;
+ return dfa->lex.lasttok = OR;
case '(':
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
goto normal_char;
- ++dfa->lexstate.parens;
- dfa->lexstate.laststart = true;
- return dfa->lexstate.lasttok = LPAREN;
+ dfa->lex.parens++;
+ dfa->lex.laststart = true;
+ return dfa->lex.lasttok = LPAREN;
case ')':
if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
goto normal_char;
- if (dfa->lexstate.parens == 0
+ if (dfa->lex.parens == 0
&& dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
goto normal_char;
- --dfa->lexstate.parens;
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = RPAREN;
+ dfa->lex.parens--;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = RPAREN;
case '.':
if (backslash)
@@ -1537,8 +1539,8 @@ lex (struct dfa *dfa)
{
/* In multibyte environment period must match with a single
character not a byte. So we use ANYCHAR. */
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = ANYCHAR;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = ANYCHAR;
}
zeroset (ccl);
notset (ccl);
@@ -1546,8 +1548,8 @@ lex (struct dfa *dfa)
clrbit ('\n', ccl);
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', ccl);
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
case 's':
case 'S':
@@ -1561,9 +1563,8 @@ lex (struct dfa *dfa)
setbit (c2, ccl);
if (c == 'S')
notset (ccl);
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
- ccl);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1572,14 +1573,15 @@ lex (struct dfa *dfa)
/* \s and \S are documented to be equivalent to [[:space:]] and
[^[:space:]] respectively, so tell the lexer to process those
strings, each minus its "already processed" '['. */
- PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]");
-
- dfa->lexstate.lasttok = parse_bracket_exp (dfa);
-
- POP_LEX_STATE ();
+ {
+ struct lexptr ls;
+ push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
+ dfa->lex.lasttok = parse_bracket_exp (dfa);
+ pop_lex_state (dfa, &ls);
+ }
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok;
case 'w':
case 'W':
@@ -1594,9 +1596,8 @@ lex (struct dfa *dfa)
setbit (c2, ccl);
if (c == 'W')
notset (ccl);
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
- ccl);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1605,38 +1606,38 @@ lex (struct dfa *dfa)
/* \w and \W are documented to be equivalent to [_[:alnum:]] and
[^_[:alnum:]] respectively, so tell the lexer to process those
strings, each minus its "already processed" '['. */
- PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]");
-
- dfa->lexstate.lasttok = parse_bracket_exp (dfa);
-
- POP_LEX_STATE ();
+ {
+ struct lexptr ls;
+ push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
+ dfa->lex.lasttok = parse_bracket_exp (dfa);
+ pop_lex_state (dfa, &ls);
+ }
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok;
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok;
case '[':
if (backslash)
goto normal_char;
- dfa->lexstate.laststart = false;
- return dfa->lexstate.lasttok = parse_bracket_exp (dfa);
+ dfa->lex.laststart = false;
+ return dfa->lex.lasttok = parse_bracket_exp (dfa);
default:
normal_char:
- dfa->lexstate.laststart = false;
+ dfa->lex.laststart = false;
/* For multibyte character sets, folding is done in atom. Always
return WCHAR. */
if (dfa->multibyte)
- return dfa->lexstate.lasttok = WCHAR;
+ return dfa->lex.lasttok = WCHAR;
if (dfa->syntax.case_fold && isalpha (c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
- return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
- ccl);
+ return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
}
- return dfa->lexstate.lasttok = c;
+ return dfa->lex.lasttok = c;
}
}
@@ -1670,21 +1671,21 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
case CAT:
case OR:
- --dfa->parsestate.depth;
+ dfa->parse.depth--;
break;
case BACKREF:
dfa->fast = false;
/* fallthrough */
default:
- ++dfa->nleaves;
+ dfa->nleaves++;
/* fallthrough */
case EMPTY:
- ++dfa->parsestate.depth;
+ dfa->parse.depth++;
break;
}
- if (dfa->parsestate.depth > dfa->depth)
- dfa->depth = dfa->parsestate.depth;
+ if (dfa->parse.depth > dfa->depth)
+ dfa->depth = dfa->parse.depth;
}
static void addtok_wc (struct dfa *dfa, wint_t wc);
@@ -1741,19 +1742,19 @@ addtok_wc (struct dfa *dfa, wint_t wc)
size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
if (stored_bytes != (size_t) -1)
- dfa->lexstate.cur_mb_len = stored_bytes;
+ dfa->lex.cur_mb_len = stored_bytes;
else
{
/* This is merely stop-gap. buf[0] is undefined, yet skipping
the addtok_mb call altogether can corrupt the heap. */
- dfa->lexstate.cur_mb_len = 1;
+ dfa->lex.cur_mb_len = 1;
buf[0] = 0;
}
- addtok_mb (dfa, buf[0], dfa->lexstate.cur_mb_len == 1 ? 3 : 1);
- for (i = 1; i < dfa->lexstate.cur_mb_len; i++)
+ addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1);
+ for (i = 1; i < dfa->lex.cur_mb_len; i++)
{
- addtok_mb (dfa, buf[i], i == dfa->lexstate.cur_mb_len - 1 ? 2 : 0);
+ addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0);
addtok (dfa, CAT);
}
}
@@ -1854,18 +1855,18 @@ add_utf8_anychar (struct dfa *dfa)
static void
atom (struct dfa *dfa)
{
- if (dfa->parsestate.tok == WCHAR)
+ if (dfa->parse.tok == WCHAR)
{
- if (dfa->lexstate.wctok == WEOF)
+ if (dfa->lex.wctok == WEOF)
addtok (dfa, BACKREF);
else
{
- addtok_wc (dfa, dfa->lexstate.wctok);
+ addtok_wc (dfa, dfa->lex.wctok);
if (dfa->syntax.case_fold)
{
wchar_t folded[CASE_FOLDED_BUFSIZE];
- unsigned int i, n = case_folded_counterparts (dfa->lexstate.wctok,
+ unsigned int i, n = case_folded_counterparts (dfa->lex.wctok,
folded);
for (i = 0; i < n; i++)
{
@@ -1875,9 +1876,9 @@ atom (struct dfa *dfa)
}
}
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
- else if (dfa->parsestate.tok == ANYCHAR && using_utf8)
+ else if (dfa->parse.tok == ANYCHAR && using_utf8)
{
/* For UTF-8 expand the period to a series of CSETs that define a valid
UTF-8 character. This avoids using the slow multibyte path. I'm
@@ -1887,26 +1888,25 @@ atom (struct dfa *dfa)
UTF-8: it is the most used, and the structure of the encoding
makes the correctness more obvious. */
add_utf8_anychar (dfa);
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
- else if ((dfa->parsestate.tok >= 0 && dfa->parsestate.tok < NOTCHAR)
- || dfa->parsestate.tok >= CSET || dfa->parsestate.tok == BACKREF
- || dfa->parsestate.tok == BEGLINE || dfa->parsestate.tok == ENDLINE
- || dfa->parsestate.tok == BEGWORD || dfa->parsestate.tok == ANYCHAR
- || dfa->parsestate.tok == MBCSET || dfa->parsestate.tok == ENDWORD
- || dfa->parsestate.tok == LIMWORD
- || dfa->parsestate.tok == NOTLIMWORD)
+ else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
+ || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
+ || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
+ || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
+ || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
+ || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
{
- addtok (dfa, dfa->parsestate.tok);
- dfa->parsestate.tok = lex (dfa);
+ addtok (dfa, dfa->parse.tok);
+ dfa->parse.tok = lex (dfa);
}
- else if (dfa->parsestate.tok == LPAREN)
+ else if (dfa->parse.tok == LPAREN)
{
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
regexp (dfa);
- if (dfa->parsestate.tok != RPAREN)
+ if (dfa->parse.tok != RPAREN)
dfaerror (_("unbalanced ("));
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
else
addtok (dfa, EMPTY);
@@ -1954,40 +1954,39 @@ closure (struct dfa *dfa)
size_t tindex, ntokens;
atom (dfa);
- while (dfa->parsestate.tok == QMARK || dfa->parsestate.tok == STAR
- || dfa->parsestate.tok == PLUS || dfa->parsestate.tok == REPMN)
- if (dfa->parsestate.tok == REPMN
- && (dfa->lexstate.minrep || dfa->lexstate.maxrep))
+ while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
+ || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
+ if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
{
ntokens = nsubtoks (dfa, dfa->tindex);
tindex = dfa->tindex - ntokens;
- if (dfa->lexstate.maxrep < 0)
+ if (dfa->lex.maxrep < 0)
addtok (dfa, PLUS);
- if (dfa->lexstate.minrep == 0)
+ if (dfa->lex.minrep == 0)
addtok (dfa, QMARK);
- for (i = 1; i < dfa->lexstate.minrep; ++i)
+ for (i = 1; i < dfa->lex.minrep; i++)
{
copytoks (dfa, tindex, ntokens);
addtok (dfa, CAT);
}
- for (; i < dfa->lexstate.maxrep; ++i)
+ for (; i < dfa->lex.maxrep; i++)
{
copytoks (dfa, tindex, ntokens);
addtok (dfa, QMARK);
addtok (dfa, CAT);
}
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
}
- else if (dfa->parsestate.tok == REPMN)
+ else if (dfa->parse.tok == REPMN)
{
dfa->tindex -= nsubtoks (dfa, dfa->tindex);
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
closure (dfa);
}
else
{
- addtok (dfa, dfa->parsestate.tok);
- dfa->parsestate.tok = lex (dfa);
+ addtok (dfa, dfa->parse.tok);
+ dfa->parse.tok = lex (dfa);
}
}
@@ -1995,8 +1994,8 @@ static void
branch (struct dfa* dfa)
{
closure (dfa);
- while (dfa->parsestate.tok != RPAREN && dfa->parsestate.tok != OR
- && dfa->parsestate.tok >= 0)
+ while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
+ && dfa->parse.tok >= 0)
{
closure (dfa);
addtok (dfa, CAT);
@@ -2007,9 +2006,9 @@ static void
regexp (struct dfa *dfa)
{
branch (dfa);
- while (dfa->parsestate.tok == OR)
+ while (dfa->parse.tok == OR)
{
- dfa->parsestate.tok = lex (dfa);
+ dfa->parse.tok = lex (dfa);
branch (dfa);
addtok (dfa, OR);
}
@@ -2021,26 +2020,26 @@ regexp (struct dfa *dfa)
static void
dfaparse (char const *s, size_t len, struct dfa *d)
{
- d->lexstate.lexptr = s;
- d->lexstate.lexleft = len;
- d->lexstate.lasttok = END;
- d->lexstate.laststart = true;
- d->lexstate.parens = 0;
+ d->lex.ptr = s;
+ d->lex.left = len;
+ d->lex.lasttok = END;
+ d->lex.laststart = true;
+ d->lex.parens = 0;
if (d->multibyte)
{
- d->lexstate.cur_mb_len = 0;
+ d->lex.cur_mb_len = 0;
memset (&d->mbs, 0, sizeof d->mbs);
}
if (!d->syntax.syntax_bits_set)
dfaerror (_("no syntax specified"));
- d->parsestate.tok = lex (d);
- d->parsestate.depth = d->depth;
+ d->parse.tok = lex (d);
+ d->parse.depth = d->depth;
regexp (d);
- if (d->parsestate.tok != END)
+ if (d->parse.tok != END)
dfaerror (_("unbalanced )"));
addtok (d, END - d->nregexps);
@@ -3990,11 +3989,9 @@ dfamust (struct dfa const *d)
bool exact = false;
bool begline = false;
bool endline = false;
- size_t rj;
bool need_begline = false;
bool need_endline = false;
bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
- struct dfamust *dm;
for (ri = 0; ri < d->tindex; ++ri)
{
@@ -4171,7 +4168,7 @@ dfamust (struct dfa const *d)
}
}
- rj = ri + 2;
+ size_t rj = ri + 2;
if (d->tokens[ri + 1] == CAT)
{
for (; rj < d->tindex - 1; rj += 2)
@@ -4200,7 +4197,7 @@ dfamust (struct dfa const *d)
}
done:;
- dm = NULL;
+ struct dfamust *dm = NULL;
if (*result)
{
dm = xmalloc (sizeof *dm);
@@ -4230,11 +4227,11 @@ dfamustfree (struct dfamust *dm)
struct dfa *
dfaalloc (void)
{
- struct dfa *d = xcalloc (1, sizeof (struct dfa));
+ struct dfa *d = xzalloc (sizeof *d);
d->multibyte = MB_CUR_MAX > 1;
d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
d->fast = !d->multibyte;
- d->lexstate.cur_mb_len = 1;
+ d->lex.cur_mb_len = 1;
return d;
}
diff --git a/doc/ChangeLog b/doc/ChangeLog
index e6d9bf05..efa2b561 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,5 +1,11 @@
2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
+ * gawktexi.in (POSIX String Comparison): Update for new
+ spec where == and != use strcmp, rest use strcoll. Thanks to
+ Chet Ramey for pointing me at the new rules.
+
+2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
* 4.1.4: Release tar ball made.
2016-08-24 Arnold D. Robbins <arnold@skeeve.com>
diff --git a/doc/gawk.info b/doc/gawk.info
index 1766ab94..973af87c 100644
--- a/doc/gawk.info
+++ b/doc/gawk.info
@@ -8615,18 +8615,18 @@ Constant Regexps::, where this is discussed in more detail.

File: gawk.info, Node: POSIX String Comparison, Prev: Comparison Operators, Up: Typing and Comparison
-6.3.2.3 String Comparison with POSIX Rules
-..........................................
+6.3.2.3 String Comparison Based on Locale Collating Order
+.........................................................
-The POSIX standard says that string comparison is performed based on the
-locale's "collating order". This is the order in which characters sort,
-as defined by the locale (for more discussion, *note Locales::). This
-order is usually very different from the results obtained when doing
-straight character-by-character comparison.(1)
+The POSIX standard used to say that all string comparisons are performed
+based on the locale's "collating order". This is the order in which
+characters sort, as defined by the locale (for more discussion, *note
+Locales::). This order is usually very different from the results
+obtained when doing straight byte-by-byte comparison.(1)
Because this behavior differs considerably from existing practice,
-'gawk' only implements it when in POSIX mode (*note Options::). Here is
-an example to illustrate the difference, in an 'en_US.UTF-8' locale:
+'gawk' only implemented it when in POSIX mode (*note Options::). Here
+is an example to illustrate the difference, in an 'en_US.UTF-8' locale:
$ gawk 'BEGIN { printf("ABC < abc = %s\n",
> ("ABC" < "abc" ? "TRUE" : "FALSE")) }'
@@ -8635,11 +8635,28 @@ an example to illustrate the difference, in an 'en_US.UTF-8' locale:
> ("ABC" < "abc" ? "TRUE" : "FALSE")) }'
-| ABC < abc = FALSE
+ Fortunately, as of August 2016, comparison based on locale collating
+order is no longer required for the '==' and '!=' operators.(2)
+However, comparison based on locales is still required for '<', '<=',
+'>', and '>='. POSIX thus recommends as follows:
+
+ Since the '==' operator checks whether strings are identical, not
+ whether they collate equally, applications needing to check whether
+ strings collate equally can use:
+
+ a <= b && a >= b
+
+ As of version 4.2, 'gawk' continues to use locale collating order for
+'<', '<=', '>', and '>=' only in POSIX mode.
+
---------- Footnotes ----------
(1) Technically, string comparison is supposed to behave the same way
as if the strings were compared with the C 'strcoll()' function.
+ (2) See the Austin Group website
+(http://austingroupbugs.net/view.php?id=1070).
+

File: gawk.info, Node: Boolean Ops, Next: Conditional Exp, Prev: Typing and Comparison, Up: Truth Values and Conditions
@@ -27603,7 +27620,7 @@ ranges, such that outside the '"C"' and '"POSIX"' locales, the meaning
of range expressions was _undefined_.(3)
By using this lovely technical term, the standard gives license to
-implementors to implement ranges in whatever way they choose. The
+implementers to implement ranges in whatever way they choose. The
'gawk' maintainer chose to apply the pre-POSIX meaning both with the
default regexp matching and when '--traditional' or '--posix' are used.
In all cases 'gawk' remains POSIX-compliant.
@@ -35427,401 +35444,402 @@ Node: Variable Typing364779
Node: Comparison Operators368403
Ref: table-relational-ops368822
Node: POSIX String Comparison372317
-Ref: POSIX String Comparison-Footnote-1373391
-Node: Boolean Ops373530
-Ref: Boolean Ops-Footnote-1378012
-Node: Conditional Exp378104
-Node: Function Calls379840
-Node: Precedence383717
-Node: Locales387376
-Node: Expressions Summary389008
-Node: Patterns and Actions391581
-Node: Pattern Overview392701
-Node: Regexp Patterns394378
-Node: Expression Patterns394920
-Node: Ranges398701
-Node: BEGIN/END401809
-Node: Using BEGIN/END402570
-Ref: Using BEGIN/END-Footnote-1405306
-Node: I/O And BEGIN/END405412
-Node: BEGINFILE/ENDFILE407726
-Node: Empty410633
-Node: Using Shell Variables410950
-Node: Action Overview413224
-Node: Statements415549
-Node: If Statement417397
-Node: While Statement418892
-Node: Do Statement420920
-Node: For Statement422068
-Node: Switch Statement425226
-Node: Break Statement427612
-Node: Continue Statement429704
-Node: Next Statement431531
-Node: Nextfile Statement433914
-Node: Exit Statement436566
-Node: Built-in Variables438969
-Node: User-modified440102
-Node: Auto-set447688
-Ref: Auto-set-Footnote-1462341
-Ref: Auto-set-Footnote-2462547
-Node: ARGC and ARGV462603
-Node: Pattern Action Summary466816
-Node: Arrays469246
-Node: Array Basics470575
-Node: Array Intro471419
-Ref: figure-array-elements473394
-Ref: Array Intro-Footnote-1476098
-Node: Reference to Elements476226
-Node: Assigning Elements478690
-Node: Array Example479181
-Node: Scanning an Array480940
-Node: Controlling Scanning483962
-Ref: Controlling Scanning-Footnote-1489361
-Node: Numeric Array Subscripts489677
-Node: Uninitialized Subscripts491861
-Node: Delete493480
-Ref: Delete-Footnote-1496232
-Node: Multidimensional496289
-Node: Multiscanning499384
-Node: Arrays of Arrays500975
-Node: Arrays Summary505742
-Node: Functions507835
-Node: Built-in508873
-Node: Calling Built-in509954
-Node: Numeric Functions511950
-Ref: Numeric Functions-Footnote-1516783
-Ref: Numeric Functions-Footnote-2517140
-Ref: Numeric Functions-Footnote-3517188
-Node: String Functions517460
-Ref: String Functions-Footnote-1540964
-Ref: String Functions-Footnote-2541092
-Ref: String Functions-Footnote-3541340
-Node: Gory Details541427
-Ref: table-sub-escapes543218
-Ref: table-sub-proposed544737
-Ref: table-posix-sub546100
-Ref: table-gensub-escapes547641
-Ref: Gory Details-Footnote-1548464
-Node: I/O Functions548618
-Ref: table-system-return-values555200
-Ref: I/O Functions-Footnote-1557180
-Ref: I/O Functions-Footnote-2557328
-Node: Time Functions557448
-Ref: Time Functions-Footnote-1567953
-Ref: Time Functions-Footnote-2568021
-Ref: Time Functions-Footnote-3568179
-Ref: Time Functions-Footnote-4568290
-Ref: Time Functions-Footnote-5568402
-Ref: Time Functions-Footnote-6568629
-Node: Bitwise Functions568895
-Ref: table-bitwise-ops569489
-Ref: Bitwise Functions-Footnote-1573827
-Node: Type Functions574000
-Node: I18N Functions576532
-Node: User-defined578183
-Node: Definition Syntax578988
-Ref: Definition Syntax-Footnote-1584675
-Node: Function Example584746
-Ref: Function Example-Footnote-1587668
-Node: Function Caveats587690
-Node: Calling A Function588208
-Node: Variable Scope589166
-Node: Pass By Value/Reference592160
-Node: Return Statement595659
-Node: Dynamic Typing598638
-Node: Indirect Calls599568
-Ref: Indirect Calls-Footnote-1609819
-Node: Functions Summary609947
-Node: Library Functions612652
-Ref: Library Functions-Footnote-1616259
-Ref: Library Functions-Footnote-2616402
-Node: Library Names616573
-Ref: Library Names-Footnote-1620033
-Ref: Library Names-Footnote-2620256
-Node: General Functions620342
-Node: Strtonum Function621445
-Node: Assert Function624467
-Node: Round Function627793
-Node: Cliff Random Function629334
-Node: Ordinal Functions630350
-Ref: Ordinal Functions-Footnote-1633413
-Ref: Ordinal Functions-Footnote-2633665
-Node: Join Function633875
-Ref: Join Function-Footnote-1635645
-Node: Getlocaltime Function635845
-Node: Readfile Function639587
-Node: Shell Quoting641559
-Node: Data File Management642960
-Node: Filetrans Function643592
-Node: Rewind Function647688
-Node: File Checking649594
-Ref: File Checking-Footnote-1650928
-Node: Empty Files651129
-Node: Ignoring Assigns653108
-Node: Getopt Function654658
-Ref: Getopt Function-Footnote-1666127
-Node: Passwd Functions666327
-Ref: Passwd Functions-Footnote-1675166
-Node: Group Functions675254
-Ref: Group Functions-Footnote-1683151
-Node: Walking Arrays683358
-Node: Library Functions Summary686366
-Node: Library Exercises687772
-Node: Sample Programs688237
-Node: Running Examples689007
-Node: Clones689735
-Node: Cut Program690959
-Node: Egrep Program700888
-Ref: Egrep Program-Footnote-1708400
-Node: Id Program708510
-Node: Split Program712190
-Ref: Split Program-Footnote-1715649
-Node: Tee Program715778
-Node: Uniq Program718568
-Node: Wc Program725994
-Ref: Wc Program-Footnote-1730249
-Node: Miscellaneous Programs730343
-Node: Dupword Program731556
-Node: Alarm Program733586
-Node: Translate Program738441
-Ref: Translate Program-Footnote-1743006
-Node: Labels Program743276
-Ref: Labels Program-Footnote-1746627
-Node: Word Sorting746711
-Node: History Sorting750783
-Node: Extract Program752618
-Node: Simple Sed760147
-Node: Igawk Program763221
-Ref: Igawk Program-Footnote-1777552
-Ref: Igawk Program-Footnote-2777754
-Ref: Igawk Program-Footnote-3777876
-Node: Anagram Program777991
-Node: Signature Program781053
-Node: Programs Summary782300
-Node: Programs Exercises783514
-Ref: Programs Exercises-Footnote-1787643
-Node: Advanced Features787734
-Node: Nondecimal Data789724
-Node: Array Sorting791315
-Node: Controlling Array Traversal792015
-Ref: Controlling Array Traversal-Footnote-1800382
-Node: Array Sorting Functions800500
-Ref: Array Sorting Functions-Footnote-1805591
-Node: Two-way I/O805787
-Ref: Two-way I/O-Footnote-1812337
-Ref: Two-way I/O-Footnote-2812524
-Node: TCP/IP Networking812606
-Node: Profiling815724
-Ref: Profiling-Footnote-1824217
-Node: Advanced Features Summary824540
-Node: Internationalization826384
-Node: I18N and L10N827864
-Node: Explaining gettext828551
-Ref: Explaining gettext-Footnote-1834443
-Ref: Explaining gettext-Footnote-2834628
-Node: Programmer i18n834793
-Ref: Programmer i18n-Footnote-1839648
-Node: Translator i18n839697
-Node: String Extraction840491
-Ref: String Extraction-Footnote-1841623
-Node: Printf Ordering841709
-Ref: Printf Ordering-Footnote-1844495
-Node: I18N Portability844559
-Ref: I18N Portability-Footnote-1847015
-Node: I18N Example847078
-Ref: I18N Example-Footnote-1849884
-Node: Gawk I18N849957
-Node: I18N Summary850602
-Node: Debugger851943
-Node: Debugging852965
-Node: Debugging Concepts853406
-Node: Debugging Terms855215
-Node: Awk Debugging857790
-Node: Sample Debugging Session858696
-Node: Debugger Invocation859230
-Node: Finding The Bug860616
-Node: List of Debugger Commands867094
-Node: Breakpoint Control868427
-Node: Debugger Execution Control872121
-Node: Viewing And Changing Data875483
-Node: Execution Stack878857
-Node: Debugger Info880494
-Node: Miscellaneous Debugger Commands884565
-Node: Readline Support889653
-Node: Limitations890549
-Ref: Limitations-Footnote-1894780
-Node: Debugging Summary894831
-Node: Arbitrary Precision Arithmetic896110
-Node: Computer Arithmetic897526
-Ref: table-numeric-ranges901117
-Ref: Computer Arithmetic-Footnote-1901839
-Node: Math Definitions901896
-Ref: table-ieee-formats905210
-Ref: Math Definitions-Footnote-1905813
-Node: MPFR features905918
-Node: FP Math Caution907635
-Ref: FP Math Caution-Footnote-1908707
-Node: Inexactness of computations909076
-Node: Inexact representation910036
-Node: Comparing FP Values911396
-Node: Errors accumulate912478
-Node: Getting Accuracy913911
-Node: Try To Round916621
-Node: Setting precision917520
-Ref: table-predefined-precision-strings918217
-Node: Setting the rounding mode920047
-Ref: table-gawk-rounding-modes920421
-Ref: Setting the rounding mode-Footnote-1923829
-Node: Arbitrary Precision Integers924008
-Ref: Arbitrary Precision Integers-Footnote-1928925
-Node: POSIX Floating Point Problems929074
-Ref: POSIX Floating Point Problems-Footnote-1932956
-Node: Floating point summary932994
-Node: Dynamic Extensions935184
-Node: Extension Intro936737
-Node: Plugin License938003
-Node: Extension Mechanism Outline938800
-Ref: figure-load-extension939239
-Ref: figure-register-new-function940804
-Ref: figure-call-new-function941896
-Node: Extension API Description943958
-Node: Extension API Functions Introduction945490
-Node: General Data Types950349
-Ref: General Data Types-Footnote-1956304
-Node: Memory Allocation Functions956603
-Ref: Memory Allocation Functions-Footnote-1959448
-Node: Constructor Functions959547
-Node: Registration Functions961292
-Node: Extension Functions961977
-Node: Exit Callback Functions964600
-Node: Extension Version String965850
-Node: Input Parsers966513
-Node: Output Wrappers976395
-Node: Two-way processors980907
-Node: Printing Messages983172
-Ref: Printing Messages-Footnote-1984343
-Node: Updating ERRNO984496
-Node: Requesting Values985235
-Ref: table-value-types-returned985972
-Node: Accessing Parameters986855
-Node: Symbol Table Access988090
-Node: Symbol table by name988602
-Node: Symbol table by cookie990623
-Ref: Symbol table by cookie-Footnote-1994775
-Node: Cached values994839
-Ref: Cached values-Footnote-1998346
-Node: Array Manipulation998437
-Ref: Array Manipulation-Footnote-1999528
-Node: Array Data Types999565
-Ref: Array Data Types-Footnote-11002223
-Node: Array Functions1002315
-Node: Flattening Arrays1006173
-Node: Creating Arrays1013081
-Node: Redirection API1017850
-Node: Extension API Variables1020681
-Node: Extension Versioning1021314
-Ref: gawk-api-version1021751
-Node: Extension API Informational Variables1023507
-Node: Extension API Boilerplate1024571
-Node: Finding Extensions1028385
-Node: Extension Example1028944
-Node: Internal File Description1029742
-Node: Internal File Ops1033822
-Ref: Internal File Ops-Footnote-11045584
-Node: Using Internal File Ops1045724
-Ref: Using Internal File Ops-Footnote-11048107
-Node: Extension Samples1048381
-Node: Extension Sample File Functions1049910
-Node: Extension Sample Fnmatch1057559
-Node: Extension Sample Fork1059046
-Node: Extension Sample Inplace1060264
-Node: Extension Sample Ord1063474
-Node: Extension Sample Readdir1064310
-Ref: table-readdir-file-types1065199
-Node: Extension Sample Revout1066004
-Node: Extension Sample Rev2way1066593
-Node: Extension Sample Read write array1067333
-Node: Extension Sample Readfile1069275
-Node: Extension Sample Time1070370
-Node: Extension Sample API Tests1071718
-Node: gawkextlib1072210
-Node: Extension summary1074657
-Node: Extension Exercises1078359
-Node: Language History1079857
-Node: V7/SVR3.11081513
-Node: SVR41083665
-Node: POSIX1085099
-Node: BTL1086478
-Node: POSIX/GNU1087207
-Node: Feature History1093069
-Node: Common Extensions1107439
-Node: Ranges and Locales1108722
-Ref: Ranges and Locales-Footnote-11113338
-Ref: Ranges and Locales-Footnote-21113365
-Ref: Ranges and Locales-Footnote-31113600
-Node: Contributors1113821
-Node: History summary1119381
-Node: Installation1120761
-Node: Gawk Distribution1121705
-Node: Getting1122189
-Node: Extracting1123150
-Node: Distribution contents1124788
-Node: Unix Installation1130882
-Node: Quick Installation1131564
-Node: Shell Startup Files1133978
-Node: Additional Configuration Options1135056
-Node: Configuration Philosophy1136861
-Node: Non-Unix Installation1139230
-Node: PC Installation1139688
-Node: PC Binary Installation1141008
-Node: PC Compiling1142860
-Ref: PC Compiling-Footnote-11145654
-Node: PC Testing1145763
-Node: PC Using1146943
-Ref: PC Using-Footnote-11151096
-Node: Cygwin1151169
-Node: MSYS1151939
-Node: VMS Installation1152440
-Node: VMS Compilation1153231
-Ref: VMS Compilation-Footnote-11154460
-Node: VMS Dynamic Extensions1154518
-Node: VMS Installation Details1156203
-Node: VMS Running1158456
-Node: VMS GNV1162735
-Node: VMS Old Gawk1163470
-Node: Bugs1163941
-Node: Other Versions1168256
-Node: Installation summary1174840
-Node: Notes1175891
-Node: Compatibility Mode1176756
-Node: Additions1177538
-Node: Accessing The Source1178463
-Node: Adding Code1179898
-Node: New Ports1186117
-Node: Derived Files1190605
-Ref: Derived Files-Footnote-11196090
-Ref: Derived Files-Footnote-21196125
-Ref: Derived Files-Footnote-31196723
-Node: Future Extensions1196837
-Node: Implementation Limitations1197495
-Node: Extension Design1198678
-Node: Old Extension Problems1199832
-Ref: Old Extension Problems-Footnote-11201350
-Node: Extension New Mechanism Goals1201407
-Ref: Extension New Mechanism Goals-Footnote-11204771
-Node: Extension Other Design Decisions1204960
-Node: Extension Future Growth1207073
-Node: Old Extension Mechanism1207909
-Node: Notes summary1209672
-Node: Basic Concepts1210854
-Node: Basic High Level1211535
-Ref: figure-general-flow1211817
-Ref: figure-process-flow1212502
-Ref: Basic High Level-Footnote-11215803
-Node: Basic Data Typing1215988
-Node: Glossary1219316
-Node: Copying1251263
-Node: GNU Free Documentation License1288802
-Node: Index1313920
+Ref: POSIX String Comparison-Footnote-1374012
+Ref: POSIX String Comparison-Footnote-2374151
+Node: Boolean Ops374235
+Ref: Boolean Ops-Footnote-1378717
+Node: Conditional Exp378809
+Node: Function Calls380545
+Node: Precedence384422
+Node: Locales388081
+Node: Expressions Summary389713
+Node: Patterns and Actions392286
+Node: Pattern Overview393406
+Node: Regexp Patterns395083
+Node: Expression Patterns395625
+Node: Ranges399406
+Node: BEGIN/END402514
+Node: Using BEGIN/END403275
+Ref: Using BEGIN/END-Footnote-1406011
+Node: I/O And BEGIN/END406117
+Node: BEGINFILE/ENDFILE408431
+Node: Empty411338
+Node: Using Shell Variables411655
+Node: Action Overview413929
+Node: Statements416254
+Node: If Statement418102
+Node: While Statement419597
+Node: Do Statement421625
+Node: For Statement422773
+Node: Switch Statement425931
+Node: Break Statement428317
+Node: Continue Statement430409
+Node: Next Statement432236
+Node: Nextfile Statement434619
+Node: Exit Statement437271
+Node: Built-in Variables439674
+Node: User-modified440807
+Node: Auto-set448393
+Ref: Auto-set-Footnote-1463046
+Ref: Auto-set-Footnote-2463252
+Node: ARGC and ARGV463308
+Node: Pattern Action Summary467521
+Node: Arrays469951
+Node: Array Basics471280
+Node: Array Intro472124
+Ref: figure-array-elements474099
+Ref: Array Intro-Footnote-1476803
+Node: Reference to Elements476931
+Node: Assigning Elements479395
+Node: Array Example479886
+Node: Scanning an Array481645
+Node: Controlling Scanning484667
+Ref: Controlling Scanning-Footnote-1490066
+Node: Numeric Array Subscripts490382
+Node: Uninitialized Subscripts492566
+Node: Delete494185
+Ref: Delete-Footnote-1496937
+Node: Multidimensional496994
+Node: Multiscanning500089
+Node: Arrays of Arrays501680
+Node: Arrays Summary506447
+Node: Functions508540
+Node: Built-in509578
+Node: Calling Built-in510659
+Node: Numeric Functions512655
+Ref: Numeric Functions-Footnote-1517488
+Ref: Numeric Functions-Footnote-2517845
+Ref: Numeric Functions-Footnote-3517893
+Node: String Functions518165
+Ref: String Functions-Footnote-1541669
+Ref: String Functions-Footnote-2541797
+Ref: String Functions-Footnote-3542045
+Node: Gory Details542132
+Ref: table-sub-escapes543923
+Ref: table-sub-proposed545442
+Ref: table-posix-sub546805
+Ref: table-gensub-escapes548346
+Ref: Gory Details-Footnote-1549169
+Node: I/O Functions549323
+Ref: table-system-return-values555905
+Ref: I/O Functions-Footnote-1557885
+Ref: I/O Functions-Footnote-2558033
+Node: Time Functions558153
+Ref: Time Functions-Footnote-1568658
+Ref: Time Functions-Footnote-2568726
+Ref: Time Functions-Footnote-3568884
+Ref: Time Functions-Footnote-4568995
+Ref: Time Functions-Footnote-5569107
+Ref: Time Functions-Footnote-6569334
+Node: Bitwise Functions569600
+Ref: table-bitwise-ops570194
+Ref: Bitwise Functions-Footnote-1574532
+Node: Type Functions574705
+Node: I18N Functions577237
+Node: User-defined578888
+Node: Definition Syntax579693
+Ref: Definition Syntax-Footnote-1585380
+Node: Function Example585451
+Ref: Function Example-Footnote-1588373
+Node: Function Caveats588395
+Node: Calling A Function588913
+Node: Variable Scope589871
+Node: Pass By Value/Reference592865
+Node: Return Statement596364
+Node: Dynamic Typing599343
+Node: Indirect Calls600273
+Ref: Indirect Calls-Footnote-1610524
+Node: Functions Summary610652
+Node: Library Functions613357
+Ref: Library Functions-Footnote-1616964
+Ref: Library Functions-Footnote-2617107
+Node: Library Names617278
+Ref: Library Names-Footnote-1620738
+Ref: Library Names-Footnote-2620961
+Node: General Functions621047
+Node: Strtonum Function622150
+Node: Assert Function625172
+Node: Round Function628498
+Node: Cliff Random Function630039
+Node: Ordinal Functions631055
+Ref: Ordinal Functions-Footnote-1634118
+Ref: Ordinal Functions-Footnote-2634370
+Node: Join Function634580
+Ref: Join Function-Footnote-1636350
+Node: Getlocaltime Function636550
+Node: Readfile Function640292
+Node: Shell Quoting642264
+Node: Data File Management643665
+Node: Filetrans Function644297
+Node: Rewind Function648393
+Node: File Checking650299
+Ref: File Checking-Footnote-1651633
+Node: Empty Files651834
+Node: Ignoring Assigns653813
+Node: Getopt Function655363
+Ref: Getopt Function-Footnote-1666832
+Node: Passwd Functions667032
+Ref: Passwd Functions-Footnote-1675871
+Node: Group Functions675959
+Ref: Group Functions-Footnote-1683856
+Node: Walking Arrays684063
+Node: Library Functions Summary687071
+Node: Library Exercises688477
+Node: Sample Programs688942
+Node: Running Examples689712
+Node: Clones690440
+Node: Cut Program691664
+Node: Egrep Program701593
+Ref: Egrep Program-Footnote-1709105
+Node: Id Program709215
+Node: Split Program712895
+Ref: Split Program-Footnote-1716354
+Node: Tee Program716483
+Node: Uniq Program719273
+Node: Wc Program726699
+Ref: Wc Program-Footnote-1730954
+Node: Miscellaneous Programs731048
+Node: Dupword Program732261
+Node: Alarm Program734291
+Node: Translate Program739146
+Ref: Translate Program-Footnote-1743711
+Node: Labels Program743981
+Ref: Labels Program-Footnote-1747332
+Node: Word Sorting747416
+Node: History Sorting751488
+Node: Extract Program753323
+Node: Simple Sed760852
+Node: Igawk Program763926
+Ref: Igawk Program-Footnote-1778257
+Ref: Igawk Program-Footnote-2778459
+Ref: Igawk Program-Footnote-3778581
+Node: Anagram Program778696
+Node: Signature Program781758
+Node: Programs Summary783005
+Node: Programs Exercises784219
+Ref: Programs Exercises-Footnote-1788348
+Node: Advanced Features788439
+Node: Nondecimal Data790429
+Node: Array Sorting792020
+Node: Controlling Array Traversal792720
+Ref: Controlling Array Traversal-Footnote-1801087
+Node: Array Sorting Functions801205
+Ref: Array Sorting Functions-Footnote-1806296
+Node: Two-way I/O806492
+Ref: Two-way I/O-Footnote-1813042
+Ref: Two-way I/O-Footnote-2813229
+Node: TCP/IP Networking813311
+Node: Profiling816429
+Ref: Profiling-Footnote-1824922
+Node: Advanced Features Summary825245
+Node: Internationalization827089
+Node: I18N and L10N828569
+Node: Explaining gettext829256
+Ref: Explaining gettext-Footnote-1835148
+Ref: Explaining gettext-Footnote-2835333
+Node: Programmer i18n835498
+Ref: Programmer i18n-Footnote-1840353
+Node: Translator i18n840402
+Node: String Extraction841196
+Ref: String Extraction-Footnote-1842328
+Node: Printf Ordering842414
+Ref: Printf Ordering-Footnote-1845200
+Node: I18N Portability845264
+Ref: I18N Portability-Footnote-1847720
+Node: I18N Example847783
+Ref: I18N Example-Footnote-1850589
+Node: Gawk I18N850662
+Node: I18N Summary851307
+Node: Debugger852648
+Node: Debugging853670
+Node: Debugging Concepts854111
+Node: Debugging Terms855920
+Node: Awk Debugging858495
+Node: Sample Debugging Session859401
+Node: Debugger Invocation859935
+Node: Finding The Bug861321
+Node: List of Debugger Commands867799
+Node: Breakpoint Control869132
+Node: Debugger Execution Control872826
+Node: Viewing And Changing Data876188
+Node: Execution Stack879562
+Node: Debugger Info881199
+Node: Miscellaneous Debugger Commands885270
+Node: Readline Support890358
+Node: Limitations891254
+Ref: Limitations-Footnote-1895485
+Node: Debugging Summary895536
+Node: Arbitrary Precision Arithmetic896815
+Node: Computer Arithmetic898231
+Ref: table-numeric-ranges901822
+Ref: Computer Arithmetic-Footnote-1902544
+Node: Math Definitions902601
+Ref: table-ieee-formats905915
+Ref: Math Definitions-Footnote-1906518
+Node: MPFR features906623
+Node: FP Math Caution908340
+Ref: FP Math Caution-Footnote-1909412
+Node: Inexactness of computations909781
+Node: Inexact representation910741
+Node: Comparing FP Values912101
+Node: Errors accumulate913183
+Node: Getting Accuracy914616
+Node: Try To Round917326
+Node: Setting precision918225
+Ref: table-predefined-precision-strings918922
+Node: Setting the rounding mode920752
+Ref: table-gawk-rounding-modes921126
+Ref: Setting the rounding mode-Footnote-1924534
+Node: Arbitrary Precision Integers924713
+Ref: Arbitrary Precision Integers-Footnote-1929630
+Node: POSIX Floating Point Problems929779
+Ref: POSIX Floating Point Problems-Footnote-1933661
+Node: Floating point summary933699
+Node: Dynamic Extensions935889
+Node: Extension Intro937442
+Node: Plugin License938708
+Node: Extension Mechanism Outline939505
+Ref: figure-load-extension939944
+Ref: figure-register-new-function941509
+Ref: figure-call-new-function942601
+Node: Extension API Description944663
+Node: Extension API Functions Introduction946195
+Node: General Data Types951054
+Ref: General Data Types-Footnote-1957009
+Node: Memory Allocation Functions957308
+Ref: Memory Allocation Functions-Footnote-1960153
+Node: Constructor Functions960252
+Node: Registration Functions961997
+Node: Extension Functions962682
+Node: Exit Callback Functions965305
+Node: Extension Version String966555
+Node: Input Parsers967218
+Node: Output Wrappers977100
+Node: Two-way processors981612
+Node: Printing Messages983877
+Ref: Printing Messages-Footnote-1985048
+Node: Updating ERRNO985201
+Node: Requesting Values985940
+Ref: table-value-types-returned986677
+Node: Accessing Parameters987560
+Node: Symbol Table Access988795
+Node: Symbol table by name989307
+Node: Symbol table by cookie991328
+Ref: Symbol table by cookie-Footnote-1995480
+Node: Cached values995544
+Ref: Cached values-Footnote-1999051
+Node: Array Manipulation999142
+Ref: Array Manipulation-Footnote-11000233
+Node: Array Data Types1000270
+Ref: Array Data Types-Footnote-11002928
+Node: Array Functions1003020
+Node: Flattening Arrays1006878
+Node: Creating Arrays1013786
+Node: Redirection API1018555
+Node: Extension API Variables1021386
+Node: Extension Versioning1022019
+Ref: gawk-api-version1022456
+Node: Extension API Informational Variables1024212
+Node: Extension API Boilerplate1025276
+Node: Finding Extensions1029090
+Node: Extension Example1029649
+Node: Internal File Description1030447
+Node: Internal File Ops1034527
+Ref: Internal File Ops-Footnote-11046289
+Node: Using Internal File Ops1046429
+Ref: Using Internal File Ops-Footnote-11048812
+Node: Extension Samples1049086
+Node: Extension Sample File Functions1050615
+Node: Extension Sample Fnmatch1058264
+Node: Extension Sample Fork1059751
+Node: Extension Sample Inplace1060969
+Node: Extension Sample Ord1064179
+Node: Extension Sample Readdir1065015
+Ref: table-readdir-file-types1065904
+Node: Extension Sample Revout1066709
+Node: Extension Sample Rev2way1067298
+Node: Extension Sample Read write array1068038
+Node: Extension Sample Readfile1069980
+Node: Extension Sample Time1071075
+Node: Extension Sample API Tests1072423
+Node: gawkextlib1072915
+Node: Extension summary1075362
+Node: Extension Exercises1079064
+Node: Language History1080562
+Node: V7/SVR3.11082218
+Node: SVR41084370
+Node: POSIX1085804
+Node: BTL1087183
+Node: POSIX/GNU1087912
+Node: Feature History1093774
+Node: Common Extensions1108144
+Node: Ranges and Locales1109427
+Ref: Ranges and Locales-Footnote-11114043
+Ref: Ranges and Locales-Footnote-21114070
+Ref: Ranges and Locales-Footnote-31114305
+Node: Contributors1114526
+Node: History summary1120086
+Node: Installation1121466
+Node: Gawk Distribution1122410
+Node: Getting1122894
+Node: Extracting1123855
+Node: Distribution contents1125493
+Node: Unix Installation1131587
+Node: Quick Installation1132269
+Node: Shell Startup Files1134683
+Node: Additional Configuration Options1135761
+Node: Configuration Philosophy1137566
+Node: Non-Unix Installation1139935
+Node: PC Installation1140393
+Node: PC Binary Installation1141713
+Node: PC Compiling1143565
+Ref: PC Compiling-Footnote-11146359
+Node: PC Testing1146468
+Node: PC Using1147648
+Ref: PC Using-Footnote-11151801
+Node: Cygwin1151874
+Node: MSYS1152644
+Node: VMS Installation1153145
+Node: VMS Compilation1153936
+Ref: VMS Compilation-Footnote-11155165
+Node: VMS Dynamic Extensions1155223
+Node: VMS Installation Details1156908
+Node: VMS Running1159161
+Node: VMS GNV1163440
+Node: VMS Old Gawk1164175
+Node: Bugs1164646
+Node: Other Versions1168961
+Node: Installation summary1175545
+Node: Notes1176596
+Node: Compatibility Mode1177461
+Node: Additions1178243
+Node: Accessing The Source1179168
+Node: Adding Code1180603
+Node: New Ports1186822
+Node: Derived Files1191310
+Ref: Derived Files-Footnote-11196795
+Ref: Derived Files-Footnote-21196830
+Ref: Derived Files-Footnote-31197428
+Node: Future Extensions1197542
+Node: Implementation Limitations1198200
+Node: Extension Design1199383
+Node: Old Extension Problems1200537
+Ref: Old Extension Problems-Footnote-11202055
+Node: Extension New Mechanism Goals1202112
+Ref: Extension New Mechanism Goals-Footnote-11205476
+Node: Extension Other Design Decisions1205665
+Node: Extension Future Growth1207778
+Node: Old Extension Mechanism1208614
+Node: Notes summary1210377
+Node: Basic Concepts1211559
+Node: Basic High Level1212240
+Ref: figure-general-flow1212522
+Ref: figure-process-flow1213207
+Ref: Basic High Level-Footnote-11216508
+Node: Basic Data Typing1216693
+Node: Glossary1220021
+Node: Copying1251968
+Node: GNU Free Documentation License1289507
+Node: Index1314625

End Tag Table
diff --git a/doc/gawk.texi b/doc/gawk.texi
index a4b61895..90f6dcfc 100644
--- a/doc/gawk.texi
+++ b/doc/gawk.texi
@@ -12577,19 +12577,19 @@ One special place where @code{/foo/} is @emph{not} an abbreviation for
where this is discussed in more detail.
@node POSIX String Comparison
-@subsubsection String Comparison with POSIX Rules
+@subsubsection String Comparison Based on Locale Collating Order
-The POSIX standard says that string comparison is performed based
-on the locale's @dfn{collating order}. This is the order in which
-characters sort, as defined by the locale (for more discussion,
-@pxref{Locales}). This order is usually very different
-from the results obtained when doing straight character-by-character
-comparison.@footnote{Technically, string comparison is supposed
-to behave the same way as if the strings were compared with the C
-@code{strcoll()} function.}
+The POSIX standard used to say that all string comparisons are
+performed based on the locale's @dfn{collating order}. This
+is the order in which characters sort, as defined by the locale
+(for more discussion, @pxref{Locales}). This order is usually very
+different from the results obtained when doing straight byte-by-byte
+comparison.@footnote{Technically, string comparison is supposed to behave
+the same way as if the strings were compared with the C @code{strcoll()}
+function.}
Because this behavior differs considerably from existing practice,
-@command{gawk} only implements it when in POSIX mode (@pxref{Options}).
+@command{gawk} only implemented it when in POSIX mode (@pxref{Options}).
Here is an example to illustrate the difference, in an @code{en_US.UTF-8}
locale:
@@ -12602,6 +12602,26 @@ $ @kbd{gawk --posix 'BEGIN @{ printf("ABC < abc = %s\n",}
@print{} ABC < abc = FALSE
@end example
+Fortunately, as of August 2016, comparison based on locale
+collating order is no longer required for the @code{==} and @code{!=}
+operators.@footnote{See @uref{http://austingroupbugs.net/view.php?id=1070,
+the Austin Group website}.} However, comparison based on locales is still
+required for @code{<}, @code{<=}, @code{>}, and @code{>=}. POSIX thus
+recommends as follows:
+
+@quotation
+Since the @code{==} operator checks whether strings are identical,
+not whether they collate equally, applications needing to check whether
+strings collate equally can use:
+
+@example
+a <= b && a >= b
+@end example
+@end quotation
+
+As of @value{PVERSION} 4.2, @command{gawk} continues to use locale
+collating order for @code{<}, @code{<=}, @code{>}, and @code{>=} only
+in POSIX mode.
@node Boolean Ops
@subsection Boolean Expressions
@@ -37385,7 +37405,7 @@ and
@uref{http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html#tag_21_09_03_05, its rationale}.}
By using this lovely technical term, the standard gives license
-to implementors to implement ranges in whatever way they choose.
+to implementers to implement ranges in whatever way they choose.
The @command{gawk} maintainer chose to apply the pre-POSIX meaning
both with the default regexp matching and when @option{--traditional} or
@option{--posix} are used.
diff --git a/doc/gawktexi.in b/doc/gawktexi.in
index 9c2864cd..782884bb 100644
--- a/doc/gawktexi.in
+++ b/doc/gawktexi.in
@@ -11896,19 +11896,19 @@ One special place where @code{/foo/} is @emph{not} an abbreviation for
where this is discussed in more detail.
@node POSIX String Comparison
-@subsubsection String Comparison with POSIX Rules
+@subsubsection String Comparison Based on Locale Collating Order
-The POSIX standard says that string comparison is performed based
-on the locale's @dfn{collating order}. This is the order in which
-characters sort, as defined by the locale (for more discussion,
-@pxref{Locales}). This order is usually very different
-from the results obtained when doing straight character-by-character
-comparison.@footnote{Technically, string comparison is supposed
-to behave the same way as if the strings were compared with the C
-@code{strcoll()} function.}
+The POSIX standard used to say that all string comparisons are
+performed based on the locale's @dfn{collating order}. This
+is the order in which characters sort, as defined by the locale
+(for more discussion, @pxref{Locales}). This order is usually very
+different from the results obtained when doing straight byte-by-byte
+comparison.@footnote{Technically, string comparison is supposed to behave
+the same way as if the strings were compared with the C @code{strcoll()}
+function.}
Because this behavior differs considerably from existing practice,
-@command{gawk} only implements it when in POSIX mode (@pxref{Options}).
+@command{gawk} only implemented it when in POSIX mode (@pxref{Options}).
Here is an example to illustrate the difference, in an @code{en_US.UTF-8}
locale:
@@ -11921,6 +11921,26 @@ $ @kbd{gawk --posix 'BEGIN @{ printf("ABC < abc = %s\n",}
@print{} ABC < abc = FALSE
@end example
+Fortunately, as of August 2016, comparison based on locale
+collating order is no longer required for the @code{==} and @code{!=}
+operators.@footnote{See @uref{http://austingroupbugs.net/view.php?id=1070,
+the Austin Group website}.} However, comparison based on locales is still
+required for @code{<}, @code{<=}, @code{>}, and @code{>=}. POSIX thus
+recommends as follows:
+
+@quotation
+Since the @code{==} operator checks whether strings are identical,
+not whether they collate equally, applications needing to check whether
+strings collate equally can use:
+
+@example
+a <= b && a >= b
+@end example
+@end quotation
+
+As of @value{PVERSION} 4.2, @command{gawk} continues to use locale
+collating order for @code{<}, @code{<=}, @code{>}, and @code{>=} only
+in POSIX mode.
@node Boolean Ops
@subsection Boolean Expressions
@@ -36467,7 +36487,7 @@ and
@uref{http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html#tag_21_09_03_05, its rationale}.}
By using this lovely technical term, the standard gives license
-to implementors to implement ranges in whatever way they choose.
+to implementers to implement ranges in whatever way they choose.
The @command{gawk} maintainer chose to apply the pre-POSIX meaning
both with the default regexp matching and when @option{--traditional} or
@option{--posix} are used.
diff --git a/eval.c b/eval.c
index fc468543..cfd71b43 100644
--- a/eval.c
+++ b/eval.c
@@ -573,7 +573,7 @@ posix_compare(NODE *s1, NODE *s2)
/* cmp_nodes --- compare two nodes, returning negative, 0, positive */
int
-cmp_nodes(NODE *t1, NODE *t2)
+cmp_nodes(NODE *t1, NODE *t2, bool use_strcmp)
{
int ret = 0;
size_t len1, len2;
@@ -596,7 +596,7 @@ cmp_nodes(NODE *t1, NODE *t2)
if (len1 == 0 || len2 == 0)
return ldiff;
- if (do_posix)
+ if (do_posix && ! use_strcmp)
return posix_compare(t1, t2);
l = (ldiff <= 0 ? len1 : len2);
@@ -882,7 +882,7 @@ fmt_index(NODE *n)
emalloc(fmt_list, NODE **, fmt_num*sizeof(*fmt_list), "fmt_index");
n = force_string(n);
while (ix < fmt_hiwater) {
- if (cmp_nodes(fmt_list[ix], n) == 0)
+ if (cmp_nodes(fmt_list[ix], n, true) == 0)
return ix;
ix++;
}
@@ -1502,10 +1502,15 @@ eval_condition(NODE *t)
return boolval(t);
}
+typedef enum {
+ SCALAR_EQ_NEQ,
+ SCALAR_RELATIONAL
+} scalar_cmp_t;
+
/* cmp_scalars -- compare two nodes on the stack */
static inline int
-cmp_scalars()
+cmp_scalars(scalar_cmp_t comparison_type)
{
NODE *t1, *t2;
int di;
@@ -1516,7 +1521,7 @@ cmp_scalars()
DEREF(t2);
fatal(_("attempt to use array `%s' in a scalar context"), array_vname(t1));
}
- di = cmp_nodes(t1, t2);
+ di = cmp_nodes(t1, t2, comparison_type == SCALAR_EQ_NEQ);
DEREF(t1);
DEREF(t2);
return di;
diff --git a/extension/configure.ac b/extension/configure.ac
index b723a3c1..b5b27d03 100644
--- a/extension/configure.ac
+++ b/extension/configure.ac
@@ -23,7 +23,7 @@ dnl
dnl Process this file with autoconf to produce a configure script.
-AC_INIT([GNU Awk Bundled Extensions],[4.1.3],[bug-gawk@gnu.org],[gawk-extensions])
+AC_INIT([GNU Awk Bundled Extensions],[4.1.4],[bug-gawk@gnu.org],[gawk-extensions])
AC_CONFIG_MACRO_DIR([m4])
AC_CONFIG_AUX_DIR([build-aux])
diff --git a/interpret.h b/interpret.h
index 2f85049a..46c44cdb 100644
--- a/interpret.h
+++ b/interpret.h
@@ -446,37 +446,37 @@ uninitialized_scalar:
break;
case Op_equal:
- r = node_Boolean[cmp_scalars() == 0];
+ r = node_Boolean[cmp_scalars(SCALAR_EQ_NEQ) == 0];
UPREF(r);
REPLACE(r);
break;
case Op_notequal:
- r = node_Boolean[cmp_scalars() != 0];
+ r = node_Boolean[cmp_scalars(SCALAR_EQ_NEQ) != 0];
UPREF(r);
REPLACE(r);
break;
case Op_less:
- r = node_Boolean[cmp_scalars() < 0];
+ r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) < 0];
UPREF(r);
REPLACE(r);
break;
case Op_greater:
- r = node_Boolean[cmp_scalars() > 0];
+ r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) > 0];
UPREF(r);
REPLACE(r);
break;
case Op_leq:
- r = node_Boolean[cmp_scalars() <= 0];
+ r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) <= 0];
UPREF(r);
REPLACE(r);
break;
case Op_geq:
- r = node_Boolean[cmp_scalars() >= 0];
+ r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) >= 0];
UPREF(r);
REPLACE(r);
break;
@@ -834,12 +834,11 @@ mod:
t2 = TOP_SCALAR(); /* switch expression */
t2 = force_string(t2);
rp = re_update(m);
- di = (research(rp, t2->stptr, 0, t2->stlen,
- avoid_dfa(m, t2->stptr, t2->stlen)) >= 0);
+ di = (research(rp, t2->stptr, 0, t2->stlen, RE_NO_FLAGS) >= 0);
} else {
t1 = POP_SCALAR(); /* case value */
t2 = TOP_SCALAR(); /* switch expression */
- di = (cmp_nodes(t2, t1) == 0);
+ di = (cmp_nodes(t2, t1, true) == 0);
DEREF(t1);
}
@@ -999,20 +998,7 @@ arrayfor:
t1 = *get_field(0, (Func_ptr *) 0);
match_re:
rp = re_update(m);
- /*
- * Any place where research() is called with a last parameter of
- * zero, we need to use the avoid_dfa test. This appears here and
- * in the code for Op_K_case.
- *
- * A new or improved dfa that distinguishes beginning/end of
- * string from beginning/end of line will allow us to get rid of
- * this hack.
- *
- * The avoid_dfa() function is in re.c; it is not very smart.
- */
-
- di = research(rp, t1->stptr, 0, t1->stlen,
- avoid_dfa(m, t1->stptr, t1->stlen));
+ di = research(rp, t1->stptr, 0, t1->stlen, RE_NO_FLAGS);
di = (di == -1) ^ (op != Op_nomatch);
if (op != Op_match_rec) {
decr_sp();
diff --git a/re.c b/re.c
index c7899694..c822c90f 100644
--- a/re.c
+++ b/re.c
@@ -170,7 +170,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
memset((char *) rp, 0, sizeof(*rp));
- rp->dfareg = NULL;
rp->pat.allocated = 0; /* regex will allocate the buffer */
emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
@@ -223,12 +222,11 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
/* gack. this must be done *after* re_compile_pattern */
rp->pat.newline_anchor = false; /* don't get \n in middle of string */
if (dfa && ! no_dfa) {
- rp->dfa = true;
rp->dfareg = dfaalloc();
dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n');
dfacomp(buf, len, rp->dfareg, true);
} else
- rp->dfa = false;
+ rp->dfareg = NULL;
rp->has_anchor = has_anchor;
/* Additional flags that help with RS as regexp. */
@@ -278,26 +276,25 @@ research(Regexp *rp, char *str, int start,
* starts in the middle of a string, so don't bother trying it
* in that case.
*/
- if (rp->dfa && ! no_bol && start == 0) {
- char save;
- size_t count = 0;
+ if (rp->dfareg != NULL && ! no_bol && start == 0) {
struct dfa *superset = dfasuperset(rp->dfareg);
- /*
- * dfa likes to stick a '\n' right after the matched
- * text. So we just save and restore the character.
- */
- save = str[start+len];
if (superset)
ret = dfaexec(superset, str+start, str+start+len,
true, NULL, NULL);
- if (ret)
+
+ if (ret && ((! need_start && ! rp->has_anchor)
+ || (! superset && dfaisfast(rp->dfareg))))
ret = dfaexec(rp->dfareg, str+start, str+start+len,
- true, &count, &try_backref);
- str[start+len] = save;
+ true, NULL, &try_backref);
}
if (ret) {
- if (need_start || rp->dfa == false || try_backref) {
+ if ( rp->dfareg == NULL
+ || start != 0
+ || no_bol
+ || need_start
+ || rp->has_anchor
+ || try_backref) {
/*
* Passing NULL as last arg speeds up search for cases
* where we don't need the start/end info.
@@ -326,7 +323,7 @@ refree(Regexp *rp)
free(rp->regs.start);
if (rp->regs.end)
free(rp->regs.end);
- if (rp->dfa) {
+ if (rp->dfareg != NULL) {
dfafree(rp->dfareg);
free(rp->dfareg);
}
@@ -359,7 +356,7 @@ re_update(NODE *t)
t1 = t->re_exp;
if (t->re_text != NULL) {
/* if contents haven't changed, just return it */
- if (cmp_nodes(t->re_text, t1) == 0)
+ if (cmp_nodes(t->re_text, t1, true) == 0)
return t->re_reg;
/* things changed, fall through to recompile */
unref(t->re_text);
@@ -425,32 +422,6 @@ resetup()
dfa_init();
}
-/* avoid_dfa --- return true if we should not use the DFA matcher */
-
-int
-avoid_dfa(NODE *re, char *str, size_t len)
-{
- char *end;
-
- /*
- * f = @/.../
- * if ("foo" ~ f) ...
- *
- * This creates a Node_dynregex with NULL re_reg.
- */
- if (re->re_reg == NULL)
- return false;
-
- if (! re->re_reg->has_anchor)
- return false;
-
- for (end = str + len; str < end; str++)
- if (*str == '\n')
- return true;
-
- return false;
-}
-
/* reisstring --- return true if the RE match is a simple string match */
int