diff options
-rw-r--r-- | ChangeLog | 33 | ||||
-rw-r--r-- | awk.h | 13 | ||||
-rw-r--r-- | debug.c | 2 | ||||
-rw-r--r-- | dfa.c | 393 | ||||
-rw-r--r-- | doc/ChangeLog | 6 | ||||
-rw-r--r-- | doc/gawk.info | 830 | ||||
-rw-r--r-- | doc/gawk.texi | 42 | ||||
-rw-r--r-- | doc/gawktexi.in | 42 | ||||
-rw-r--r-- | eval.c | 15 | ||||
-rw-r--r-- | extension/configure.ac | 2 | ||||
-rw-r--r-- | interpret.h | 32 | ||||
-rw-r--r-- | re.c | 57 |
12 files changed, 761 insertions, 706 deletions
@@ -1,5 +1,38 @@ 2016-08-25 Arnold D. Robbins <arnold@skeeve.com> + POSIX now says use strcmp for == and !=. Thanks to Chet Ramey + for pointing me at the change. Make it so: + + * awk.h (cmp_nodes): New 3rd param indicating strcmp, not strcoll. + * debug.c (cmp_val): Update call to cmp_nodes. + * eval.c (cmp_nodes): New 3rd param indicating strcmp, not strcoll. + Adjust code and all callers. + (scalar_cmp_t): New enum type. Used in ... + (cmp_scalars): ... in order to call cmp_nodes correctly. + * interpret.h: Use the enum type in calls to cmp_scalars. + * re.c (re_update): Adjust call to cmp_nodes. + +2016-08-25 Norihiro Tanaka <noritnk@kcn.ne.jp> + + * awk.h (struct Regexp): Remove dfa. Now dfareg instead of it. All + referers changed. + * re.c (research): Arrange caller of dfaexec and research. + * (avoid_dfa): Removed. All callers changed. + * awk.h (avoid_dfa): Removed. + + Other changes by Arnold Robbins: + + * awk.h (struct Regexp): Change various boolean members to bool. + (RE_NO_FLAGS): New #define. + * interpret.h: Use RE_NO_FLAGS instead of zero. + * re.c (research): Prettify the logic a little bit. + +2016-08-25 Arnold D. Robbins <arnold@skeeve.com> + + * dfa.c: Sync with grep. + +2016-08-25 Arnold D. Robbins <arnold@skeeve.com> + * 4.1.4: Release tar ball made. 2016-08-23 Arnold D. Robbins <arnold@skeeve.com> @@ -206,11 +206,10 @@ typedef struct Regexp { struct re_pattern_buffer pat; struct re_registers regs; struct dfa *dfareg; - short dfa; - short has_anchor; /* speed up of avoid_dfa kludge, temporary */ - short non_empty; /* for use in fpat_parse_field */ - short has_meta; /* re has meta chars so (probably) isn't simple string */ - short maybe_long; /* re has meta chars that can match long text */ + bool has_anchor; /* re has anchors which dfa avoids */ + bool non_empty; /* for use in fpat_parse_field */ + bool has_meta; /* re has meta chars so (probably) isn't simple string */ + bool maybe_long; /* re has meta chars that can match long text */ } Regexp; #define RESTART(rp,s) (rp)->regs.start[0] #define REEND(rp,s) (rp)->regs.end[0] @@ -219,6 +218,7 @@ typedef struct Regexp { #define NUMSUBPATS(rp,s) (rp)->regs.num_regs /* regexp matching flags: */ +#define RE_NO_FLAGS 0 /* empty flags */ #define RE_NEED_START 1 /* need to know start/end of match */ #define RE_NO_BOL 2 /* not allowed to match ^ in regexp */ @@ -1443,7 +1443,7 @@ extern int sanitize_exit_status(int status); extern void PUSH_CODE(INSTRUCTION *cp); extern INSTRUCTION *POP_CODE(void); extern void init_interpret(void); -extern int cmp_nodes(NODE *t1, NODE *t2); +extern int cmp_nodes(NODE *t1, NODE *t2, bool use_strcmp); extern int cmp_awknums(const NODE *t1, const NODE *t2); extern void set_IGNORECASE(void); extern void set_OFS(void); @@ -1651,7 +1651,6 @@ extern void reg_error(const char *s); extern Regexp *re_update(NODE *t); extern void resyntax(int syntax); extern void resetup(void); -extern int avoid_dfa(NODE *re, char *str, size_t len); extern int reisstring(const char *text, size_t len, Regexp *re, const char *buf); extern int get_numbase(const char *str, bool use_locale); @@ -1670,7 +1670,7 @@ cmp_val(struct list_item *w, NODE *old, NODE *new) if (new->type == Node_var_array) /* 5 */ return true; - return cmp_nodes(old, new); /* 4 */ + return cmp_nodes(old, new, true); /* 4 */ } /* watchpoint_triggered --- check if we should stop at this watchpoint; @@ -387,8 +387,8 @@ struct regex_syntax meaning of the @#%!@#%^!@ syntax bits. */ struct lexer_state { - char const *lexptr; /* Pointer to next input character. */ - size_t lexleft; /* Number of characters remaining. */ + char const *ptr; /* Pointer to next input character. */ + size_t left; /* Number of characters remaining. */ token lasttok; /* Previous token returned; initially END. */ size_t parens; /* Count of outstanding left parens. */ int minrep, maxrep; /* Repeat counts for {m,n}. */ @@ -429,10 +429,10 @@ struct dfa size_t calloc; /* Number of charclasses allocated. */ /* Scanner state */ - struct lexer_state lexstate; + struct lexer_state lex; /* Parser state */ - struct parser_state parsestate; + struct parser_state parse; /* Fields filled by the parser. */ token *tokens; /* Postfix parse array. */ @@ -910,7 +910,7 @@ using_simple_locale (struct dfa const *dfa) && '}' == 125 && '~' == 126) }; - return (!native_c_charset || dfa->multibyte) ? false : unibyte_c; + return (native_c_charset & !dfa->multibyte) | unibyte_c; } /* Fetch the next lexical input character. Set C (of type int) to the @@ -922,23 +922,23 @@ using_simple_locale (struct dfa const *dfa) otherwise. */ # define FETCH_WC(dfa, c, wc, eoferr) \ do { \ - if (! dfa->lexstate.lexleft) \ + if (! (dfa)->lex.left) \ { \ if ((eoferr) != 0) \ dfaerror (eoferr); \ else \ - return dfa->lexstate.lasttok = END; \ + return (dfa)->lex.lasttok = END; \ } \ else \ { \ wint_t _wc; \ - size_t nbytes = mbs_to_wchar (&_wc, dfa->lexstate.lexptr, \ - dfa->lexstate.lexleft, dfa); \ - dfa->lexstate.cur_mb_len = nbytes; \ + size_t nbytes = mbs_to_wchar (&_wc, (dfa)->lex.ptr, \ + (dfa)->lex.left, dfa); \ + (dfa)->lex.cur_mb_len = nbytes; \ (wc) = _wc; \ - (c) = nbytes == 1 ? to_uchar (*dfa->lexstate.lexptr) : EOF; \ - dfa->lexstate.lexptr += nbytes; \ - dfa->lexstate.lexleft -= nbytes; \ + (c) = nbytes == 1 ? to_uchar ((dfa)->lex.ptr[0]) : EOF; \ + (dfa)->lex.ptr += nbytes; \ + (dfa)->lex.left -= nbytes; \ } \ } while (false) @@ -1112,8 +1112,8 @@ parse_bracket_exp (struct dfa *dfa) for (;;) { FETCH_WC (dfa, c, wc, _("unbalanced [")); - if ((c == c1 && *dfa->lexstate.lexptr == ']') - || dfa->lexstate.lexleft == 0) + if (dfa->lex.left == 0 + || (c == c1 && dfa->lex.ptr[0] == ']')) break; if (len < MAX_BRACKET_STRING_LEN) str[len++] = c; @@ -1133,8 +1133,8 @@ parse_bracket_exp (struct dfa *dfa) { char const *class = (dfa->syntax.case_fold && (STREQ (str, "upper") - || STREQ (str, "lower")) ? - "alpha" : str); + || STREQ (str, "lower")) + ? "alpha" : str); const struct dfa_ctype *pred = find_pred (class); if (!pred) dfaerror (_("invalid character class")); @@ -1174,7 +1174,7 @@ parse_bracket_exp (struct dfa *dfa) /* A bracket expression like [a-[.aa.]] matches an unknown set. Treat it like [-a[.aa.]] while parsing it, and remember that the set is unknown. */ - if (c2 == '[' && *dfa->lexstate.lexptr == '.') + if (c2 == '[' && dfa->lex.ptr[0] == '.') { known_bracket_exp = false; c2 = ']'; @@ -1184,8 +1184,8 @@ parse_bracket_exp (struct dfa *dfa) { /* In the case [x-], the - is an ordinary hyphen, which is left in c1, the lookahead character. */ - dfa->lexstate.lexptr -= dfa->lexstate.cur_mb_len; - dfa->lexstate.lexleft += dfa->lexstate.cur_mb_len; + dfa->lex.ptr -= dfa->lex.cur_mb_len; + dfa->lex.left += dfa->lex.cur_mb_len; } else { @@ -1283,19 +1283,27 @@ parse_bracket_exp (struct dfa *dfa) return CSET + dfa_charclass_index (dfa, ccl); } -#define PUSH_LEX_STATE(s) \ - do \ - { \ - char const *lexptr_saved = dfa->lexstate.lexptr; \ - size_t lexleft_saved = dfa->lexstate.lexleft; \ - dfa->lexstate.lexptr = (s); \ - dfa->lexstate.lexleft = strlen (dfa->lexstate.lexptr) +struct lexptr +{ + char const *ptr; + size_t left; +}; + +static void +push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s) +{ + ls->ptr = dfa->lex.ptr; + ls->left = dfa->lex.left; + dfa->lex.ptr = s; + dfa->lex.left = strlen (s); +} -#define POP_LEX_STATE() \ - dfa->lexstate.lexptr = lexptr_saved; \ - dfa->lexstate.lexleft = lexleft_saved; \ - } \ - while (false) +static void +pop_lex_state (struct dfa *dfa, struct lexptr const *ls) +{ + dfa->lex.ptr = ls->ptr; + dfa->lex.left = ls->left; +} static token lex (struct dfa *dfa) @@ -1313,14 +1321,14 @@ lex (struct dfa *dfa) "if (backslash) ...". */ for (i = 0; i < 2; ++i) { - FETCH_WC (dfa, c, dfa->lexstate.wctok, NULL); + FETCH_WC (dfa, c, dfa->lex.wctok, NULL); switch (c) { case '\\': if (backslash) goto normal_char; - if (dfa->lexstate.lexleft == 0) + if (dfa->lex.left == 0) dfaerror (_("unfinished \\ escape")); backslash = true; break; @@ -1329,28 +1337,29 @@ lex (struct dfa *dfa) if (backslash) goto normal_char; if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS - || dfa->lexstate.lasttok == END || dfa->lexstate.lasttok == LPAREN - || dfa->lexstate.lasttok == OR) - return dfa->lexstate.lasttok = BEGLINE; + || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN + || dfa->lex.lasttok == OR) + return dfa->lex.lasttok = BEGLINE; goto normal_char; case '$': if (backslash) goto normal_char; if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS - || dfa->lexstate.lexleft == 0 - || (dfa->syntax.syntax_bits & RE_NO_BK_PARENS - ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == ')' - : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\' - && dfa->lexstate.lexptr[1] == ')') - || (dfa->syntax.syntax_bits & RE_NO_BK_VBAR - ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == '|' - : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\' - && dfa->lexstate.lexptr[1] == '|') + || dfa->lex.left == 0 + || ((dfa->lex.left + > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)) + && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS) + & (dfa->lex.ptr[0] == '\\')] + == ')')) + || ((dfa->lex.left + > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)) + && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR) + & (dfa->lex.ptr[0] == '\\')] + == '|')) || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT) - && dfa->lexstate.lexleft > 0 - && *dfa->lexstate.lexptr == '\n')) - return dfa->lexstate.lasttok = ENDLINE; + && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n')) + return dfa->lex.lasttok = ENDLINE; goto normal_char; case '1': @@ -1364,8 +1373,8 @@ lex (struct dfa *dfa) case '9': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS)) { - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = BACKREF; + dfa->lex.laststart = false; + return dfa->lex.lasttok = BACKREF; } goto normal_char; @@ -1373,7 +1382,7 @@ lex (struct dfa *dfa) if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) { /* FIXME: should be beginning of string */ - return dfa->lexstate.lasttok = BEGLINE; + return dfa->lex.lasttok = BEGLINE; } goto normal_char; @@ -1381,28 +1390,28 @@ lex (struct dfa *dfa) if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) { /* FIXME: should be end of string */ - return dfa->lexstate.lasttok = ENDLINE; + return dfa->lex.lasttok = ENDLINE; } goto normal_char; case '<': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = BEGWORD; + return dfa->lex.lasttok = BEGWORD; goto normal_char; case '>': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = ENDWORD; + return dfa->lex.lasttok = ENDWORD; goto normal_char; case 'b': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = LIMWORD; + return dfa->lex.lasttok = LIMWORD; goto normal_char; case 'B': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = NOTLIMWORD; + return dfa->lex.lasttok = NOTLIMWORD; goto normal_char; case '?': @@ -1411,17 +1420,17 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = QMARK; + return dfa->lex.lasttok = QMARK; case '*': if (backslash) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = STAR; + return dfa->lex.lasttok = STAR; case '+': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS) @@ -1429,9 +1438,9 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = PLUS; + return dfa->lex.lasttok = PLUS; case '{': if (!(dfa->syntax.syntax_bits & RE_INTERVALS)) @@ -1439,7 +1448,7 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; /* Cases: @@ -1449,86 +1458,79 @@ lex (struct dfa *dfa) {,} - 0 to infinity (same as '*') {M,N} - M through N */ { - char const *p = dfa->lexstate.lexptr; - char const *lim = p + dfa->lexstate.lexleft; - dfa->lexstate.minrep = dfa->lexstate.maxrep = -1; + char const *p = dfa->lex.ptr; + char const *lim = p + dfa->lex.left; + dfa->lex.minrep = dfa->lex.maxrep = -1; for (; p != lim && ISASCIIDIGIT (*p); p++) - { - if (dfa->lexstate.minrep < 0) - dfa->lexstate.minrep = *p - '0'; - else - dfa->lexstate.minrep = MIN (RE_DUP_MAX + 1, - (dfa->lexstate.minrep - * 10 + *p - '0')); - } + dfa->lex.minrep = (dfa->lex.minrep < 0 + ? *p - '0' + : MIN (RE_DUP_MAX + 1, + dfa->lex.minrep * 10 + *p - '0')); if (p != lim) { if (*p != ',') - dfa->lexstate.maxrep = dfa->lexstate.minrep; + dfa->lex.maxrep = dfa->lex.minrep; else { - if (dfa->lexstate.minrep < 0) - dfa->lexstate.minrep = 0; + if (dfa->lex.minrep < 0) + dfa->lex.minrep = 0; while (++p != lim && ISASCIIDIGIT (*p)) - { - if (dfa->lexstate.maxrep < 0) - dfa->lexstate.maxrep = *p - '0'; - else - dfa->lexstate.maxrep = MIN (RE_DUP_MAX + 1, - (dfa->lexstate.maxrep - * 10 + *p - '0')); - } + dfa->lex.maxrep + = (dfa->lex.maxrep < 0 + ? *p - '0' + : MIN (RE_DUP_MAX + 1, + dfa->lex.maxrep * 10 + *p - '0')); } } if (! ((! backslash || (p != lim && *p++ == '\\')) && p != lim && *p++ == '}' - && 0 <= dfa->lexstate.minrep - && (dfa->lexstate.maxrep < 0 - || dfa->lexstate.minrep <= dfa->lexstate.maxrep))) + && 0 <= dfa->lex.minrep + && (dfa->lex.maxrep < 0 + || dfa->lex.minrep <= dfa->lex.maxrep))) { if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD) goto normal_char; dfaerror (_("invalid content of \\{\\}")); } - if (RE_DUP_MAX < dfa->lexstate.maxrep) + if (RE_DUP_MAX < dfa->lex.maxrep) dfaerror (_("regular expression too big")); - dfa->lexstate.lexptr = p; - dfa->lexstate.lexleft = lim - p; + dfa->lex.ptr = p; + dfa->lex.left = lim - p; } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = REPMN; + dfa->lex.laststart = false; + return dfa->lex.lasttok = REPMN; case '|': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS) goto normal_char; if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0)) goto normal_char; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = OR; + dfa->lex.laststart = true; + return dfa->lex.lasttok = OR; case '\n': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT)) goto normal_char; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = OR; + dfa->lex.laststart = true; + return dfa->lex.lasttok = OR; case '(': if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; - ++dfa->lexstate.parens; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = LPAREN; + dfa->lex.parens++; + dfa->lex.laststart = true; + return dfa->lex.lasttok = LPAREN; case ')': if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; - if (dfa->lexstate.parens == 0 + if (dfa->lex.parens == 0 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) goto normal_char; - --dfa->lexstate.parens; - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = RPAREN; + dfa->lex.parens--; + dfa->lex.laststart = false; + return dfa->lex.lasttok = RPAREN; case '.': if (backslash) @@ -1537,8 +1539,8 @@ lex (struct dfa *dfa) { /* In multibyte environment period must match with a single character not a byte. So we use ANYCHAR. */ - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = ANYCHAR; + dfa->lex.laststart = false; + return dfa->lex.lasttok = ANYCHAR; } zeroset (ccl); notset (ccl); @@ -1546,8 +1548,8 @@ lex (struct dfa *dfa) clrbit ('\n', ccl); if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); case 's': case 'S': @@ -1561,9 +1563,8 @@ lex (struct dfa *dfa) setbit (c2, ccl); if (c == 'S') notset (ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1572,14 +1573,15 @@ lex (struct dfa *dfa) /* \s and \S are documented to be equivalent to [[:space:]] and [^[:space:]] respectively, so tell the lexer to process those strings, each minus its "already processed" '['. */ - PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]"); - - dfa->lexstate.lasttok = parse_bracket_exp (dfa); - - POP_LEX_STATE (); + { + struct lexptr ls; + push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']); + dfa->lex.lasttok = parse_bracket_exp (dfa); + pop_lex_state (dfa, &ls); + } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok; + dfa->lex.laststart = false; + return dfa->lex.lasttok; case 'w': case 'W': @@ -1594,9 +1596,8 @@ lex (struct dfa *dfa) setbit (c2, ccl); if (c == 'W') notset (ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1605,38 +1606,38 @@ lex (struct dfa *dfa) /* \w and \W are documented to be equivalent to [_[:alnum:]] and [^_[:alnum:]] respectively, so tell the lexer to process those strings, each minus its "already processed" '['. */ - PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]"); - - dfa->lexstate.lasttok = parse_bracket_exp (dfa); - - POP_LEX_STATE (); + { + struct lexptr ls; + push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']); + dfa->lex.lasttok = parse_bracket_exp (dfa); + pop_lex_state (dfa, &ls); + } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok; + dfa->lex.laststart = false; + return dfa->lex.lasttok; case '[': if (backslash) goto normal_char; - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = parse_bracket_exp (dfa); + dfa->lex.laststart = false; + return dfa->lex.lasttok = parse_bracket_exp (dfa); default: normal_char: - dfa->lexstate.laststart = false; + dfa->lex.laststart = false; /* For multibyte character sets, folding is done in atom. Always return WCHAR. */ if (dfa->multibyte) - return dfa->lexstate.lasttok = WCHAR; + return dfa->lex.lasttok = WCHAR; if (dfa->syntax.case_fold && isalpha (c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); } - return dfa->lexstate.lasttok = c; + return dfa->lex.lasttok = c; } } @@ -1670,21 +1671,21 @@ addtok_mb (struct dfa *dfa, token t, int mbprop) case CAT: case OR: - --dfa->parsestate.depth; + dfa->parse.depth--; break; case BACKREF: dfa->fast = false; /* fallthrough */ default: - ++dfa->nleaves; + dfa->nleaves++; /* fallthrough */ case EMPTY: - ++dfa->parsestate.depth; + dfa->parse.depth++; break; } - if (dfa->parsestate.depth > dfa->depth) - dfa->depth = dfa->parsestate.depth; + if (dfa->parse.depth > dfa->depth) + dfa->depth = dfa->parse.depth; } static void addtok_wc (struct dfa *dfa, wint_t wc); @@ -1741,19 +1742,19 @@ addtok_wc (struct dfa *dfa, wint_t wc) size_t stored_bytes = wcrtomb ((char *) buf, wc, &s); if (stored_bytes != (size_t) -1) - dfa->lexstate.cur_mb_len = stored_bytes; + dfa->lex.cur_mb_len = stored_bytes; else { /* This is merely stop-gap. buf[0] is undefined, yet skipping the addtok_mb call altogether can corrupt the heap. */ - dfa->lexstate.cur_mb_len = 1; + dfa->lex.cur_mb_len = 1; buf[0] = 0; } - addtok_mb (dfa, buf[0], dfa->lexstate.cur_mb_len == 1 ? 3 : 1); - for (i = 1; i < dfa->lexstate.cur_mb_len; i++) + addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1); + for (i = 1; i < dfa->lex.cur_mb_len; i++) { - addtok_mb (dfa, buf[i], i == dfa->lexstate.cur_mb_len - 1 ? 2 : 0); + addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0); addtok (dfa, CAT); } } @@ -1854,18 +1855,18 @@ add_utf8_anychar (struct dfa *dfa) static void atom (struct dfa *dfa) { - if (dfa->parsestate.tok == WCHAR) + if (dfa->parse.tok == WCHAR) { - if (dfa->lexstate.wctok == WEOF) + if (dfa->lex.wctok == WEOF) addtok (dfa, BACKREF); else { - addtok_wc (dfa, dfa->lexstate.wctok); + addtok_wc (dfa, dfa->lex.wctok); if (dfa->syntax.case_fold) { wchar_t folded[CASE_FOLDED_BUFSIZE]; - unsigned int i, n = case_folded_counterparts (dfa->lexstate.wctok, + unsigned int i, n = case_folded_counterparts (dfa->lex.wctok, folded); for (i = 0; i < n; i++) { @@ -1875,9 +1876,9 @@ atom (struct dfa *dfa) } } - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == ANYCHAR && using_utf8) + else if (dfa->parse.tok == ANYCHAR && using_utf8) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1887,26 +1888,25 @@ atom (struct dfa *dfa) UTF-8: it is the most used, and the structure of the encoding makes the correctness more obvious. */ add_utf8_anychar (dfa); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if ((dfa->parsestate.tok >= 0 && dfa->parsestate.tok < NOTCHAR) - || dfa->parsestate.tok >= CSET || dfa->parsestate.tok == BACKREF - || dfa->parsestate.tok == BEGLINE || dfa->parsestate.tok == ENDLINE - || dfa->parsestate.tok == BEGWORD || dfa->parsestate.tok == ANYCHAR - || dfa->parsestate.tok == MBCSET || dfa->parsestate.tok == ENDWORD - || dfa->parsestate.tok == LIMWORD - || dfa->parsestate.tok == NOTLIMWORD) + else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR) + || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF + || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE + || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR + || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD + || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD) { - addtok (dfa, dfa->parsestate.tok); - dfa->parsestate.tok = lex (dfa); + addtok (dfa, dfa->parse.tok); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == LPAREN) + else if (dfa->parse.tok == LPAREN) { - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); regexp (dfa); - if (dfa->parsestate.tok != RPAREN) + if (dfa->parse.tok != RPAREN) dfaerror (_("unbalanced (")); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } else addtok (dfa, EMPTY); @@ -1954,40 +1954,39 @@ closure (struct dfa *dfa) size_t tindex, ntokens; atom (dfa); - while (dfa->parsestate.tok == QMARK || dfa->parsestate.tok == STAR - || dfa->parsestate.tok == PLUS || dfa->parsestate.tok == REPMN) - if (dfa->parsestate.tok == REPMN - && (dfa->lexstate.minrep || dfa->lexstate.maxrep)) + while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR + || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN) + if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep)) { ntokens = nsubtoks (dfa, dfa->tindex); tindex = dfa->tindex - ntokens; - if (dfa->lexstate.maxrep < 0) + if (dfa->lex.maxrep < 0) addtok (dfa, PLUS); - if (dfa->lexstate.minrep == 0) + if (dfa->lex.minrep == 0) addtok (dfa, QMARK); - for (i = 1; i < dfa->lexstate.minrep; ++i) + for (i = 1; i < dfa->lex.minrep; i++) { copytoks (dfa, tindex, ntokens); addtok (dfa, CAT); } - for (; i < dfa->lexstate.maxrep; ++i) + for (; i < dfa->lex.maxrep; i++) { copytoks (dfa, tindex, ntokens); addtok (dfa, QMARK); addtok (dfa, CAT); } - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == REPMN) + else if (dfa->parse.tok == REPMN) { dfa->tindex -= nsubtoks (dfa, dfa->tindex); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); closure (dfa); } else { - addtok (dfa, dfa->parsestate.tok); - dfa->parsestate.tok = lex (dfa); + addtok (dfa, dfa->parse.tok); + dfa->parse.tok = lex (dfa); } } @@ -1995,8 +1994,8 @@ static void branch (struct dfa* dfa) { closure (dfa); - while (dfa->parsestate.tok != RPAREN && dfa->parsestate.tok != OR - && dfa->parsestate.tok >= 0) + while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR + && dfa->parse.tok >= 0) { closure (dfa); addtok (dfa, CAT); @@ -2007,9 +2006,9 @@ static void regexp (struct dfa *dfa) { branch (dfa); - while (dfa->parsestate.tok == OR) + while (dfa->parse.tok == OR) { - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); branch (dfa); addtok (dfa, OR); } @@ -2021,26 +2020,26 @@ regexp (struct dfa *dfa) static void dfaparse (char const *s, size_t len, struct dfa *d) { - d->lexstate.lexptr = s; - d->lexstate.lexleft = len; - d->lexstate.lasttok = END; - d->lexstate.laststart = true; - d->lexstate.parens = 0; + d->lex.ptr = s; + d->lex.left = len; + d->lex.lasttok = END; + d->lex.laststart = true; + d->lex.parens = 0; if (d->multibyte) { - d->lexstate.cur_mb_len = 0; + d->lex.cur_mb_len = 0; memset (&d->mbs, 0, sizeof d->mbs); } if (!d->syntax.syntax_bits_set) dfaerror (_("no syntax specified")); - d->parsestate.tok = lex (d); - d->parsestate.depth = d->depth; + d->parse.tok = lex (d); + d->parse.depth = d->depth; regexp (d); - if (d->parsestate.tok != END) + if (d->parse.tok != END) dfaerror (_("unbalanced )")); addtok (d, END - d->nregexps); @@ -3990,11 +3989,9 @@ dfamust (struct dfa const *d) bool exact = false; bool begline = false; bool endline = false; - size_t rj; bool need_begline = false; bool need_endline = false; bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1; - struct dfamust *dm; for (ri = 0; ri < d->tindex; ++ri) { @@ -4171,7 +4168,7 @@ dfamust (struct dfa const *d) } } - rj = ri + 2; + size_t rj = ri + 2; if (d->tokens[ri + 1] == CAT) { for (; rj < d->tindex - 1; rj += 2) @@ -4200,7 +4197,7 @@ dfamust (struct dfa const *d) } done:; - dm = NULL; + struct dfamust *dm = NULL; if (*result) { dm = xmalloc (sizeof *dm); @@ -4230,11 +4227,11 @@ dfamustfree (struct dfamust *dm) struct dfa * dfaalloc (void) { - struct dfa *d = xcalloc (1, sizeof (struct dfa)); + struct dfa *d = xzalloc (sizeof *d); d->multibyte = MB_CUR_MAX > 1; d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb; d->fast = !d->multibyte; - d->lexstate.cur_mb_len = 1; + d->lex.cur_mb_len = 1; return d; } diff --git a/doc/ChangeLog b/doc/ChangeLog index 2dc83a60..ce21ba92 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,5 +1,11 @@ 2016-08-25 Arnold D. Robbins <arnold@skeeve.com> + * gawktexi.in (POSIX String Comparison): Update for new + spec where == and != use strcmp, rest use strcoll. Thanks to + Chet Ramey for pointing me at the new rules. + +2016-08-25 Arnold D. Robbins <arnold@skeeve.com> + * 4.1.4: Release tar ball made. 2016-08-24 Arnold D. Robbins <arnold@skeeve.com> diff --git a/doc/gawk.info b/doc/gawk.info index fd3a5b8a..c39afa4f 100644 --- a/doc/gawk.info +++ b/doc/gawk.info @@ -8666,18 +8666,18 @@ Constant Regexps::, where this is discussed in more detail. File: gawk.info, Node: POSIX String Comparison, Prev: Comparison Operators, Up: Typing and Comparison -6.3.2.3 String Comparison with POSIX Rules -.......................................... +6.3.2.3 String Comparison Based on Locale Collating Order +......................................................... -The POSIX standard says that string comparison is performed based on the -locale's "collating order". This is the order in which characters sort, -as defined by the locale (for more discussion, *note Locales::). This -order is usually very different from the results obtained when doing -straight character-by-character comparison.(1) +The POSIX standard used to say that all string comparisons are performed +based on the locale's "collating order". This is the order in which +characters sort, as defined by the locale (for more discussion, *note +Locales::). This order is usually very different from the results +obtained when doing straight byte-by-byte comparison.(1) Because this behavior differs considerably from existing practice, -'gawk' only implements it when in POSIX mode (*note Options::). Here is -an example to illustrate the difference, in an 'en_US.UTF-8' locale: +'gawk' only implemented it when in POSIX mode (*note Options::). Here +is an example to illustrate the difference, in an 'en_US.UTF-8' locale: $ gawk 'BEGIN { printf("ABC < abc = %s\n", > ("ABC" < "abc" ? "TRUE" : "FALSE")) }' @@ -8686,11 +8686,28 @@ an example to illustrate the difference, in an 'en_US.UTF-8' locale: > ("ABC" < "abc" ? "TRUE" : "FALSE")) }' -| ABC < abc = FALSE + Fortunately, as of August 2016, comparison based on locale collating +order is no longer required for the '==' and '!=' operators.(2) +However, comparison based on locales is still required for '<', '<=', +'>', and '>='. POSIX thus recommends as follows: + + Since the '==' operator checks whether strings are identical, not + whether they collate equally, applications needing to check whether + strings collate equally can use: + + a <= b && a >= b + + As of version 4.2, 'gawk' continues to use locale collating order for +'<', '<=', '>', and '>=' only in POSIX mode. + ---------- Footnotes ---------- (1) Technically, string comparison is supposed to behave the same way as if the strings were compared with the C 'strcoll()' function. + (2) See the Austin Group website +(http://austingroupbugs.net/view.php?id=1070). + File: gawk.info, Node: Boolean Ops, Next: Conditional Exp, Prev: Typing and Comparison, Up: Truth Values and Conditions @@ -27659,7 +27676,7 @@ ranges, such that outside the '"C"' and '"POSIX"' locales, the meaning of range expressions was _undefined_.(3) By using this lovely technical term, the standard gives license to -implementors to implement ranges in whatever way they choose. The +implementers to implement ranges in whatever way they choose. The 'gawk' maintainer chose to apply the pre-POSIX meaning both with the default regexp matching and when '--traditional' or '--posix' are used. In all cases 'gawk' remains POSIX-compliant. @@ -35483,401 +35500,402 @@ Node: Variable Typing367063 Node: Comparison Operators370687 Ref: table-relational-ops371106 Node: POSIX String Comparison374601 -Ref: POSIX String Comparison-Footnote-1375675 -Node: Boolean Ops375814 -Ref: Boolean Ops-Footnote-1380296 -Node: Conditional Exp380388 -Node: Function Calls382124 -Node: Precedence386001 -Node: Locales389660 -Node: Expressions Summary391292 -Node: Patterns and Actions393865 -Node: Pattern Overview394985 -Node: Regexp Patterns396662 -Node: Expression Patterns397204 -Node: Ranges400985 -Node: BEGIN/END404093 -Node: Using BEGIN/END404854 -Ref: Using BEGIN/END-Footnote-1407590 -Node: I/O And BEGIN/END407696 -Node: BEGINFILE/ENDFILE410010 -Node: Empty412917 -Node: Using Shell Variables413234 -Node: Action Overview415508 -Node: Statements417833 -Node: If Statement419681 -Node: While Statement421176 -Node: Do Statement423204 -Node: For Statement424352 -Node: Switch Statement427510 -Node: Break Statement429896 -Node: Continue Statement431988 -Node: Next Statement433815 -Node: Nextfile Statement436198 -Node: Exit Statement438850 -Node: Built-in Variables441253 -Node: User-modified442386 -Node: Auto-set449972 -Ref: Auto-set-Footnote-1464625 -Ref: Auto-set-Footnote-2464831 -Node: ARGC and ARGV464887 -Node: Pattern Action Summary469100 -Node: Arrays471530 -Node: Array Basics472859 -Node: Array Intro473703 -Ref: figure-array-elements475678 -Ref: Array Intro-Footnote-1478382 -Node: Reference to Elements478510 -Node: Assigning Elements480974 -Node: Array Example481465 -Node: Scanning an Array483224 -Node: Controlling Scanning486246 -Ref: Controlling Scanning-Footnote-1491645 -Node: Numeric Array Subscripts491961 -Node: Uninitialized Subscripts494145 -Node: Delete495764 -Ref: Delete-Footnote-1498516 -Node: Multidimensional498573 -Node: Multiscanning501668 -Node: Arrays of Arrays503259 -Node: Arrays Summary508026 -Node: Functions510119 -Node: Built-in511157 -Node: Calling Built-in512238 -Node: Numeric Functions514234 -Ref: Numeric Functions-Footnote-1519067 -Ref: Numeric Functions-Footnote-2519424 -Ref: Numeric Functions-Footnote-3519472 -Node: String Functions519744 -Ref: String Functions-Footnote-1543248 -Ref: String Functions-Footnote-2543376 -Ref: String Functions-Footnote-3543624 -Node: Gory Details543711 -Ref: table-sub-escapes545502 -Ref: table-sub-proposed547021 -Ref: table-posix-sub548384 -Ref: table-gensub-escapes549925 -Ref: Gory Details-Footnote-1550748 -Node: I/O Functions550902 -Ref: table-system-return-values557484 -Ref: I/O Functions-Footnote-1559464 -Ref: I/O Functions-Footnote-2559612 -Node: Time Functions559732 -Ref: Time Functions-Footnote-1570237 -Ref: Time Functions-Footnote-2570305 -Ref: Time Functions-Footnote-3570463 -Ref: Time Functions-Footnote-4570574 -Ref: Time Functions-Footnote-5570686 -Ref: Time Functions-Footnote-6570913 -Node: Bitwise Functions571179 -Ref: table-bitwise-ops571773 -Ref: Bitwise Functions-Footnote-1576111 -Node: Type Functions576284 -Node: I18N Functions578945 -Node: User-defined580596 -Node: Definition Syntax581401 -Ref: Definition Syntax-Footnote-1587088 -Node: Function Example587159 -Ref: Function Example-Footnote-1590081 -Node: Function Caveats590103 -Node: Calling A Function590621 -Node: Variable Scope591579 -Node: Pass By Value/Reference594573 -Node: Return Statement598072 -Node: Dynamic Typing601051 -Node: Indirect Calls601981 -Ref: Indirect Calls-Footnote-1612232 -Node: Functions Summary612360 -Node: Library Functions615065 -Ref: Library Functions-Footnote-1618672 -Ref: Library Functions-Footnote-2618815 -Node: Library Names618986 -Ref: Library Names-Footnote-1622446 -Ref: Library Names-Footnote-2622669 -Node: General Functions622755 -Node: Strtonum Function623858 -Node: Assert Function626880 -Node: Round Function630206 -Node: Cliff Random Function631747 -Node: Ordinal Functions632763 -Ref: Ordinal Functions-Footnote-1635826 -Ref: Ordinal Functions-Footnote-2636078 -Node: Join Function636288 -Ref: Join Function-Footnote-1638058 -Node: Getlocaltime Function638258 -Node: Readfile Function642000 -Node: Shell Quoting643972 -Node: Data File Management645373 -Node: Filetrans Function646005 -Node: Rewind Function650101 -Node: File Checking652007 -Ref: File Checking-Footnote-1653341 -Node: Empty Files653542 -Node: Ignoring Assigns655521 -Node: Getopt Function657071 -Ref: Getopt Function-Footnote-1668540 -Node: Passwd Functions668740 -Ref: Passwd Functions-Footnote-1677579 -Node: Group Functions677667 -Ref: Group Functions-Footnote-1685564 -Node: Walking Arrays685771 -Node: Library Functions Summary688779 -Node: Library Exercises690185 -Node: Sample Programs690650 -Node: Running Examples691420 -Node: Clones692148 -Node: Cut Program693372 -Node: Egrep Program703301 -Ref: Egrep Program-Footnote-1710813 -Node: Id Program710923 -Node: Split Program714603 -Ref: Split Program-Footnote-1718062 -Node: Tee Program718191 -Node: Uniq Program720981 -Node: Wc Program728407 -Ref: Wc Program-Footnote-1732662 -Node: Miscellaneous Programs732756 -Node: Dupword Program733969 -Node: Alarm Program735999 -Node: Translate Program740854 -Ref: Translate Program-Footnote-1745419 -Node: Labels Program745689 -Ref: Labels Program-Footnote-1749040 -Node: Word Sorting749124 -Node: History Sorting753196 -Node: Extract Program755031 -Node: Simple Sed762560 -Node: Igawk Program765634 -Ref: Igawk Program-Footnote-1779965 -Ref: Igawk Program-Footnote-2780167 -Ref: Igawk Program-Footnote-3780289 -Node: Anagram Program780404 -Node: Signature Program783466 -Node: Programs Summary784713 -Node: Programs Exercises785927 -Ref: Programs Exercises-Footnote-1790056 -Node: Advanced Features790147 -Node: Nondecimal Data792137 -Node: Array Sorting793728 -Node: Controlling Array Traversal794428 -Ref: Controlling Array Traversal-Footnote-1802795 -Node: Array Sorting Functions802913 -Ref: Array Sorting Functions-Footnote-1808004 -Node: Two-way I/O808200 -Ref: Two-way I/O-Footnote-1814750 -Ref: Two-way I/O-Footnote-2814937 -Node: TCP/IP Networking815019 -Node: Profiling818137 -Ref: Profiling-Footnote-1826630 -Node: Advanced Features Summary826953 -Node: Internationalization828797 -Node: I18N and L10N830277 -Node: Explaining gettext830964 -Ref: Explaining gettext-Footnote-1836856 -Ref: Explaining gettext-Footnote-2837041 -Node: Programmer i18n837206 -Ref: Programmer i18n-Footnote-1842061 -Node: Translator i18n842110 -Node: String Extraction842904 -Ref: String Extraction-Footnote-1844036 -Node: Printf Ordering844122 -Ref: Printf Ordering-Footnote-1846908 -Node: I18N Portability846972 -Ref: I18N Portability-Footnote-1849428 -Node: I18N Example849491 -Ref: I18N Example-Footnote-1852297 -Node: Gawk I18N852370 -Node: I18N Summary853015 -Node: Debugger854356 -Node: Debugging855378 -Node: Debugging Concepts855819 -Node: Debugging Terms857628 -Node: Awk Debugging860203 -Node: Sample Debugging Session861109 -Node: Debugger Invocation861643 -Node: Finding The Bug863029 -Node: List of Debugger Commands869507 -Node: Breakpoint Control870840 -Node: Debugger Execution Control874534 -Node: Viewing And Changing Data877896 -Node: Execution Stack881270 -Node: Debugger Info882907 -Node: Miscellaneous Debugger Commands886978 -Node: Readline Support892066 -Node: Limitations892962 -Ref: Limitations-Footnote-1897193 -Node: Debugging Summary897244 -Node: Arbitrary Precision Arithmetic898523 -Node: Computer Arithmetic899939 -Ref: table-numeric-ranges903530 -Ref: Computer Arithmetic-Footnote-1904252 -Node: Math Definitions904309 -Ref: table-ieee-formats907623 -Ref: Math Definitions-Footnote-1908226 -Node: MPFR features908331 -Node: FP Math Caution910048 -Ref: FP Math Caution-Footnote-1911120 -Node: Inexactness of computations911489 -Node: Inexact representation912449 -Node: Comparing FP Values913809 -Node: Errors accumulate914891 -Node: Getting Accuracy916324 -Node: Try To Round919034 -Node: Setting precision919933 -Ref: table-predefined-precision-strings920630 -Node: Setting the rounding mode922460 -Ref: table-gawk-rounding-modes922834 -Ref: Setting the rounding mode-Footnote-1926242 -Node: Arbitrary Precision Integers926421 -Ref: Arbitrary Precision Integers-Footnote-1931338 -Node: POSIX Floating Point Problems931487 -Ref: POSIX Floating Point Problems-Footnote-1935369 -Node: Floating point summary935407 -Node: Dynamic Extensions937597 -Node: Extension Intro939150 -Node: Plugin License940416 -Node: Extension Mechanism Outline941213 -Ref: figure-load-extension941652 -Ref: figure-register-new-function943217 -Ref: figure-call-new-function944309 -Node: Extension API Description946371 -Node: Extension API Functions Introduction947903 -Node: General Data Types952762 -Ref: General Data Types-Footnote-1958717 -Node: Memory Allocation Functions959016 -Ref: Memory Allocation Functions-Footnote-1961861 -Node: Constructor Functions961960 -Node: Registration Functions963705 -Node: Extension Functions964390 -Node: Exit Callback Functions967013 -Node: Extension Version String968263 -Node: Input Parsers968926 -Node: Output Wrappers978808 -Node: Two-way processors983320 -Node: Printing Messages985585 -Ref: Printing Messages-Footnote-1986756 -Node: Updating ERRNO986909 -Node: Requesting Values987648 -Ref: table-value-types-returned988385 -Node: Accessing Parameters989268 -Node: Symbol Table Access990503 -Node: Symbol table by name991015 -Node: Symbol table by cookie993036 -Ref: Symbol table by cookie-Footnote-1997188 -Node: Cached values997252 -Ref: Cached values-Footnote-11000759 -Node: Array Manipulation1000850 -Ref: Array Manipulation-Footnote-11001941 -Node: Array Data Types1001978 -Ref: Array Data Types-Footnote-11004636 -Node: Array Functions1004728 -Node: Flattening Arrays1008586 -Node: Creating Arrays1015494 -Node: Redirection API1020263 -Node: Extension API Variables1023094 -Node: Extension Versioning1023727 -Ref: gawk-api-version1024164 -Node: Extension API Informational Variables1025920 -Node: Extension API Boilerplate1026984 -Node: Finding Extensions1030798 -Node: Extension Example1031357 -Node: Internal File Description1032155 -Node: Internal File Ops1036235 -Ref: Internal File Ops-Footnote-11047997 -Node: Using Internal File Ops1048137 -Ref: Using Internal File Ops-Footnote-11050520 -Node: Extension Samples1050794 -Node: Extension Sample File Functions1052323 -Node: Extension Sample Fnmatch1059972 -Node: Extension Sample Fork1061459 -Node: Extension Sample Inplace1062677 -Node: Extension Sample Ord1065887 -Node: Extension Sample Readdir1066723 -Ref: table-readdir-file-types1067612 -Node: Extension Sample Revout1068417 -Node: Extension Sample Rev2way1069006 -Node: Extension Sample Read write array1069746 -Node: Extension Sample Readfile1071688 -Node: Extension Sample Time1072783 -Node: Extension Sample API Tests1074131 -Node: gawkextlib1074623 -Node: Extension summary1077070 -Node: Extension Exercises1080772 -Node: Language History1082270 -Node: V7/SVR3.11083926 -Node: SVR41086078 -Node: POSIX1087512 -Node: BTL1088891 -Node: POSIX/GNU1089620 -Node: Feature History1095482 -Node: Common Extensions1109852 -Node: Ranges and Locales1111135 -Ref: Ranges and Locales-Footnote-11115751 -Ref: Ranges and Locales-Footnote-21115778 -Ref: Ranges and Locales-Footnote-31116013 -Node: Contributors1116234 -Node: History summary1121794 -Node: Installation1123174 -Node: Gawk Distribution1124118 -Node: Getting1124602 -Node: Extracting1125563 -Node: Distribution contents1127201 -Node: Unix Installation1133295 -Node: Quick Installation1133977 -Node: Shell Startup Files1136391 -Node: Additional Configuration Options1137469 -Node: Configuration Philosophy1139274 -Node: Non-Unix Installation1141643 -Node: PC Installation1142101 -Node: PC Binary Installation1143421 -Node: PC Compiling1145273 -Ref: PC Compiling-Footnote-11148067 -Node: PC Testing1148176 -Node: PC Using1149356 -Ref: PC Using-Footnote-11153509 -Node: Cygwin1153582 -Node: MSYS1154352 -Node: VMS Installation1154853 -Node: VMS Compilation1155644 -Ref: VMS Compilation-Footnote-11156873 -Node: VMS Dynamic Extensions1156931 -Node: VMS Installation Details1158616 -Node: VMS Running1160869 -Node: VMS GNV1165148 -Node: VMS Old Gawk1165883 -Node: Bugs1166354 -Node: Other Versions1170669 -Node: Installation summary1177253 -Node: Notes1178304 -Node: Compatibility Mode1179169 -Node: Additions1179951 -Node: Accessing The Source1180876 -Node: Adding Code1182311 -Node: New Ports1188530 -Node: Derived Files1193018 -Ref: Derived Files-Footnote-11198503 -Ref: Derived Files-Footnote-21198538 -Ref: Derived Files-Footnote-31199136 -Node: Future Extensions1199250 -Node: Implementation Limitations1199908 -Node: Extension Design1201091 -Node: Old Extension Problems1202245 -Ref: Old Extension Problems-Footnote-11203763 -Node: Extension New Mechanism Goals1203820 -Ref: Extension New Mechanism Goals-Footnote-11207184 -Node: Extension Other Design Decisions1207373 -Node: Extension Future Growth1209486 -Node: Old Extension Mechanism1210322 -Node: Notes summary1212085 -Node: Basic Concepts1213267 -Node: Basic High Level1213948 -Ref: figure-general-flow1214230 -Ref: figure-process-flow1214915 -Ref: Basic High Level-Footnote-11218216 -Node: Basic Data Typing1218401 -Node: Glossary1221729 -Node: Copying1253676 -Node: GNU Free Documentation License1291215 -Node: Index1316333 +Ref: POSIX String Comparison-Footnote-1376296 +Ref: POSIX String Comparison-Footnote-2376435 +Node: Boolean Ops376519 +Ref: Boolean Ops-Footnote-1381001 +Node: Conditional Exp381093 +Node: Function Calls382829 +Node: Precedence386706 +Node: Locales390365 +Node: Expressions Summary391997 +Node: Patterns and Actions394570 +Node: Pattern Overview395690 +Node: Regexp Patterns397367 +Node: Expression Patterns397909 +Node: Ranges401690 +Node: BEGIN/END404798 +Node: Using BEGIN/END405559 +Ref: Using BEGIN/END-Footnote-1408295 +Node: I/O And BEGIN/END408401 +Node: BEGINFILE/ENDFILE410715 +Node: Empty413622 +Node: Using Shell Variables413939 +Node: Action Overview416213 +Node: Statements418538 +Node: If Statement420386 +Node: While Statement421881 +Node: Do Statement423909 +Node: For Statement425057 +Node: Switch Statement428215 +Node: Break Statement430601 +Node: Continue Statement432693 +Node: Next Statement434520 +Node: Nextfile Statement436903 +Node: Exit Statement439555 +Node: Built-in Variables441958 +Node: User-modified443091 +Node: Auto-set450677 +Ref: Auto-set-Footnote-1465330 +Ref: Auto-set-Footnote-2465536 +Node: ARGC and ARGV465592 +Node: Pattern Action Summary469805 +Node: Arrays472235 +Node: Array Basics473564 +Node: Array Intro474408 +Ref: figure-array-elements476383 +Ref: Array Intro-Footnote-1479087 +Node: Reference to Elements479215 +Node: Assigning Elements481679 +Node: Array Example482170 +Node: Scanning an Array483929 +Node: Controlling Scanning486951 +Ref: Controlling Scanning-Footnote-1492350 +Node: Numeric Array Subscripts492666 +Node: Uninitialized Subscripts494850 +Node: Delete496469 +Ref: Delete-Footnote-1499221 +Node: Multidimensional499278 +Node: Multiscanning502373 +Node: Arrays of Arrays503964 +Node: Arrays Summary508731 +Node: Functions510824 +Node: Built-in511862 +Node: Calling Built-in512943 +Node: Numeric Functions514939 +Ref: Numeric Functions-Footnote-1519772 +Ref: Numeric Functions-Footnote-2520129 +Ref: Numeric Functions-Footnote-3520177 +Node: String Functions520449 +Ref: String Functions-Footnote-1543953 +Ref: String Functions-Footnote-2544081 +Ref: String Functions-Footnote-3544329 +Node: Gory Details544416 +Ref: table-sub-escapes546207 +Ref: table-sub-proposed547726 +Ref: table-posix-sub549089 +Ref: table-gensub-escapes550630 +Ref: Gory Details-Footnote-1551453 +Node: I/O Functions551607 +Ref: table-system-return-values558189 +Ref: I/O Functions-Footnote-1560169 +Ref: I/O Functions-Footnote-2560317 +Node: Time Functions560437 +Ref: Time Functions-Footnote-1570942 +Ref: Time Functions-Footnote-2571010 +Ref: Time Functions-Footnote-3571168 +Ref: Time Functions-Footnote-4571279 +Ref: Time Functions-Footnote-5571391 +Ref: Time Functions-Footnote-6571618 +Node: Bitwise Functions571884 +Ref: table-bitwise-ops572478 +Ref: Bitwise Functions-Footnote-1576816 +Node: Type Functions576989 +Node: I18N Functions579650 +Node: User-defined581301 +Node: Definition Syntax582106 +Ref: Definition Syntax-Footnote-1587793 +Node: Function Example587864 +Ref: Function Example-Footnote-1590786 +Node: Function Caveats590808 +Node: Calling A Function591326 +Node: Variable Scope592284 +Node: Pass By Value/Reference595278 +Node: Return Statement598777 +Node: Dynamic Typing601756 +Node: Indirect Calls602686 +Ref: Indirect Calls-Footnote-1612937 +Node: Functions Summary613065 +Node: Library Functions615770 +Ref: Library Functions-Footnote-1619377 +Ref: Library Functions-Footnote-2619520 +Node: Library Names619691 +Ref: Library Names-Footnote-1623151 +Ref: Library Names-Footnote-2623374 +Node: General Functions623460 +Node: Strtonum Function624563 +Node: Assert Function627585 +Node: Round Function630911 +Node: Cliff Random Function632452 +Node: Ordinal Functions633468 +Ref: Ordinal Functions-Footnote-1636531 +Ref: Ordinal Functions-Footnote-2636783 +Node: Join Function636993 +Ref: Join Function-Footnote-1638763 +Node: Getlocaltime Function638963 +Node: Readfile Function642705 +Node: Shell Quoting644677 +Node: Data File Management646078 +Node: Filetrans Function646710 +Node: Rewind Function650806 +Node: File Checking652712 +Ref: File Checking-Footnote-1654046 +Node: Empty Files654247 +Node: Ignoring Assigns656226 +Node: Getopt Function657776 +Ref: Getopt Function-Footnote-1669245 +Node: Passwd Functions669445 +Ref: Passwd Functions-Footnote-1678284 +Node: Group Functions678372 +Ref: Group Functions-Footnote-1686269 +Node: Walking Arrays686476 +Node: Library Functions Summary689484 +Node: Library Exercises690890 +Node: Sample Programs691355 +Node: Running Examples692125 +Node: Clones692853 +Node: Cut Program694077 +Node: Egrep Program704006 +Ref: Egrep Program-Footnote-1711518 +Node: Id Program711628 +Node: Split Program715308 +Ref: Split Program-Footnote-1718767 +Node: Tee Program718896 +Node: Uniq Program721686 +Node: Wc Program729112 +Ref: Wc Program-Footnote-1733367 +Node: Miscellaneous Programs733461 +Node: Dupword Program734674 +Node: Alarm Program736704 +Node: Translate Program741559 +Ref: Translate Program-Footnote-1746124 +Node: Labels Program746394 +Ref: Labels Program-Footnote-1749745 +Node: Word Sorting749829 +Node: History Sorting753901 +Node: Extract Program755736 +Node: Simple Sed763265 +Node: Igawk Program766339 +Ref: Igawk Program-Footnote-1780670 +Ref: Igawk Program-Footnote-2780872 +Ref: Igawk Program-Footnote-3780994 +Node: Anagram Program781109 +Node: Signature Program784171 +Node: Programs Summary785418 +Node: Programs Exercises786632 +Ref: Programs Exercises-Footnote-1790761 +Node: Advanced Features790852 +Node: Nondecimal Data792842 +Node: Array Sorting794433 +Node: Controlling Array Traversal795133 +Ref: Controlling Array Traversal-Footnote-1803500 +Node: Array Sorting Functions803618 +Ref: Array Sorting Functions-Footnote-1808709 +Node: Two-way I/O808905 +Ref: Two-way I/O-Footnote-1815455 +Ref: Two-way I/O-Footnote-2815642 +Node: TCP/IP Networking815724 +Node: Profiling818842 +Ref: Profiling-Footnote-1827335 +Node: Advanced Features Summary827658 +Node: Internationalization829502 +Node: I18N and L10N830982 +Node: Explaining gettext831669 +Ref: Explaining gettext-Footnote-1837561 +Ref: Explaining gettext-Footnote-2837746 +Node: Programmer i18n837911 +Ref: Programmer i18n-Footnote-1842766 +Node: Translator i18n842815 +Node: String Extraction843609 +Ref: String Extraction-Footnote-1844741 +Node: Printf Ordering844827 +Ref: Printf Ordering-Footnote-1847613 +Node: I18N Portability847677 +Ref: I18N Portability-Footnote-1850133 +Node: I18N Example850196 +Ref: I18N Example-Footnote-1853002 +Node: Gawk I18N853075 +Node: I18N Summary853720 +Node: Debugger855061 +Node: Debugging856083 +Node: Debugging Concepts856524 +Node: Debugging Terms858333 +Node: Awk Debugging860908 +Node: Sample Debugging Session861814 +Node: Debugger Invocation862348 +Node: Finding The Bug863734 +Node: List of Debugger Commands870212 +Node: Breakpoint Control871545 +Node: Debugger Execution Control875239 +Node: Viewing And Changing Data878601 +Node: Execution Stack881975 +Node: Debugger Info883612 +Node: Miscellaneous Debugger Commands887683 +Node: Readline Support892771 +Node: Limitations893667 +Ref: Limitations-Footnote-1897898 +Node: Debugging Summary897949 +Node: Arbitrary Precision Arithmetic899228 +Node: Computer Arithmetic900644 +Ref: table-numeric-ranges904235 +Ref: Computer Arithmetic-Footnote-1904957 +Node: Math Definitions905014 +Ref: table-ieee-formats908328 +Ref: Math Definitions-Footnote-1908931 +Node: MPFR features909036 +Node: FP Math Caution910753 +Ref: FP Math Caution-Footnote-1911825 +Node: Inexactness of computations912194 +Node: Inexact representation913154 +Node: Comparing FP Values914514 +Node: Errors accumulate915596 +Node: Getting Accuracy917029 +Node: Try To Round919739 +Node: Setting precision920638 +Ref: table-predefined-precision-strings921335 +Node: Setting the rounding mode923165 +Ref: table-gawk-rounding-modes923539 +Ref: Setting the rounding mode-Footnote-1926947 +Node: Arbitrary Precision Integers927126 +Ref: Arbitrary Precision Integers-Footnote-1932043 +Node: POSIX Floating Point Problems932192 +Ref: POSIX Floating Point Problems-Footnote-1936074 +Node: Floating point summary936112 +Node: Dynamic Extensions938302 +Node: Extension Intro939855 +Node: Plugin License941121 +Node: Extension Mechanism Outline941918 +Ref: figure-load-extension942357 +Ref: figure-register-new-function943922 +Ref: figure-call-new-function945014 +Node: Extension API Description947076 +Node: Extension API Functions Introduction948608 +Node: General Data Types953467 +Ref: General Data Types-Footnote-1959422 +Node: Memory Allocation Functions959721 +Ref: Memory Allocation Functions-Footnote-1962566 +Node: Constructor Functions962665 +Node: Registration Functions964410 +Node: Extension Functions965095 +Node: Exit Callback Functions967718 +Node: Extension Version String968968 +Node: Input Parsers969631 +Node: Output Wrappers979513 +Node: Two-way processors984025 +Node: Printing Messages986290 +Ref: Printing Messages-Footnote-1987461 +Node: Updating ERRNO987614 +Node: Requesting Values988353 +Ref: table-value-types-returned989090 +Node: Accessing Parameters989973 +Node: Symbol Table Access991208 +Node: Symbol table by name991720 +Node: Symbol table by cookie993741 +Ref: Symbol table by cookie-Footnote-1997893 +Node: Cached values997957 +Ref: Cached values-Footnote-11001464 +Node: Array Manipulation1001555 +Ref: Array Manipulation-Footnote-11002646 +Node: Array Data Types1002683 +Ref: Array Data Types-Footnote-11005341 +Node: Array Functions1005433 +Node: Flattening Arrays1009291 +Node: Creating Arrays1016199 +Node: Redirection API1020968 +Node: Extension API Variables1023799 +Node: Extension Versioning1024432 +Ref: gawk-api-version1024869 +Node: Extension API Informational Variables1026625 +Node: Extension API Boilerplate1027689 +Node: Finding Extensions1031503 +Node: Extension Example1032062 +Node: Internal File Description1032860 +Node: Internal File Ops1036940 +Ref: Internal File Ops-Footnote-11048702 +Node: Using Internal File Ops1048842 +Ref: Using Internal File Ops-Footnote-11051225 +Node: Extension Samples1051499 +Node: Extension Sample File Functions1053028 +Node: Extension Sample Fnmatch1060677 +Node: Extension Sample Fork1062164 +Node: Extension Sample Inplace1063382 +Node: Extension Sample Ord1066592 +Node: Extension Sample Readdir1067428 +Ref: table-readdir-file-types1068317 +Node: Extension Sample Revout1069122 +Node: Extension Sample Rev2way1069711 +Node: Extension Sample Read write array1070451 +Node: Extension Sample Readfile1072393 +Node: Extension Sample Time1073488 +Node: Extension Sample API Tests1074836 +Node: gawkextlib1075328 +Node: Extension summary1077775 +Node: Extension Exercises1081477 +Node: Language History1082975 +Node: V7/SVR3.11084631 +Node: SVR41086783 +Node: POSIX1088217 +Node: BTL1089596 +Node: POSIX/GNU1090325 +Node: Feature History1096187 +Node: Common Extensions1110557 +Node: Ranges and Locales1111840 +Ref: Ranges and Locales-Footnote-11116456 +Ref: Ranges and Locales-Footnote-21116483 +Ref: Ranges and Locales-Footnote-31116718 +Node: Contributors1116939 +Node: History summary1122499 +Node: Installation1123879 +Node: Gawk Distribution1124823 +Node: Getting1125307 +Node: Extracting1126268 +Node: Distribution contents1127906 +Node: Unix Installation1134000 +Node: Quick Installation1134682 +Node: Shell Startup Files1137096 +Node: Additional Configuration Options1138174 +Node: Configuration Philosophy1139979 +Node: Non-Unix Installation1142348 +Node: PC Installation1142806 +Node: PC Binary Installation1144126 +Node: PC Compiling1145978 +Ref: PC Compiling-Footnote-11148772 +Node: PC Testing1148881 +Node: PC Using1150061 +Ref: PC Using-Footnote-11154214 +Node: Cygwin1154287 +Node: MSYS1155057 +Node: VMS Installation1155558 +Node: VMS Compilation1156349 +Ref: VMS Compilation-Footnote-11157578 +Node: VMS Dynamic Extensions1157636 +Node: VMS Installation Details1159321 +Node: VMS Running1161574 +Node: VMS GNV1165853 +Node: VMS Old Gawk1166588 +Node: Bugs1167059 +Node: Other Versions1171374 +Node: Installation summary1177958 +Node: Notes1179009 +Node: Compatibility Mode1179874 +Node: Additions1180656 +Node: Accessing The Source1181581 +Node: Adding Code1183016 +Node: New Ports1189235 +Node: Derived Files1193723 +Ref: Derived Files-Footnote-11199208 +Ref: Derived Files-Footnote-21199243 +Ref: Derived Files-Footnote-31199841 +Node: Future Extensions1199955 +Node: Implementation Limitations1200613 +Node: Extension Design1201796 +Node: Old Extension Problems1202950 +Ref: Old Extension Problems-Footnote-11204468 +Node: Extension New Mechanism Goals1204525 +Ref: Extension New Mechanism Goals-Footnote-11207889 +Node: Extension Other Design Decisions1208078 +Node: Extension Future Growth1210191 +Node: Old Extension Mechanism1211027 +Node: Notes summary1212790 +Node: Basic Concepts1213972 +Node: Basic High Level1214653 +Ref: figure-general-flow1214935 +Ref: figure-process-flow1215620 +Ref: Basic High Level-Footnote-11218921 +Node: Basic Data Typing1219106 +Node: Glossary1222434 +Node: Copying1254381 +Node: GNU Free Documentation License1291920 +Node: Index1317038 End Tag Table diff --git a/doc/gawk.texi b/doc/gawk.texi index 91c4893e..60dfe961 100644 --- a/doc/gawk.texi +++ b/doc/gawk.texi @@ -12646,19 +12646,19 @@ One special place where @code{/foo/} is @emph{not} an abbreviation for where this is discussed in more detail. @node POSIX String Comparison -@subsubsection String Comparison with POSIX Rules +@subsubsection String Comparison Based on Locale Collating Order -The POSIX standard says that string comparison is performed based -on the locale's @dfn{collating order}. This is the order in which -characters sort, as defined by the locale (for more discussion, -@pxref{Locales}). This order is usually very different -from the results obtained when doing straight character-by-character -comparison.@footnote{Technically, string comparison is supposed -to behave the same way as if the strings were compared with the C -@code{strcoll()} function.} +The POSIX standard used to say that all string comparisons are +performed based on the locale's @dfn{collating order}. This +is the order in which characters sort, as defined by the locale +(for more discussion, @pxref{Locales}). This order is usually very +different from the results obtained when doing straight byte-by-byte +comparison.@footnote{Technically, string comparison is supposed to behave +the same way as if the strings were compared with the C @code{strcoll()} +function.} Because this behavior differs considerably from existing practice, -@command{gawk} only implements it when in POSIX mode (@pxref{Options}). +@command{gawk} only implemented it when in POSIX mode (@pxref{Options}). Here is an example to illustrate the difference, in an @code{en_US.UTF-8} locale: @@ -12671,6 +12671,26 @@ $ @kbd{gawk --posix 'BEGIN @{ printf("ABC < abc = %s\n",} @print{} ABC < abc = FALSE @end example +Fortunately, as of August 2016, comparison based on locale +collating order is no longer required for the @code{==} and @code{!=} +operators.@footnote{See @uref{http://austingroupbugs.net/view.php?id=1070, +the Austin Group website}.} However, comparison based on locales is still +required for @code{<}, @code{<=}, @code{>}, and @code{>=}. POSIX thus +recommends as follows: + +@quotation +Since the @code{==} operator checks whether strings are identical, +not whether they collate equally, applications needing to check whether +strings collate equally can use: + +@example +a <= b && a >= b +@end example +@end quotation + +As of @value{PVERSION} 4.2, @command{gawk} continues to use locale +collating order for @code{<}, @code{<=}, @code{>}, and @code{>=} only +in POSIX mode. @node Boolean Ops @subsection Boolean Expressions @@ -37458,7 +37478,7 @@ and @uref{http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html#tag_21_09_03_05, its rationale}.} By using this lovely technical term, the standard gives license -to implementors to implement ranges in whatever way they choose. +to implementers to implement ranges in whatever way they choose. The @command{gawk} maintainer chose to apply the pre-POSIX meaning both with the default regexp matching and when @option{--traditional} or @option{--posix} are used. diff --git a/doc/gawktexi.in b/doc/gawktexi.in index 6d7eceb9..546f7611 100644 --- a/doc/gawktexi.in +++ b/doc/gawktexi.in @@ -11965,19 +11965,19 @@ One special place where @code{/foo/} is @emph{not} an abbreviation for where this is discussed in more detail. @node POSIX String Comparison -@subsubsection String Comparison with POSIX Rules +@subsubsection String Comparison Based on Locale Collating Order -The POSIX standard says that string comparison is performed based -on the locale's @dfn{collating order}. This is the order in which -characters sort, as defined by the locale (for more discussion, -@pxref{Locales}). This order is usually very different -from the results obtained when doing straight character-by-character -comparison.@footnote{Technically, string comparison is supposed -to behave the same way as if the strings were compared with the C -@code{strcoll()} function.} +The POSIX standard used to say that all string comparisons are +performed based on the locale's @dfn{collating order}. This +is the order in which characters sort, as defined by the locale +(for more discussion, @pxref{Locales}). This order is usually very +different from the results obtained when doing straight byte-by-byte +comparison.@footnote{Technically, string comparison is supposed to behave +the same way as if the strings were compared with the C @code{strcoll()} +function.} Because this behavior differs considerably from existing practice, -@command{gawk} only implements it when in POSIX mode (@pxref{Options}). +@command{gawk} only implemented it when in POSIX mode (@pxref{Options}). Here is an example to illustrate the difference, in an @code{en_US.UTF-8} locale: @@ -11990,6 +11990,26 @@ $ @kbd{gawk --posix 'BEGIN @{ printf("ABC < abc = %s\n",} @print{} ABC < abc = FALSE @end example +Fortunately, as of August 2016, comparison based on locale +collating order is no longer required for the @code{==} and @code{!=} +operators.@footnote{See @uref{http://austingroupbugs.net/view.php?id=1070, +the Austin Group website}.} However, comparison based on locales is still +required for @code{<}, @code{<=}, @code{>}, and @code{>=}. POSIX thus +recommends as follows: + +@quotation +Since the @code{==} operator checks whether strings are identical, +not whether they collate equally, applications needing to check whether +strings collate equally can use: + +@example +a <= b && a >= b +@end example +@end quotation + +As of @value{PVERSION} 4.2, @command{gawk} continues to use locale +collating order for @code{<}, @code{<=}, @code{>}, and @code{>=} only +in POSIX mode. @node Boolean Ops @subsection Boolean Expressions @@ -36540,7 +36560,7 @@ and @uref{http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html#tag_21_09_03_05, its rationale}.} By using this lovely technical term, the standard gives license -to implementors to implement ranges in whatever way they choose. +to implementers to implement ranges in whatever way they choose. The @command{gawk} maintainer chose to apply the pre-POSIX meaning both with the default regexp matching and when @option{--traditional} or @option{--posix} are used. @@ -575,7 +575,7 @@ posix_compare(NODE *s1, NODE *s2) /* cmp_nodes --- compare two nodes, returning negative, 0, positive */ int -cmp_nodes(NODE *t1, NODE *t2) +cmp_nodes(NODE *t1, NODE *t2, bool use_strcmp) { int ret = 0; size_t len1, len2; @@ -598,7 +598,7 @@ cmp_nodes(NODE *t1, NODE *t2) if (len1 == 0 || len2 == 0) return ldiff; - if (do_posix) + if (do_posix && ! use_strcmp) return posix_compare(t1, t2); l = (ldiff <= 0 ? len1 : len2); @@ -885,7 +885,7 @@ fmt_index(NODE *n) emalloc(fmt_list, NODE **, fmt_num*sizeof(*fmt_list), "fmt_index"); n = force_string(n); while (ix < fmt_hiwater) { - if (cmp_nodes(fmt_list[ix], n) == 0) + if (cmp_nodes(fmt_list[ix], n, true) == 0) return ix; ix++; } @@ -1514,10 +1514,15 @@ eval_condition(NODE *t) return boolval(t); } +typedef enum { + SCALAR_EQ_NEQ, + SCALAR_RELATIONAL +} scalar_cmp_t; + /* cmp_scalars -- compare two nodes on the stack */ static inline int -cmp_scalars() +cmp_scalars(scalar_cmp_t comparison_type) { NODE *t1, *t2; int di; @@ -1528,7 +1533,7 @@ cmp_scalars() DEREF(t2); fatal(_("attempt to use array `%s' in a scalar context"), array_vname(t1)); } - di = cmp_nodes(t1, t2); + di = cmp_nodes(t1, t2, comparison_type == SCALAR_EQ_NEQ); DEREF(t1); DEREF(t2); return di; diff --git a/extension/configure.ac b/extension/configure.ac index b723a3c1..b5b27d03 100644 --- a/extension/configure.ac +++ b/extension/configure.ac @@ -23,7 +23,7 @@ dnl dnl Process this file with autoconf to produce a configure script. -AC_INIT([GNU Awk Bundled Extensions],[4.1.3],[bug-gawk@gnu.org],[gawk-extensions]) +AC_INIT([GNU Awk Bundled Extensions],[4.1.4],[bug-gawk@gnu.org],[gawk-extensions]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/interpret.h b/interpret.h index 3bb4532e..5467aa87 100644 --- a/interpret.h +++ b/interpret.h @@ -444,37 +444,37 @@ uninitialized_scalar: break; case Op_equal: - r = node_Boolean[cmp_scalars() == 0]; + r = node_Boolean[cmp_scalars(SCALAR_EQ_NEQ) == 0]; UPREF(r); REPLACE(r); break; case Op_notequal: - r = node_Boolean[cmp_scalars() != 0]; + r = node_Boolean[cmp_scalars(SCALAR_EQ_NEQ) != 0]; UPREF(r); REPLACE(r); break; case Op_less: - r = node_Boolean[cmp_scalars() < 0]; + r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) < 0]; UPREF(r); REPLACE(r); break; case Op_greater: - r = node_Boolean[cmp_scalars() > 0]; + r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) > 0]; UPREF(r); REPLACE(r); break; case Op_leq: - r = node_Boolean[cmp_scalars() <= 0]; + r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) <= 0]; UPREF(r); REPLACE(r); break; case Op_geq: - r = node_Boolean[cmp_scalars() >= 0]; + r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) >= 0]; UPREF(r); REPLACE(r); break; @@ -832,12 +832,11 @@ mod: t2 = TOP_SCALAR(); /* switch expression */ t2 = force_string(t2); rp = re_update(m); - di = (research(rp, t2->stptr, 0, t2->stlen, - avoid_dfa(m, t2->stptr, t2->stlen)) >= 0); + di = (research(rp, t2->stptr, 0, t2->stlen, RE_NO_FLAGS) >= 0); } else { t1 = POP_SCALAR(); /* case value */ t2 = TOP_SCALAR(); /* switch expression */ - di = (cmp_nodes(t2, t1) == 0); + di = (cmp_nodes(t2, t1, true) == 0); DEREF(t1); } @@ -998,20 +997,7 @@ arrayfor: t1 = *get_field(0, (Func_ptr *) 0); match_re: rp = re_update(m); - /* - * Any place where research() is called with a last parameter of - * zero, we need to use the avoid_dfa test. This appears here and - * in the code for Op_K_case. - * - * A new or improved dfa that distinguishes beginning/end of - * string from beginning/end of line will allow us to get rid of - * this hack. - * - * The avoid_dfa() function is in re.c; it is not very smart. - */ - - di = research(rp, t1->stptr, 0, t1->stlen, - avoid_dfa(m, t1->stptr, t1->stlen)); + di = research(rp, t1->stptr, 0, t1->stlen, RE_NO_FLAGS); di = (di == -1) ^ (op != Op_nomatch); if (op != Op_match_rec) { decr_sp(); @@ -170,7 +170,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); memset((char *) rp, 0, sizeof(*rp)); - rp->dfareg = NULL; rp->pat.allocated = 0; /* regex will allocate the buffer */ emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); @@ -223,12 +222,11 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) /* gack. this must be done *after* re_compile_pattern */ rp->pat.newline_anchor = false; /* don't get \n in middle of string */ if (dfa && ! no_dfa) { - rp->dfa = true; rp->dfareg = dfaalloc(); dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n'); dfacomp(buf, len, rp->dfareg, true); } else - rp->dfa = false; + rp->dfareg = NULL; rp->has_anchor = has_anchor; /* Additional flags that help with RS as regexp. */ @@ -278,26 +276,25 @@ research(Regexp *rp, char *str, int start, * starts in the middle of a string, so don't bother trying it * in that case. */ - if (rp->dfa && ! no_bol && start == 0) { - char save; - size_t count = 0; + if (rp->dfareg != NULL && ! no_bol && start == 0) { struct dfa *superset = dfasuperset(rp->dfareg); - /* - * dfa likes to stick a '\n' right after the matched - * text. So we just save and restore the character. - */ - save = str[start+len]; if (superset) ret = dfaexec(superset, str+start, str+start+len, true, NULL, NULL); - if (ret) + + if (ret && ((! need_start && ! rp->has_anchor) + || (! superset && dfaisfast(rp->dfareg)))) ret = dfaexec(rp->dfareg, str+start, str+start+len, - true, &count, &try_backref); - str[start+len] = save; + true, NULL, &try_backref); } if (ret) { - if (need_start || rp->dfa == false || try_backref) { + if ( rp->dfareg == NULL + || start != 0 + || no_bol + || need_start + || rp->has_anchor + || try_backref) { /* * Passing NULL as last arg speeds up search for cases * where we don't need the start/end info. @@ -326,7 +323,7 @@ refree(Regexp *rp) free(rp->regs.start); if (rp->regs.end) free(rp->regs.end); - if (rp->dfa) { + if (rp->dfareg != NULL) { dfafree(rp->dfareg); free(rp->dfareg); } @@ -363,7 +360,7 @@ re_update(NODE *t) } if (t->re_text != NULL) { /* if contents haven't changed, just return it */ - if (cmp_nodes(t->re_text, t1) == 0) + if (cmp_nodes(t->re_text, t1, true) == 0) return t->re_reg; /* things changed, fall through to recompile */ unref(t->re_text); @@ -429,32 +426,6 @@ resetup() dfa_init(); } -/* avoid_dfa --- return true if we should not use the DFA matcher */ - -int -avoid_dfa(NODE *re, char *str, size_t len) -{ - char *end; - - /* - * f = @/.../ - * if ("foo" ~ f) ... - * - * This creates a Node_dynregex with NULL re_reg. - */ - if (re->re_reg == NULL) - return false; - - if (! re->re_reg->has_anchor) - return false; - - for (end = str + len; str < end; str++) - if (*str == '\n') - return true; - - return false; -} - /* reisstring --- return true if the RE match is a simple string match */ int |