diff options
-rw-r--r-- | ChangeLog | 33 | ||||
-rw-r--r-- | awk.h | 13 | ||||
-rw-r--r-- | debug.c | 2 | ||||
-rw-r--r-- | dfa.c | 393 | ||||
-rw-r--r-- | doc/ChangeLog | 6 | ||||
-rw-r--r-- | doc/gawk.info | 830 | ||||
-rw-r--r-- | doc/gawk.texi | 42 | ||||
-rw-r--r-- | doc/gawktexi.in | 42 | ||||
-rw-r--r-- | eval.c | 15 | ||||
-rw-r--r-- | extension/configure.ac | 2 | ||||
-rw-r--r-- | interpret.h | 32 | ||||
-rw-r--r-- | re.c | 57 |
12 files changed, 761 insertions, 706 deletions
@@ -1,5 +1,38 @@ 2016-08-25 Arnold D. Robbins <arnold@skeeve.com> + POSIX now says use strcmp for == and !=. Thanks to Chet Ramey + for pointing me at the change. Make it so: + + * awk.h (cmp_nodes): New 3rd param indicating strcmp, not strcoll. + * debug.c (cmp_val): Update call to cmp_nodes. + * eval.c (cmp_nodes): New 3rd param indicating strcmp, not strcoll. + Adjust code and all callers. + (scalar_cmp_t): New enum type. Used in ... + (cmp_scalars): ... in order to call cmp_nodes correctly. + * interpret.h: Use the enum type in calls to cmp_scalars. + * re.c (re_update): Adjust call to cmp_nodes. + +2016-08-25 Norihiro Tanaka <noritnk@kcn.ne.jp> + + * awk.h (struct Regexp): Remove dfa. Now dfareg instead of it. All + referers changed. + * re.c (research): Arrange caller of dfaexec and research. + * (avoid_dfa): Removed. All callers changed. + * awk.h (avoid_dfa): Removed. + + Other changes by Arnold Robbins: + + * awk.h (struct Regexp): Change various boolean members to bool. + (RE_NO_FLAGS): New #define. + * interpret.h: Use RE_NO_FLAGS instead of zero. + * re.c (research): Prettify the logic a little bit. + +2016-08-25 Arnold D. Robbins <arnold@skeeve.com> + + * dfa.c: Sync with grep. + +2016-08-25 Arnold D. Robbins <arnold@skeeve.com> + * 4.1.4: Release tar ball made. 2016-08-23 Arnold D. Robbins <arnold@skeeve.com> @@ -206,11 +206,10 @@ typedef struct Regexp { struct re_pattern_buffer pat; struct re_registers regs; struct dfa *dfareg; - short dfa; - short has_anchor; /* speed up of avoid_dfa kludge, temporary */ - short non_empty; /* for use in fpat_parse_field */ - short has_meta; /* re has meta chars so (probably) isn't simple string */ - short maybe_long; /* re has meta chars that can match long text */ + bool has_anchor; /* re has anchors which dfa avoids */ + bool non_empty; /* for use in fpat_parse_field */ + bool has_meta; /* re has meta chars so (probably) isn't simple string */ + bool maybe_long; /* re has meta chars that can match long text */ } Regexp; #define RESTART(rp,s) (rp)->regs.start[0] #define REEND(rp,s) (rp)->regs.end[0] @@ -219,6 +218,7 @@ typedef struct Regexp { #define NUMSUBPATS(rp,s) (rp)->regs.num_regs /* regexp matching flags: */ +#define RE_NO_FLAGS 0 /* empty flags */ #define RE_NEED_START 1 /* need to know start/end of match */ #define RE_NO_BOL 2 /* not allowed to match ^ in regexp */ @@ -1442,7 +1442,7 @@ extern int sanitize_exit_status(int status); extern void PUSH_CODE(INSTRUCTION *cp); extern INSTRUCTION *POP_CODE(void); extern void init_interpret(void); -extern int cmp_nodes(NODE *t1, NODE *t2); +extern int cmp_nodes(NODE *t1, NODE *t2, bool use_strcmp); extern int cmp_awknums(const NODE *t1, const NODE *t2); extern void set_IGNORECASE(void); extern void set_OFS(void); @@ -1651,7 +1651,6 @@ extern void reg_error(const char *s); extern Regexp *re_update(NODE *t); extern void resyntax(int syntax); extern void resetup(void); -extern int avoid_dfa(NODE *re, char *str, size_t len); extern int reisstring(const char *text, size_t len, Regexp *re, const char *buf); extern int get_numbase(const char *str, bool use_locale); @@ -1670,7 +1670,7 @@ cmp_val(struct list_item *w, NODE *old, NODE *new) if (new->type == Node_var_array) /* 5 */ return true; - return cmp_nodes(old, new); /* 4 */ + return cmp_nodes(old, new, true); /* 4 */ } /* watchpoint_triggered --- check if we should stop at this watchpoint; @@ -387,8 +387,8 @@ struct regex_syntax meaning of the @#%!@#%^!@ syntax bits. */ struct lexer_state { - char const *lexptr; /* Pointer to next input character. */ - size_t lexleft; /* Number of characters remaining. */ + char const *ptr; /* Pointer to next input character. */ + size_t left; /* Number of characters remaining. */ token lasttok; /* Previous token returned; initially END. */ size_t parens; /* Count of outstanding left parens. */ int minrep, maxrep; /* Repeat counts for {m,n}. */ @@ -429,10 +429,10 @@ struct dfa size_t calloc; /* Number of charclasses allocated. */ /* Scanner state */ - struct lexer_state lexstate; + struct lexer_state lex; /* Parser state */ - struct parser_state parsestate; + struct parser_state parse; /* Fields filled by the parser. */ token *tokens; /* Postfix parse array. */ @@ -910,7 +910,7 @@ using_simple_locale (struct dfa const *dfa) && '}' == 125 && '~' == 126) }; - return (!native_c_charset || dfa->multibyte) ? false : unibyte_c; + return (native_c_charset & !dfa->multibyte) | unibyte_c; } /* Fetch the next lexical input character. Set C (of type int) to the @@ -922,23 +922,23 @@ using_simple_locale (struct dfa const *dfa) otherwise. */ # define FETCH_WC(dfa, c, wc, eoferr) \ do { \ - if (! dfa->lexstate.lexleft) \ + if (! (dfa)->lex.left) \ { \ if ((eoferr) != 0) \ dfaerror (eoferr); \ else \ - return dfa->lexstate.lasttok = END; \ + return (dfa)->lex.lasttok = END; \ } \ else \ { \ wint_t _wc; \ - size_t nbytes = mbs_to_wchar (&_wc, dfa->lexstate.lexptr, \ - dfa->lexstate.lexleft, dfa); \ - dfa->lexstate.cur_mb_len = nbytes; \ + size_t nbytes = mbs_to_wchar (&_wc, (dfa)->lex.ptr, \ + (dfa)->lex.left, dfa); \ + (dfa)->lex.cur_mb_len = nbytes; \ (wc) = _wc; \ - (c) = nbytes == 1 ? to_uchar (*dfa->lexstate.lexptr) : EOF; \ - dfa->lexstate.lexptr += nbytes; \ - dfa->lexstate.lexleft -= nbytes; \ + (c) = nbytes == 1 ? to_uchar ((dfa)->lex.ptr[0]) : EOF; \ + (dfa)->lex.ptr += nbytes; \ + (dfa)->lex.left -= nbytes; \ } \ } while (false) @@ -1112,8 +1112,8 @@ parse_bracket_exp (struct dfa *dfa) for (;;) { FETCH_WC (dfa, c, wc, _("unbalanced [")); - if ((c == c1 && *dfa->lexstate.lexptr == ']') - || dfa->lexstate.lexleft == 0) + if (dfa->lex.left == 0 + || (c == c1 && dfa->lex.ptr[0] == ']')) break; if (len < MAX_BRACKET_STRING_LEN) str[len++] = c; @@ -1133,8 +1133,8 @@ parse_bracket_exp (struct dfa *dfa) { char const *class = (dfa->syntax.case_fold && (STREQ (str, "upper") - || STREQ (str, "lower")) ? - "alpha" : str); + || STREQ (str, "lower")) + ? "alpha" : str); const struct dfa_ctype *pred = find_pred (class); if (!pred) dfaerror (_("invalid character class")); @@ -1174,7 +1174,7 @@ parse_bracket_exp (struct dfa *dfa) /* A bracket expression like [a-[.aa.]] matches an unknown set. Treat it like [-a[.aa.]] while parsing it, and remember that the set is unknown. */ - if (c2 == '[' && *dfa->lexstate.lexptr == '.') + if (c2 == '[' && dfa->lex.ptr[0] == '.') { known_bracket_exp = false; c2 = ']'; @@ -1184,8 +1184,8 @@ parse_bracket_exp (struct dfa *dfa) { /* In the case [x-], the - is an ordinary hyphen, which is left in c1, the lookahead character. */ - dfa->lexstate.lexptr -= dfa->lexstate.cur_mb_len; - dfa->lexstate.lexleft += dfa->lexstate.cur_mb_len; + dfa->lex.ptr -= dfa->lex.cur_mb_len; + dfa->lex.left += dfa->lex.cur_mb_len; } else { @@ -1283,19 +1283,27 @@ parse_bracket_exp (struct dfa *dfa) return CSET + dfa_charclass_index (dfa, ccl); } -#define PUSH_LEX_STATE(s) \ - do \ - { \ - char const *lexptr_saved = dfa->lexstate.lexptr; \ - size_t lexleft_saved = dfa->lexstate.lexleft; \ - dfa->lexstate.lexptr = (s); \ - dfa->lexstate.lexleft = strlen (dfa->lexstate.lexptr) +struct lexptr +{ + char const *ptr; + size_t left; +}; + +static void +push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s) +{ + ls->ptr = dfa->lex.ptr; + ls->left = dfa->lex.left; + dfa->lex.ptr = s; + dfa->lex.left = strlen (s); +} -#define POP_LEX_STATE() \ - dfa->lexstate.lexptr = lexptr_saved; \ - dfa->lexstate.lexleft = lexleft_saved; \ - } \ - while (false) +static void +pop_lex_state (struct dfa *dfa, struct lexptr const *ls) +{ + dfa->lex.ptr = ls->ptr; + dfa->lex.left = ls->left; +} static token lex (struct dfa *dfa) @@ -1313,14 +1321,14 @@ lex (struct dfa *dfa) "if (backslash) ...". */ for (i = 0; i < 2; ++i) { - FETCH_WC (dfa, c, dfa->lexstate.wctok, NULL); + FETCH_WC (dfa, c, dfa->lex.wctok, NULL); switch (c) { case '\\': if (backslash) goto normal_char; - if (dfa->lexstate.lexleft == 0) + if (dfa->lex.left == 0) dfaerror (_("unfinished \\ escape")); backslash = true; break; @@ -1329,28 +1337,29 @@ lex (struct dfa *dfa) if (backslash) goto normal_char; if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS - || dfa->lexstate.lasttok == END || dfa->lexstate.lasttok == LPAREN - || dfa->lexstate.lasttok == OR) - return dfa->lexstate.lasttok = BEGLINE; + || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN + || dfa->lex.lasttok == OR) + return dfa->lex.lasttok = BEGLINE; goto normal_char; case '$': if (backslash) goto normal_char; if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS - || dfa->lexstate.lexleft == 0 - || (dfa->syntax.syntax_bits & RE_NO_BK_PARENS - ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == ')' - : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\' - && dfa->lexstate.lexptr[1] == ')') - || (dfa->syntax.syntax_bits & RE_NO_BK_VBAR - ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == '|' - : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\' - && dfa->lexstate.lexptr[1] == '|') + || dfa->lex.left == 0 + || ((dfa->lex.left + > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)) + && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS) + & (dfa->lex.ptr[0] == '\\')] + == ')')) + || ((dfa->lex.left + > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)) + && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR) + & (dfa->lex.ptr[0] == '\\')] + == '|')) || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT) - && dfa->lexstate.lexleft > 0 - && *dfa->lexstate.lexptr == '\n')) - return dfa->lexstate.lasttok = ENDLINE; + && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n')) + return dfa->lex.lasttok = ENDLINE; goto normal_char; case '1': @@ -1364,8 +1373,8 @@ lex (struct dfa *dfa) case '9': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS)) { - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = BACKREF; + dfa->lex.laststart = false; + return dfa->lex.lasttok = BACKREF; } goto normal_char; @@ -1373,7 +1382,7 @@ lex (struct dfa *dfa) if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) { /* FIXME: should be beginning of string */ - return dfa->lexstate.lasttok = BEGLINE; + return dfa->lex.lasttok = BEGLINE; } goto normal_char; @@ -1381,28 +1390,28 @@ lex (struct dfa *dfa) if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) { /* FIXME: should be end of string */ - return dfa->lexstate.lasttok = ENDLINE; + return dfa->lex.lasttok = ENDLINE; } goto normal_char; case '<': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = BEGWORD; + return dfa->lex.lasttok = BEGWORD; goto normal_char; case '>': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = ENDWORD; + return dfa->lex.lasttok = ENDWORD; goto normal_char; case 'b': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = LIMWORD; + return dfa->lex.lasttok = LIMWORD; goto normal_char; case 'B': if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) - return dfa->lexstate.lasttok = NOTLIMWORD; + return dfa->lex.lasttok = NOTLIMWORD; goto normal_char; case '?': @@ -1411,17 +1420,17 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = QMARK; + return dfa->lex.lasttok = QMARK; case '*': if (backslash) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = STAR; + return dfa->lex.lasttok = STAR; case '+': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS) @@ -1429,9 +1438,9 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; - return dfa->lexstate.lasttok = PLUS; + return dfa->lex.lasttok = PLUS; case '{': if (!(dfa->syntax.syntax_bits & RE_INTERVALS)) @@ -1439,7 +1448,7 @@ lex (struct dfa *dfa) if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0)) goto normal_char; if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) - && dfa->lexstate.laststart) + && dfa->lex.laststart) goto normal_char; /* Cases: @@ -1449,86 +1458,79 @@ lex (struct dfa *dfa) {,} - 0 to infinity (same as '*') {M,N} - M through N */ { - char const *p = dfa->lexstate.lexptr; - char const *lim = p + dfa->lexstate.lexleft; - dfa->lexstate.minrep = dfa->lexstate.maxrep = -1; + char const *p = dfa->lex.ptr; + char const *lim = p + dfa->lex.left; + dfa->lex.minrep = dfa->lex.maxrep = -1; for (; p != lim && ISASCIIDIGIT (*p); p++) - { - if (dfa->lexstate.minrep < 0) - dfa->lexstate.minrep = *p - '0'; - else - dfa->lexstate.minrep = MIN (RE_DUP_MAX + 1, - (dfa->lexstate.minrep - * 10 + *p - '0')); - } + dfa->lex.minrep = (dfa->lex.minrep < 0 + ? *p - '0' + : MIN (RE_DUP_MAX + 1, + dfa->lex.minrep * 10 + *p - '0')); if (p != lim) { if (*p != ',') - dfa->lexstate.maxrep = dfa->lexstate.minrep; + dfa->lex.maxrep = dfa->lex.minrep; else { - if (dfa->lexstate.minrep < 0) - dfa->lexstate.minrep = 0; + if (dfa->lex.minrep < 0) + dfa->lex.minrep = 0; while (++p != lim && ISASCIIDIGIT (*p)) - { - if (dfa->lexstate.maxrep < 0) - dfa->lexstate.maxrep = *p - '0'; - else - dfa->lexstate.maxrep = MIN (RE_DUP_MAX + 1, - (dfa->lexstate.maxrep - * 10 + *p - '0')); - } + dfa->lex.maxrep + = (dfa->lex.maxrep < 0 + ? *p - '0' + : MIN (RE_DUP_MAX + 1, + dfa->lex.maxrep * 10 + *p - '0')); } } if (! ((! backslash || (p != lim && *p++ == '\\')) && p != lim && *p++ == '}' - && 0 <= dfa->lexstate.minrep - && (dfa->lexstate.maxrep < 0 - || dfa->lexstate.minrep <= dfa->lexstate.maxrep))) + && 0 <= dfa->lex.minrep + && (dfa->lex.maxrep < 0 + || dfa->lex.minrep <= dfa->lex.maxrep))) { if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD) goto normal_char; dfaerror (_("invalid content of \\{\\}")); } - if (RE_DUP_MAX < dfa->lexstate.maxrep) + if (RE_DUP_MAX < dfa->lex.maxrep) dfaerror (_("regular expression too big")); - dfa->lexstate.lexptr = p; - dfa->lexstate.lexleft = lim - p; + dfa->lex.ptr = p; + dfa->lex.left = lim - p; } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = REPMN; + dfa->lex.laststart = false; + return dfa->lex.lasttok = REPMN; case '|': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS) goto normal_char; if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0)) goto normal_char; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = OR; + dfa->lex.laststart = true; + return dfa->lex.lasttok = OR; case '\n': if (dfa->syntax.syntax_bits & RE_LIMITED_OPS || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT)) goto normal_char; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = OR; + dfa->lex.laststart = true; + return dfa->lex.lasttok = OR; case '(': if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; - ++dfa->lexstate.parens; - dfa->lexstate.laststart = true; - return dfa->lexstate.lasttok = LPAREN; + dfa->lex.parens++; + dfa->lex.laststart = true; + return dfa->lex.lasttok = LPAREN; case ')': if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; - if (dfa->lexstate.parens == 0 + if (dfa->lex.parens == 0 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) goto normal_char; - --dfa->lexstate.parens; - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = RPAREN; + dfa->lex.parens--; + dfa->lex.laststart = false; + return dfa->lex.lasttok = RPAREN; case '.': if (backslash) @@ -1537,8 +1539,8 @@ lex (struct dfa *dfa) { /* In multibyte environment period must match with a single character not a byte. So we use ANYCHAR. */ - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = ANYCHAR; + dfa->lex.laststart = false; + return dfa->lex.lasttok = ANYCHAR; } zeroset (ccl); notset (ccl); @@ -1546,8 +1548,8 @@ lex (struct dfa *dfa) clrbit ('\n', ccl); if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); case 's': case 'S': @@ -1561,9 +1563,8 @@ lex (struct dfa *dfa) setbit (c2, ccl); if (c == 'S') notset (ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1572,14 +1573,15 @@ lex (struct dfa *dfa) /* \s and \S are documented to be equivalent to [[:space:]] and [^[:space:]] respectively, so tell the lexer to process those strings, each minus its "already processed" '['. */ - PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]"); - - dfa->lexstate.lasttok = parse_bracket_exp (dfa); - - POP_LEX_STATE (); + { + struct lexptr ls; + push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']); + dfa->lex.lasttok = parse_bracket_exp (dfa); + pop_lex_state (dfa, &ls); + } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok; + dfa->lex.laststart = false; + return dfa->lex.lasttok; case 'w': case 'W': @@ -1594,9 +1596,8 @@ lex (struct dfa *dfa) setbit (c2, ccl); if (c == 'W') notset (ccl); - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + dfa->lex.laststart = false; + return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1605,38 +1606,38 @@ lex (struct dfa *dfa) /* \w and \W are documented to be equivalent to [_[:alnum:]] and [^_[:alnum:]] respectively, so tell the lexer to process those strings, each minus its "already processed" '['. */ - PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]"); - - dfa->lexstate.lasttok = parse_bracket_exp (dfa); - - POP_LEX_STATE (); + { + struct lexptr ls; + push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']); + dfa->lex.lasttok = parse_bracket_exp (dfa); + pop_lex_state (dfa, &ls); + } - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok; + dfa->lex.laststart = false; + return dfa->lex.lasttok; case '[': if (backslash) goto normal_char; - dfa->lexstate.laststart = false; - return dfa->lexstate.lasttok = parse_bracket_exp (dfa); + dfa->lex.laststart = false; + return dfa->lex.lasttok = parse_bracket_exp (dfa); default: normal_char: - dfa->lexstate.laststart = false; + dfa->lex.laststart = false; /* For multibyte character sets, folding is done in atom. Always return WCHAR. */ if (dfa->multibyte) - return dfa->lexstate.lasttok = WCHAR; + return dfa->lex.lasttok = WCHAR; if (dfa->syntax.case_fold && isalpha (c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); - return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, - ccl); + return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); } - return dfa->lexstate.lasttok = c; + return dfa->lex.lasttok = c; } } @@ -1670,21 +1671,21 @@ addtok_mb (struct dfa *dfa, token t, int mbprop) case CAT: case OR: - --dfa->parsestate.depth; + dfa->parse.depth--; break; case BACKREF: dfa->fast = false; /* fallthrough */ default: - ++dfa->nleaves; + dfa->nleaves++; /* fallthrough */ case EMPTY: - ++dfa->parsestate.depth; + dfa->parse.depth++; break; } - if (dfa->parsestate.depth > dfa->depth) - dfa->depth = dfa->parsestate.depth; + if (dfa->parse.depth > dfa->depth) + dfa->depth = dfa->parse.depth; } static void addtok_wc (struct dfa *dfa, wint_t wc); @@ -1741,19 +1742,19 @@ addtok_wc (struct dfa *dfa, wint_t wc) size_t stored_bytes = wcrtomb ((char *) buf, wc, &s); if (stored_bytes != (size_t) -1) - dfa->lexstate.cur_mb_len = stored_bytes; + dfa->lex.cur_mb_len = stored_bytes; else { /* This is merely stop-gap. buf[0] is undefined, yet skipping the addtok_mb call altogether can corrupt the heap. */ - dfa->lexstate.cur_mb_len = 1; + dfa->lex.cur_mb_len = 1; buf[0] = 0; } - addtok_mb (dfa, buf[0], dfa->lexstate.cur_mb_len == 1 ? 3 : 1); - for (i = 1; i < dfa->lexstate.cur_mb_len; i++) + addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1); + for (i = 1; i < dfa->lex.cur_mb_len; i++) { - addtok_mb (dfa, buf[i], i == dfa->lexstate.cur_mb_len - 1 ? 2 : 0); + addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0); addtok (dfa, CAT); } } @@ -1854,18 +1855,18 @@ add_utf8_anychar (struct dfa *dfa) static void atom (struct dfa *dfa) { - if (dfa->parsestate.tok == WCHAR) + if (dfa->parse.tok == WCHAR) { - if (dfa->lexstate.wctok == WEOF) + if (dfa->lex.wctok == WEOF) addtok (dfa, BACKREF); else { - addtok_wc (dfa, dfa->lexstate.wctok); + addtok_wc (dfa, dfa->lex.wctok); if (dfa->syntax.case_fold) { wchar_t folded[CASE_FOLDED_BUFSIZE]; - unsigned int i, n = case_folded_counterparts (dfa->lexstate.wctok, + unsigned int i, n = case_folded_counterparts (dfa->lex.wctok, folded); for (i = 0; i < n; i++) { @@ -1875,9 +1876,9 @@ atom (struct dfa *dfa) } } - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == ANYCHAR && using_utf8) + else if (dfa->parse.tok == ANYCHAR && using_utf8) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1887,26 +1888,25 @@ atom (struct dfa *dfa) UTF-8: it is the most used, and the structure of the encoding makes the correctness more obvious. */ add_utf8_anychar (dfa); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if ((dfa->parsestate.tok >= 0 && dfa->parsestate.tok < NOTCHAR) - || dfa->parsestate.tok >= CSET || dfa->parsestate.tok == BACKREF - || dfa->parsestate.tok == BEGLINE || dfa->parsestate.tok == ENDLINE - || dfa->parsestate.tok == BEGWORD || dfa->parsestate.tok == ANYCHAR - || dfa->parsestate.tok == MBCSET || dfa->parsestate.tok == ENDWORD - || dfa->parsestate.tok == LIMWORD - || dfa->parsestate.tok == NOTLIMWORD) + else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR) + || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF + || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE + || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR + || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD + || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD) { - addtok (dfa, dfa->parsestate.tok); - dfa->parsestate.tok = lex (dfa); + addtok (dfa, dfa->parse.tok); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == LPAREN) + else if (dfa->parse.tok == LPAREN) { - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); regexp (dfa); - if (dfa->parsestate.tok != RPAREN) + if (dfa->parse.tok != RPAREN) dfaerror (_("unbalanced (")); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } else addtok (dfa, EMPTY); @@ -1954,40 +1954,39 @@ closure (struct dfa *dfa) size_t tindex, ntokens; atom (dfa); - while (dfa->parsestate.tok == QMARK || dfa->parsestate.tok == STAR - || dfa->parsestate.tok == PLUS || dfa->parsestate.tok == REPMN) - if (dfa->parsestate.tok == REPMN - && (dfa->lexstate.minrep || dfa->lexstate.maxrep)) + while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR + || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN) + if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep)) { ntokens = nsubtoks (dfa, dfa->tindex); tindex = dfa->tindex - ntokens; - if (dfa->lexstate.maxrep < 0) + if (dfa->lex.maxrep < 0) addtok (dfa, PLUS); - if (dfa->lexstate.minrep == 0) + if (dfa->lex.minrep == 0) addtok (dfa, QMARK); - for (i = 1; i < dfa->lexstate.minrep; ++i) + for (i = 1; i < dfa->lex.minrep; i++) { copytoks (dfa, tindex, ntokens); addtok (dfa, CAT); } - for (; i < dfa->lexstate.maxrep; ++i) + for (; i < dfa->lex.maxrep; i++) { copytoks (dfa, tindex, ntokens); addtok (dfa, QMARK); addtok (dfa, CAT); } - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); } - else if (dfa->parsestate.tok == REPMN) + else if (dfa->parse.tok == REPMN) { dfa->tindex -= nsubtoks (dfa, dfa->tindex); - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); closure (dfa); } else { - addtok (dfa, dfa->parsestate.tok); - dfa->parsestate.tok = lex (dfa); + addtok (dfa, dfa->parse.tok); + dfa->parse.tok = lex (dfa); } } @@ -1995,8 +1994,8 @@ static void branch (struct dfa* dfa) { closure (dfa); - while (dfa->parsestate.tok != RPAREN && dfa->parsestate.tok != OR - && dfa->parsestate.tok >= 0) + while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR + && dfa->parse.tok >= 0) { closure (dfa); addtok (dfa, CAT); @@ -2007,9 +2006,9 @@ static void regexp (struct dfa *dfa) { branch (dfa); - while (dfa->parsestate.tok == OR) + while (dfa->parse.tok == OR) { - dfa->parsestate.tok = lex (dfa); + dfa->parse.tok = lex (dfa); branch (dfa); addtok (dfa, OR); } @@ -2021,26 +2020,26 @@ regexp (struct dfa *dfa) static void dfaparse (char const *s, size_t len, struct dfa *d) { - d->lexstate.lexptr = s; - d->lexstate.lexleft = len; - d->lexstate.lasttok = END; - d->lexstate.laststart = true; - d->lexstate.parens = 0; + d->lex.ptr = s; + d->lex.left = len; + d->lex.lasttok = END; + d->lex.laststart = true; + d->lex.parens = 0; if (d->multibyte) { - d->lexstate.cur_mb_len = 0; + d->lex.cur_mb_len = 0; memset (&d->mbs, 0, sizeof d->mbs); } if (!d->syntax.syntax_bits_set) dfaerror (_("no syntax specified")); - d->parsestate.tok = lex (d); - d->parsestate.depth = d->depth; + d->parse.tok = lex (d); + d->parse.depth = d->depth; regexp (d); - if (d->parsestate.tok != END) + if (d->parse.tok != END) dfaerror (_("unbalanced )")); addtok (d, END - d->nregexps); @@ -3990,11 +3989,9 @@ dfamust (struct dfa const *d) bool exact = false; bool begline = false; bool endline = false; - size_t rj; bool need_begline = false; bool need_endline = false; bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1; - struct dfamust *dm; for (ri = 0; ri < d->tindex; ++ri) { @@ -4171,7 +4168,7 @@ dfamust (struct dfa const *d) } } - rj = ri + 2; + size_t rj = ri + 2; if (d->tokens[ri + 1] == CAT) { for (; rj < d->tindex - 1; rj += 2) @@ -4200,7 +4197,7 @@ dfamust (struct dfa const *d) } done:; - dm = NULL; + struct dfamust *dm = NULL; if (*result) { dm = xmalloc (sizeof *dm); @@ -4230,11 +4227,11 @@ dfamustfree (struct dfamust *dm) struct dfa * dfaalloc (void) { - struct dfa *d = xcalloc (1, sizeof (struct dfa)); + struct dfa *d = xzalloc (sizeof *d); d->multibyte = MB_CUR_MAX > 1; d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb; d->fast = !d->multibyte; - d->lexstate.cur_mb_len = 1; + d->lex.cur_mb_len = 1; return d; } diff --git a/doc/ChangeLog b/doc/ChangeLog index e6d9bf05..efa2b561 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,5 +1,11 @@ 2016-08-25 Arnold D. Robbins <arnold@skeeve.com> + * gawktexi.in (POSIX String Comparison): Update for new + spec where == and != use strcmp, rest use strcoll. Thanks to + Chet Ramey for pointing me at the new rules. + +2016-08-25 Arnold D. Robbins <arnold@skeeve.com> + * 4.1.4: Release tar ball made. 2016-08-24 Arnold D. Robbins <arnold@skeeve.com> diff --git a/doc/gawk.info b/doc/gawk.info index 1766ab94..973af87c 100644 --- a/doc/gawk.info +++ b/doc/gawk.info @@ -8615,18 +8615,18 @@ Constant Regexps::, where this is discussed in more detail. File: gawk.info, Node: POSIX String Comparison, Prev: Comparison Operators, Up: Typing and Comparison -6.3.2.3 String Comparison with POSIX Rules -.......................................... +6.3.2.3 String Comparison Based on Locale Collating Order +......................................................... -The POSIX standard says that string comparison is performed based on the -locale's "collating order". This is the order in which characters sort, -as defined by the locale (for more discussion, *note Locales::). This -order is usually very different from the results obtained when doing -straight character-by-character comparison.(1) +The POSIX standard used to say that all string comparisons are performed +based on the locale's "collating order". This is the order in which +characters sort, as defined by the locale (for more discussion, *note +Locales::). This order is usually very different from the results +obtained when doing straight byte-by-byte comparison.(1) Because this behavior differs considerably from existing practice, -'gawk' only implements it when in POSIX mode (*note Options::). Here is -an example to illustrate the difference, in an 'en_US.UTF-8' locale: +'gawk' only implemented it when in POSIX mode (*note Options::). Here +is an example to illustrate the difference, in an 'en_US.UTF-8' locale: $ gawk 'BEGIN { printf("ABC < abc = %s\n", > ("ABC" < "abc" ? "TRUE" : "FALSE")) }' @@ -8635,11 +8635,28 @@ an example to illustrate the difference, in an 'en_US.UTF-8' locale: > ("ABC" < "abc" ? "TRUE" : "FALSE")) }' -| ABC < abc = FALSE + Fortunately, as of August 2016, comparison based on locale collating +order is no longer required for the '==' and '!=' operators.(2) +However, comparison based on locales is still required for '<', '<=', +'>', and '>='. POSIX thus recommends as follows: + + Since the '==' operator checks whether strings are identical, not + whether they collate equally, applications needing to check whether + strings collate equally can use: + + a <= b && a >= b + + As of version 4.2, 'gawk' continues to use locale collating order for +'<', '<=', '>', and '>=' only in POSIX mode. + ---------- Footnotes ---------- (1) Technically, string comparison is supposed to behave the same way as if the strings were compared with the C 'strcoll()' function. + (2) See the Austin Group website +(http://austingroupbugs.net/view.php?id=1070). + File: gawk.info, Node: Boolean Ops, Next: Conditional Exp, Prev: Typing and Comparison, Up: Truth Values and Conditions @@ -27603,7 +27620,7 @@ ranges, such that outside the '"C"' and '"POSIX"' locales, the meaning of range expressions was _undefined_.(3) By using this lovely technical term, the standard gives license to -implementors to implement ranges in whatever way they choose. The +implementers to implement ranges in whatever way they choose. The 'gawk' maintainer chose to apply the pre-POSIX meaning both with the default regexp matching and when '--traditional' or '--posix' are used. In all cases 'gawk' remains POSIX-compliant. @@ -35427,401 +35444,402 @@ Node: Variable Typing364779 Node: Comparison Operators368403 Ref: table-relational-ops368822 Node: POSIX String Comparison372317 -Ref: POSIX String Comparison-Footnote-1373391 -Node: Boolean Ops373530 -Ref: Boolean Ops-Footnote-1378012 -Node: Conditional Exp378104 -Node: Function Calls379840 -Node: Precedence383717 -Node: Locales387376 -Node: Expressions Summary389008 -Node: Patterns and Actions391581 -Node: Pattern Overview392701 -Node: Regexp Patterns394378 -Node: Expression Patterns394920 -Node: Ranges398701 -Node: BEGIN/END401809 -Node: Using BEGIN/END402570 -Ref: Using BEGIN/END-Footnote-1405306 -Node: I/O And BEGIN/END405412 -Node: BEGINFILE/ENDFILE407726 -Node: Empty410633 -Node: Using Shell Variables410950 -Node: Action Overview413224 -Node: Statements415549 -Node: If Statement417397 -Node: While Statement418892 -Node: Do Statement420920 -Node: For Statement422068 -Node: Switch Statement425226 -Node: Break Statement427612 -Node: Continue Statement429704 -Node: Next Statement431531 -Node: Nextfile Statement433914 -Node: Exit Statement436566 -Node: Built-in Variables438969 -Node: User-modified440102 -Node: Auto-set447688 -Ref: Auto-set-Footnote-1462341 -Ref: Auto-set-Footnote-2462547 -Node: ARGC and ARGV462603 -Node: Pattern Action Summary466816 -Node: Arrays469246 -Node: Array Basics470575 -Node: Array Intro471419 -Ref: figure-array-elements473394 -Ref: Array Intro-Footnote-1476098 -Node: Reference to Elements476226 -Node: Assigning Elements478690 -Node: Array Example479181 -Node: Scanning an Array480940 -Node: Controlling Scanning483962 -Ref: Controlling Scanning-Footnote-1489361 -Node: Numeric Array Subscripts489677 -Node: Uninitialized Subscripts491861 -Node: Delete493480 -Ref: Delete-Footnote-1496232 -Node: Multidimensional496289 -Node: Multiscanning499384 -Node: Arrays of Arrays500975 -Node: Arrays Summary505742 -Node: Functions507835 -Node: Built-in508873 -Node: Calling Built-in509954 -Node: Numeric Functions511950 -Ref: Numeric Functions-Footnote-1516783 -Ref: Numeric Functions-Footnote-2517140 -Ref: Numeric Functions-Footnote-3517188 -Node: String Functions517460 -Ref: String Functions-Footnote-1540964 -Ref: String Functions-Footnote-2541092 -Ref: String Functions-Footnote-3541340 -Node: Gory Details541427 -Ref: table-sub-escapes543218 -Ref: table-sub-proposed544737 -Ref: table-posix-sub546100 -Ref: table-gensub-escapes547641 -Ref: Gory Details-Footnote-1548464 -Node: I/O Functions548618 -Ref: table-system-return-values555200 -Ref: I/O Functions-Footnote-1557180 -Ref: I/O Functions-Footnote-2557328 -Node: Time Functions557448 -Ref: Time Functions-Footnote-1567953 -Ref: Time Functions-Footnote-2568021 -Ref: Time Functions-Footnote-3568179 -Ref: Time Functions-Footnote-4568290 -Ref: Time Functions-Footnote-5568402 -Ref: Time Functions-Footnote-6568629 -Node: Bitwise Functions568895 -Ref: table-bitwise-ops569489 -Ref: Bitwise Functions-Footnote-1573827 -Node: Type Functions574000 -Node: I18N Functions576532 -Node: User-defined578183 -Node: Definition Syntax578988 -Ref: Definition Syntax-Footnote-1584675 -Node: Function Example584746 -Ref: Function Example-Footnote-1587668 -Node: Function Caveats587690 -Node: Calling A Function588208 -Node: Variable Scope589166 -Node: Pass By Value/Reference592160 -Node: Return Statement595659 -Node: Dynamic Typing598638 -Node: Indirect Calls599568 -Ref: Indirect Calls-Footnote-1609819 -Node: Functions Summary609947 -Node: Library Functions612652 -Ref: Library Functions-Footnote-1616259 -Ref: Library Functions-Footnote-2616402 -Node: Library Names616573 -Ref: Library Names-Footnote-1620033 -Ref: Library Names-Footnote-2620256 -Node: General Functions620342 -Node: Strtonum Function621445 -Node: Assert Function624467 -Node: Round Function627793 -Node: Cliff Random Function629334 -Node: Ordinal Functions630350 -Ref: Ordinal Functions-Footnote-1633413 -Ref: Ordinal Functions-Footnote-2633665 -Node: Join Function633875 -Ref: Join Function-Footnote-1635645 -Node: Getlocaltime Function635845 -Node: Readfile Function639587 -Node: Shell Quoting641559 -Node: Data File Management642960 -Node: Filetrans Function643592 -Node: Rewind Function647688 -Node: File Checking649594 -Ref: File Checking-Footnote-1650928 -Node: Empty Files651129 -Node: Ignoring Assigns653108 -Node: Getopt Function654658 -Ref: Getopt Function-Footnote-1666127 -Node: Passwd Functions666327 -Ref: Passwd Functions-Footnote-1675166 -Node: Group Functions675254 -Ref: Group Functions-Footnote-1683151 -Node: Walking Arrays683358 -Node: Library Functions Summary686366 -Node: Library Exercises687772 -Node: Sample Programs688237 -Node: Running Examples689007 -Node: Clones689735 -Node: Cut Program690959 -Node: Egrep Program700888 -Ref: Egrep Program-Footnote-1708400 -Node: Id Program708510 -Node: Split Program712190 -Ref: Split Program-Footnote-1715649 -Node: Tee Program715778 -Node: Uniq Program718568 -Node: Wc Program725994 -Ref: Wc Program-Footnote-1730249 -Node: Miscellaneous Programs730343 -Node: Dupword Program731556 -Node: Alarm Program733586 -Node: Translate Program738441 -Ref: Translate Program-Footnote-1743006 -Node: Labels Program743276 -Ref: Labels Program-Footnote-1746627 -Node: Word Sorting746711 -Node: History Sorting750783 -Node: Extract Program752618 -Node: Simple Sed760147 -Node: Igawk Program763221 -Ref: Igawk Program-Footnote-1777552 -Ref: Igawk Program-Footnote-2777754 -Ref: Igawk Program-Footnote-3777876 -Node: Anagram Program777991 -Node: Signature Program781053 -Node: Programs Summary782300 -Node: Programs Exercises783514 -Ref: Programs Exercises-Footnote-1787643 -Node: Advanced Features787734 -Node: Nondecimal Data789724 -Node: Array Sorting791315 -Node: Controlling Array Traversal792015 -Ref: Controlling Array Traversal-Footnote-1800382 -Node: Array Sorting Functions800500 -Ref: Array Sorting Functions-Footnote-1805591 -Node: Two-way I/O805787 -Ref: Two-way I/O-Footnote-1812337 -Ref: Two-way I/O-Footnote-2812524 -Node: TCP/IP Networking812606 -Node: Profiling815724 -Ref: Profiling-Footnote-1824217 -Node: Advanced Features Summary824540 -Node: Internationalization826384 -Node: I18N and L10N827864 -Node: Explaining gettext828551 -Ref: Explaining gettext-Footnote-1834443 -Ref: Explaining gettext-Footnote-2834628 -Node: Programmer i18n834793 -Ref: Programmer i18n-Footnote-1839648 -Node: Translator i18n839697 -Node: String Extraction840491 -Ref: String Extraction-Footnote-1841623 -Node: Printf Ordering841709 -Ref: Printf Ordering-Footnote-1844495 -Node: I18N Portability844559 -Ref: I18N Portability-Footnote-1847015 -Node: I18N Example847078 -Ref: I18N Example-Footnote-1849884 -Node: Gawk I18N849957 -Node: I18N Summary850602 -Node: Debugger851943 -Node: Debugging852965 -Node: Debugging Concepts853406 -Node: Debugging Terms855215 -Node: Awk Debugging857790 -Node: Sample Debugging Session858696 -Node: Debugger Invocation859230 -Node: Finding The Bug860616 -Node: List of Debugger Commands867094 -Node: Breakpoint Control868427 -Node: Debugger Execution Control872121 -Node: Viewing And Changing Data875483 -Node: Execution Stack878857 -Node: Debugger Info880494 -Node: Miscellaneous Debugger Commands884565 -Node: Readline Support889653 -Node: Limitations890549 -Ref: Limitations-Footnote-1894780 -Node: Debugging Summary894831 -Node: Arbitrary Precision Arithmetic896110 -Node: Computer Arithmetic897526 -Ref: table-numeric-ranges901117 -Ref: Computer Arithmetic-Footnote-1901839 -Node: Math Definitions901896 -Ref: table-ieee-formats905210 -Ref: Math Definitions-Footnote-1905813 -Node: MPFR features905918 -Node: FP Math Caution907635 -Ref: FP Math Caution-Footnote-1908707 -Node: Inexactness of computations909076 -Node: Inexact representation910036 -Node: Comparing FP Values911396 -Node: Errors accumulate912478 -Node: Getting Accuracy913911 -Node: Try To Round916621 -Node: Setting precision917520 -Ref: table-predefined-precision-strings918217 -Node: Setting the rounding mode920047 -Ref: table-gawk-rounding-modes920421 -Ref: Setting the rounding mode-Footnote-1923829 -Node: Arbitrary Precision Integers924008 -Ref: Arbitrary Precision Integers-Footnote-1928925 -Node: POSIX Floating Point Problems929074 -Ref: POSIX Floating Point Problems-Footnote-1932956 -Node: Floating point summary932994 -Node: Dynamic Extensions935184 -Node: Extension Intro936737 -Node: Plugin License938003 -Node: Extension Mechanism Outline938800 -Ref: figure-load-extension939239 -Ref: figure-register-new-function940804 -Ref: figure-call-new-function941896 -Node: Extension API Description943958 -Node: Extension API Functions Introduction945490 -Node: General Data Types950349 -Ref: General Data Types-Footnote-1956304 -Node: Memory Allocation Functions956603 -Ref: Memory Allocation Functions-Footnote-1959448 -Node: Constructor Functions959547 -Node: Registration Functions961292 -Node: Extension Functions961977 -Node: Exit Callback Functions964600 -Node: Extension Version String965850 -Node: Input Parsers966513 -Node: Output Wrappers976395 -Node: Two-way processors980907 -Node: Printing Messages983172 -Ref: Printing Messages-Footnote-1984343 -Node: Updating ERRNO984496 -Node: Requesting Values985235 -Ref: table-value-types-returned985972 -Node: Accessing Parameters986855 -Node: Symbol Table Access988090 -Node: Symbol table by name988602 -Node: Symbol table by cookie990623 -Ref: Symbol table by cookie-Footnote-1994775 -Node: Cached values994839 -Ref: Cached values-Footnote-1998346 -Node: Array Manipulation998437 -Ref: Array Manipulation-Footnote-1999528 -Node: Array Data Types999565 -Ref: Array Data Types-Footnote-11002223 -Node: Array Functions1002315 -Node: Flattening Arrays1006173 -Node: Creating Arrays1013081 -Node: Redirection API1017850 -Node: Extension API Variables1020681 -Node: Extension Versioning1021314 -Ref: gawk-api-version1021751 -Node: Extension API Informational Variables1023507 -Node: Extension API Boilerplate1024571 -Node: Finding Extensions1028385 -Node: Extension Example1028944 -Node: Internal File Description1029742 -Node: Internal File Ops1033822 -Ref: Internal File Ops-Footnote-11045584 -Node: Using Internal File Ops1045724 -Ref: Using Internal File Ops-Footnote-11048107 -Node: Extension Samples1048381 -Node: Extension Sample File Functions1049910 -Node: Extension Sample Fnmatch1057559 -Node: Extension Sample Fork1059046 -Node: Extension Sample Inplace1060264 -Node: Extension Sample Ord1063474 -Node: Extension Sample Readdir1064310 -Ref: table-readdir-file-types1065199 -Node: Extension Sample Revout1066004 -Node: Extension Sample Rev2way1066593 -Node: Extension Sample Read write array1067333 -Node: Extension Sample Readfile1069275 -Node: Extension Sample Time1070370 -Node: Extension Sample API Tests1071718 -Node: gawkextlib1072210 -Node: Extension summary1074657 -Node: Extension Exercises1078359 -Node: Language History1079857 -Node: V7/SVR3.11081513 -Node: SVR41083665 -Node: POSIX1085099 -Node: BTL1086478 -Node: POSIX/GNU1087207 -Node: Feature History1093069 -Node: Common Extensions1107439 -Node: Ranges and Locales1108722 -Ref: Ranges and Locales-Footnote-11113338 -Ref: Ranges and Locales-Footnote-21113365 -Ref: Ranges and Locales-Footnote-31113600 -Node: Contributors1113821 -Node: History summary1119381 -Node: Installation1120761 -Node: Gawk Distribution1121705 -Node: Getting1122189 -Node: Extracting1123150 -Node: Distribution contents1124788 -Node: Unix Installation1130882 -Node: Quick Installation1131564 -Node: Shell Startup Files1133978 -Node: Additional Configuration Options1135056 -Node: Configuration Philosophy1136861 -Node: Non-Unix Installation1139230 -Node: PC Installation1139688 -Node: PC Binary Installation1141008 -Node: PC Compiling1142860 -Ref: PC Compiling-Footnote-11145654 -Node: PC Testing1145763 -Node: PC Using1146943 -Ref: PC Using-Footnote-11151096 -Node: Cygwin1151169 -Node: MSYS1151939 -Node: VMS Installation1152440 -Node: VMS Compilation1153231 -Ref: VMS Compilation-Footnote-11154460 -Node: VMS Dynamic Extensions1154518 -Node: VMS Installation Details1156203 -Node: VMS Running1158456 -Node: VMS GNV1162735 -Node: VMS Old Gawk1163470 -Node: Bugs1163941 -Node: Other Versions1168256 -Node: Installation summary1174840 -Node: Notes1175891 -Node: Compatibility Mode1176756 -Node: Additions1177538 -Node: Accessing The Source1178463 -Node: Adding Code1179898 -Node: New Ports1186117 -Node: Derived Files1190605 -Ref: Derived Files-Footnote-11196090 -Ref: Derived Files-Footnote-21196125 -Ref: Derived Files-Footnote-31196723 -Node: Future Extensions1196837 -Node: Implementation Limitations1197495 -Node: Extension Design1198678 -Node: Old Extension Problems1199832 -Ref: Old Extension Problems-Footnote-11201350 -Node: Extension New Mechanism Goals1201407 -Ref: Extension New Mechanism Goals-Footnote-11204771 -Node: Extension Other Design Decisions1204960 -Node: Extension Future Growth1207073 -Node: Old Extension Mechanism1207909 -Node: Notes summary1209672 -Node: Basic Concepts1210854 -Node: Basic High Level1211535 -Ref: figure-general-flow1211817 -Ref: figure-process-flow1212502 -Ref: Basic High Level-Footnote-11215803 -Node: Basic Data Typing1215988 -Node: Glossary1219316 -Node: Copying1251263 -Node: GNU Free Documentation License1288802 -Node: Index1313920 +Ref: POSIX String Comparison-Footnote-1374012 +Ref: POSIX String Comparison-Footnote-2374151 +Node: Boolean Ops374235 +Ref: Boolean Ops-Footnote-1378717 +Node: Conditional Exp378809 +Node: Function Calls380545 +Node: Precedence384422 +Node: Locales388081 +Node: Expressions Summary389713 +Node: Patterns and Actions392286 +Node: Pattern Overview393406 +Node: Regexp Patterns395083 +Node: Expression Patterns395625 +Node: Ranges399406 +Node: BEGIN/END402514 +Node: Using BEGIN/END403275 +Ref: Using BEGIN/END-Footnote-1406011 +Node: I/O And BEGIN/END406117 +Node: BEGINFILE/ENDFILE408431 +Node: Empty411338 +Node: Using Shell Variables411655 +Node: Action Overview413929 +Node: Statements416254 +Node: If Statement418102 +Node: While Statement419597 +Node: Do Statement421625 +Node: For Statement422773 +Node: Switch Statement425931 +Node: Break Statement428317 +Node: Continue Statement430409 +Node: Next Statement432236 +Node: Nextfile Statement434619 +Node: Exit Statement437271 +Node: Built-in Variables439674 +Node: User-modified440807 +Node: Auto-set448393 +Ref: Auto-set-Footnote-1463046 +Ref: Auto-set-Footnote-2463252 +Node: ARGC and ARGV463308 +Node: Pattern Action Summary467521 +Node: Arrays469951 +Node: Array Basics471280 +Node: Array Intro472124 +Ref: figure-array-elements474099 +Ref: Array Intro-Footnote-1476803 +Node: Reference to Elements476931 +Node: Assigning Elements479395 +Node: Array Example479886 +Node: Scanning an Array481645 +Node: Controlling Scanning484667 +Ref: Controlling Scanning-Footnote-1490066 +Node: Numeric Array Subscripts490382 +Node: Uninitialized Subscripts492566 +Node: Delete494185 +Ref: Delete-Footnote-1496937 +Node: Multidimensional496994 +Node: Multiscanning500089 +Node: Arrays of Arrays501680 +Node: Arrays Summary506447 +Node: Functions508540 +Node: Built-in509578 +Node: Calling Built-in510659 +Node: Numeric Functions512655 +Ref: Numeric Functions-Footnote-1517488 +Ref: Numeric Functions-Footnote-2517845 +Ref: Numeric Functions-Footnote-3517893 +Node: String Functions518165 +Ref: String Functions-Footnote-1541669 +Ref: String Functions-Footnote-2541797 +Ref: String Functions-Footnote-3542045 +Node: Gory Details542132 +Ref: table-sub-escapes543923 +Ref: table-sub-proposed545442 +Ref: table-posix-sub546805 +Ref: table-gensub-escapes548346 +Ref: Gory Details-Footnote-1549169 +Node: I/O Functions549323 +Ref: table-system-return-values555905 +Ref: I/O Functions-Footnote-1557885 +Ref: I/O Functions-Footnote-2558033 +Node: Time Functions558153 +Ref: Time Functions-Footnote-1568658 +Ref: Time Functions-Footnote-2568726 +Ref: Time Functions-Footnote-3568884 +Ref: Time Functions-Footnote-4568995 +Ref: Time Functions-Footnote-5569107 +Ref: Time Functions-Footnote-6569334 +Node: Bitwise Functions569600 +Ref: table-bitwise-ops570194 +Ref: Bitwise Functions-Footnote-1574532 +Node: Type Functions574705 +Node: I18N Functions577237 +Node: User-defined578888 +Node: Definition Syntax579693 +Ref: Definition Syntax-Footnote-1585380 +Node: Function Example585451 +Ref: Function Example-Footnote-1588373 +Node: Function Caveats588395 +Node: Calling A Function588913 +Node: Variable Scope589871 +Node: Pass By Value/Reference592865 +Node: Return Statement596364 +Node: Dynamic Typing599343 +Node: Indirect Calls600273 +Ref: Indirect Calls-Footnote-1610524 +Node: Functions Summary610652 +Node: Library Functions613357 +Ref: Library Functions-Footnote-1616964 +Ref: Library Functions-Footnote-2617107 +Node: Library Names617278 +Ref: Library Names-Footnote-1620738 +Ref: Library Names-Footnote-2620961 +Node: General Functions621047 +Node: Strtonum Function622150 +Node: Assert Function625172 +Node: Round Function628498 +Node: Cliff Random Function630039 +Node: Ordinal Functions631055 +Ref: Ordinal Functions-Footnote-1634118 +Ref: Ordinal Functions-Footnote-2634370 +Node: Join Function634580 +Ref: Join Function-Footnote-1636350 +Node: Getlocaltime Function636550 +Node: Readfile Function640292 +Node: Shell Quoting642264 +Node: Data File Management643665 +Node: Filetrans Function644297 +Node: Rewind Function648393 +Node: File Checking650299 +Ref: File Checking-Footnote-1651633 +Node: Empty Files651834 +Node: Ignoring Assigns653813 +Node: Getopt Function655363 +Ref: Getopt Function-Footnote-1666832 +Node: Passwd Functions667032 +Ref: Passwd Functions-Footnote-1675871 +Node: Group Functions675959 +Ref: Group Functions-Footnote-1683856 +Node: Walking Arrays684063 +Node: Library Functions Summary687071 +Node: Library Exercises688477 +Node: Sample Programs688942 +Node: Running Examples689712 +Node: Clones690440 +Node: Cut Program691664 +Node: Egrep Program701593 +Ref: Egrep Program-Footnote-1709105 +Node: Id Program709215 +Node: Split Program712895 +Ref: Split Program-Footnote-1716354 +Node: Tee Program716483 +Node: Uniq Program719273 +Node: Wc Program726699 +Ref: Wc Program-Footnote-1730954 +Node: Miscellaneous Programs731048 +Node: Dupword Program732261 +Node: Alarm Program734291 +Node: Translate Program739146 +Ref: Translate Program-Footnote-1743711 +Node: Labels Program743981 +Ref: Labels Program-Footnote-1747332 +Node: Word Sorting747416 +Node: History Sorting751488 +Node: Extract Program753323 +Node: Simple Sed760852 +Node: Igawk Program763926 +Ref: Igawk Program-Footnote-1778257 +Ref: Igawk Program-Footnote-2778459 +Ref: Igawk Program-Footnote-3778581 +Node: Anagram Program778696 +Node: Signature Program781758 +Node: Programs Summary783005 +Node: Programs Exercises784219 +Ref: Programs Exercises-Footnote-1788348 +Node: Advanced Features788439 +Node: Nondecimal Data790429 +Node: Array Sorting792020 +Node: Controlling Array Traversal792720 +Ref: Controlling Array Traversal-Footnote-1801087 +Node: Array Sorting Functions801205 +Ref: Array Sorting Functions-Footnote-1806296 +Node: Two-way I/O806492 +Ref: Two-way I/O-Footnote-1813042 +Ref: Two-way I/O-Footnote-2813229 +Node: TCP/IP Networking813311 +Node: Profiling816429 +Ref: Profiling-Footnote-1824922 +Node: Advanced Features Summary825245 +Node: Internationalization827089 +Node: I18N and L10N828569 +Node: Explaining gettext829256 +Ref: Explaining gettext-Footnote-1835148 +Ref: Explaining gettext-Footnote-2835333 +Node: Programmer i18n835498 +Ref: Programmer i18n-Footnote-1840353 +Node: Translator i18n840402 +Node: String Extraction841196 +Ref: String Extraction-Footnote-1842328 +Node: Printf Ordering842414 +Ref: Printf Ordering-Footnote-1845200 +Node: I18N Portability845264 +Ref: I18N Portability-Footnote-1847720 +Node: I18N Example847783 +Ref: I18N Example-Footnote-1850589 +Node: Gawk I18N850662 +Node: I18N Summary851307 +Node: Debugger852648 +Node: Debugging853670 +Node: Debugging Concepts854111 +Node: Debugging Terms855920 +Node: Awk Debugging858495 +Node: Sample Debugging Session859401 +Node: Debugger Invocation859935 +Node: Finding The Bug861321 +Node: List of Debugger Commands867799 +Node: Breakpoint Control869132 +Node: Debugger Execution Control872826 +Node: Viewing And Changing Data876188 +Node: Execution Stack879562 +Node: Debugger Info881199 +Node: Miscellaneous Debugger Commands885270 +Node: Readline Support890358 +Node: Limitations891254 +Ref: Limitations-Footnote-1895485 +Node: Debugging Summary895536 +Node: Arbitrary Precision Arithmetic896815 +Node: Computer Arithmetic898231 +Ref: table-numeric-ranges901822 +Ref: Computer Arithmetic-Footnote-1902544 +Node: Math Definitions902601 +Ref: table-ieee-formats905915 +Ref: Math Definitions-Footnote-1906518 +Node: MPFR features906623 +Node: FP Math Caution908340 +Ref: FP Math Caution-Footnote-1909412 +Node: Inexactness of computations909781 +Node: Inexact representation910741 +Node: Comparing FP Values912101 +Node: Errors accumulate913183 +Node: Getting Accuracy914616 +Node: Try To Round917326 +Node: Setting precision918225 +Ref: table-predefined-precision-strings918922 +Node: Setting the rounding mode920752 +Ref: table-gawk-rounding-modes921126 +Ref: Setting the rounding mode-Footnote-1924534 +Node: Arbitrary Precision Integers924713 +Ref: Arbitrary Precision Integers-Footnote-1929630 +Node: POSIX Floating Point Problems929779 +Ref: POSIX Floating Point Problems-Footnote-1933661 +Node: Floating point summary933699 +Node: Dynamic Extensions935889 +Node: Extension Intro937442 +Node: Plugin License938708 +Node: Extension Mechanism Outline939505 +Ref: figure-load-extension939944 +Ref: figure-register-new-function941509 +Ref: figure-call-new-function942601 +Node: Extension API Description944663 +Node: Extension API Functions Introduction946195 +Node: General Data Types951054 +Ref: General Data Types-Footnote-1957009 +Node: Memory Allocation Functions957308 +Ref: Memory Allocation Functions-Footnote-1960153 +Node: Constructor Functions960252 +Node: Registration Functions961997 +Node: Extension Functions962682 +Node: Exit Callback Functions965305 +Node: Extension Version String966555 +Node: Input Parsers967218 +Node: Output Wrappers977100 +Node: Two-way processors981612 +Node: Printing Messages983877 +Ref: Printing Messages-Footnote-1985048 +Node: Updating ERRNO985201 +Node: Requesting Values985940 +Ref: table-value-types-returned986677 +Node: Accessing Parameters987560 +Node: Symbol Table Access988795 +Node: Symbol table by name989307 +Node: Symbol table by cookie991328 +Ref: Symbol table by cookie-Footnote-1995480 +Node: Cached values995544 +Ref: Cached values-Footnote-1999051 +Node: Array Manipulation999142 +Ref: Array Manipulation-Footnote-11000233 +Node: Array Data Types1000270 +Ref: Array Data Types-Footnote-11002928 +Node: Array Functions1003020 +Node: Flattening Arrays1006878 +Node: Creating Arrays1013786 +Node: Redirection API1018555 +Node: Extension API Variables1021386 +Node: Extension Versioning1022019 +Ref: gawk-api-version1022456 +Node: Extension API Informational Variables1024212 +Node: Extension API Boilerplate1025276 +Node: Finding Extensions1029090 +Node: Extension Example1029649 +Node: Internal File Description1030447 +Node: Internal File Ops1034527 +Ref: Internal File Ops-Footnote-11046289 +Node: Using Internal File Ops1046429 +Ref: Using Internal File Ops-Footnote-11048812 +Node: Extension Samples1049086 +Node: Extension Sample File Functions1050615 +Node: Extension Sample Fnmatch1058264 +Node: Extension Sample Fork1059751 +Node: Extension Sample Inplace1060969 +Node: Extension Sample Ord1064179 +Node: Extension Sample Readdir1065015 +Ref: table-readdir-file-types1065904 +Node: Extension Sample Revout1066709 +Node: Extension Sample Rev2way1067298 +Node: Extension Sample Read write array1068038 +Node: Extension Sample Readfile1069980 +Node: Extension Sample Time1071075 +Node: Extension Sample API Tests1072423 +Node: gawkextlib1072915 +Node: Extension summary1075362 +Node: Extension Exercises1079064 +Node: Language History1080562 +Node: V7/SVR3.11082218 +Node: SVR41084370 +Node: POSIX1085804 +Node: BTL1087183 +Node: POSIX/GNU1087912 +Node: Feature History1093774 +Node: Common Extensions1108144 +Node: Ranges and Locales1109427 +Ref: Ranges and Locales-Footnote-11114043 +Ref: Ranges and Locales-Footnote-21114070 +Ref: Ranges and Locales-Footnote-31114305 +Node: Contributors1114526 +Node: History summary1120086 +Node: Installation1121466 +Node: Gawk Distribution1122410 +Node: Getting1122894 +Node: Extracting1123855 +Node: Distribution contents1125493 +Node: Unix Installation1131587 +Node: Quick Installation1132269 +Node: Shell Startup Files1134683 +Node: Additional Configuration Options1135761 +Node: Configuration Philosophy1137566 +Node: Non-Unix Installation1139935 +Node: PC Installation1140393 +Node: PC Binary Installation1141713 +Node: PC Compiling1143565 +Ref: PC Compiling-Footnote-11146359 +Node: PC Testing1146468 +Node: PC Using1147648 +Ref: PC Using-Footnote-11151801 +Node: Cygwin1151874 +Node: MSYS1152644 +Node: VMS Installation1153145 +Node: VMS Compilation1153936 +Ref: VMS Compilation-Footnote-11155165 +Node: VMS Dynamic Extensions1155223 +Node: VMS Installation Details1156908 +Node: VMS Running1159161 +Node: VMS GNV1163440 +Node: VMS Old Gawk1164175 +Node: Bugs1164646 +Node: Other Versions1168961 +Node: Installation summary1175545 +Node: Notes1176596 +Node: Compatibility Mode1177461 +Node: Additions1178243 +Node: Accessing The Source1179168 +Node: Adding Code1180603 +Node: New Ports1186822 +Node: Derived Files1191310 +Ref: Derived Files-Footnote-11196795 +Ref: Derived Files-Footnote-21196830 +Ref: Derived Files-Footnote-31197428 +Node: Future Extensions1197542 +Node: Implementation Limitations1198200 +Node: Extension Design1199383 +Node: Old Extension Problems1200537 +Ref: Old Extension Problems-Footnote-11202055 +Node: Extension New Mechanism Goals1202112 +Ref: Extension New Mechanism Goals-Footnote-11205476 +Node: Extension Other Design Decisions1205665 +Node: Extension Future Growth1207778 +Node: Old Extension Mechanism1208614 +Node: Notes summary1210377 +Node: Basic Concepts1211559 +Node: Basic High Level1212240 +Ref: figure-general-flow1212522 +Ref: figure-process-flow1213207 +Ref: Basic High Level-Footnote-11216508 +Node: Basic Data Typing1216693 +Node: Glossary1220021 +Node: Copying1251968 +Node: GNU Free Documentation License1289507 +Node: Index1314625 End Tag Table diff --git a/doc/gawk.texi b/doc/gawk.texi index a4b61895..90f6dcfc 100644 --- a/doc/gawk.texi +++ b/doc/gawk.texi @@ -12577,19 +12577,19 @@ One special place where @code{/foo/} is @emph{not} an abbreviation for where this is discussed in more detail. @node POSIX String Comparison -@subsubsection String Comparison with POSIX Rules +@subsubsection String Comparison Based on Locale Collating Order -The POSIX standard says that string comparison is performed based -on the locale's @dfn{collating order}. This is the order in which -characters sort, as defined by the locale (for more discussion, -@pxref{Locales}). This order is usually very different -from the results obtained when doing straight character-by-character -comparison.@footnote{Technically, string comparison is supposed -to behave the same way as if the strings were compared with the C -@code{strcoll()} function.} +The POSIX standard used to say that all string comparisons are +performed based on the locale's @dfn{collating order}. This +is the order in which characters sort, as defined by the locale +(for more discussion, @pxref{Locales}). This order is usually very +different from the results obtained when doing straight byte-by-byte +comparison.@footnote{Technically, string comparison is supposed to behave +the same way as if the strings were compared with the C @code{strcoll()} +function.} Because this behavior differs considerably from existing practice, -@command{gawk} only implements it when in POSIX mode (@pxref{Options}). +@command{gawk} only implemented it when in POSIX mode (@pxref{Options}). Here is an example to illustrate the difference, in an @code{en_US.UTF-8} locale: @@ -12602,6 +12602,26 @@ $ @kbd{gawk --posix 'BEGIN @{ printf("ABC < abc = %s\n",} @print{} ABC < abc = FALSE @end example +Fortunately, as of August 2016, comparison based on locale +collating order is no longer required for the @code{==} and @code{!=} +operators.@footnote{See @uref{http://austingroupbugs.net/view.php?id=1070, +the Austin Group website}.} However, comparison based on locales is still +required for @code{<}, @code{<=}, @code{>}, and @code{>=}. POSIX thus +recommends as follows: + +@quotation +Since the @code{==} operator checks whether strings are identical, +not whether they collate equally, applications needing to check whether +strings collate equally can use: + +@example +a <= b && a >= b +@end example +@end quotation + +As of @value{PVERSION} 4.2, @command{gawk} continues to use locale +collating order for @code{<}, @code{<=}, @code{>}, and @code{>=} only +in POSIX mode. @node Boolean Ops @subsection Boolean Expressions @@ -37385,7 +37405,7 @@ and @uref{http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html#tag_21_09_03_05, its rationale}.} By using this lovely technical term, the standard gives license -to implementors to implement ranges in whatever way they choose. +to implementers to implement ranges in whatever way they choose. The @command{gawk} maintainer chose to apply the pre-POSIX meaning both with the default regexp matching and when @option{--traditional} or @option{--posix} are used. diff --git a/doc/gawktexi.in b/doc/gawktexi.in index 9c2864cd..782884bb 100644 --- a/doc/gawktexi.in +++ b/doc/gawktexi.in @@ -11896,19 +11896,19 @@ One special place where @code{/foo/} is @emph{not} an abbreviation for where this is discussed in more detail. @node POSIX String Comparison -@subsubsection String Comparison with POSIX Rules +@subsubsection String Comparison Based on Locale Collating Order -The POSIX standard says that string comparison is performed based -on the locale's @dfn{collating order}. This is the order in which -characters sort, as defined by the locale (for more discussion, -@pxref{Locales}). This order is usually very different -from the results obtained when doing straight character-by-character -comparison.@footnote{Technically, string comparison is supposed -to behave the same way as if the strings were compared with the C -@code{strcoll()} function.} +The POSIX standard used to say that all string comparisons are +performed based on the locale's @dfn{collating order}. This +is the order in which characters sort, as defined by the locale +(for more discussion, @pxref{Locales}). This order is usually very +different from the results obtained when doing straight byte-by-byte +comparison.@footnote{Technically, string comparison is supposed to behave +the same way as if the strings were compared with the C @code{strcoll()} +function.} Because this behavior differs considerably from existing practice, -@command{gawk} only implements it when in POSIX mode (@pxref{Options}). +@command{gawk} only implemented it when in POSIX mode (@pxref{Options}). Here is an example to illustrate the difference, in an @code{en_US.UTF-8} locale: @@ -11921,6 +11921,26 @@ $ @kbd{gawk --posix 'BEGIN @{ printf("ABC < abc = %s\n",} @print{} ABC < abc = FALSE @end example +Fortunately, as of August 2016, comparison based on locale +collating order is no longer required for the @code{==} and @code{!=} +operators.@footnote{See @uref{http://austingroupbugs.net/view.php?id=1070, +the Austin Group website}.} However, comparison based on locales is still +required for @code{<}, @code{<=}, @code{>}, and @code{>=}. POSIX thus +recommends as follows: + +@quotation +Since the @code{==} operator checks whether strings are identical, +not whether they collate equally, applications needing to check whether +strings collate equally can use: + +@example +a <= b && a >= b +@end example +@end quotation + +As of @value{PVERSION} 4.2, @command{gawk} continues to use locale +collating order for @code{<}, @code{<=}, @code{>}, and @code{>=} only +in POSIX mode. @node Boolean Ops @subsection Boolean Expressions @@ -36467,7 +36487,7 @@ and @uref{http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html#tag_21_09_03_05, its rationale}.} By using this lovely technical term, the standard gives license -to implementors to implement ranges in whatever way they choose. +to implementers to implement ranges in whatever way they choose. The @command{gawk} maintainer chose to apply the pre-POSIX meaning both with the default regexp matching and when @option{--traditional} or @option{--posix} are used. @@ -573,7 +573,7 @@ posix_compare(NODE *s1, NODE *s2) /* cmp_nodes --- compare two nodes, returning negative, 0, positive */ int -cmp_nodes(NODE *t1, NODE *t2) +cmp_nodes(NODE *t1, NODE *t2, bool use_strcmp) { int ret = 0; size_t len1, len2; @@ -596,7 +596,7 @@ cmp_nodes(NODE *t1, NODE *t2) if (len1 == 0 || len2 == 0) return ldiff; - if (do_posix) + if (do_posix && ! use_strcmp) return posix_compare(t1, t2); l = (ldiff <= 0 ? len1 : len2); @@ -882,7 +882,7 @@ fmt_index(NODE *n) emalloc(fmt_list, NODE **, fmt_num*sizeof(*fmt_list), "fmt_index"); n = force_string(n); while (ix < fmt_hiwater) { - if (cmp_nodes(fmt_list[ix], n) == 0) + if (cmp_nodes(fmt_list[ix], n, true) == 0) return ix; ix++; } @@ -1502,10 +1502,15 @@ eval_condition(NODE *t) return boolval(t); } +typedef enum { + SCALAR_EQ_NEQ, + SCALAR_RELATIONAL +} scalar_cmp_t; + /* cmp_scalars -- compare two nodes on the stack */ static inline int -cmp_scalars() +cmp_scalars(scalar_cmp_t comparison_type) { NODE *t1, *t2; int di; @@ -1516,7 +1521,7 @@ cmp_scalars() DEREF(t2); fatal(_("attempt to use array `%s' in a scalar context"), array_vname(t1)); } - di = cmp_nodes(t1, t2); + di = cmp_nodes(t1, t2, comparison_type == SCALAR_EQ_NEQ); DEREF(t1); DEREF(t2); return di; diff --git a/extension/configure.ac b/extension/configure.ac index b723a3c1..b5b27d03 100644 --- a/extension/configure.ac +++ b/extension/configure.ac @@ -23,7 +23,7 @@ dnl dnl Process this file with autoconf to produce a configure script. -AC_INIT([GNU Awk Bundled Extensions],[4.1.3],[bug-gawk@gnu.org],[gawk-extensions]) +AC_INIT([GNU Awk Bundled Extensions],[4.1.4],[bug-gawk@gnu.org],[gawk-extensions]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/interpret.h b/interpret.h index 2f85049a..46c44cdb 100644 --- a/interpret.h +++ b/interpret.h @@ -446,37 +446,37 @@ uninitialized_scalar: break; case Op_equal: - r = node_Boolean[cmp_scalars() == 0]; + r = node_Boolean[cmp_scalars(SCALAR_EQ_NEQ) == 0]; UPREF(r); REPLACE(r); break; case Op_notequal: - r = node_Boolean[cmp_scalars() != 0]; + r = node_Boolean[cmp_scalars(SCALAR_EQ_NEQ) != 0]; UPREF(r); REPLACE(r); break; case Op_less: - r = node_Boolean[cmp_scalars() < 0]; + r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) < 0]; UPREF(r); REPLACE(r); break; case Op_greater: - r = node_Boolean[cmp_scalars() > 0]; + r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) > 0]; UPREF(r); REPLACE(r); break; case Op_leq: - r = node_Boolean[cmp_scalars() <= 0]; + r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) <= 0]; UPREF(r); REPLACE(r); break; case Op_geq: - r = node_Boolean[cmp_scalars() >= 0]; + r = node_Boolean[cmp_scalars(SCALAR_RELATIONAL) >= 0]; UPREF(r); REPLACE(r); break; @@ -834,12 +834,11 @@ mod: t2 = TOP_SCALAR(); /* switch expression */ t2 = force_string(t2); rp = re_update(m); - di = (research(rp, t2->stptr, 0, t2->stlen, - avoid_dfa(m, t2->stptr, t2->stlen)) >= 0); + di = (research(rp, t2->stptr, 0, t2->stlen, RE_NO_FLAGS) >= 0); } else { t1 = POP_SCALAR(); /* case value */ t2 = TOP_SCALAR(); /* switch expression */ - di = (cmp_nodes(t2, t1) == 0); + di = (cmp_nodes(t2, t1, true) == 0); DEREF(t1); } @@ -999,20 +998,7 @@ arrayfor: t1 = *get_field(0, (Func_ptr *) 0); match_re: rp = re_update(m); - /* - * Any place where research() is called with a last parameter of - * zero, we need to use the avoid_dfa test. This appears here and - * in the code for Op_K_case. - * - * A new or improved dfa that distinguishes beginning/end of - * string from beginning/end of line will allow us to get rid of - * this hack. - * - * The avoid_dfa() function is in re.c; it is not very smart. - */ - - di = research(rp, t1->stptr, 0, t1->stlen, - avoid_dfa(m, t1->stptr, t1->stlen)); + di = research(rp, t1->stptr, 0, t1->stlen, RE_NO_FLAGS); di = (di == -1) ^ (op != Op_nomatch); if (op != Op_match_rec) { decr_sp(); @@ -170,7 +170,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); memset((char *) rp, 0, sizeof(*rp)); - rp->dfareg = NULL; rp->pat.allocated = 0; /* regex will allocate the buffer */ emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); @@ -223,12 +222,11 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) /* gack. this must be done *after* re_compile_pattern */ rp->pat.newline_anchor = false; /* don't get \n in middle of string */ if (dfa && ! no_dfa) { - rp->dfa = true; rp->dfareg = dfaalloc(); dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n'); dfacomp(buf, len, rp->dfareg, true); } else - rp->dfa = false; + rp->dfareg = NULL; rp->has_anchor = has_anchor; /* Additional flags that help with RS as regexp. */ @@ -278,26 +276,25 @@ research(Regexp *rp, char *str, int start, * starts in the middle of a string, so don't bother trying it * in that case. */ - if (rp->dfa && ! no_bol && start == 0) { - char save; - size_t count = 0; + if (rp->dfareg != NULL && ! no_bol && start == 0) { struct dfa *superset = dfasuperset(rp->dfareg); - /* - * dfa likes to stick a '\n' right after the matched - * text. So we just save and restore the character. - */ - save = str[start+len]; if (superset) ret = dfaexec(superset, str+start, str+start+len, true, NULL, NULL); - if (ret) + + if (ret && ((! need_start && ! rp->has_anchor) + || (! superset && dfaisfast(rp->dfareg)))) ret = dfaexec(rp->dfareg, str+start, str+start+len, - true, &count, &try_backref); - str[start+len] = save; + true, NULL, &try_backref); } if (ret) { - if (need_start || rp->dfa == false || try_backref) { + if ( rp->dfareg == NULL + || start != 0 + || no_bol + || need_start + || rp->has_anchor + || try_backref) { /* * Passing NULL as last arg speeds up search for cases * where we don't need the start/end info. @@ -326,7 +323,7 @@ refree(Regexp *rp) free(rp->regs.start); if (rp->regs.end) free(rp->regs.end); - if (rp->dfa) { + if (rp->dfareg != NULL) { dfafree(rp->dfareg); free(rp->dfareg); } @@ -359,7 +356,7 @@ re_update(NODE *t) t1 = t->re_exp; if (t->re_text != NULL) { /* if contents haven't changed, just return it */ - if (cmp_nodes(t->re_text, t1) == 0) + if (cmp_nodes(t->re_text, t1, true) == 0) return t->re_reg; /* things changed, fall through to recompile */ unref(t->re_text); @@ -425,32 +422,6 @@ resetup() dfa_init(); } -/* avoid_dfa --- return true if we should not use the DFA matcher */ - -int -avoid_dfa(NODE *re, char *str, size_t len) -{ - char *end; - - /* - * f = @/.../ - * if ("foo" ~ f) ... - * - * This creates a Node_dynregex with NULL re_reg. - */ - if (re->re_reg == NULL) - return false; - - if (! re->re_reg->has_anchor) - return false; - - for (end = str + len; str < end; str++) - if (*str == '\n') - return true; - - return false; -} - /* reisstring --- return true if the RE match is a simple string match */ int |