diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2015-08-25 20:30:42 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2015-08-25 20:30:42 +0300 |
commit | cd2504da556b8bb61b56a34727ef4ce5195ff9a0 (patch) | |
tree | d3c2831436c33756735214b942a8981b6da2ba5e | |
parent | 76406567b762f58129d49b5a9ec4e22cf15499f2 (diff) | |
parent | 243b097279a89d456fda4a400412482d70b3665c (diff) | |
download | egawk-cd2504da556b8bb61b56a34727ef4ce5195ff9a0.tar.gz egawk-cd2504da556b8bb61b56a34727ef4ce5195ff9a0.tar.bz2 egawk-cd2504da556b8bb61b56a34727ef4ce5195ff9a0.zip |
Merge branch 'gawk-4.1-stable' into feature/zOS-try2
-rw-r--r-- | ChangeLog | 18 | ||||
-rw-r--r-- | awk.h | 31 | ||||
-rw-r--r-- | mpfr.c | 4 | ||||
-rw-r--r-- | node.c | 42 | ||||
-rw-r--r-- | test/ChangeLog | 6 | ||||
-rw-r--r-- | test/Makefile.am | 11 | ||||
-rw-r--r-- | test/Makefile.in | 11 | ||||
-rw-r--r-- | test/mbstr1.ok | 1 | ||||
-rw-r--r-- | test/mbstr2.awk | 1 | ||||
-rw-r--r-- | test/mbstr2.in | 4 | ||||
-rw-r--r-- | test/mbstr2.ok | 5 |
11 files changed, 117 insertions, 17 deletions
@@ -1,3 +1,21 @@ +2015-08-25 Arnold D. Robbins <arnold@skeeve.com> + + * node.c (str2wstr): Upon finding an invalid character, if + using UTF-8, use the replacement character instead of skipping + it. Helps match() and other functions work better in the face + of unexpected data. Make the lint warning an unconditional + warning. + + Unrelated: + + * awk.h: Add explanatory comment on the flags related to + types and values. + * mpfr.c (mpg_force_number): If setting NUMBER, clear STRING also + when clearing MAYBE_NUM. + (set_PREC): Check STRCUR instead of STRING. + * node.c (r_force_number): If setting NUMBER, clear STRING also + when clearing MAYBE_NUM. + 2015-08-15 Arnold D. Robbins <arnold@skeeve.com> * dfa.c (dfamust): Restore c90 compat by moving some @@ -395,6 +395,37 @@ typedef struct exp_node { # define MALLOC 0x0001 /* can be free'd */ /* type = Node_val */ + /* + * STRING and NUMBER are mutually exclusive. They represent the + * type of a value as assigned. + * + * STRCUR and NUMCUR are not mutually exclusive. They represent that + * the particular type of value is up to date. For example, + * + * a = 5 # NUMBER | NUMCUR + * b = a "" # Adds STRCUR to a, since a string value + * # is now available. But the type hasn't changed! + * + * a = "42" # STRING | STRCUR + * b = a + 0 # Adds NUMCUR to a, since numeric value + * # is now available. But the type hasn't changed! + * + * MAYBE_NUM is the joker. It means "this is string data, but + * the user may have really wanted it to be a number. If we have + * to guess, like in a comparison, turn it into a number." + * For example, gawk -v a=42 .... + * Here, `a' gets STRING|STRCUR|MAYBE_NUM and then when used where + * a number is needed, it gets turned into a NUMBER and STRING + * is cleared. + * + * WSTRCUR is for efficiency. If in a multibyte locale, and we + * need to do something character based (substr, length, etc.) + * we create the corresponding wide character string and store it, + * and add WSTRCUR to the flags so that we don't have to do the + * conversion more than once. + * + * We hope that the rest of the flags are self-explanatory. :-) + */ # define STRING 0x0002 /* assigned as string */ # define STRCUR 0x0004 /* string value is current */ # define NUMCUR 0x0008 /* numeric value is current */ @@ -347,7 +347,7 @@ mpg_force_number(NODE *n) return n; if ((n->flags & MAYBE_NUM) != 0) { - n->flags &= ~MAYBE_NUM; + n->flags &= ~(MAYBE_NUM|STRING); newflags = NUMBER; } @@ -525,7 +525,7 @@ set_PREC() if ((val->flags & MAYBE_NUM) != 0) force_number(val); - if ((val->flags & (STRING|NUMBER)) == STRING) { + if ((val->flags & STRCUR) != 0) { int i, j; /* emulate IEEE-754 binary format */ @@ -76,7 +76,7 @@ r_force_number(NODE *n) return n; } else if (n->stlen == 4 && is_ieee_magic_val(n->stptr)) { if ((n->flags & MAYBE_NUM) != 0) - n->flags &= ~MAYBE_NUM; + n->flags &= ~(MAYBE_NUM|STRING); n->flags |= NUMBER|NUMCUR; n->numbr = get_ieee_magic_val(n->stptr); @@ -103,7 +103,7 @@ r_force_number(NODE *n) if ((n->flags & MAYBE_NUM) != 0) { newflags = NUMBER; - n->flags &= ~MAYBE_NUM; + n->flags &= ~(MAYBE_NUM|STRING); } else newflags = 0; @@ -717,22 +717,37 @@ str2wstr(NODE *n, size_t **ptr) case (size_t) -2: case (size_t) -1: /* - * Just skip the bad byte and keep going, so that - * we get a more-or-less full string, instead of - * stopping early. This is particularly important - * for match() where we need to build the indices. - */ - sp++; - src_count--; - /* * mbrtowc(3) says the state of mbs becomes undefined * after a bad character, so reset it. */ memset(& mbs, 0, sizeof(mbs)); - /* And warn the user something's wrong */ - if (do_lint && ! warned) { + + /* Warn the user something's wrong */ + if (! warned) { warned = true; - lintwarn(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale.")); + warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale.")); + } + + /* + * 8/2015: If we're using UTF, then instead of just + * skipping the character, plug in the Unicode + * replacement character. In most cases this gives + * us "better" results, in that character counts + * and string lengths tend to make more sense. + * + * Otherwise, just skip the bad byte and keep going, + * so that we get a more-or-less full string, instead of + * stopping early. This is particularly important + * for match() where we need to build the indices. + */ + if (using_utf8()) { + count = 1; + wc = 0xFFFD; /* unicode replacement character */ + goto set_wc; + } else { + /* skip it and keep going */ + sp++; + src_count--; } break; @@ -740,6 +755,7 @@ str2wstr(NODE *n, size_t **ptr) count = 1; /* fall through */ default: + set_wc: *wsp++ = wc; src_count -= count; while (count--) { diff --git a/test/ChangeLog b/test/ChangeLog index cc7576ef..7b9e273a 100644 --- a/test/ChangeLog +++ b/test/ChangeLog @@ -1,3 +1,9 @@ +2015-08-25 Arnold D. Robbins <arnold@skeeve.com> + + * mbstr1.ok: Updated after code change. + * Makefile.am (mbstr2): New test. + * mbstr2.awk, mbstr2.in, mbstr2.ok: New files. + 2015-06-25 Arnold D. Robbins <arnold@skeeve.com> * Makefile.am (negtime): Fix out-of-tree test run. diff --git a/test/Makefile.am b/test/Makefile.am index c499996c..781d45d4 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -531,6 +531,9 @@ EXTRA_DIST = \ mbprintf4.ok \ mbstr1.awk \ mbstr1.ok \ + mbstr2.awk \ + mbstr2.in \ + mbstr2.ok \ membug1.awk \ membug1.in \ membug1.ok \ @@ -1062,7 +1065,7 @@ GAWK_EXT_TESTS = \ incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \ include include2 indirectbuiltin indirectcall indirectcall2 \ lint lintold lintwarn \ - manyfiles match1 match2 match3 mbstr1 \ + manyfiles match1 match2 match3 mbstr1 mbstr2 \ nastyparm negtime next nondec nondec2 \ patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge procinfs \ profile0 profile1 profile2 profile3 profile4 profile5 profile6 profile7 pty1 \ @@ -1705,6 +1708,12 @@ mbstr1:: AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +mbstr2:: + @echo $@ + @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \ + AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + printfbad2: printfbad2.ok @echo $@ @$(AWK) --lint -f "$(srcdir)"/$@.awk "$(srcdir)"/$@.in 2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$? >>_$@ diff --git a/test/Makefile.in b/test/Makefile.in index e8f0109c..d2776b29 100644 --- a/test/Makefile.in +++ b/test/Makefile.in @@ -788,6 +788,9 @@ EXTRA_DIST = \ mbprintf4.ok \ mbstr1.awk \ mbstr1.ok \ + mbstr2.awk \ + mbstr2.in \ + mbstr2.ok \ membug1.awk \ membug1.in \ membug1.ok \ @@ -1318,7 +1321,7 @@ GAWK_EXT_TESTS = \ incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \ include include2 indirectbuiltin indirectcall indirectcall2 \ lint lintold lintwarn \ - manyfiles match1 match2 match3 mbstr1 \ + manyfiles match1 match2 match3 mbstr1 mbstr2 \ nastyparm negtime next nondec nondec2 \ patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge procinfs \ profile0 profile1 profile2 profile3 profile4 profile5 profile6 profile7 pty1 \ @@ -2145,6 +2148,12 @@ mbstr1:: AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ +mbstr2:: + @echo $@ + @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \ + AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@ + @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@ + printfbad2: printfbad2.ok @echo $@ @$(AWK) --lint -f "$(srcdir)"/$@.awk "$(srcdir)"/$@.in 2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$? >>_$@ diff --git a/test/mbstr1.ok b/test/mbstr1.ok index dcb4347b..3fd1bf8e 100644 --- a/test/mbstr1.ok +++ b/test/mbstr1.ok @@ -1,2 +1,3 @@ +gawk: mbstr1.awk:2: warning: Invalid multibyte data detected. There may be a mismatch between your data and your locale. 4 1 diff --git a/test/mbstr2.awk b/test/mbstr2.awk new file mode 100644 index 00000000..4f2c8cc6 --- /dev/null +++ b/test/mbstr2.awk @@ -0,0 +1 @@ +match($0,/:deathdate=2007....:/) { print substr($0,RSTART+11,RLENGTH-16) } diff --git a/test/mbstr2.in b/test/mbstr2.in new file mode 100644 index 00000000..36e971a6 --- /dev/null +++ b/test/mbstr2.in @@ -0,0 +1,4 @@ +missile:deathdate=20070306: +P”rr”:deathdate=20070306: +wizard:deathdate=20071103: +Daithí:deathdate=20071103: diff --git a/test/mbstr2.ok b/test/mbstr2.ok new file mode 100644 index 00000000..29ac876a --- /dev/null +++ b/test/mbstr2.ok @@ -0,0 +1,5 @@ +2007 +gawk: mbstr2.awk:1: (FILENAME=- FNR=2) warning: Invalid multibyte data detected. There may be a mismatch between your data and your locale. +2007 +2007 +2007 |