aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2015-08-25 20:30:42 +0300
committerArnold D. Robbins <arnold@skeeve.com>2015-08-25 20:30:42 +0300
commitcd2504da556b8bb61b56a34727ef4ce5195ff9a0 (patch)
treed3c2831436c33756735214b942a8981b6da2ba5e
parent76406567b762f58129d49b5a9ec4e22cf15499f2 (diff)
parent243b097279a89d456fda4a400412482d70b3665c (diff)
downloadegawk-cd2504da556b8bb61b56a34727ef4ce5195ff9a0.tar.gz
egawk-cd2504da556b8bb61b56a34727ef4ce5195ff9a0.tar.bz2
egawk-cd2504da556b8bb61b56a34727ef4ce5195ff9a0.zip
Merge branch 'gawk-4.1-stable' into feature/zOS-try2
-rw-r--r--ChangeLog18
-rw-r--r--awk.h31
-rw-r--r--mpfr.c4
-rw-r--r--node.c42
-rw-r--r--test/ChangeLog6
-rw-r--r--test/Makefile.am11
-rw-r--r--test/Makefile.in11
-rw-r--r--test/mbstr1.ok1
-rw-r--r--test/mbstr2.awk1
-rw-r--r--test/mbstr2.in4
-rw-r--r--test/mbstr2.ok5
11 files changed, 117 insertions, 17 deletions
diff --git a/ChangeLog b/ChangeLog
index 62647c1a..e685dd6f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2015-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
+ * node.c (str2wstr): Upon finding an invalid character, if
+ using UTF-8, use the replacement character instead of skipping
+ it. Helps match() and other functions work better in the face
+ of unexpected data. Make the lint warning an unconditional
+ warning.
+
+ Unrelated:
+
+ * awk.h: Add explanatory comment on the flags related to
+ types and values.
+ * mpfr.c (mpg_force_number): If setting NUMBER, clear STRING also
+ when clearing MAYBE_NUM.
+ (set_PREC): Check STRCUR instead of STRING.
+ * node.c (r_force_number): If setting NUMBER, clear STRING also
+ when clearing MAYBE_NUM.
+
2015-08-15 Arnold D. Robbins <arnold@skeeve.com>
* dfa.c (dfamust): Restore c90 compat by moving some
diff --git a/awk.h b/awk.h
index 2b823667..1a7dc242 100644
--- a/awk.h
+++ b/awk.h
@@ -395,6 +395,37 @@ typedef struct exp_node {
# define MALLOC 0x0001 /* can be free'd */
/* type = Node_val */
+ /*
+ * STRING and NUMBER are mutually exclusive. They represent the
+ * type of a value as assigned.
+ *
+ * STRCUR and NUMCUR are not mutually exclusive. They represent that
+ * the particular type of value is up to date. For example,
+ *
+ * a = 5 # NUMBER | NUMCUR
+ * b = a "" # Adds STRCUR to a, since a string value
+ * # is now available. But the type hasn't changed!
+ *
+ * a = "42" # STRING | STRCUR
+ * b = a + 0 # Adds NUMCUR to a, since numeric value
+ * # is now available. But the type hasn't changed!
+ *
+ * MAYBE_NUM is the joker. It means "this is string data, but
+ * the user may have really wanted it to be a number. If we have
+ * to guess, like in a comparison, turn it into a number."
+ * For example, gawk -v a=42 ....
+ * Here, `a' gets STRING|STRCUR|MAYBE_NUM and then when used where
+ * a number is needed, it gets turned into a NUMBER and STRING
+ * is cleared.
+ *
+ * WSTRCUR is for efficiency. If in a multibyte locale, and we
+ * need to do something character based (substr, length, etc.)
+ * we create the corresponding wide character string and store it,
+ * and add WSTRCUR to the flags so that we don't have to do the
+ * conversion more than once.
+ *
+ * We hope that the rest of the flags are self-explanatory. :-)
+ */
# define STRING 0x0002 /* assigned as string */
# define STRCUR 0x0004 /* string value is current */
# define NUMCUR 0x0008 /* numeric value is current */
diff --git a/mpfr.c b/mpfr.c
index a89b2bc6..4e4e12dc 100644
--- a/mpfr.c
+++ b/mpfr.c
@@ -347,7 +347,7 @@ mpg_force_number(NODE *n)
return n;
if ((n->flags & MAYBE_NUM) != 0) {
- n->flags &= ~MAYBE_NUM;
+ n->flags &= ~(MAYBE_NUM|STRING);
newflags = NUMBER;
}
@@ -525,7 +525,7 @@ set_PREC()
if ((val->flags & MAYBE_NUM) != 0)
force_number(val);
- if ((val->flags & (STRING|NUMBER)) == STRING) {
+ if ((val->flags & STRCUR) != 0) {
int i, j;
/* emulate IEEE-754 binary format */
diff --git a/node.c b/node.c
index 1741a13b..a7c19db1 100644
--- a/node.c
+++ b/node.c
@@ -76,7 +76,7 @@ r_force_number(NODE *n)
return n;
} else if (n->stlen == 4 && is_ieee_magic_val(n->stptr)) {
if ((n->flags & MAYBE_NUM) != 0)
- n->flags &= ~MAYBE_NUM;
+ n->flags &= ~(MAYBE_NUM|STRING);
n->flags |= NUMBER|NUMCUR;
n->numbr = get_ieee_magic_val(n->stptr);
@@ -103,7 +103,7 @@ r_force_number(NODE *n)
if ((n->flags & MAYBE_NUM) != 0) {
newflags = NUMBER;
- n->flags &= ~MAYBE_NUM;
+ n->flags &= ~(MAYBE_NUM|STRING);
} else
newflags = 0;
@@ -717,22 +717,37 @@ str2wstr(NODE *n, size_t **ptr)
case (size_t) -2:
case (size_t) -1:
/*
- * Just skip the bad byte and keep going, so that
- * we get a more-or-less full string, instead of
- * stopping early. This is particularly important
- * for match() where we need to build the indices.
- */
- sp++;
- src_count--;
- /*
* mbrtowc(3) says the state of mbs becomes undefined
* after a bad character, so reset it.
*/
memset(& mbs, 0, sizeof(mbs));
- /* And warn the user something's wrong */
- if (do_lint && ! warned) {
+
+ /* Warn the user something's wrong */
+ if (! warned) {
warned = true;
- lintwarn(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
+ warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
+ }
+
+ /*
+ * 8/2015: If we're using UTF, then instead of just
+ * skipping the character, plug in the Unicode
+ * replacement character. In most cases this gives
+ * us "better" results, in that character counts
+ * and string lengths tend to make more sense.
+ *
+ * Otherwise, just skip the bad byte and keep going,
+ * so that we get a more-or-less full string, instead of
+ * stopping early. This is particularly important
+ * for match() where we need to build the indices.
+ */
+ if (using_utf8()) {
+ count = 1;
+ wc = 0xFFFD; /* unicode replacement character */
+ goto set_wc;
+ } else {
+ /* skip it and keep going */
+ sp++;
+ src_count--;
}
break;
@@ -740,6 +755,7 @@ str2wstr(NODE *n, size_t **ptr)
count = 1;
/* fall through */
default:
+ set_wc:
*wsp++ = wc;
src_count -= count;
while (count--) {
diff --git a/test/ChangeLog b/test/ChangeLog
index cc7576ef..7b9e273a 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,9 @@
+2015-08-25 Arnold D. Robbins <arnold@skeeve.com>
+
+ * mbstr1.ok: Updated after code change.
+ * Makefile.am (mbstr2): New test.
+ * mbstr2.awk, mbstr2.in, mbstr2.ok: New files.
+
2015-06-25 Arnold D. Robbins <arnold@skeeve.com>
* Makefile.am (negtime): Fix out-of-tree test run.
diff --git a/test/Makefile.am b/test/Makefile.am
index c499996c..781d45d4 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -531,6 +531,9 @@ EXTRA_DIST = \
mbprintf4.ok \
mbstr1.awk \
mbstr1.ok \
+ mbstr2.awk \
+ mbstr2.in \
+ mbstr2.ok \
membug1.awk \
membug1.in \
membug1.ok \
@@ -1062,7 +1065,7 @@ GAWK_EXT_TESTS = \
incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \
include include2 indirectbuiltin indirectcall indirectcall2 \
lint lintold lintwarn \
- manyfiles match1 match2 match3 mbstr1 \
+ manyfiles match1 match2 match3 mbstr1 mbstr2 \
nastyparm negtime next nondec nondec2 \
patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge procinfs \
profile0 profile1 profile2 profile3 profile4 profile5 profile6 profile7 pty1 \
@@ -1705,6 +1708,12 @@ mbstr1::
AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+mbstr2::
+ @echo $@
+ @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \
+ AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
printfbad2: printfbad2.ok
@echo $@
@$(AWK) --lint -f "$(srcdir)"/$@.awk "$(srcdir)"/$@.in 2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$? >>_$@
diff --git a/test/Makefile.in b/test/Makefile.in
index e8f0109c..d2776b29 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -788,6 +788,9 @@ EXTRA_DIST = \
mbprintf4.ok \
mbstr1.awk \
mbstr1.ok \
+ mbstr2.awk \
+ mbstr2.in \
+ mbstr2.ok \
membug1.awk \
membug1.in \
membug1.ok \
@@ -1318,7 +1321,7 @@ GAWK_EXT_TESTS = \
incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \
include include2 indirectbuiltin indirectcall indirectcall2 \
lint lintold lintwarn \
- manyfiles match1 match2 match3 mbstr1 \
+ manyfiles match1 match2 match3 mbstr1 mbstr2 \
nastyparm negtime next nondec nondec2 \
patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge procinfs \
profile0 profile1 profile2 profile3 profile4 profile5 profile6 profile7 pty1 \
@@ -2145,6 +2148,12 @@ mbstr1::
AWKPATH="$(srcdir)" $(AWK) -f $@.awk >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+mbstr2::
+ @echo $@
+ @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \
+ AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
printfbad2: printfbad2.ok
@echo $@
@$(AWK) --lint -f "$(srcdir)"/$@.awk "$(srcdir)"/$@.in 2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$? >>_$@
diff --git a/test/mbstr1.ok b/test/mbstr1.ok
index dcb4347b..3fd1bf8e 100644
--- a/test/mbstr1.ok
+++ b/test/mbstr1.ok
@@ -1,2 +1,3 @@
+gawk: mbstr1.awk:2: warning: Invalid multibyte data detected. There may be a mismatch between your data and your locale.
4
1
diff --git a/test/mbstr2.awk b/test/mbstr2.awk
new file mode 100644
index 00000000..4f2c8cc6
--- /dev/null
+++ b/test/mbstr2.awk
@@ -0,0 +1 @@
+match($0,/:deathdate=2007....:/) { print substr($0,RSTART+11,RLENGTH-16) }
diff --git a/test/mbstr2.in b/test/mbstr2.in
new file mode 100644
index 00000000..36e971a6
--- /dev/null
+++ b/test/mbstr2.in
@@ -0,0 +1,4 @@
+missile:deathdate=20070306:
+P”rr”:deathdate=20070306:
+wizard:deathdate=20071103:
+Daithí:deathdate=20071103:
diff --git a/test/mbstr2.ok b/test/mbstr2.ok
new file mode 100644
index 00000000..29ac876a
--- /dev/null
+++ b/test/mbstr2.ok
@@ -0,0 +1,5 @@
+2007
+gawk: mbstr2.awk:1: (FILENAME=- FNR=2) warning: Invalid multibyte data detected. There may be a mismatch between your data and your locale.
+2007
+2007
+2007