aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog4
-rw-r--r--dfa.c189
-rw-r--r--doc/ChangeLog4
-rw-r--r--doc/gawk.texi35
-rw-r--r--doc/gawktexi.in35
5 files changed, 169 insertions, 98 deletions
diff --git a/ChangeLog b/ChangeLog
index 57d55fb6..c86de65e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2014-03-30 Arnold D. Robbins <arnold@skeeve.com>
+
+ * dfa.c: Sync with GNU grep.
+
2014-03-28 Arnold D. Robbins <arnold@skeeve.com>
* configure.ac: Remove duplicate AC_HEADER_TIME and rearrange
diff --git a/dfa.c b/dfa.c
index 0fc68cff..378305df 100644
--- a/dfa.c
+++ b/dfa.c
@@ -43,7 +43,11 @@
#include "missing_d/gawkbool.h"
#endif /* HAVE_STDBOOL_H */
-/* Gawk doesn't use Gnulib, so don't assume static_assert is present. */
+/* Gawk doesn't use Gnulib, so don't assume that setlocale and
+ static_assert are present. */
+#ifndef LC_ALL
+# define setlocale(category, locale) NULL
+#endif
#ifndef static_assert
# define static_assert(cond, diagnostic) \
extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
@@ -408,6 +412,14 @@ struct dfa
size_t nmultibyte_prop;
int *multibyte_prop;
+#if MBS_SUPPORT
+ /* A table indexed by byte values that contains the corresponding wide
+ character (if any) for that byte. WEOF means the byte is the
+ leading byte of a multibyte character. Invalid and null bytes are
+ mapped to themselves. */
+ wint_t mbrtowc_cache[NOTCHAR];
+#endif
+
/* Array of the bracket expression in the DFA. */
struct mb_char_classes *mbcsets;
size_t nmbcsets;
@@ -510,6 +522,64 @@ static void regexp (void);
} \
while (false)
+static void
+dfambcache (struct dfa *d)
+{
+#if MBS_SUPPORT
+ int i;
+ for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+ {
+ char c = i;
+ unsigned char uc = i;
+ mbstate_t s = { 0 };
+ wchar_t wc;
+ wint_t wi;
+ switch (mbrtowc (&wc, &c, 1, &s))
+ {
+ default: wi = wc; break;
+ case (size_t) -2: wi = WEOF; break;
+ case (size_t) -1: wi = uc; break;
+ }
+ d->mbrtowc_cache[uc] = wi;
+ }
+#endif
+}
+
+#if MBS_SUPPORT
+/* Given the dfa D, store into *PWC the result of converting the
+ leading bytes of the multibyte buffer S of length N bytes, updating
+ the conversion state in *MBS. On conversion error, convert just a
+ single byte as-is. Return the number of bytes converted.
+
+ This differs from mbrtowc (PWC, S, N, MBS) as follows:
+
+ * Extra arg D, containing an mbrtowc_cache for speed.
+ * N must be at least 1.
+ * S[N - 1] must be a sentinel byte.
+ * Shift encodings are not supported.
+ * The return value is always in the range 1..N.
+ * *MBS is always valid afterwards.
+ * *PWC is always set to something. */
+static size_t
+mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n,
+ mbstate_t *mbs)
+{
+ unsigned char uc = s[0];
+ wint_t wc = d->mbrtowc_cache[uc];
+
+ if (wc == WEOF)
+ {
+ size_t nbytes = mbrtowc (pwc, s, n, mbs);
+ if (0 < nbytes && nbytes < (size_t) -2)
+ return nbytes;
+ memset (mbs, 0, sizeof *mbs);
+ wc = uc;
+ }
+
+ *pwc = wc;
+ return 1;
+}
+#endif
#ifdef DEBUG
@@ -820,13 +890,10 @@ using_simple_locale (void)
static int unibyte_c = -1;
if (unibyte_c < 0)
{
-#ifdef LC_ALL
- char *locale = setlocale (LC_ALL, NULL);
- unibyte_c = (locale && (STREQ (locale, "C")
- || STREQ (locale, "POSIX")));
-#else
- unibyte_c = 1;
-#endif
+ char const *locale = setlocale (LC_ALL, NULL);
+ unibyte_c = (!locale
+ || STREQ (locale, "C")
+ || STREQ (locale, "POSIX"));
}
return unibyte_c;
}
@@ -848,7 +915,7 @@ static int minrep, maxrep; /* Repeat counts for {m,n}. */
static int cur_mb_len = 1; /* Length of the multibyte representation of
wctok. */
/* These variables are used only if (MB_CUR_MAX > 1). */
-static mbstate_t mbs; /* Mbstate for mbrlen. */
+static mbstate_t mbs; /* mbstate for mbrtowc. */
static wchar_t wctok; /* Wide character representation of the current
multibyte character. */
static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec.
@@ -885,32 +952,18 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */
else \
{ \
wchar_t _wc; \
- cur_mb_len = mbrtowc (&_wc, lexptr, lexleft, &mbs); \
- if (cur_mb_len <= 0) \
- { \
- cur_mb_len = 1; \
- --lexleft; \
- (wc) = (c) = to_uchar (*lexptr++); \
- } \
- else \
- { \
- lexptr += cur_mb_len; \
- lexleft -= cur_mb_len; \
- (wc) = _wc; \
- (c) = wctob (wc); \
- } \
+ size_t nbytes = mbs_to_wchar (dfa, &_wc, lexptr, lexleft, &mbs); \
+ cur_mb_len = nbytes; \
+ (wc) = _wc; \
+ (c) = nbytes == 1 ? to_uchar (*lexptr) : EOF; \
+ lexptr += nbytes; \
+ lexleft -= nbytes; \
} \
} while (0)
-# define FETCH(c, eoferr) \
- do { \
- wint_t wc; \
- FETCH_WC (c, wc, eoferr); \
- } while (0)
-
#else
/* Note that characters become unsigned here. */
-# define FETCH(c, eoferr) \
+# define FETCH_WC(c, unused, eoferr) \
do { \
if (! lexleft) \
{ \
@@ -923,8 +976,6 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */
--lexleft; \
} while (0)
-# define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
-
#endif /* MBS_SUPPORT */
#ifndef MIN
@@ -1302,14 +1353,9 @@ lex (void)
"if (backslash) ...". */
for (i = 0; i < 2; ++i)
{
- if (MB_CUR_MAX > 1)
- {
- FETCH_WC (c, wctok, NULL);
- if ((int) c == EOF)
- goto normal_char;
- }
- else
- FETCH (c, NULL);
+ FETCH_WC (c, wctok, NULL);
+ if (c == (unsigned int) EOF)
+ goto normal_char;
switch (c)
{
@@ -1726,16 +1772,19 @@ static void
addtok_wc (wint_t wc)
{
unsigned char buf[MB_LEN_MAX];
- mbstate_t s;
+ mbstate_t s = { 0 };
int i;
- memset (&s, 0, sizeof s);
- cur_mb_len = wcrtomb ((char *) buf, wc, &s);
+ size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
- /* This is merely stop-gap. When cur_mb_len is 0 or negative,
- buf[0] is undefined, yet skipping the addtok_mb call altogether
- can result in heap corruption. */
- if (cur_mb_len <= 0)
- buf[0] = 0;
+ if (stored_bytes != (size_t) -1)
+ cur_mb_len = stored_bytes;
+ else
+ {
+ /* This is merely stop-gap. buf[0] is undefined, yet skipping
+ the addtok_mb call altogether can corrupt the heap. */
+ cur_mb_len = 1;
+ buf[0] = 0;
+ }
addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1);
for (i = 1; i < cur_mb_len; i++)
@@ -3356,43 +3405,26 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
/* Initialize mblen_buf and inputwcs with data from the next line. */
static void
-prepare_wc_buf (const char *begin, const char *end)
+prepare_wc_buf (struct dfa *d, const char *begin, const char *end)
{
#if MBS_SUPPORT
unsigned char eol = eolbyte;
- size_t remain_bytes, i;
+ size_t i;
+ size_t ilim = end - begin + 1;
buf_begin = (unsigned char *) begin;
- remain_bytes = 0;
- for (i = 0; i < end - begin + 1; i++)
+ for (i = 0; i < ilim; i++)
{
- if (remain_bytes == 0)
- {
- remain_bytes
- = mbrtowc (inputwcs + i, begin + i, end - begin - i + 1, &mbs);
- if (remain_bytes < 1
- || remain_bytes == (size_t) -1
- || remain_bytes == (size_t) -2
- || (remain_bytes == 1 && inputwcs[i] == (wchar_t) begin[i]))
- {
- remain_bytes = 0;
- inputwcs[i] = (wchar_t) begin[i];
- mblen_buf[i] = 0;
- if (begin[i] == eol)
- break;
- }
- else
- {
- mblen_buf[i] = remain_bytes;
- remain_bytes--;
- }
- }
- else
+ size_t nbytes = mbs_to_wchar (d, inputwcs + i, begin + i, ilim - i, &mbs);
+ mblen_buf[i] = nbytes - (nbytes == 1);
+ if (begin[i] == eol)
+ break;
+ while (--nbytes != 0)
{
- mblen_buf[i] = remain_bytes;
+ i++;
+ mblen_buf[i] = nbytes;
inputwcs[i] = 0;
- remain_bytes--;
}
}
@@ -3439,7 +3471,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
MALLOC (mblen_buf, end - begin + 2);
MALLOC (inputwcs, end - begin + 2);
memset (&mbs, 0, sizeof (mbstate_t));
- prepare_wc_buf ((const char *) p, end);
+ prepare_wc_buf (d, (const char *) p, end);
}
for (;;)
@@ -3529,7 +3561,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
++*count;
if (d->mb_cur_max > 1)
- prepare_wc_buf ((const char *) p, end);
+ prepare_wc_buf (d, (const char *) p, end);
}
/* Check if we've run off the end of the buffer. */
@@ -3648,6 +3680,7 @@ void
dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
{
dfainit (d);
+ dfambcache (d);
dfaparse (s, len, d);
dfamust (d);
dfaoptimize (d);
diff --git a/doc/ChangeLog b/doc/ChangeLog
index 4ef12ca6..d2763325 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,7 @@
+2014-03-30 Arnold D. Robbins <arnold@skeeve.com>
+
+ * gawktexi.in: Cleanups to docbook, finish math stuff.
+
2014-03-28 Arnold D. Robbins <arnold@skeeve.com>
* gawktexi.in: Minor cleanups to the indexing.
diff --git a/doc/gawk.texi b/doc/gawk.texi
index 2b666d35..47dee34c 100644
--- a/doc/gawk.texi
+++ b/doc/gawk.texi
@@ -169,6 +169,9 @@
@ignore
Some comments on the layout for TeX.
1. Use at least texinfo.tex 2014-01-30.15
+2. When using @docbook, if the last line is part of a paragraph, end
+it with a space and @c so that the lines won't run together. This is a
+quirk of the language / makeinfo, and isn't going to change.
@end ignore
@c merge the function and variable indexes into the concept index
@@ -1061,7 +1064,7 @@ $\sim\! Cn^2$
@end ifnotdocbook
@end ifnottex
@docbook
-<emphasis>&sim; Cn<superscript>2</superscript></emphasis>&thinsp;
+<emphasis>&sim; Cn<superscript>2</superscript></emphasis> @c
@end docbook
performance, while
theory predicted
@@ -1074,7 +1077,7 @@ $\sim\! Cn\log n$
@end ifnotdocbook
@end ifnottex
@docbook
-<emphasis>&sim; Cn log n</emphasis>&thinsp;
+<emphasis>&sim; Cn log n</emphasis> @c
@end docbook
behavior. A few minutes poring
over the @file{awkprof.out} profile pinpointed the problem to
@@ -17311,7 +17314,7 @@ All known POSIX-compliant systems support timestamps from 0 through
@end ifnotdocbook
@end ifnottex
@docbook
-2<superscript>31</superscript> &minus; 1,&thinsp;
+2<superscript>31</superscript> &minus; 1, @c
@end docbook
which is sufficient to represent times through
2038-01-19 03:14:07 UTC. Many systems support a wider range of timestamps,
@@ -28801,7 +28804,7 @@ then the answer is
@end ifnotdocbook
@end ifnottex
@docbook
-2<superscript>53</superscript>.&thinsp;
+2<superscript>53</superscript>. @c
@end docbook
The next representable number is the even number
@iftex
@@ -28813,7 +28816,7 @@ The next representable number is the even number
@end ifnotdocbook
@end ifnottex
@docbook
-2<superscript>53</superscript> &plus; 2,
+2<superscript>53</superscript> &plus; 2, @c
@end docbook
meaning it is unlikely that you will be able to make
@command{gawk} print
@@ -28826,7 +28829,7 @@ meaning it is unlikely that you will be able to make
@end ifnotdocbook
@end ifnottex
@docbook
-2<superscript>53</superscript> &plus; 1&thinsp;
+2<superscript>53</superscript> &plus; 1 @c
@end docbook
in integer format.
The range of integers exactly representable by a 64-bit double
@@ -28840,7 +28843,7 @@ is
@end ifnotdocbook
@end ifnottex
@docbook
-[&minus;2<superscript>53</superscript>, 2<superscript>53</superscript>].&thinsp;
+[&minus;2<superscript>53</superscript>, 2<superscript>53</superscript>]. @c
@end docbook
If you ever see an integer outside this range in @command{awk}
using 64-bit doubles, you have reason to be very suspicious about
@@ -29070,7 +29073,7 @@ number is then
@end ifnotdocbook
@end ifnottex
@docbook
-<emphasis>s &cdot; 2<superscript>e</superscript></emphasis>.&thinsp;
+<emphasis>s &sdot; 2<superscript>e</superscript></emphasis>. @c
@end docbook
The first bit of a non-zero binary significand
is always one, so the significand in an IEEE-754 format only includes the
@@ -29319,7 +29322,7 @@ numbers are not implemented.}
@end ifnotdocbook
@end ifnottex
@docbook
-(<emphasis>emax</emphasis> = 2<superscript>30</superscript> &minus; 1, <emphasis>emin</emphasis> = &minus;<emphasis>emax</emphasis>)&thinsp;
+(<emphasis>emax</emphasis> = 2<superscript>30</superscript> &minus; 1, <emphasis>emin</emphasis> = &minus;<emphasis>emax</emphasis>) @c
@end docbook
for all floating-point contexts.
There is no explicit mechanism to adjust the exponent range.
@@ -29398,7 +29401,7 @@ formula:
@end ifnottex
@docbook
<para>
-<emphasis>prec</emphasis> = 3.322 &cdot; <emphasis>dps</emphasis>
+<emphasis>prec</emphasis> = 3.322 &sdot; <emphasis>dps</emphasis> @c
</para>
@end docbook
@@ -29636,8 +29639,13 @@ For example, the following computes
@math{5^{4^{3^{2}}}},
@end iftex
@ifnottex
+@ifnotdocbook
5^4^3^2,
+@end ifnotdocbook
@end ifnottex
+@docbook
+5<superscript>4<superscript>3<superscript>2</superscript></superscript></superscript>, @c
+@end docbook
the result of which is beyond the
limits of ordinary @command{gawk} numbers:
@@ -29659,9 +29667,16 @@ floating-point values instead, the precision needed for correct output
would be @math{3.322 @cdot 183231},
@end iftex
@ifnottex
+@ifnotdocbook
@samp{prec = 3.322 * dps}),
would be 3.322 x 183231,
+@end ifnotdocbook
@end ifnottex
+@docbook
+<emphasis>prec</emphasis> = 3.322 &sdot; <emphasis>dps</emphasis>),
+would be
+<emphasis>prec</emphasis> = 3.322 &sdot; 183231, @c
+@end docbook
or 608693.
The result from an arithmetic operation with an integer and a floating-point value
diff --git a/doc/gawktexi.in b/doc/gawktexi.in
index 66df3784..446c13d5 100644
--- a/doc/gawktexi.in
+++ b/doc/gawktexi.in
@@ -164,6 +164,9 @@
@ignore
Some comments on the layout for TeX.
1. Use at least texinfo.tex 2014-01-30.15
+2. When using @docbook, if the last line is part of a paragraph, end
+it with a space and @c so that the lines won't run together. This is a
+quirk of the language / makeinfo, and isn't going to change.
@end ignore
@c merge the function and variable indexes into the concept index
@@ -1056,7 +1059,7 @@ $\sim\! Cn^2$
@end ifnotdocbook
@end ifnottex
@docbook
-<emphasis>&sim; Cn<superscript>2</superscript></emphasis>&thinsp;
+<emphasis>&sim; Cn<superscript>2</superscript></emphasis> @c
@end docbook
performance, while
theory predicted
@@ -1069,7 +1072,7 @@ $\sim\! Cn\log n$
@end ifnotdocbook
@end ifnottex
@docbook
-<emphasis>&sim; Cn log n</emphasis>&thinsp;
+<emphasis>&sim; Cn log n</emphasis> @c
@end docbook
behavior. A few minutes poring
over the @file{awkprof.out} profile pinpointed the problem to
@@ -16481,7 +16484,7 @@ All known POSIX-compliant systems support timestamps from 0 through
@end ifnotdocbook
@end ifnottex
@docbook
-2<superscript>31</superscript> &minus; 1,&thinsp;
+2<superscript>31</superscript> &minus; 1, @c
@end docbook
which is sufficient to represent times through
2038-01-19 03:14:07 UTC. Many systems support a wider range of timestamps,
@@ -27942,7 +27945,7 @@ then the answer is
@end ifnotdocbook
@end ifnottex
@docbook
-2<superscript>53</superscript>.&thinsp;
+2<superscript>53</superscript>. @c
@end docbook
The next representable number is the even number
@iftex
@@ -27954,7 +27957,7 @@ The next representable number is the even number
@end ifnotdocbook
@end ifnottex
@docbook
-2<superscript>53</superscript> &plus; 2,
+2<superscript>53</superscript> &plus; 2, @c
@end docbook
meaning it is unlikely that you will be able to make
@command{gawk} print
@@ -27967,7 +27970,7 @@ meaning it is unlikely that you will be able to make
@end ifnotdocbook
@end ifnottex
@docbook
-2<superscript>53</superscript> &plus; 1&thinsp;
+2<superscript>53</superscript> &plus; 1 @c
@end docbook
in integer format.
The range of integers exactly representable by a 64-bit double
@@ -27981,7 +27984,7 @@ is
@end ifnotdocbook
@end ifnottex
@docbook
-[&minus;2<superscript>53</superscript>, 2<superscript>53</superscript>].&thinsp;
+[&minus;2<superscript>53</superscript>, 2<superscript>53</superscript>]. @c
@end docbook
If you ever see an integer outside this range in @command{awk}
using 64-bit doubles, you have reason to be very suspicious about
@@ -28211,7 +28214,7 @@ number is then
@end ifnotdocbook
@end ifnottex
@docbook
-<emphasis>s &cdot; 2<superscript>e</superscript></emphasis>.&thinsp;
+<emphasis>s &sdot; 2<superscript>e</superscript></emphasis>. @c
@end docbook
The first bit of a non-zero binary significand
is always one, so the significand in an IEEE-754 format only includes the
@@ -28460,7 +28463,7 @@ numbers are not implemented.}
@end ifnotdocbook
@end ifnottex
@docbook
-(<emphasis>emax</emphasis> = 2<superscript>30</superscript> &minus; 1, <emphasis>emin</emphasis> = &minus;<emphasis>emax</emphasis>)&thinsp;
+(<emphasis>emax</emphasis> = 2<superscript>30</superscript> &minus; 1, <emphasis>emin</emphasis> = &minus;<emphasis>emax</emphasis>) @c
@end docbook
for all floating-point contexts.
There is no explicit mechanism to adjust the exponent range.
@@ -28539,7 +28542,7 @@ formula:
@end ifnottex
@docbook
<para>
-<emphasis>prec</emphasis> = 3.322 &cdot; <emphasis>dps</emphasis>
+<emphasis>prec</emphasis> = 3.322 &sdot; <emphasis>dps</emphasis> @c
</para>
@end docbook
@@ -28777,8 +28780,13 @@ For example, the following computes
@math{5^{4^{3^{2}}}},
@end iftex
@ifnottex
+@ifnotdocbook
5^4^3^2,
+@end ifnotdocbook
@end ifnottex
+@docbook
+5<superscript>4<superscript>3<superscript>2</superscript></superscript></superscript>, @c
+@end docbook
the result of which is beyond the
limits of ordinary @command{gawk} numbers:
@@ -28800,9 +28808,16 @@ floating-point values instead, the precision needed for correct output
would be @math{3.322 @cdot 183231},
@end iftex
@ifnottex
+@ifnotdocbook
@samp{prec = 3.322 * dps}),
would be 3.322 x 183231,
+@end ifnotdocbook
@end ifnottex
+@docbook
+<emphasis>prec</emphasis> = 3.322 &sdot; <emphasis>dps</emphasis>),
+would be
+<emphasis>prec</emphasis> = 3.322 &sdot; 183231, @c
+@end docbook
or 608693.
The result from an arithmetic operation with an integer and a floating-point value