diff options
-rw-r--r-- | ChangeLog | 9 | ||||
-rw-r--r-- | awk.h | 14 | ||||
-rw-r--r-- | builtin.c | 6 | ||||
-rw-r--r-- | eval.c | 2 | ||||
-rw-r--r-- | field.c | 29 | ||||
-rw-r--r-- | gawkapi.c | 5 | ||||
-rw-r--r-- | int_array.c | 8 | ||||
-rw-r--r-- | io.c | 6 | ||||
-rw-r--r-- | main.c | 8 | ||||
-rw-r--r-- | mpfr.c | 6 | ||||
-rw-r--r-- | nocopy-doc.diff | 238 | ||||
-rw-r--r-- | node.c | 10 |
12 files changed, 294 insertions, 47 deletions
@@ -1,3 +1,9 @@ +2016-11-07 Arnold D. Robbins <arnold@skeeve.com> + + * awk.h [USER_INPUT]: Renamed from MAYBE_NUM. + * builtin.c, eval.c, field.c, int_array.c, io.c, main.c, + mpfr.c, node.c: Change all uses. + 2016-11-04 Eli Zaretskii <eliz@gnu.org> * builtin.c (efwrite) [__MINGW32__]: Call w32_maybe_set_errno if @@ -336,7 +342,7 @@ termination to protect against unterminated field values. (nondec2awknum): Remove comment about unnecessary termination. * eval.c (posix_compare): Restore temporary string termination. - * field.c (databuf): Remove struct no longer needed. + * field.c (databuf): Remove struct, no longer needed. (set_field): Remove memcpy for string termination, since we will support unterminated field string values. (rebuild_record): Ditto. Also no need to allocate space for terminated @@ -434,6 +440,7 @@ Make sure that all field values, and therefore all strings inside gawk, are terminated with a '\0' character! + * field.c (databuf): New static struct to hold info about our buffer to contain the field string values. (allocate_databuf): New function to make sure the databuf is large @@ -408,15 +408,15 @@ typedef struct exp_node { * b = a + 0 # Adds NUMCUR to a, since numeric value * # is now available. But the type hasn't changed! * - * MAYBE_NUM is the joker. When STRING|MAYBE_NUM is set, it means + * USER_INPUT is the joker. When STRING|USER_INPUT is set, it means * "this is string data, but the user may have really wanted it to be a * number. If we have to guess, like in a comparison, turn it into a * number if the string is indeed numeric." * For example, gawk -v a=42 .... - * Here, `a' gets STRING|STRCUR|MAYBE_NUM and then when used where + * Here, `a' gets STRING|STRCUR|USER_INPUT and then when used where * a number is needed, it gets turned into a NUMBER and STRING - * is cleared. In that case, we leave the MAYBE_NUM in place, so - * the combination NUMBER|MAYBE_NUM means it is a strnum a.k.a. a + * is cleared. In that case, we leave the USER_INPUT in place, so + * the combination NUMBER|USER_INPUT means it is a strnum a.k.a. a * "numeric string". * * WSTRCUR is for efficiency. If in a multibyte locale, and we @@ -440,7 +440,7 @@ typedef struct exp_node { # define STRCUR 0x0004 /* string value is current */ # define NUMCUR 0x0008 /* numeric value is current */ # define NUMBER 0x0010 /* assigned as number */ -# define MAYBE_NUM 0x0020 /* user input: if NUMERIC then +# define USER_INPUT 0x0020 /* user input: if NUMERIC then * a NUMBER */ # define INTLSTR 0x0040 /* use localized version */ # define NUMINT 0x0080 /* numeric value is an integer */ @@ -1870,7 +1870,7 @@ force_number(NODE *n) * please use this function to resolve the type. * * It is safe to assume that the return value will be the same NODE, - * since force_number on a MAYBE_NUM should always return the same NODE, + * since force_number on a USER_INPUT should always return the same NODE, * and force_string on an INTIND should as well. * * There is no way to handle a Node_typedregex correctly, so we ignore @@ -1882,7 +1882,7 @@ fixtype(NODE *n) { assert(n->type == Node_val); if (n->type == Node_val) { - if ((n->flags & (NUMCUR|MAYBE_NUM)) == MAYBE_NUM) + if ((n->flags & (NUMCUR|USER_INPUT)) == USER_INPUT) return force_number(n); if ((n->flags & INTIND) != 0) return force_string(n); @@ -2647,7 +2647,7 @@ do_match(int nargs) } it = make_string(start, len); - it->flags |= MAYBE_NUM; /* user input */ + it->flags |= USER_INPUT; sub = make_number((AWKNUM) (ii)); lhs = assoc_lookup(dest, sub); @@ -3950,14 +3950,14 @@ do_typeof(int nargs) break; case Node_val: case Node_var: - switch (fixtype(arg)->flags & (STRING|NUMBER|MAYBE_NUM)) { + switch (fixtype(arg)->flags & (STRING|NUMBER|USER_INPUT)) { case STRING: res = "string"; break; case NUMBER: res = "number"; break; - case NUMBER|MAYBE_NUM: + case NUMBER|USER_INPUT: res = "strnum"; break; case NUMBER|STRING: @@ -436,7 +436,7 @@ flags2str(int flagval) { STRCUR, "STRCUR" }, { NUMCUR, "NUMCUR" }, { NUMBER, "NUMBER" }, - { MAYBE_NUM, "MAYBE_NUM" }, + { USER_INPUT, "USER_INPUT" }, { INTLSTR, "INTLSTR" }, { NUMINT, "NUMINT" }, { INTIND, "INTIND" }, @@ -132,7 +132,7 @@ set_field(long num, n = fields_arr[num]; n->stptr = str; n->stlen = len; - n->flags = (STRCUR|STRING|MAYBE_NUM); /* do not set MALLOC */ + n->flags = (STRCUR|STRING|USER_INPUT); /* do not set MALLOC */ } /* rebuild_record --- Someone assigned a value to $(something). @@ -231,10 +231,10 @@ rebuild_record() cops += fields_arr[i]->stlen + OFSlen; } -#ifndef NDEBUG - if ((fields_arr[0]->flags & MALLOC) == 0) - assert(fields_arr[0]->valref == 1); -#endif + assert((fields_arr[0]->flags & MALLOC) == 0 + ? fields_arr[0]->valref == 1 + : true); + unref(fields_arr[0]); fields_arr[0] = tmp; @@ -293,10 +293,10 @@ set_record(const char *buf, int cnt) databuf[cnt] = '\0'; /* manage field 0: */ -#ifndef NDEBUG - if ((fields_arr[0]->flags & MALLOC) == 0) - assert(fields_arr[0]->valref == 1); -#endif + assert((fields_arr[0]->flags & MALLOC) == 0 + ? fields_arr[0]->valref == 1 + : true); + unref(fields_arr[0]); getnode(n); n->stptr = databuf; @@ -304,7 +304,7 @@ set_record(const char *buf, int cnt) n->valref = 1; n->type = Node_val; n->stfmt = STFMT_UNUSED; - n->flags = (STRING|STRCUR|MAYBE_NUM); /* do not set MALLOC */ + n->flags = (STRING|STRCUR|USER_INPUT); /* do not set MALLOC */ fields_arr[0] = n; #undef INITIAL_SIZE @@ -328,10 +328,9 @@ purge_record() NF = -1; for (i = 1; i <= parse_high_water; i++) { -#ifndef NDEBUG - if ((fields_arr[i]->flags & MALLOC) == 0) - assert(fields_arr[i]->valref == 1); -#endif + assert((fields_arr[i]->flags & MALLOC) == 0 + ? fields_arr[i]->valref == 1 + : true); unref(fields_arr[i]); getnode(n); *n = *Null_field; @@ -883,7 +882,7 @@ set_element(long num, char *s, long len, NODE *n) NODE *sub; it = make_string(s, len); - it->flags |= MAYBE_NUM; + it->flags |= USER_INPUT; sub = make_number((AWKNUM) (num)); lhs = assoc_lookup(n, sub); unref(*lhs); @@ -396,6 +396,8 @@ static struct { size_t i, size; } scopy; +/* free_api_string_copies --- release memory used by string copies */ + void free_api_string_copies() { @@ -406,7 +408,7 @@ free_api_string_copies() scopy.i = 0; } -/* return a node string with nul termination */ +/* assign_string --- return a string node with NUL termination */ static inline void assign_string(NODE *node, awk_value_t *val) @@ -418,6 +420,7 @@ assign_string(NODE *node, awk_value_t *val) * This should happen only for $n where n > 0 and n < NF. */ char *s; + assert((node->flags & MALLOC) == 0); if (scopy.i == scopy.size) { /* expand list */ diff --git a/int_array.c b/int_array.c index 0014a81f..992da4a6 100644 --- a/int_array.c +++ b/int_array.c @@ -183,8 +183,8 @@ is_integer(NODE *symbol, NODE *subs) return NULL; if (len == 1 && *cp != '-') { /* single digit */ subs->numbr = (long) (*cp - '0'); - if ((subs->flags & MAYBE_NUM) != 0) { - /* leave MAYBE_NUM set */ + if ((subs->flags & USER_INPUT) != 0) { + /* leave USER_INPUT set */ subs->flags &= ~STRING; subs->flags |= NUMBER; } @@ -203,8 +203,8 @@ is_integer(NODE *symbol, NODE *subs) return NULL; subs->numbr = l; - if ((subs->flags & MAYBE_NUM) != 0) { - /* leave MAYBE_NUM set */ + if ((subs->flags & USER_INPUT) != 0) { + /* leave USER_INPUT set */ subs->flags &= ~STRING; subs->flags |= NUMBER; } @@ -533,7 +533,7 @@ nextfile(IOBUF **curfile, bool skipping) unref(FILENAME_node->var_value); FILENAME_node->var_value = make_string("-", 1); - FILENAME_node->var_value->flags |= MAYBE_NUM; /* be pedantic */ + FILENAME_node->var_value->flags |= USER_INPUT; /* be pedantic */ fname = "-"; iop = iop_alloc(fileno(stdin), fname, 0); *curfile = iop_finish(iop); @@ -2657,7 +2657,7 @@ do_getline_redir(int into_variable, enum redirval redirtype) else { /* assignment to variable */ unref(*lhs); *lhs = make_string(s, cnt); - (*lhs)->flags |= MAYBE_NUM; + (*lhs)->flags |= USER_INPUT; } return make_number((AWKNUM) 1.0); @@ -2700,7 +2700,7 @@ do_getline(int into_variable, IOBUF *iop) lhs = POP_ADDRESS(); unref(*lhs); *lhs = make_string(s, cnt); - (*lhs)->flags |= MAYBE_NUM; + (*lhs)->flags |= USER_INPUT; } return make_number((AWKNUM) 1.0); } @@ -722,14 +722,14 @@ init_args(int argc0, int argc, const char *argv0, char **argv) unref(tmp); unref(*aptr); *aptr = make_string(argv0, strlen(argv0)); - (*aptr)->flags |= MAYBE_NUM; + (*aptr)->flags |= USER_INPUT; for (i = argc0, j = 1; i < argc; i++, j++) { tmp = make_number((AWKNUM) j); aptr = assoc_lookup(ARGV_node, tmp); unref(tmp); unref(*aptr); *aptr = make_string(argv[i], strlen(argv[i])); - (*aptr)->flags |= MAYBE_NUM; + (*aptr)->flags |= USER_INPUT; } ARGC_node = install_symbol(estrdup("ARGC", 4), Node_var); @@ -883,7 +883,7 @@ load_environ() unref(tmp); unref(*aptr); *aptr = make_string(val, strlen(val)); - (*aptr)->flags |= MAYBE_NUM; + (*aptr)->flags |= USER_INPUT; /* restore '=' so that system() gets a valid environment */ if (val != nullstr) @@ -1161,7 +1161,7 @@ arg_assign(char *arg, bool initing) * This makes sense, so we do it too. */ it = make_str_node(cp, strlen(cp), SCAN); - it->flags |= MAYBE_NUM; + it->flags |= USER_INPUT; #ifdef LC_NUMERIC /* * See comment above about locale decimal point. @@ -348,13 +348,13 @@ mpg_force_number(NODE *n) n->flags |= NUMCUR; if (force_mpnum(n, (do_non_decimal_data && ! do_traditional), true)) { - if ((n->flags & MAYBE_NUM) != 0) { - /* leave MAYBE_NUM set to indicate a strnum */ + if ((n->flags & USER_INPUT) != 0) { + /* leave USER_INPUT set to indicate a strnum */ n->flags &= ~STRING; n->flags |= NUMBER; } } else - n->flags &= ~MAYBE_NUM; + n->flags &= ~USER_INPUT; return n; } diff --git a/nocopy-doc.diff b/nocopy-doc.diff new file mode 100644 index 00000000..bc63cff3 --- /dev/null +++ b/nocopy-doc.diff @@ -0,0 +1,238 @@ +diff --git a/doc/gawktexi.in b/doc/gawktexi.in +index efca7b6..76c3a9b 100644 +--- a/doc/gawktexi.in ++++ b/doc/gawktexi.in +@@ -11527,17 +11527,93 @@ compares variables. + @node Variable Typing + @subsubsection String Type versus Numeric Type + ++Scalar objects in @command{awk} (variables, array elements, and fields) ++are @emph{dynamically} typed. This means their type can change as the ++program runs, from @dfn{untyped} before any use,@footnote{@command{gawk} ++calls this @dfn{unassigned}, as the following example shows.} to string ++or number, and then from string to number or number to string, as the ++program progresses. ++ ++You can't do much with untyped variables, other than tell that they ++are untyped. The following program tests @code{a} against @code{""} ++and @code{0}; the test succeeds when @code{a} has never been assigned ++a value. It also uses the built-in @code{typeof()} function ++(not presented yet; @pxref{Type Functions}) to show @code{a}'s type: ++ ++@example ++$ @kbd{gawk 'BEGIN @{ print (a == "" && a == 0 ?} ++> @kbd{"a is untyped" : "a has a type!") ; print typeof(a) @}'} ++@print{} a is untyped ++@print{} unassigned ++@end example ++ ++A scalar has numeric type when assigned a numeric value, ++such as from a numeric constant, or from another scalar ++with numeric type: ++ ++@example ++$ @kbd{gawk 'BEGIN @{ a = 42 ; print typeof(a)} ++> @kbd{b = a ; print typeof(b) @}'} ++number ++number ++@end example ++ ++Similarly, a scalar has string type when assigned a string ++value, such as from a string constant, or from another scalar ++with string type: ++ ++@example ++$ @kbd{gawk 'BEGIN @{ a = "forty two" ; print typeof(a)} ++> @kbd{b = a ; print typeof(b) @}'} ++string ++string ++@end example ++ ++So far, this is all simple and straightforward. What happens, though, ++when @command{awk} has to process data from a user? Let's start with ++field data. What should the following command produce as output? ++ ++@example ++echo hello | awk '@{ printf("%s %s < 42\n", $1, ++ ($1 < 42 ? "is" : "is not")) @}' ++@end example ++ ++@noindent ++Since @samp{hello} is alphabetic data, @command{awk} can only do a string ++comparison. Internally, it converts @code{42} into @code{"42"} and compares ++the two string values @code{"hello"} and @code{"42"}. Here's the result: ++ ++@example ++$ @kbd{echo hello | awk '@{ printf("%s %s < 42\n", $1,} ++> @kbd{ ($1 < 42 ? "is" : "is not")) @}'} ++@print{} hello is not < 42 ++@end example ++ ++However, what happens when data from a user @emph{looks like} a number? ++On the one hand, in reality, the input data consists of characters, not ++binary numeric ++values. But, on the other hand, the data looks numeric, and @command{awk} ++really ought to treat it as such. And indeed, it does: ++ ++@example ++$ @kbd{echo 37 | awk '@{ printf("%s %s < 42\n", $1,} ++> @kbd{ ($1 < 42 ? "is" : "is not")) @}'} ++@print{} 37 is < 42 ++@end example ++ ++Here are the rules for when @command{awk} ++treats data as a number, and for when it treats data as a string. ++ + @cindex numeric, strings + @cindex strings, numeric + @cindex POSIX @command{awk}, numeric strings and +-The POSIX standard introduced +-the concept of a @dfn{numeric string}, which is simply a string that looks +-like a number---for example, @code{@w{" +2"}}. This concept is used +-for determining the type of a variable. +-The type of the variable is important because the types of two variables +-determine how they are compared. +-Variable typing follows these rules: ++The POSIX standard uses the term @dfn{numeric string} for input data that ++looks numeric. The @samp{37} in the previous example is a numeric string. ++So what is the type of a numeric string? Answer: numeric. + ++The type of a variable is important because the types of two variables ++determine how they are compared. ++Variable typing follows these definitions and rules: + + @itemize @value{BULLET} + @item +@@ -11552,7 +11628,9 @@ attribute. + Fields, @code{getline} input, @code{FILENAME}, @code{ARGV} elements, + @code{ENVIRON} elements, and the elements of an array created by + @code{match()}, @code{split()}, and @code{patsplit()} that are numeric +-strings have the @dfn{strnum} attribute. Otherwise, they have ++strings have the @dfn{strnum} attribute.@footnote{Thus, a POSIX ++numeric string and @command{gawk}'s strnum are the same thing.} ++Otherwise, they have + the @dfn{string} attribute. Uninitialized variables also have the + @dfn{strnum} attribute. + +@@ -11626,7 +11704,7 @@ STRNUM &&string &numeric &numeric\cr + @end tex + @ifnottex + @ifnotdocbook +-@display ++@verbatim + +---------------------------------------------- + | STRING NUMERIC STRNUM + --------+---------------------------------------------- +@@ -11637,7 +11715,7 @@ NUMERIC | string numeric numeric + | + STRNUM | string numeric numeric + --------+---------------------------------------------- +-@end display ++@end verbatim + @end ifnotdocbook + @end ifnottex + @docbook +@@ -11696,10 +11774,14 @@ purposes. + In short, when one operand is a ``pure'' string, such as a string + constant, then a string comparison is performed. Otherwise, a + numeric comparison is performed. ++(The primary difference between a number and a strnum is that ++for strnums @command{gawk} preserves the original string value that ++the scalar had when it came in.) ++ ++This point bears additional emphasis: ++Input that looks numeric @emph{is} numeric. ++All other input is treated as strings. + +-This point bears additional emphasis: All user input is made of characters, +-and so is first and foremost of string type; input strings +-that look numeric are additionally given the strnum attribute. + Thus, the six-character input string @w{@samp{ +3.14}} receives the + strnum attribute. In contrast, the eight characters + @w{@code{" +3.14"}} appearing in program text comprise a string constant. +@@ -11726,6 +11808,14 @@ $ @kbd{echo ' +3.14' | awk '@{ print($1 == 3.14) @}'} @ii{True} + @print{} 1 + @end example + ++You can see the type of an input field (or other user input) ++using @code{typeof()}: ++ ++@example ++$ @kbd{echo hello 37 | gawk '@{ print typeof($1), typeof($2) @}'} ++@print{} string strnum ++@end example ++ + @node Comparison Operators + @subsubsection Comparison Operators + +@@ -18688,8 +18778,8 @@ Return one of the following strings, depending upon the type of @var{x}: + @var{x} is a string. + + @item "strnum" +-@var{x} is a string that might be a number, such as a field or +-the result of calling @code{split()}. (I.e., @var{x} has the STRNUM ++@var{x} is a number that started life as user input, such as a field or ++the result of calling @code{split()}. (I.e., @var{x} has the strnum + attribute; @pxref{Variable Typing}.) + + @item "unassigned" +@@ -18698,8 +18788,9 @@ For example: + + @example + BEGIN @{ +- a[1] # creates a[1] but it has no assigned value +- print typeof(a[1]) # scalar_u ++ # creates a[1] but it has no assigned value ++ a[1] ++ print typeof(a[1]) # unassigned + @} + @end example + +@@ -29721,6 +29812,8 @@ executing, short programs. + The @command{gawk} debugger only accepts source code supplied with the @option{-f} option. + @end itemize + ++@ignore ++@c 11/2016: This no longer applies after all the type cleanup work that's been done. + One other point is worth discussing. Conventional debuggers run in a + separate process (and thus address space) from the programs that they + debug (the @dfn{debuggee}, if you will). +@@ -29779,6 +29872,7 @@ is indeed a number, and this is reflected in the result of + Cases like this where the debugger is not transparent to the program's + execution should be rare. If you encounter one, please report it + (@pxref{Bugs}). ++@end ignore + + @ignore + Look forward to a future release when these and other missing features may +@@ -31285,14 +31379,26 @@ and is managed by @command{gawk} from then on. + The API defines several simple @code{struct}s that map values as seen + from @command{awk}. A value can be a @code{double}, a string, or an + array (as in multidimensional arrays, or when creating a new array). ++ + String values maintain both pointer and length, because embedded @sc{nul} + characters are allowed. + + @quotation NOTE +-By intent, strings are maintained using the current multibyte encoding (as +-defined by @env{LC_@var{xxx}} environment variables) and not using wide +-characters. This matches how @command{gawk} stores strings internally +-and also how characters are likely to be input into and output from files. ++By intent, @command{gawk} maintains strings using the current multibyte ++encoding (as defined by @env{LC_@var{xxx}} environment variables) ++and not using wide characters. This matches how @command{gawk} stores ++strings internally and also how characters are likely to be input into ++and output from files. ++@end quotation ++ ++@quotation NOTE ++String values passed to an extension by @command{gawk} are always ++@sc{NUL}-terminated. Thus it is safe to pass such string values to ++standard library and system routines. However, because ++@command{gawk} allows embedded @sc{NUL} characters in string data, ++you should check that @samp{strlen(@var{some_string})} matches ++the length for that string passed to the extension before using ++it as a regular C string. + @end quotation + + @item @@ -67,9 +67,9 @@ r_force_number(NODE *n) return n; /* - * We should always set NUMCUR. If MAYBE_NUM is set and it's a + * We should always set NUMCUR. If USER_INPUT is set and it's a * numeric string, we clear STRING and enable NUMBER, but if it's not - * numeric, we disable MAYBE_NUM. + * numeric, we disable USER_INPUT. */ /* All the conditionals are an attempt to avoid the expensive strtod */ @@ -159,12 +159,12 @@ r_force_number(NODE *n) /* fall through to badnum */ } badnum: - n->flags &= ~MAYBE_NUM; + n->flags &= ~USER_INPUT; return n; goodnum: - if ((n->flags & MAYBE_NUM) != 0) { - /* leave MAYBE_NUM enabled to indicate that this is a strnum */ + if ((n->flags & USER_INPUT) != 0) { + /* leave USER_INPUT enabled to indicate that this is a strnum */ n->flags &= ~STRING; n->flags |= NUMBER; } |