summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2012-03-13 22:02:48 -0700
committerKaz Kylheku <kaz@kylheku.com>2012-03-13 22:02:48 -0700
commit7f0f22c4e455f457d37ddf542b36c49db20d16af (patch)
treeae4b2ed4afa05a8714c86f80ce5a9d4a8d4afe05
parentbe33de3909100a204726cdde52bf6077e890ca6d (diff)
downloadtxr-7f0f22c4e455f457d37ddf542b36c49db20d16af.tar.gz
txr-7f0f22c4e455f457d37ddf542b36c49db20d16af.tar.bz2
txr-7f0f22c4e455f457d37ddf542b36c49db20d16af.zip
Change: @(block) requires @(end) from now on.
Blocks no longer extend to the end of the surrounding scope. * match.c (v_block): Rewrite for new syntax. * parser.l (BLOCK): New token type handled. * parser.y (BLOCK): New token. (block_clause): New nonterminal grammar symbol. (clause): Collateral fix: replaced a bunch of list(X, nao) forms with cons(X, nil). Introduced block_clause as a constituent of clause. * txr.1: Revamped documentation of block, and wrote about using blocks for reducing nested skips and reducing backtracking in general.
-rw-r--r--ChangeLog20
-rw-r--r--match.c27
-rw-r--r--parser.l6
-rw-r--r--parser.y41
-rw-r--r--txr.1101
5 files changed, 156 insertions, 39 deletions
diff --git a/ChangeLog b/ChangeLog
index f94493b8..772f53ad 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,25 @@
2012-03-13 Kaz Kylheku <kaz@kylheku.com>
+ Change: @(block) requires @(end) from now on.
+ Blocks no longer extend to the end of the surrounding
+ scope.
+
+ * match.c (v_block): Rewrite for new syntax.
+
+ * parser.l (BLOCK): New token type handled.
+
+ * parser.y (BLOCK): New token.
+ (block_clause): New nonterminal grammar symbol.
+ (clause): Collateral fix: replaced a bunch of
+ list(X, nao) forms with cons(X, nil).
+ Introduced block_clause as a constituent of clause.
+
+ * txr.1: Revamped documentation of block, and
+ wrote about using blocks for reducing nested
+ skips and reducing backtracking in general.
+
+2012-03-13 Kaz Kylheku <kaz@kylheku.com>
+
* parser.l (ID_END): Bugfix: ID_END was defined incorrectly
for the current way in which an identifier token is recognized.
As a result @(collect-ing) was being interpreted as @(collect -ing).
diff --git a/match.c b/match.c
index 2b00c5cc..24d7a3f9 100644
--- a/match.c
+++ b/match.c
@@ -2110,20 +2110,35 @@ static val v_block(match_files_ctx *c)
{
spec_bind (specline, first_spec, c->spec);
- val name = first(rest(first_spec));
+ val args = rest(first_spec);
+ val name = first(args);
+ val spec = second(args);
if (rest(specline))
sem_error(specline, lit("unexpected material after block directive"), nao);
- if ((c->spec = rest(c->spec)) != nil)
{
uw_block_begin(name, result);
- result = match_files(*c);
+ result = match_files(mf_spec(*c, spec));
uw_block_end;
- return result;
- }
- return next_spec_k;
+ {
+ cons_bind (new_bindings, success, result);
+
+ if (!success) {
+ return nil;
+ } else if (success == t) {
+ c->data = nil;
+ } else {
+ cons_bind (new_data, new_line, success);
+ c->data = new_data;
+ c->data_lineno = new_line;
+ }
+
+ c->bindings = new_bindings;
+ return next_spec_k;
+ }
+ }
}
static val v_accept_fail(match_files_ctx *c)
diff --git a/parser.l b/parser.l
index 98a243ec..f7a655db 100644
--- a/parser.l
+++ b/parser.l
@@ -257,6 +257,12 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
return CASES;
}
+<SPECIAL>\({WS}block/{ID_END} {
+ yy_push_state(NESTED);
+ yylval.lineno = lineno;
+ return BLOCK;
+}
+
<SPECIAL>\({WS}choose/{ID_END} {
yy_push_state(NESTED);
yylval.lineno = lineno;
diff --git a/parser.y b/parser.y
index 8656de57..9a712d64 100644
--- a/parser.y
+++ b/parser.y
@@ -67,7 +67,7 @@ static val parsed_spec;
}
%token <lexeme> SPACE TEXT IDENT KEYWORD METAVAR
-%token <lineno> ALL SOME NONE MAYBE CASES CHOOSE GATHER
+%token <lineno> ALL SOME NONE MAYBE CASES BLOCK CHOOSE GATHER
%token <lineno> AND OR END COLLECT
%token <lineno> UNTIL COLL OUTPUT REPEAT REP SINGLE FIRST LAST EMPTY
%token <lineno> MOD MODLAST DEFINE TRY CATCH FINALLY
@@ -80,7 +80,7 @@ static val parsed_spec;
%token <chr> METAPAR METABKT SPLICE
%type <val> spec clauses clauses_opt clause
-%type <val> all_clause some_clause none_clause maybe_clause
+%type <val> all_clause some_clause none_clause maybe_clause block_clause
%type <val> cases_clause choose_clause gather_clause collect_clause until_last
%type <val> clause_parts additional_parts gather_parts additional_gather_parts
%type <val> output_clause define_clause try_clause catch_clauses_opt
@@ -128,19 +128,20 @@ clauses_opt : clauses { $$ = $1; }
| /* empty */ { $$ = nil; }
;
-clause : all_clause { $$ = list($1, nao); rlcp($$, $1); }
- | some_clause { $$ = list($1, nao); rlcp($$, $1); }
- | none_clause { $$ = list($1, nao); rlcp($$, $1); }
- | maybe_clause { $$ = list($1, nao); rlcp($$, $1); }
- | cases_clause { $$ = list($1, nao); rlcp($$, $1); }
- | choose_clause { $$ = list($1, nao); rlcp($$, $1); }
- | collect_clause { $$ = list($1, nao); rlcp($$, $1); }
- | gather_clause { $$ = list($1, nao); rlcp($$, $1); }
+clause : all_clause { $$ = cons($1, nil); rlcp($$, $1); }
+ | some_clause { $$ = cons($1, nil); rlcp($$, $1); }
+ | none_clause { $$ = cons($1, nil); rlcp($$, $1); }
+ | maybe_clause { $$ = cons($1, nil); rlcp($$, $1); }
+ | cases_clause { $$ = cons($1, nil); rlcp($$, $1); }
+ | block_clause { $$ = cons($1, nil); rlcp($$, $1); }
+ | choose_clause { $$ = cons($1, nil); rlcp($$, $1); }
+ | collect_clause { $$ = cons($1, nil); rlcp($$, $1); }
+ | gather_clause { $$ = cons($1, nil); rlcp($$, $1); }
| define_clause { $$ = list(define_transform($1), nao);
rlcp(car($$), $1);
rlcp($$, $1); }
- | try_clause { $$ = list($1, nao); rlcp($$, $1); }
- | output_clause { $$ = list($1, nao); rlcp($$, $1); }
+ | try_clause { $$ = cons($1, nil); rlcp($$, $1); }
+ | output_clause { $$ = cons($1, nil); rlcp($$, $1); }
| line { $$ = $1; }
| repeat_clause { $$ = nil;
yyerror("repeat outside of output"); }
@@ -196,6 +197,22 @@ cases_clause : CASES newl clause_parts { $$ = list(cases_s, $3, nao);
yyerror("empty cases clause"); }
;
+block_clause : BLOCK exprs_opt ')'
+ newl clauses_opt
+ END newl { val name = first($2);
+ if (gt(length($2), one))
+ yyerror("block: takes zero or no arguments");
+ if (name && !bindable(name))
+ yyerrorf(lit("block: ~s is not a bindable symbol"),
+ name, nao);
+ $$ = list(block_s, name, $5, nao);
+ rl($$, num($1)); }
+ | BLOCK exprs_opt ')'
+ newl error { $$ = nil;
+ yybadtoken(yychar,
+ lit("block clause")); }
+ ;
+
choose_clause : CHOOSE exprs_opt ')'
newl clause_parts { $$ = list(choose_s, $5, $2, nao);
rl($$, num($1)); }
diff --git a/txr.1 b/txr.1
index e5e63478..b51813d2 100644
--- a/txr.1
+++ b/txr.1
@@ -1136,8 +1136,9 @@ end of a line. Also Fails if no data remains (there is no current line).
Continue matching in another file or other data source.
.IP @(block)
-The remaining query is treated as an anonymous or named block.
-Blocks may be referenced by @(accept) and @(fail) directives.
+Groups to gether a sequence of directives into a logical name block,
+which can be explicitly terminated from within using
+the @(accept) and @(fail) directives.
Blocks are discussed in the section BLOCKS below.
.IP @(skip)
@@ -1524,6 +1525,65 @@ will keep scanning even though it has found the correct match, then backtrack
to the last good match once it runs out of data. The regular skip with explicit
@(eof) will stop when the @(eof) matches.
+.SS Reducing Backtracking with Blocks
+
+Skip can consider considerable CPU time when multiple skips are nested.
+Consider:
+
+ @(skip)
+ A
+ @(skip)
+ B
+ @(skip)
+ C
+
+This is actually nesting: the second a third skips occur within the body of the
+first one, and thus this creates nested iteration. TXR is searching for the
+combination of skips which find match the pattern of lines A, B and C, with
+backtracking behavior. The outermost skip marches through the data until it
+finds A, followed by a pattern match for the second skip. The second skip
+iterates within to find B, followed by the third skip, and the third skip
+iterates to find C. If there is only one line A, and one B, then this is
+reasonably fast. But suppose there are many lines matching A and B,
+giving rise to a large number combinations of skips which match A and B, and
+yet no match for C, triggering backtracking. The nested stepping which tries
+the combinations of A and B can give rise to a considerable running time.
+
+One way to deal with the problem is to unravel the nesting with the help of
+blocks. For example:
+
+ @(block)
+ @ (skip)
+ A
+ @(end)
+ @(block)
+ @ (skip)
+ B
+ @(end)
+ @(skip)
+ C
+
+Now the scope of each skip is just the remainder of the block in which it
+occurs. The first skip finds A, and then the block ends. Control passes to the
+next block, and backtracking will take place to a block which completed (unless
+all these blocks are enclosed in some larger construct which backtracks,
+causing the blocks to be re-executed.
+
+Of course, this rewrite is not equivalent, and cannot be used for instance
+in backreferencing situations such as:
+
+ @;
+ @; Find some three lines which are the same.
+ @;
+ @(skip)
+ @line
+ @(skip)
+ @line
+ @(skip)
+ @line
+
+This example depends on the nested search-within-search semantics.
+
.SS The Trailer Directive
The trailer directive introduces a trailing portion of a query or subquery
@@ -2627,28 +2687,36 @@ to @(block nil).
The @(skip) and @(collect) directives introduce implicit anonymous blocks,
as do function bodies.
+Blocks are useful for terminating parts of a pattern matchin search
+prematurely, and escaping to a higher level. This makes blocks not only
+useful for simplifying the semantics of certain pattern matches,
+but also an optimization tool.
+
+Judicious use of blocks and escapes can reduce or eliminate the amount
+backtracking that TXR performs.
+
.SS Block Scope
The names of blocks are in a distinct namespace from the variable binding
space. So @(block foo) has no interaction with the variable @foo.
A block extends from the @(block ...) directive which introduces it,
-to the end of the subquery in which that directive is contained. For instance:
+until the matching @(end), and may be empty. For instance:
@(some)
abc
@(block foo)
xyz
@(end)
+ @(end)
Here, the block foo occurs in a @(some) clause, and so it extends to the @(end)
-which terminates that clause. After that @(end), the name foo is not
-associated with a block (is not "in scope"). A block which is not contained in
-any subquery extends to the end of the overall query. Blocks are never
-terminated by @(end).
+which terminates the block. After that @(end), the name foo is not
+associated with a block (is not "in scope"). The second @(end) terminates
+the @(some) block.
The implicit anonymous block introduced by @(skip) has the same scope
-as the @(skip): they extends over all of the material which follows the skip,
+as the @(skip): it extends over all of the material which follows the skip,
to the end of the containing subquery.
.SS Block Nesting
@@ -2659,11 +2727,15 @@ which they are nested. For instance:
@(block)
@(block)
...
+ @(end)
+ @(end)
is a nesting of two anonymous blocks, and
@(block foo)
@(block foo)
+ @(end)
+ @(end)
is a nesting of two named blocks which happen to have the same name.
When a nested block has the same name as an outer block, it creates
@@ -2671,19 +2743,6 @@ a block scope in which the outer block is "shadowed"; that is to say,
directives which refer to that block name within the nested block refer to the
inner block, and not to the outer one.
-A more complicated example of nesting is:
-
- @(skip)
- abc
- @(block)
- @(some)
- @(block foo)
- @(end)
-
-Here, the @(skip) introduces an anonymous block. The explicit anonymous
-@(block) is nested within skip's anonymous block and shadows it.
-The foo block is nested within both of these.
-
.SS Block Semantics
A block normally does nothing. The query material in the block is evaluated