summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2016-09-25 10:40:51 -0700
committerKaz Kylheku <kaz@kylheku.com>2016-09-25 10:40:51 -0700
commitb0bbe6e9dfd169f78b4908296d6edba52ed9a707 (patch)
tree9fe148cacf3e5252928b098fe33c41af6e9cf614
parent7656e99c9e1ffb509a6310cadca26c4c1c7008c9 (diff)
downloadtxr-b0bbe6e9dfd169f78b4908296d6edba52ed9a707.tar.gz
txr-b0bbe6e9dfd169f78b4908296d6edba52ed9a707.tar.bz2
txr-b0bbe6e9dfd169f78b4908296d6edba52ed9a707.zip
awk macro: proper fs semantics in paragraph mode.
* share/txr/stdlib/awk.tl (sys:awk-state): New slots: par-mode, par-mode-fs, par-mode-prev-fs. (sys:awk-state rec-to-f): In paragraph mode, detect that fs has changed since the last call. In that case, take the user's fs and add to it a newline match. If it is a regex, take the source, add the syntax and recompile the regex. If it's a string, build regex around it and compile. (sys:awk-state loop): Maintain the par-mode-t variable in the state structure as the rs value triggers transitions into or out of paragraph mode. * txr.1: Updated documentation for rs.
-rw-r--r--share/txr/stdlib/awk.tl40
-rw-r--r--txr.129
2 files changed, 39 insertions, 30 deletions
diff --git a/share/txr/stdlib/awk.tl b/share/txr/stdlib/awk.tl
index 3efd8cc6..e99f62d2 100644
--- a/share/txr/stdlib/awk.tl
+++ b/share/txr/stdlib/awk.tl
@@ -37,6 +37,7 @@
(rec-num 0)
rec orig-rec fields nf
rng-vec (rng-n 0)
+ par-mode par-mode-fs par-mode-prev-fs
(:postinit (self)
(if (plusp self.rng-n)
(set self.rng-vec (vector self.rng-n)))
@@ -61,8 +62,18 @@
(if (and (not self.kfs) (equal self.rec ""))
(set self.fields nil
self.nf 0)
- (set self.fields (split-str self.rec self.fs self.kfs)
- self.nf (length self.fields))))
+ (let ((eff-fs (if self.par-mode
+ (if (equal self.fs self.par-mode-prev-fs)
+ self.par-mode-fs
+ (set self.par-mode-prev-fs self.fs
+ self.par-mode-fs
+ (regex-compile ^(or ,(if (regexp self.fs)
+ (regex-source self.fs)
+ self.fs)
+ "\n"))))
+ self.fs)))
+ (set self.fields (split-str self.rec eff-fs self.kfs)
+ self.nf (length self.fields)))))
(self.ft
(set self.fields (tok-str self.rec self.ft self.kfs)
self.nf (length self.fields)))
@@ -95,20 +106,23 @@
(set cached-rr
(cond
((and (equal aws.rs "\n") (not aws.krs))
+ (set aws.par-mode nil)
(lambda () (get-line stin)))
((null aws.rs)
- (let ((rin (record-adapter #/\n[ \n\t]*\n/))
- (flag t))
- (lambda ()
- (let ((r (get-line rin)))
- (cond
- (flag
- (set flag nil)
- (if (equal r "")
- (get-line rin)
- r))
- (t r))))))
+ (set aws.par-mode t)
+ (let ((rin (record-adapter #/\n[ \n\t]*\n/))
+ (flag t))
+ (lambda ()
+ (let ((r (get-line rin)))
+ (cond
+ (flag
+ (set flag nil)
+ (if (equal r "")
+ (get-line rin)
+ r))
+ (t r))))))
(t
+ (set aws.par-mode nil)
(let ((rin (record-adapter (if (regexp aws.rs)
aws.rs
(regex-compile aws.rs))
diff --git a/txr.1 b/txr.1
index 508b8c72..bea89447 100644
--- a/txr.1
+++ b/txr.1
@@ -38597,27 +38597,22 @@ or more blank lines (empty lines or lines containing only a mixture of
tabs and spaces). This means that, effectively, the record-separating
sequences match the regular expression
.codn "/\en[ \en\et]*\en/" .
-There is a difference between paragraph mode and simply using the above
+
+There are two differences between paragraph mode and simply using the above
regular expression as
.codn rs .
-The difference is that if the first record which is read upon entering
+The first difference is that if the first record which is read upon entering
paragraph mode is empty (because the input begins with a match for the
-separator regex), then that record is thrown away, and the next record
-is read.
-
-Note that the POSIX Awk paragraph mode (which occurs when
-.code RS
-is blank) there is an additional difference: regardless of the value
-of the field separator
-.codn FS ,
-newline characters separate fields. This behavior is not implemented
-in the
-.code awk
-macro. Since newlines are included as separators in under the default field
-separation, the behaviors match in that case. Code using a custom
+separator regex), then that record is thrown away, and the next record is read.
+The second difference is that, if field separation based on the
+.code fs
+variable is in effect, then regardless of the value of
+.codn fs ,
+newline characters separate fields. Therefore, the programmer-defined
.code fs
-must explicitly include a match for newline to obtain that as a field
-separator.
+doesn't have to include a match for newline. Moreover, if it is a simple
+fixed string, it need not be converted to a regular expression which also
+matches a newline.
.coNP Variable @ krs
.desc