Improvements in the random number generator.

author: Arnold D. Robbins <arnold@skeeve.com> 2013-09-25 12:59:21 +0300
committer: Arnold D. Robbins <arnold@skeeve.com> 2013-09-25 12:59:21 +0300
commit: 026a126c27f648839325c7fadb37c42d2935f467 (patch)
tree: 2657622bcb410d484434804bc900dde0eca53d13
parent: 7234d4a6c1ef8763ab6ac25619f8a225260d60b8 (diff)
download: egawk-026a126c27f648839325c7fadb37c42d2935f467.tar.gz
egawk-026a126c27f648839325c7fadb37c42d2935f467.tar.bz2
egawk-026a126c27f648839325c7fadb37c42d2935f467.zip
9 files changed, 168 insertions, 4 deletions
diff --git a/ChangeLog b/ChangeLog
index 64caea35..fe593aeb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2013-09-25         Arnold D. Robbins     <arnold@skeeve.com>
+
+	* builtin.c (do_rand): Make the result more random by calling
+	random() twice. See the comment in the code. Thanks to
+	Bob Jewett <jewett@bill.scs.agilent.com> for the report and
+	the fix.
+
 2013-09-24         Arnold D. Robbins     <arnold@skeeve.com>
 
 	* debug.c (find_rule): Handle case where lineno is zero. Can happen
diff --git a/NEWS b/NEWS
index b6f97d5e..3284ec1c 100644
--- a/NEWS
+++ b/NEWS
@@ -11,6 +11,11 @@ Changes from 4.1.1 to 4.2.0
    gawk's environment, affecting any programs run by system()
    or for piped redirections.
 
+2. The series of numbers returned by rand() should now be "more
+   random" than previously.  Gawk's rand() remains repeatable; you will
+   get the same series of numbers each time you call rand() repeatedly,
+   but this will be a different series than previously.
+
 Changes from 4.1.0 to 4.1.1
 ---------------------------
 
diff --git a/builtin.c b/builtin.c
index 9258ea60..925a92bd 100644
--- a/builtin.c
+++ b/builtin.c
@@ -2372,6 +2372,8 @@ static char *const state = (char *const) istate;
 NODE *
 do_rand(int nargs ATTRIBUTE_UNUSED)
 {
+	double tmprand;
+#define RAND_DIVISOR ((double)GAWK_RANDOM_MAX+1.0)
 	if (firstrand) {
 		(void) initstate((unsigned) 1, state, SIZEOF_STATE);
 		/* don't need to srandom(1), initstate() does it for us. */
@@ -2383,7 +2385,24 @@ do_rand(int nargs ATTRIBUTE_UNUSED)
 	 *
 	 * 	0 <= n < 1
 	 */
-	return make_number((AWKNUM) (random() % GAWK_RANDOM_MAX) / GAWK_RANDOM_MAX);
+ 	/*
+	 * Date: Wed, 28 Aug 2013 17:52:46 -0700
+	 * From: Bob Jewett <jewett@bill.scs.agilent.com>
+	 *
+ 	 * Call random() twice to fill in more bits in the value
+ 	 * of the double.  Also, there is a bug in random() such
+ 	 * that when the values of successive values are combined
+ 	 * like (rand1*rand2)^2, (rand3*rand4)^2,  ...  the
+ 	 * resulting time series is not white noise.  The
+ 	 * following also seems to fix that bug. 
+ 	 *
+ 	 * The add/subtract 0.5 keeps small bits from filling
+ 	 * below 2^-53 in the double, not that anyone should be
+ 	 * looking down there. 
+ 	 */
+ 
+ 	tmprand = 0.5 + ( (random()/RAND_DIVISOR + random()) / RAND_DIVISOR);
+ 	return make_number((AWKNUM) (tmprand - 0.5));
 }
 
 /* do_srand --- seed the random number generator */
diff --git a/test/ChangeLog b/test/ChangeLog
index ad0366f2..b1c3e7b9 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,9 @@
+2013-09-25         Arnold D. Robbins     <arnold@skeeve.com>
+
+	* Makfile.am (randtest): New test.
+	* randtest.sh, randtest.ok: New files.
+	* rand.ok: Updated to reflect new results based on code change.
+
 2013-09-13         Arnold D. Robbins     <arnold@skeeve.com>
 
 	* Makefile.am: Fix quoting for generation of Maketests file so
diff --git a/test/Makefile.am b/test/Makefile.am
index 8288cf7d..d09e6988 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -691,6 +691,8 @@ EXTRA_DIST = \
 	rand-mpfr.ok \
 	rand.awk \
 	rand.ok \
+	randtest.sh \
+	randtest.ok \
 	range1.awk \
 	range1.in \
 	range1.ok \
@@ -952,7 +954,7 @@ BASIC_TESTS = \
 	paramdup paramres paramtyp paramuninitglobal parse1 parsefld parseme \
 	pcntplus posix2008sub prdupval prec printf0 printf1 prmarscl prmreuse \
 	prt1eval prtoeval \
-	rand range1 rebt8b1 redfilnm regeq regexprange regrange reindops \
+	rand randtest range1 rebt8b1 redfilnm regeq regexprange regrange reindops \
 	reparse resplit rri1 rs rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 \
 	rstest3 rstest4 rstest5 rswhite \
 	scalar sclforin sclifin sortempty splitargv splitarr splitdef \
@@ -1858,6 +1860,11 @@ dfamb1:
 	AWKPATH="$(srcdir)" $(AWK) -f $@.awk "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
 	@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
 
+randtest::
+	@echo $@
+	@GAWK="$(AWKPROG)" "$(srcdir)"/randtest.sh >_$@
+	@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
 # Targets generated for other tests:
 include Maketests
 
diff --git a/test/Makefile.in b/test/Makefile.in
index 31a22602..0b7f1f46 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -909,6 +909,8 @@ EXTRA_DIST = \
 	rand-mpfr.ok \
 	rand.awk \
 	rand.ok \
+	randtest.sh \
+	randtest.ok \
 	range1.awk \
 	range1.in \
 	range1.ok \
@@ -1169,7 +1171,7 @@ BASIC_TESTS = \
 	paramdup paramres paramtyp paramuninitglobal parse1 parsefld parseme \
 	pcntplus posix2008sub prdupval prec printf0 printf1 prmarscl prmreuse \
 	prt1eval prtoeval \
-	rand range1 rebt8b1 redfilnm regeq regexprange regrange reindops \
+	rand randtest range1 rebt8b1 redfilnm regeq regexprange regrange reindops \
 	reparse resplit rri1 rs rsnul1nl rsnulbig rsnulbig2 rstest1 rstest2 \
 	rstest3 rstest4 rstest5 rswhite \
 	scalar sclforin sclifin sortempty splitargv splitarr splitdef \
@@ -2251,6 +2253,11 @@ dfamb1:
 	@[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \
 	AWKPATH="$(srcdir)" $(AWK) -f $@.awk "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
 	@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
+randtest::
+	@echo $@
+	@GAWK="$(AWKPROG)" "$(srcdir)"/randtest.sh >_$@
+	@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
 Gt-dummy:
 # file Maketests, generated from Makefile.am by the Gentests program
 addcomma:
diff --git a/test/rand.ok b/test/rand.ok
index 60432b95..1df4ba39 100644
--- a/test/rand.ok
+++ b/test/rand.ok
@@ -1 +1 @@
- 62  67  88   6  35  77   3  68  30  96  90  26  35   8  88  93  49  53  37 
+ 67   6  77  68  96  26   8  93  53  74  53  95  78  74  96  77  33  58  91 
diff --git a/test/randtest.ok b/test/randtest.ok
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/test/randtest.ok
diff --git a/test/randtest.sh b/test/randtest.sh
new file mode 100755
index 00000000..b17fda73
--- /dev/null
+++ b/test/randtest.sh
@@ -0,0 +1,113 @@
+# THIS PURPOSELY DOES NOT HAVE A !# LINE !!!!
+#
+# Date: Mon, 9 Sep 2013 14:49:43 -0700
+# From: Bob Jewett <jewett@bill.scs.agilent.com>
+# Message-Id: <201309092149.r89Lnh94010909@bill.scs.agilent.com>
+# To: arnold@skeeve.com
+# Subject: Re: [bug-gawk] Bug in random() in builtin.c
+# 
+# Hi Arnold,
+# 
+# Attached below is a script that tests gawk for this particular
+# rand() problem.  The pair-wise combinations show a strong
+# autocorrelation for a delay of 31 pairs of rand() samples. 
+# 
+# The script prints out the measured autocorrelation for a record
+# of NSAMPLES pairs.  It also prints a fail message at the end if
+# it fails. 
+# 
+# If you want to see the autocorrelation values, there is a print
+# statement that if uncommented will save them to a file.
+# 
+# Please let me know if the mailer screws up the transfer or
+# if you have any questions about the test.
+# 
+# Best regards,
+# Bob
+# 
+# -------------- test_pair_power_autocorrelation -----------------------
+# 
+#!/bin/ksh
+
+#GAWK=/bin/gawk
+
+# ADR: Get GAWK from the environment.
+# Additional note: This wants ksh/bash for the use of $RANDOM below to
+# seed the generator. However, shells that don't provide it won't be
+# a problem since gawk will then seed the generator with the time of day,
+# as srand() will be called without an argument.
+
+# large NSAMPLES and NRUNS will bring any correlation out of the noise better
+NSAMPLES=1024; MAX_ALLOWED_SIGMA=5; NRUNS=50;
+
+$GAWK 'BEGIN{ 
+    srand('$RANDOM');
+    nsamples=('$NSAMPLES');
+    max_allowed_sigma=('$MAX_ALLOWED_SIGMA');
+    nruns=('$NRUNS');
+    for(tau=0;tau<nsamples/2;tau++) corr[tau]=0;
+
+    for(run=0;run<nruns;run++) {
+	sum=0;
+
+	# Fill an array with a sequence of samples that are a
+	# function of pairs of rand() values.
+
+	for(i=0;i<nsamples;i++) {
+	   samp[i]=((rand()-0.5)*(rand()-0.5))^2;
+	   sum=sum+samp[i];
+	   }
+
+	# Subtract off the mean of the sequence:
+
+	mean=sum/nsamples;
+	for(i=0;i<nsamples;i++) samp[i]=samp[i]-mean;
+
+	# Calculate an autocorrelation function on the sequence.
+	# Because the values of rand() should be independent, there
+	# should be no peaks in the autocorrelation.
+
+	for(tau=0;tau<nsamples/2;tau++) {
+	    sum=0;
+	    for(i=0;i<nsamples/2;i++) sum=sum+samp[i]*samp[i+tau];
+	    corr[tau]=corr[tau]+sum;
+	    }
+
+	}
+    # Normalize the autocorrelation to the tau=0 value.
+
+    max_corr=corr[0];
+    for(tau=0;tau<nsamples/2;tau++) corr[tau]=corr[tau]/max_corr;
+
+    # OPTIONALLY Print out the autocorrelation values:
+
+    # for(tau=0;tau<nsamples/2;tau++) print tau, corr[tau] > "pairpower_corr.data";
+
+    # Calculate the sigma for the non-zero tau values: 
+
+    power_sum=0;
+
+    for(tau=1;tau<nsamples/2;tau++) power_sum=power_sum+(corr[tau])^2;
+
+    sigma=sqrt(power_sum/(nsamples/2-1));
+
+    # See if any of the correlations exceed a reasonable number of sigma:
+
+    passed=1;
+    for(tau=1;tau<nsamples/2;tau++) {
+	if ( abs(corr[tau])/sigma > max_allowed_sigma ) {
+	    print "Tau=", tau ", Autocorr=", corr[tau]/sigma, "sigma";
+	    passed=0;
+	    }
+        }
+    if(!passed) {
+	print "Test failed."
+	exit(1);
+        }
+    else exit (0);
+    }
+
+function abs(abs_input) { return(sqrt(abs_input^2)) ; }
+'
+
+exit 0
author	Arnold D. Robbins <arnold@skeeve.com>	2013-09-25 12:59:21 +0300
committer	Arnold D. Robbins <arnold@skeeve.com>	2013-09-25 12:59:21 +0300
commit	026a126c27f648839325c7fadb37c42d2935f467 (patch)
tree	2657622bcb410d484434804bc900dde0eca53d13
parent	7234d4a6c1ef8763ab6ac25619f8a225260d60b8 (diff)
download	egawk-026a126c27f648839325c7fadb37c42d2935f467.tar.gz egawk-026a126c27f648839325c7fadb37c42d2935f467.tar.bz2 egawk-026a126c27f648839325c7fadb37c42d2935f467.zip