diff options
author | Nick Lavers <nick.lavers@videoamp.com> | 2016-08-19 10:11:59 +0100 |
---|---|---|
committer | Sean Owen <sowen@cloudera.com> | 2016-08-19 10:11:59 +0100 |
commit | 5377fc62360d5e9b5c94078e41d10a96e0e8a535 (patch) | |
tree | 1998db20af8d7cc93a2b00308c0f5e8e2b3166a9 /core/src/test | |
parent | 287bea13050b8eedc3b8b6b3491f1b5e5bc24d7a (diff) | |
download | spark-5377fc62360d5e9b5c94078e41d10a96e0e8a535.tar.gz spark-5377fc62360d5e9b5c94078e41d10a96e0e8a535.tar.bz2 spark-5377fc62360d5e9b5c94078e41d10a96e0e8a535.zip |
[SPARK-16961][CORE] Fixed off-by-one error that biased randomizeInPlace
JIRA issue link:
https://issues.apache.org/jira/browse/SPARK-16961
Changed one line of Utils.randomizeInPlace to allow elements to stay in place.
Created a unit test that runs a Pearson's chi squared test to determine whether the output diverges significantly from a uniform distribution.
Author: Nick Lavers <nick.lavers@videoamp.com>
Closes #14551 from nicklavers/SPARK-16961-randomizeInPlace.
Diffstat (limited to 'core/src/test')
-rw-r--r-- | core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 30952a9458..4715fd2937 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -31,6 +31,7 @@ import scala.util.Random import com.google.common.io.Files import org.apache.commons.lang3.SystemUtils +import org.apache.commons.math3.stat.inference.ChiSquareTest import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -874,4 +875,38 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { } } } + + test("chi square test of randomizeInPlace") { + // Parameters + val arraySize = 10 + val numTrials = 1000 + val threshold = 0.05 + val seed = 1L + + // results(i)(j): how many times Utils.randomize moves an element from position j to position i + val results = Array.ofDim[Long](arraySize, arraySize) + + // This must be seeded because even a fair random process will fail this test with + // probability equal to the value of `threshold`, which is inconvenient for a unit test. + val rand = new java.util.Random(seed) + val range = 0 until arraySize + + for { + _ <- 0 until numTrials + trial = Utils.randomizeInPlace(range.toArray, rand) + i <- range + } results(i)(trial(i)) += 1L + + val chi = new ChiSquareTest() + + // We expect an even distribution; this array will be rescaled by `chiSquareTest` + val expected = Array.fill(arraySize * arraySize)(1.0) + val observed = results.flatten + + // Performs Pearson's chi-squared test. Using the sum-of-squares as the test statistic, gives + // the probability of a uniform distribution producing results as extreme as `observed` + val pValue = chi.chiSquareTest(expected, observed) + + assert(pValue > threshold) + } } |