aboutsummaryrefslogtreecommitdiff
path: root/core/src/test/scala/org/apache
diff options
context:
space:
mode:
authorNick Lavers <nick.lavers@videoamp.com>2016-08-19 10:11:59 +0100
committerSean Owen <sowen@cloudera.com>2016-08-19 10:11:59 +0100
commit5377fc62360d5e9b5c94078e41d10a96e0e8a535 (patch)
tree1998db20af8d7cc93a2b00308c0f5e8e2b3166a9 /core/src/test/scala/org/apache
parent287bea13050b8eedc3b8b6b3491f1b5e5bc24d7a (diff)
downloadspark-5377fc62360d5e9b5c94078e41d10a96e0e8a535.tar.gz
spark-5377fc62360d5e9b5c94078e41d10a96e0e8a535.tar.bz2
spark-5377fc62360d5e9b5c94078e41d10a96e0e8a535.zip
[SPARK-16961][CORE] Fixed off-by-one error that biased randomizeInPlace
JIRA issue link: https://issues.apache.org/jira/browse/SPARK-16961 Changed one line of Utils.randomizeInPlace to allow elements to stay in place. Created a unit test that runs a Pearson's chi squared test to determine whether the output diverges significantly from a uniform distribution. Author: Nick Lavers <nick.lavers@videoamp.com> Closes #14551 from nicklavers/SPARK-16961-randomizeInPlace.
Diffstat (limited to 'core/src/test/scala/org/apache')
-rw-r--r--core/src/test/scala/org/apache/spark/util/UtilsSuite.scala35
1 files changed, 35 insertions, 0 deletions
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 30952a9458..4715fd2937 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -31,6 +31,7 @@ import scala.util.Random
import com.google.common.io.Files
import org.apache.commons.lang3.SystemUtils
+import org.apache.commons.math3.stat.inference.ChiSquareTest
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
@@ -874,4 +875,38 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
}
}
}
+
+ test("chi square test of randomizeInPlace") {
+ // Parameters
+ val arraySize = 10
+ val numTrials = 1000
+ val threshold = 0.05
+ val seed = 1L
+
+ // results(i)(j): how many times Utils.randomize moves an element from position j to position i
+ val results = Array.ofDim[Long](arraySize, arraySize)
+
+ // This must be seeded because even a fair random process will fail this test with
+ // probability equal to the value of `threshold`, which is inconvenient for a unit test.
+ val rand = new java.util.Random(seed)
+ val range = 0 until arraySize
+
+ for {
+ _ <- 0 until numTrials
+ trial = Utils.randomizeInPlace(range.toArray, rand)
+ i <- range
+ } results(i)(trial(i)) += 1L
+
+ val chi = new ChiSquareTest()
+
+ // We expect an even distribution; this array will be rescaled by `chiSquareTest`
+ val expected = Array.fill(arraySize * arraySize)(1.0)
+ val observed = results.flatten
+
+ // Performs Pearson's chi-squared test. Using the sum-of-squares as the test statistic, gives
+ // the probability of a uniform distribution producing results as extreme as `observed`
+ val pValue = chi.chiSquareTest(expected, observed)
+
+ assert(pValue > threshold)
+ }
}