From 5377fc62360d5e9b5c94078e41d10a96e0e8a535 Mon Sep 17 00:00:00 2001 From: Nick Lavers Date: Fri, 19 Aug 2016 10:11:59 +0100 Subject: [SPARK-16961][CORE] Fixed off-by-one error that biased randomizeInPlace JIRA issue link: https://issues.apache.org/jira/browse/SPARK-16961 Changed one line of Utils.randomizeInPlace to allow elements to stay in place. Created a unit test that runs a Pearson's chi squared test to determine whether the output diverges significantly from a uniform distribution. Author: Nick Lavers Closes #14551 from nicklavers/SPARK-16961-randomizeInPlace. --- .../scala/org/apache/spark/util/UtilsSuite.scala | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'core/src/test/scala/org/apache') diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 30952a9458..4715fd2937 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -31,6 +31,7 @@ import scala.util.Random import com.google.common.io.Files import org.apache.commons.lang3.SystemUtils +import org.apache.commons.math3.stat.inference.ChiSquareTest import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -874,4 +875,38 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { } } } + + test("chi square test of randomizeInPlace") { + // Parameters + val arraySize = 10 + val numTrials = 1000 + val threshold = 0.05 + val seed = 1L + + // results(i)(j): how many times Utils.randomize moves an element from position j to position i + val results = Array.ofDim[Long](arraySize, arraySize) + + // This must be seeded because even a fair random process will fail this test with + // probability equal to the value of `threshold`, which is inconvenient for a unit test. + val rand = new java.util.Random(seed) + val range = 0 until arraySize + + for { + _ <- 0 until numTrials + trial = Utils.randomizeInPlace(range.toArray, rand) + i <- range + } results(i)(trial(i)) += 1L + + val chi = new ChiSquareTest() + + // We expect an even distribution; this array will be rescaled by `chiSquareTest` + val expected = Array.fill(arraySize * arraySize)(1.0) + val observed = results.flatten + + // Performs Pearson's chi-squared test. Using the sum-of-squares as the test statistic, gives + // the probability of a uniform distribution producing results as extreme as `observed` + val pValue = chi.chiSquareTest(expected, observed) + + assert(pValue > threshold) + } } -- cgit v1.2.3