aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--R/pkg/inst/tests/testthat/test_mllib.R12
-rw-r--r--core/src/main/scala/org/apache/spark/util/Utils.scala2
-rw-r--r--core/src/test/scala/org/apache/spark/util/UtilsSuite.scala35
-rw-r--r--python/pyspark/ml/clustering.py12
-rw-r--r--python/pyspark/mllib/clustering.py2
-rw-r--r--python/pyspark/mllib/tests.py2
6 files changed, 50 insertions, 15 deletions
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 8c380fbf15..dfb7a185cd 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -546,15 +546,15 @@ test_that("spark.gaussianMixture", {
df <- createDataFrame(data, c("x1", "x2"))
model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2)
stats <- summary(model)
- rLambda <- c(0.4, 0.6)
- rMu <- c(-0.2614822, 0.5128697, 2.647284, 4.544682)
- rSigma <- c(0.08427399, 0.00548772, 0.00548772, 0.09090715,
- 0.1641373, -0.1673806, -0.1673806, 0.7508951)
- expect_equal(stats$lambda, rLambda)
+ rLambda <- c(0.50861, 0.49139)
+ rMu <- c(0.267, 1.195, 2.743, 4.730)
+ rSigma <- c(1.099, 1.339, 1.339, 1.798,
+ 0.145, -0.309, -0.309, 0.716)
+ expect_equal(stats$lambda, rLambda, tolerance = 1e-3)
expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3)
expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3)
p <- collect(select(predict(model, df), "prediction"))
- expect_equal(p$prediction, c(0, 0, 0, 0, 1, 1, 1, 1, 1, 1))
+ expect_equal(p$prediction, c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1))
# Test model save/load
modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 0ae44a2ed7..9b4274a27b 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -824,7 +824,7 @@ private[spark] object Utils extends Logging {
*/
def randomizeInPlace[T](arr: Array[T], rand: Random = new Random): Array[T] = {
for (i <- (arr.length - 1) to 1 by -1) {
- val j = rand.nextInt(i)
+ val j = rand.nextInt(i + 1)
val tmp = arr(j)
arr(j) = arr(i)
arr(i) = tmp
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 30952a9458..4715fd2937 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -31,6 +31,7 @@ import scala.util.Random
import com.google.common.io.Files
import org.apache.commons.lang3.SystemUtils
+import org.apache.commons.math3.stat.inference.ChiSquareTest
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
@@ -874,4 +875,38 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
}
}
}
+
+ test("chi square test of randomizeInPlace") {
+ // Parameters
+ val arraySize = 10
+ val numTrials = 1000
+ val threshold = 0.05
+ val seed = 1L
+
+ // results(i)(j): how many times Utils.randomize moves an element from position j to position i
+ val results = Array.ofDim[Long](arraySize, arraySize)
+
+ // This must be seeded because even a fair random process will fail this test with
+ // probability equal to the value of `threshold`, which is inconvenient for a unit test.
+ val rand = new java.util.Random(seed)
+ val range = 0 until arraySize
+
+ for {
+ _ <- 0 until numTrials
+ trial = Utils.randomizeInPlace(range.toArray, rand)
+ i <- range
+ } results(i)(trial(i)) += 1L
+
+ val chi = new ChiSquareTest()
+
+ // We expect an even distribution; this array will be rescaled by `chiSquareTest`
+ val expected = Array.fill(arraySize * arraySize)(1.0)
+ val observed = results.flatten
+
+ // Performs Pearson's chi-squared test. Using the sum-of-squares as the test statistic, gives
+ // the probability of a uniform distribution producing results as extreme as `observed`
+ val pValue = chi.chiSquareTest(expected, observed)
+
+ assert(pValue > threshold)
+ }
}
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 75d9a0e8ca..4dab83362a 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -99,9 +99,9 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
+--------------------+--------------------+
| mean| cov|
+--------------------+--------------------+
- |[-0.0550000000000...|0.002025000000000...|
- |[0.82499999999999...|0.005625000000000...|
- |[-0.87,-0.7200000...|0.001600000000000...|
+ |[0.82500000140229...|0.005625000000006...|
+ |[-0.4777098016092...|0.167969502720916...|
+ |[-0.4472625243352...|0.167304119758233...|
+--------------------+--------------------+
...
>>> transformed = model.transform(df).select("features", "prediction")
@@ -124,9 +124,9 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
+--------------------+--------------------+
| mean| cov|
+--------------------+--------------------+
- |[-0.0550000000000...|0.002025000000000...|
- |[0.82499999999999...|0.005625000000000...|
- |[-0.87,-0.7200000...|0.001600000000000...|
+ |[0.82500000140229...|0.005625000000006...|
+ |[-0.4777098016092...|0.167969502720916...|
+ |[-0.4472625243352...|0.167304119758233...|
+--------------------+--------------------+
...
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index c8c3c42774..29aa615125 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -416,7 +416,7 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader):
... 4.5605, 5.2043, 6.2734])
>>> clusterdata_2 = sc.parallelize(data.reshape(5,3))
>>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001,
- ... maxIterations=150, seed=10)
+ ... maxIterations=150, seed=4)
>>> labels = model.predict(clusterdata_2).collect()
>>> labels[0]==labels[1]
True
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 99bf50b5a1..3f3dfd186c 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -550,7 +550,7 @@ class ListTests(MLlibTestCase):
[-6, -7],
])
clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
- maxIterations=10, seed=56)
+ maxIterations=10, seed=1)
labels = clusters.predict(data).collect()
self.assertEqual(labels[0], labels[1])
self.assertEqual(labels[2], labels[3])