6 files changed, 50 insertions, 15 deletions
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 8c380fbf15..dfb7a185cd 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -546,15 +546,15 @@ test_that("spark.gaussianMixture", {
   df <- createDataFrame(data, c("x1", "x2"))
   model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2)
   stats <- summary(model)
-  rLambda <- c(0.4, 0.6)
-  rMu <- c(-0.2614822, 0.5128697, 2.647284, 4.544682)
-  rSigma <- c(0.08427399, 0.00548772, 0.00548772, 0.09090715,
-              0.1641373, -0.1673806, -0.1673806, 0.7508951)
-  expect_equal(stats$lambda, rLambda)
+  rLambda <- c(0.50861, 0.49139)
+  rMu <- c(0.267, 1.195, 2.743, 4.730)
+  rSigma <- c(1.099, 1.339, 1.339, 1.798,
+              0.145, -0.309, -0.309, 0.716)
+  expect_equal(stats$lambda, rLambda, tolerance = 1e-3)
   expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3)
   expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3)
   p <- collect(select(predict(model, df), "prediction"))
-  expect_equal(p$prediction, c(0, 0, 0, 0, 1, 1, 1, 1, 1, 1))
+  expect_equal(p$prediction, c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1))
 
   # Test model save/load
   modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 0ae44a2ed7..9b4274a27b 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -824,7 +824,7 @@ private[spark] object Utils extends Logging {
    */
   def randomizeInPlace[T](arr: Array[T], rand: Random = new Random): Array[T] = {
     for (i <- (arr.length - 1) to 1 by -1) {
-      val j = rand.nextInt(i)
+      val j = rand.nextInt(i + 1)
       val tmp = arr(j)
       arr(j) = arr(i)
       arr(i) = tmp
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 30952a9458..4715fd2937 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -31,6 +31,7 @@ import scala.util.Random
 
 import com.google.common.io.Files
 import org.apache.commons.lang3.SystemUtils
+import org.apache.commons.math3.stat.inference.ChiSquareTest
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
@@ -874,4 +875,38 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
       }
     }
   }
+
+  test("chi square test of randomizeInPlace") {
+    // Parameters
+    val arraySize = 10
+    val numTrials = 1000
+    val threshold = 0.05
+    val seed = 1L
+
+    // results(i)(j): how many times Utils.randomize moves an element from position j to position i
+    val results = Array.ofDim[Long](arraySize, arraySize)
+
+    // This must be seeded because even a fair random process will fail this test with
+    // probability equal to the value of `threshold`, which is inconvenient for a unit test.
+    val rand = new java.util.Random(seed)
+    val range = 0 until arraySize
+
+    for {
+      _ <- 0 until numTrials
+      trial = Utils.randomizeInPlace(range.toArray, rand)
+      i <- range
+    } results(i)(trial(i)) += 1L
+
+    val chi = new ChiSquareTest()
+
+    // We expect an even distribution; this array will be rescaled by `chiSquareTest`
+    val expected = Array.fill(arraySize * arraySize)(1.0)
+    val observed = results.flatten
+
+    // Performs Pearson's chi-squared test. Using the sum-of-squares as the test statistic, gives
+    // the probability of a uniform distribution producing results as extreme as `observed`
+    val pValue = chi.chiSquareTest(expected, observed)
+
+    assert(pValue > threshold)
+  }
 }
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 75d9a0e8ca..4dab83362a 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -99,9 +99,9 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     +--------------------+--------------------+
     |                mean|                 cov|
     +--------------------+--------------------+
-    |[-0.0550000000000...|0.002025000000000...|
-    |[0.82499999999999...|0.005625000000000...|
-    |[-0.87,-0.7200000...|0.001600000000000...|
+    |[0.82500000140229...|0.005625000000006...|
+    |[-0.4777098016092...|0.167969502720916...|
+    |[-0.4472625243352...|0.167304119758233...|
     +--------------------+--------------------+
     ...
     >>> transformed = model.transform(df).select("features", "prediction")
@@ -124,9 +124,9 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     +--------------------+--------------------+
     |                mean|                 cov|
     +--------------------+--------------------+
-    |[-0.0550000000000...|0.002025000000000...|
-    |[0.82499999999999...|0.005625000000000...|
-    |[-0.87,-0.7200000...|0.001600000000000...|
+    |[0.82500000140229...|0.005625000000006...|
+    |[-0.4777098016092...|0.167969502720916...|
+    |[-0.4472625243352...|0.167304119758233...|
     +--------------------+--------------------+
     ...
 
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index c8c3c42774..29aa615125 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -416,7 +416,7 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     ...                 4.5605,  5.2043,  6.2734])
     >>> clusterdata_2 = sc.parallelize(data.reshape(5,3))
     >>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001,
-    ...                               maxIterations=150, seed=10)
+    ...                               maxIterations=150, seed=4)
     >>> labels = model.predict(clusterdata_2).collect()
     >>> labels[0]==labels[1]
     True
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 99bf50b5a1..3f3dfd186c 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -550,7 +550,7 @@ class ListTests(MLlibTestCase):
             [-6, -7],
         ])
         clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
-                                         maxIterations=10, seed=56)
+                                         maxIterations=10, seed=1)
         labels = clusters.predict(data).collect()
         self.assertEqual(labels[0], labels[1])
         self.assertEqual(labels[2], labels[3])