From 5377fc62360d5e9b5c94078e41d10a96e0e8a535 Mon Sep 17 00:00:00 2001
From: Nick Lavers <nick.lavers@videoamp.com>
Date: Fri, 19 Aug 2016 10:11:59 +0100
Subject: [SPARK-16961][CORE] Fixed off-by-one error that biased
 randomizeInPlace

JIRA issue link:
https://issues.apache.org/jira/browse/SPARK-16961

Changed one line of Utils.randomizeInPlace to allow elements to stay in place.

Created a unit test that runs a Pearson's chi squared test to determine whether the output diverges significantly from a uniform distribution.

Author: Nick Lavers <nick.lavers@videoamp.com>

Closes #14551 from nicklavers/SPARK-16961-randomizeInPlace.
---
 python/pyspark/ml/clustering.py    | 12 ++++++------
 python/pyspark/mllib/clustering.py |  2 +-
 python/pyspark/mllib/tests.py      |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'python')

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 75d9a0e8ca..4dab83362a 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -99,9 +99,9 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     +--------------------+--------------------+
     |                mean|                 cov|
     +--------------------+--------------------+
-    |[-0.0550000000000...|0.002025000000000...|
-    |[0.82499999999999...|0.005625000000000...|
-    |[-0.87,-0.7200000...|0.001600000000000...|
+    |[0.82500000140229...|0.005625000000006...|
+    |[-0.4777098016092...|0.167969502720916...|
+    |[-0.4472625243352...|0.167304119758233...|
     +--------------------+--------------------+
     ...
     >>> transformed = model.transform(df).select("features", "prediction")
@@ -124,9 +124,9 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     +--------------------+--------------------+
     |                mean|                 cov|
     +--------------------+--------------------+
-    |[-0.0550000000000...|0.002025000000000...|
-    |[0.82499999999999...|0.005625000000000...|
-    |[-0.87,-0.7200000...|0.001600000000000...|
+    |[0.82500000140229...|0.005625000000006...|
+    |[-0.4777098016092...|0.167969502720916...|
+    |[-0.4472625243352...|0.167304119758233...|
     +--------------------+--------------------+
     ...
 
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index c8c3c42774..29aa615125 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -416,7 +416,7 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     ...                 4.5605,  5.2043,  6.2734])
     >>> clusterdata_2 = sc.parallelize(data.reshape(5,3))
     >>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001,
-    ...                               maxIterations=150, seed=10)
+    ...                               maxIterations=150, seed=4)
     >>> labels = model.predict(clusterdata_2).collect()
     >>> labels[0]==labels[1]
     True
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 99bf50b5a1..3f3dfd186c 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -550,7 +550,7 @@ class ListTests(MLlibTestCase):
             [-6, -7],
         ])
         clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
-                                         maxIterations=10, seed=56)
+                                         maxIterations=10, seed=1)
         labels = clusters.predict(data).collect()
         self.assertEqual(labels[0], labels[1])
         self.assertEqual(labels[2], labels[3])
-- 
cgit v1.2.3