aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib
diff options
context:
space:
mode:
authorNick Lavers <nick.lavers@videoamp.com>2016-08-19 10:11:59 +0100
committerSean Owen <sowen@cloudera.com>2016-08-19 10:11:59 +0100
commit5377fc62360d5e9b5c94078e41d10a96e0e8a535 (patch)
tree1998db20af8d7cc93a2b00308c0f5e8e2b3166a9 /python/pyspark/mllib
parent287bea13050b8eedc3b8b6b3491f1b5e5bc24d7a (diff)
downloadspark-5377fc62360d5e9b5c94078e41d10a96e0e8a535.tar.gz
spark-5377fc62360d5e9b5c94078e41d10a96e0e8a535.tar.bz2
spark-5377fc62360d5e9b5c94078e41d10a96e0e8a535.zip
[SPARK-16961][CORE] Fixed off-by-one error that biased randomizeInPlace
JIRA issue link: https://issues.apache.org/jira/browse/SPARK-16961 Changed one line of Utils.randomizeInPlace to allow elements to stay in place. Created a unit test that runs a Pearson's chi squared test to determine whether the output diverges significantly from a uniform distribution. Author: Nick Lavers <nick.lavers@videoamp.com> Closes #14551 from nicklavers/SPARK-16961-randomizeInPlace.
Diffstat (limited to 'python/pyspark/mllib')
-rw-r--r--python/pyspark/mllib/clustering.py2
-rw-r--r--python/pyspark/mllib/tests.py2
2 files changed, 2 insertions, 2 deletions
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index c8c3c42774..29aa615125 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -416,7 +416,7 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader):
... 4.5605, 5.2043, 6.2734])
>>> clusterdata_2 = sc.parallelize(data.reshape(5,3))
>>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001,
- ... maxIterations=150, seed=10)
+ ... maxIterations=150, seed=4)
>>> labels = model.predict(clusterdata_2).collect()
>>> labels[0]==labels[1]
True
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 99bf50b5a1..3f3dfd186c 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -550,7 +550,7 @@ class ListTests(MLlibTestCase):
[-6, -7],
])
clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
- maxIterations=10, seed=56)
+ maxIterations=10, seed=1)
labels = clusters.predict(data).collect()
self.assertEqual(labels[0], labels[1])
self.assertEqual(labels[2], labels[3])