From 5377fc62360d5e9b5c94078e41d10a96e0e8a535 Mon Sep 17 00:00:00 2001 From: Nick Lavers Date: Fri, 19 Aug 2016 10:11:59 +0100 Subject: [SPARK-16961][CORE] Fixed off-by-one error that biased randomizeInPlace JIRA issue link: https://issues.apache.org/jira/browse/SPARK-16961 Changed one line of Utils.randomizeInPlace to allow elements to stay in place. Created a unit test that runs a Pearson's chi squared test to determine whether the output diverges significantly from a uniform distribution. Author: Nick Lavers Closes #14551 from nicklavers/SPARK-16961-randomizeInPlace. --- python/pyspark/ml/clustering.py | 12 ++++++------ python/pyspark/mllib/clustering.py | 2 +- python/pyspark/mllib/tests.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'python') diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 75d9a0e8ca..4dab83362a 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -99,9 +99,9 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte +--------------------+--------------------+ | mean| cov| +--------------------+--------------------+ - |[-0.0550000000000...|0.002025000000000...| - |[0.82499999999999...|0.005625000000000...| - |[-0.87,-0.7200000...|0.001600000000000...| + |[0.82500000140229...|0.005625000000006...| + |[-0.4777098016092...|0.167969502720916...| + |[-0.4472625243352...|0.167304119758233...| +--------------------+--------------------+ ... >>> transformed = model.transform(df).select("features", "prediction") @@ -124,9 +124,9 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte +--------------------+--------------------+ | mean| cov| +--------------------+--------------------+ - |[-0.0550000000000...|0.002025000000000...| - |[0.82499999999999...|0.005625000000000...| - |[-0.87,-0.7200000...|0.001600000000000...| + |[0.82500000140229...|0.005625000000006...| + |[-0.4777098016092...|0.167969502720916...| + |[-0.4472625243352...|0.167304119758233...| +--------------------+--------------------+ ... diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index c8c3c42774..29aa615125 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -416,7 +416,7 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader): ... 4.5605, 5.2043, 6.2734]) >>> clusterdata_2 = sc.parallelize(data.reshape(5,3)) >>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001, - ... maxIterations=150, seed=10) + ... maxIterations=150, seed=4) >>> labels = model.predict(clusterdata_2).collect() >>> labels[0]==labels[1] True diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 99bf50b5a1..3f3dfd186c 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -550,7 +550,7 @@ class ListTests(MLlibTestCase): [-6, -7], ]) clusters = GaussianMixture.train(data, 2, convergenceTol=0.001, - maxIterations=10, seed=56) + maxIterations=10, seed=1) labels = clusters.predict(data).collect() self.assertEqual(labels[0], labels[1]) self.assertEqual(labels[2], labels[3]) -- cgit v1.2.3