diff options
author | Yanbo Liang <ybliang8@gmail.com> | 2016-09-11 13:47:13 +0100 |
---|---|---|
committer | Sean Owen <sowen@cloudera.com> | 2016-09-11 13:47:13 +0100 |
commit | 883c7631847a95684534222c1b6cfed8e62710c8 (patch) | |
tree | b7b007aa8e0891b507a87539b4221e711ce0c9fd /python | |
parent | c76baff0cc4775c2191d075cc9a8176e4915fec8 (diff) | |
download | spark-883c7631847a95684534222c1b6cfed8e62710c8.tar.gz spark-883c7631847a95684534222c1b6cfed8e62710c8.tar.bz2 spark-883c7631847a95684534222c1b6cfed8e62710c8.zip |
[SPARK-17389][FOLLOW-UP][ML] Change KMeans k-means|| default init steps from 5 to 2.
## What changes were proposed in this pull request?
#14956 reduced default k-means|| init steps to 2 from 5 only for spark.mllib package, we should also do same change for spark.ml and PySpark.
## How was this patch tested?
Existing tests.
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #15050 from yanboliang/spark-17389.
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/ml/clustering.py | 10 | ||||
-rw-r--r-- | python/pyspark/mllib/clustering.py | 6 |
2 files changed, 8 insertions, 8 deletions
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 4dab83362a..7632f05c3b 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -254,14 +254,14 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol @keyword_only def __init__(self, featuresCol="features", predictionCol="prediction", k=2, - initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None): + initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None): """ __init__(self, featuresCol="features", predictionCol="prediction", k=2, \ - initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None) + initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None) """ super(KMeans, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid) - self._setDefault(k=2, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20) + self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -271,10 +271,10 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol @keyword_only @since("1.5.0") def setParams(self, featuresCol="features", predictionCol="prediction", k=2, - initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None): + initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None): """ setParams(self, featuresCol="features", predictionCol="prediction", k=2, \ - initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None) + initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None) Sets params for KMeans. """ diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index 29aa615125..2036168e45 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -306,7 +306,7 @@ class KMeans(object): @classmethod @since('0.9.0') def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||", - seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None): + seed=None, initializationSteps=2, epsilon=1e-4, initialModel=None): """ Train a k-means clustering model. @@ -330,9 +330,9 @@ class KMeans(object): (default: None) :param initializationSteps: Number of steps for the k-means|| initialization mode. - This is an advanced setting -- the default of 5 is almost + This is an advanced setting -- the default of 2 is almost always enough. - (default: 5) + (default: 2) :param epsilon: Distance threshold within which a center will be considered to have converged. If all centers move less than this Euclidean |