aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala1
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala42
-rw-r--r--python/pyspark/ml/clustering.py5
-rw-r--r--python/pyspark/mllib/clustering.py9
4 files changed, 16 insertions, 41 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 32dc16de08..8daee7b3aa 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -357,7 +357,6 @@ private[python] class PythonMLLibAPI extends Serializable {
val kMeansAlg = new KMeans()
.setK(k)
.setMaxIterations(maxIterations)
- .internalSetRuns(runs)
.setInitializationMode(initializationMode)
.setInitializationSteps(initializationSteps)
.setEpsilon(epsilon)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index ff77090990..60f13d27d0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -32,9 +32,8 @@ import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom
/**
- * K-means clustering with support for multiple parallel runs and a k-means++ like initialization
- * mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested,
- * they are executed together with joint passes over the data for efficiency.
+ * K-means clustering with a k-means++ like initialization mode
+ * (the k-means|| algorithm by Bahmani et al).
*
* This is an iterative algorithm that will make multiple passes over the data, so any RDDs given
* to it should be cached by the user.
@@ -109,35 +108,20 @@ class KMeans private (
}
/**
- * :: Experimental ::
- * Number of runs of the algorithm to execute in parallel.
+ * This function has no effect since Spark 2.0.0.
*/
@Since("1.4.0")
- @deprecated("Support for runs is deprecated. This param will have no effect in 2.0.0.", "1.6.0")
- def getRuns: Int = runs
+ def getRuns: Int = {
+ logWarning("Getting number of runs has no effect since Spark 2.0.0.")
+ runs
+ }
/**
- * :: Experimental ::
- * Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm
- * this many times with random starting conditions (configured by the initialization mode), then
- * return the best clustering found over any run. Default: 1.
+ * This function has no effect since Spark 2.0.0.
*/
@Since("0.8.0")
- @deprecated("Support for runs is deprecated. This param will have no effect in 2.0.0.", "1.6.0")
def setRuns(runs: Int): this.type = {
- internalSetRuns(runs)
- }
-
- // Internal version of setRuns for Python API, this should be removed at the same time as setRuns
- // this is done to avoid deprecation warnings in our build.
- private[mllib] def internalSetRuns(runs: Int): this.type = {
- if (runs <= 0) {
- throw new IllegalArgumentException("Number of runs must be positive")
- }
- if (runs != 1) {
- logWarning("Setting number of runs is deprecated and will have no effect in 2.0.0")
- }
- this.runs = runs
+ logWarning("Setting number of runs has no effect since Spark 2.0.0.")
this
}
@@ -511,8 +495,7 @@ object KMeans {
* @param data Training points as an `RDD` of `Vector` types.
* @param k Number of clusters to create.
* @param maxIterations Maximum number of iterations allowed.
- * @param runs Number of runs to execute in parallel. The best model according to the cost
- * function will be returned. (default: 1)
+ * @param runs This param has no effect since Spark 2.0.0.
* @param initializationMode The initialization algorithm. This can either be "random" or
* "k-means||". (default: "k-means||")
* @param seed Random seed for cluster initialization. Default is to generate seed based
@@ -528,7 +511,6 @@ object KMeans {
seed: Long): KMeansModel = {
new KMeans().setK(k)
.setMaxIterations(maxIterations)
- .internalSetRuns(runs)
.setInitializationMode(initializationMode)
.setSeed(seed)
.run(data)
@@ -540,8 +522,7 @@ object KMeans {
* @param data Training points as an `RDD` of `Vector` types.
* @param k Number of clusters to create.
* @param maxIterations Maximum number of iterations allowed.
- * @param runs Number of runs to execute in parallel. The best model according to the cost
- * function will be returned. (default: 1)
+ * @param runs This param has no effect since Spark 2.0.0.
* @param initializationMode The initialization algorithm. This can either be "random" or
* "k-means||". (default: "k-means||")
*/
@@ -554,7 +535,6 @@ object KMeans {
initializationMode: String): KMeansModel = {
new KMeans().setK(k)
.setMaxIterations(maxIterations)
- .internalSetRuns(runs)
.setInitializationMode(initializationMode)
.run(data)
}
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 4ce8012754..9740ec45af 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -194,9 +194,8 @@ class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed,
JavaMLWritable, JavaMLReadable):
"""
- K-means clustering with support for multiple parallel runs and a k-means++ like initialization
- mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested,
- they are executed together with joint passes over the data for efficiency.
+ K-means clustering with a k-means++ like initialization mode
+ (the k-means|| algorithm by Bahmani et al).
>>> from pyspark.mllib.linalg import Vectors
>>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 23d118bd40..95f7278dc6 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -179,7 +179,7 @@ class KMeansModel(Saveable, Loader):
>>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)
>>> model = KMeans.train(
- ... sc.parallelize(data), 2, maxIterations=10, runs=30, initializationMode="random",
+ ... sc.parallelize(data), 2, maxIterations=10, initializationMode="random",
... seed=50, initializationSteps=5, epsilon=1e-4)
>>> model.predict(array([0.0, 0.0])) == model.predict(array([1.0, 1.0]))
True
@@ -323,9 +323,7 @@ class KMeans(object):
Maximum number of iterations allowed.
(default: 100)
:param runs:
- Number of runs to execute in parallel. The best model according
- to the cost function will be returned (deprecated in 1.6.0).
- (default: 1)
+ This param has no effect since Spark 2.0.0.
:param initializationMode:
The initialization algorithm. This can be either "random" or
"k-means||".
@@ -350,8 +348,7 @@ class KMeans(object):
(default: None)
"""
if runs != 1:
- warnings.warn(
- "Support for runs is deprecated in 1.6.0. This param will have no effect in 2.0.0.")
+ warnings.warn("The param `runs` has no effect since Spark 2.0.0.")
clusterInitialModel = []
if initialModel is not None:
if not isinstance(initialModel, KMeansModel):