From 29ba9578f44c7caa8451386cee1f03f4e0ed8fc7 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Sun, 11 Sep 2016 08:00:55 +0100 Subject: [SPARK-17389][ML][MLLIB] KMeans speedup with better choice of k-means|| init steps = 2 ## What changes were proposed in this pull request? Reduce default k-means|| init steps to 2 from 5. See JIRA for discussion. See also https://github.com/apache/spark/pull/14948 ## How was this patch tested? Existing tests. Author: Sean Owen Closes #14956 from srowen/SPARK-17389.2. --- .../main/scala/org/apache/spark/mllib/clustering/KMeans.scala | 6 +++--- .../spark/mllib/clustering/PowerIterationClusteringSuite.scala | 10 +++------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index de9fa4aebf..23141aaf42 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -51,10 +51,10 @@ class KMeans private ( /** * Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1, - * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random}. + * initializationMode: "k-means||", initializationSteps: 2, epsilon: 1e-4, seed: random}. */ @Since("0.8.0") - def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4, Utils.random.nextLong()) + def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 2, 1e-4, Utils.random.nextLong()) /** * Number of clusters to create (k). @@ -134,7 +134,7 @@ class KMeans private ( /** * Set the number of steps for the k-means|| initialization mode. This is an advanced - * setting -- the default of 5 is almost always enough. Default: 5. + * setting -- the default of 2 is almost always enough. Default: 2. */ @Since("0.8.0") def setInitializationSteps(initializationSteps: Int): this.type = { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala index 3d81d375c7..b33b86b39a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala @@ -49,7 +49,7 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon val r1 = 1.0 val n1 = 10 val r2 = 4.0 - val n2 = 40 + val n2 = 10 val n = n1 + n2 val points = genCircle(r1, n1) ++ genCircle(r2, n2) val similarities = for (i <- 1 until n; j <- 0 until i) yield { @@ -83,7 +83,7 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon val r1 = 1.0 val n1 = 10 val r2 = 4.0 - val n2 = 40 + val n2 = 10 val n = n1 + n2 val points = genCircle(r1, n1) ++ genCircle(r2, n2) val similarities = for (i <- 1 until n; j <- 0 until i) yield { @@ -91,11 +91,7 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon } val edges = similarities.flatMap { case (i, j, s) => - if (i != j) { - Seq(Edge(i, j, s), Edge(j, i, s)) - } else { - None - } + Seq(Edge(i, j, s), Edge(j, i, s)) } val graph = Graph.fromEdges(sc.parallelize(edges, 2), 0.0) -- cgit v1.2.3