diff options
Diffstat (limited to 'mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala')
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala index fc491cd616..30513c1e27 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala @@ -29,9 +29,12 @@ class BisectingKMeansSuite final val k = 5 @transient var dataset: Dataset[_] = _ + @transient var sparseDataset: Dataset[_] = _ + override def beforeAll(): Unit = { super.beforeAll() dataset = KMeansSuite.generateKMeansData(spark, 50, 3, k) + sparseDataset = KMeansSuite.generateSparseData(spark, 10, 1000, 42) } test("default parameters") { @@ -51,6 +54,22 @@ class BisectingKMeansSuite assert(copiedModel.hasSummary) } + test("SPARK-16473: Verify Bisecting K-Means does not fail in edge case where" + + "one cluster is empty after split") { + val bkm = new BisectingKMeans() + .setK(k) + .setMinDivisibleClusterSize(4) + .setMaxIter(4) + .setSeed(123) + + // Verify fit does not fail on very sparse data + val model = bkm.fit(sparseDataset) + val result = model.transform(sparseDataset) + val numClusters = result.select("prediction").distinct().collect().length + // Verify we hit the edge case + assert(numClusters < k && numClusters > 1) + } + test("setter/getter") { val bkm = new BisectingKMeans() .setK(9) |