aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
diff options
context:
space:
mode:
Diffstat (limited to 'mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala')
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala19
1 files changed, 19 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index fc491cd616..30513c1e27 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -29,9 +29,12 @@ class BisectingKMeansSuite
final val k = 5
@transient var dataset: Dataset[_] = _
+ @transient var sparseDataset: Dataset[_] = _
+
override def beforeAll(): Unit = {
super.beforeAll()
dataset = KMeansSuite.generateKMeansData(spark, 50, 3, k)
+ sparseDataset = KMeansSuite.generateSparseData(spark, 10, 1000, 42)
}
test("default parameters") {
@@ -51,6 +54,22 @@ class BisectingKMeansSuite
assert(copiedModel.hasSummary)
}
+ test("SPARK-16473: Verify Bisecting K-Means does not fail in edge case where" +
+ "one cluster is empty after split") {
+ val bkm = new BisectingKMeans()
+ .setK(k)
+ .setMinDivisibleClusterSize(4)
+ .setMaxIter(4)
+ .setSeed(123)
+
+ // Verify fit does not fail on very sparse data
+ val model = bkm.fit(sparseDataset)
+ val result = model.transform(sparseDataset)
+ val numClusters = result.select("prediction").distinct().collect().length
+ // Verify we hit the edge case
+ assert(numClusters < k && numClusters > 1)
+ }
+
test("setter/getter") {
val bkm = new BisectingKMeans()
.setK(9)