1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.ml.clustering
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Dataset}
class BisectingKMeansSuite
extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
final val k = 5
@transient var dataset: Dataset[_] = _
override def beforeAll(): Unit = {
super.beforeAll()
dataset = KMeansSuite.generateKMeansData(sqlContext, 50, 3, k)
}
test("default parameters") {
val bkm = new BisectingKMeans()
assert(bkm.getK === 4)
assert(bkm.getFeaturesCol === "features")
assert(bkm.getPredictionCol === "prediction")
assert(bkm.getMaxIter === 20)
assert(bkm.getMinDivisibleClusterSize === 1.0)
}
test("setter/getter") {
val bkm = new BisectingKMeans()
.setK(9)
.setMinDivisibleClusterSize(2.0)
.setFeaturesCol("test_feature")
.setPredictionCol("test_prediction")
.setMaxIter(33)
.setSeed(123)
assert(bkm.getK === 9)
assert(bkm.getFeaturesCol === "test_feature")
assert(bkm.getPredictionCol === "test_prediction")
assert(bkm.getMaxIter === 33)
assert(bkm.getMinDivisibleClusterSize === 2.0)
assert(bkm.getSeed === 123)
intercept[IllegalArgumentException] {
new BisectingKMeans().setK(1)
}
intercept[IllegalArgumentException] {
new BisectingKMeans().setMinDivisibleClusterSize(0)
}
}
test("fit & transform") {
val predictionColName = "bisecting_kmeans_prediction"
val bkm = new BisectingKMeans().setK(k).setPredictionCol(predictionColName).setSeed(1)
val model = bkm.fit(dataset)
assert(model.clusterCenters.length === k)
val transformed = model.transform(dataset)
val expectedColumns = Array("features", predictionColName)
expectedColumns.foreach { column =>
assert(transformed.columns.contains(column))
}
val clusters =
transformed.select(predictionColName).rdd.map(_.getInt(0)).distinct().collect().toSet
assert(clusters.size === k)
assert(clusters === Set(0, 1, 2, 3, 4))
assert(model.computeCost(dataset) < 0.1)
assert(model.hasParent)
}
test("read/write") {
def checkModelData(model: BisectingKMeansModel, model2: BisectingKMeansModel): Unit = {
assert(model.clusterCenters === model2.clusterCenters)
}
val bisectingKMeans = new BisectingKMeans()
testEstimatorAndModelReadWrite(
bisectingKMeans, dataset, BisectingKMeansSuite.allParamSettings, checkModelData)
}
}
object BisectingKMeansSuite {
val allParamSettings: Map[String, Any] = Map(
"k" -> 3,
"maxIter" -> 2,
"seed" -> -1L,
"minDivisibleClusterSize" -> 2.0
)
}
|