aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/python/ml/bisecting_k_means_example.py
diff options
context:
space:
mode:
Diffstat (limited to 'examples/src/main/python/ml/bisecting_k_means_example.py')
-rw-r--r--examples/src/main/python/ml/bisecting_k_means_example.py30
1 files changed, 14 insertions, 16 deletions
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
index 540a4bc3e4..ee0399ac5e 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -18,15 +18,14 @@
from __future__ import print_function
# $example on$
-from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel
-from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors
-from pyspark.mllib.linalg import Vectors
-from pyspark.sql.types import Row
+from pyspark.ml.clustering import BisectingKMeans
# $example off$
from pyspark.sql import SparkSession
"""
-A simple example demonstrating a bisecting k-means clustering.
+An example demonstrating bisecting k-means clustering.
+Run with:
+ bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
"""
if __name__ == "__main__":
@@ -36,21 +35,20 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- data = spark.read.text("data/mllib/kmeans_data.txt").rdd
- parsed = data\
- .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')])))
- training = spark.createDataFrame(parsed)
+ # Loads data.
+ dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
- kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features")
+ # Trains a bisecting k-means model.
+ bkm = BisectingKMeans().setK(2).setSeed(1)
+ model = bkm.fit(dataset)
- model = kmeans.fit(training)
+ # Evaluate clustering.
+ cost = model.computeCost(dataset)
+ print("Within Set Sum of Squared Errors = " + str(cost))
- # Evaluate clustering
- cost = model.computeCost(training)
- print("Bisecting K-means Cost = " + str(cost))
-
- centers = model.clusterCenters()
+ # Shows the result.
print("Cluster Centers: ")
+ centers = model.clusterCenters()
for center in centers:
print(center)
# $example off$