From cef73b563864d5f8aa1b26e31e3b9af6f0a08a5d Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 11 May 2016 09:56:36 +0200 Subject: [SPARK-14340][EXAMPLE][DOC] Update Examples and User Guide for ml.BisectingKMeans ## What changes were proposed in this pull request? 1, add BisectingKMeans to ml-clustering.md 2, add the missing Scala BisectingKMeansExample 3, create a new datafile `data/mllib/sample_kmeans_data.txt` ## How was this patch tested? manual tests Author: Zheng RuiFeng Closes #11844 from zhengruifeng/doc_bkm. --- .../main/python/ml/bisecting_k_means_example.py | 30 ++++++++++------------ 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'examples/src/main/python/ml') diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index 540a4bc3e4..ee0399ac5e 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -18,15 +18,14 @@ from __future__ import print_function # $example on$ -from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel -from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors -from pyspark.mllib.linalg import Vectors -from pyspark.sql.types import Row +from pyspark.ml.clustering import BisectingKMeans # $example off$ from pyspark.sql import SparkSession """ -A simple example demonstrating a bisecting k-means clustering. +An example demonstrating bisecting k-means clustering. +Run with: + bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py """ if __name__ == "__main__": @@ -36,21 +35,20 @@ if __name__ == "__main__": .getOrCreate() # $example on$ - data = spark.read.text("data/mllib/kmeans_data.txt").rdd - parsed = data\ - .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')]))) - training = spark.createDataFrame(parsed) + # Loads data. + dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") - kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") + # Trains a bisecting k-means model. + bkm = BisectingKMeans().setK(2).setSeed(1) + model = bkm.fit(dataset) - model = kmeans.fit(training) + # Evaluate clustering. + cost = model.computeCost(dataset) + print("Within Set Sum of Squared Errors = " + str(cost)) - # Evaluate clustering - cost = model.computeCost(training) - print("Bisecting K-means Cost = " + str(cost)) - - centers = model.clusterCenters() + # Shows the result. print("Cluster Centers: ") + centers = model.clusterCenters() for center in centers: print(center) # $example off$ -- cgit v1.2.3