diff options
author | Zheng RuiFeng <ruifengz@foxmail.com> | 2016-05-11 09:56:36 +0200 |
---|---|---|
committer | Nick Pentreath <nickp@za.ibm.com> | 2016-05-11 09:56:36 +0200 |
commit | cef73b563864d5f8aa1b26e31e3b9af6f0a08a5d (patch) | |
tree | 425fd9da8e73e5a31fbb0e46be206692c23f64f0 /examples/src/main/python | |
parent | ad1a8466e9c10fbe8b455dba17b16973f92ebc15 (diff) | |
download | spark-cef73b563864d5f8aa1b26e31e3b9af6f0a08a5d.tar.gz spark-cef73b563864d5f8aa1b26e31e3b9af6f0a08a5d.tar.bz2 spark-cef73b563864d5f8aa1b26e31e3b9af6f0a08a5d.zip |
[SPARK-14340][EXAMPLE][DOC] Update Examples and User Guide for ml.BisectingKMeans
## What changes were proposed in this pull request?
1, add BisectingKMeans to ml-clustering.md
2, add the missing Scala BisectingKMeansExample
3, create a new datafile `data/mllib/sample_kmeans_data.txt`
## How was this patch tested?
manual tests
Author: Zheng RuiFeng <ruifengz@foxmail.com>
Closes #11844 from zhengruifeng/doc_bkm.
Diffstat (limited to 'examples/src/main/python')
-rw-r--r-- | examples/src/main/python/ml/bisecting_k_means_example.py | 30 |
1 files changed, 14 insertions, 16 deletions
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index 540a4bc3e4..ee0399ac5e 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -18,15 +18,14 @@ from __future__ import print_function # $example on$ -from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel -from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors -from pyspark.mllib.linalg import Vectors -from pyspark.sql.types import Row +from pyspark.ml.clustering import BisectingKMeans # $example off$ from pyspark.sql import SparkSession """ -A simple example demonstrating a bisecting k-means clustering. +An example demonstrating bisecting k-means clustering. +Run with: + bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py """ if __name__ == "__main__": @@ -36,21 +35,20 @@ if __name__ == "__main__": .getOrCreate() # $example on$ - data = spark.read.text("data/mllib/kmeans_data.txt").rdd - parsed = data\ - .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')]))) - training = spark.createDataFrame(parsed) + # Loads data. + dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") - kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") + # Trains a bisecting k-means model. + bkm = BisectingKMeans().setK(2).setSeed(1) + model = bkm.fit(dataset) - model = kmeans.fit(training) + # Evaluate clustering. + cost = model.computeCost(dataset) + print("Within Set Sum of Squared Errors = " + str(cost)) - # Evaluate clustering - cost = model.computeCost(training) - print("Bisecting K-means Cost = " + str(cost)) - - centers = model.clusterCenters() + # Shows the result. print("Cluster Centers: ") + centers = model.clusterCenters() for center in centers: print(center) # $example off$ |