aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/python
diff options
context:
space:
mode:
authorZheng RuiFeng <ruifengz@foxmail.com>2016-05-11 09:56:36 +0200
committerNick Pentreath <nickp@za.ibm.com>2016-05-11 09:56:36 +0200
commitcef73b563864d5f8aa1b26e31e3b9af6f0a08a5d (patch)
tree425fd9da8e73e5a31fbb0e46be206692c23f64f0 /examples/src/main/python
parentad1a8466e9c10fbe8b455dba17b16973f92ebc15 (diff)
downloadspark-cef73b563864d5f8aa1b26e31e3b9af6f0a08a5d.tar.gz
spark-cef73b563864d5f8aa1b26e31e3b9af6f0a08a5d.tar.bz2
spark-cef73b563864d5f8aa1b26e31e3b9af6f0a08a5d.zip
[SPARK-14340][EXAMPLE][DOC] Update Examples and User Guide for ml.BisectingKMeans
## What changes were proposed in this pull request? 1, add BisectingKMeans to ml-clustering.md 2, add the missing Scala BisectingKMeansExample 3, create a new datafile `data/mllib/sample_kmeans_data.txt` ## How was this patch tested? manual tests Author: Zheng RuiFeng <ruifengz@foxmail.com> Closes #11844 from zhengruifeng/doc_bkm.
Diffstat (limited to 'examples/src/main/python')
-rw-r--r--examples/src/main/python/ml/bisecting_k_means_example.py30
1 files changed, 14 insertions, 16 deletions
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
index 540a4bc3e4..ee0399ac5e 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -18,15 +18,14 @@
from __future__ import print_function
# $example on$
-from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel
-from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors
-from pyspark.mllib.linalg import Vectors
-from pyspark.sql.types import Row
+from pyspark.ml.clustering import BisectingKMeans
# $example off$
from pyspark.sql import SparkSession
"""
-A simple example demonstrating a bisecting k-means clustering.
+An example demonstrating bisecting k-means clustering.
+Run with:
+ bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
"""
if __name__ == "__main__":
@@ -36,21 +35,20 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- data = spark.read.text("data/mllib/kmeans_data.txt").rdd
- parsed = data\
- .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')])))
- training = spark.createDataFrame(parsed)
+ # Loads data.
+ dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
- kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features")
+ # Trains a bisecting k-means model.
+ bkm = BisectingKMeans().setK(2).setSeed(1)
+ model = bkm.fit(dataset)
- model = kmeans.fit(training)
+ # Evaluate clustering.
+ cost = model.computeCost(dataset)
+ print("Within Set Sum of Squared Errors = " + str(cost))
- # Evaluate clustering
- cost = model.computeCost(training)
- print("Bisecting K-means Cost = " + str(cost))
-
- centers = model.clusterCenters()
+ # Shows the result.
print("Cluster Centers: ")
+ centers = model.clusterCenters()
for center in centers:
print(center)
# $example off$