From cef73b563864d5f8aa1b26e31e3b9af6f0a08a5d Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 11 May 2016 09:56:36 +0200 Subject: [SPARK-14340][EXAMPLE][DOC] Update Examples and User Guide for ml.BisectingKMeans ## What changes were proposed in this pull request? 1, add BisectingKMeans to ml-clustering.md 2, add the missing Scala BisectingKMeansExample 3, create a new datafile `data/mllib/sample_kmeans_data.txt` ## How was this patch tested? manual tests Author: Zheng RuiFeng Closes #11844 from zhengruifeng/doc_bkm. --- .../examples/ml/JavaBisectingKMeansExample.java | 49 ++++++---------- .../main/python/ml/bisecting_k_means_example.py | 30 +++++----- .../spark/examples/ml/BisectingKMeansExample.scala | 65 ++++++++++++++++++++++ 3 files changed, 97 insertions(+), 47 deletions(-) create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala (limited to 'examples/src') diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java index 810ad905c5..62871448e3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java @@ -17,27 +17,22 @@ package org.apache.spark.examples.ml; -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; // $example on$ import org.apache.spark.ml.clustering.BisectingKMeans; import org.apache.spark.ml.clustering.BisectingKMeansModel; import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.VectorUDT; -import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; // $example off$ +import org.apache.spark.sql.SparkSession; /** - * An example demonstrating a bisecting k-means clustering. + * An example demonstrating bisecting k-means clustering. + * Run with + *
+ * bin/run-example ml.JavaBisectingKMeansExample
+ * 
*/ public class JavaBisectingKMeansExample { @@ -48,30 +43,22 @@ public class JavaBisectingKMeansExample { .getOrCreate(); // $example on$ - List data = Arrays.asList( - RowFactory.create(Vectors.dense(0.1, 0.1, 0.1)), - RowFactory.create(Vectors.dense(0.3, 0.3, 0.25)), - RowFactory.create(Vectors.dense(0.1, 0.1, -0.1)), - RowFactory.create(Vectors.dense(20.3, 20.1, 19.9)), - RowFactory.create(Vectors.dense(20.2, 20.1, 19.7)), - RowFactory.create(Vectors.dense(18.9, 20.0, 19.7)) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("features", new VectorUDT(), false, Metadata.empty()), - }); - - Dataset dataset = spark.createDataFrame(data, schema); + // Loads data. + Dataset dataset = spark.read().format("libsvm").load("data/mllib/sample_kmeans_data.txt"); - BisectingKMeans bkm = new BisectingKMeans().setK(2); + // Trains a bisecting k-means model. + BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1); BisectingKMeansModel model = bkm.fit(dataset); - System.out.println("Compute Cost: " + model.computeCost(dataset)); + // Evaluate clustering. + double cost = model.computeCost(dataset); + System.out.println("Within Set Sum of Squared Errors = " + cost); - Vector[] clusterCenters = model.clusterCenters(); - for (int i = 0; i < clusterCenters.length; i++) { - Vector clusterCenter = clusterCenters[i]; - System.out.println("Cluster Center " + i + ": " + clusterCenter); + // Shows the result. + System.out.println("Cluster Centers: "); + Vector[] centers = model.clusterCenters(); + for (Vector center : centers) { + System.out.println(center); } // $example off$ diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index 540a4bc3e4..ee0399ac5e 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -18,15 +18,14 @@ from __future__ import print_function # $example on$ -from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel -from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors -from pyspark.mllib.linalg import Vectors -from pyspark.sql.types import Row +from pyspark.ml.clustering import BisectingKMeans # $example off$ from pyspark.sql import SparkSession """ -A simple example demonstrating a bisecting k-means clustering. +An example demonstrating bisecting k-means clustering. +Run with: + bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py """ if __name__ == "__main__": @@ -36,21 +35,20 @@ if __name__ == "__main__": .getOrCreate() # $example on$ - data = spark.read.text("data/mllib/kmeans_data.txt").rdd - parsed = data\ - .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')]))) - training = spark.createDataFrame(parsed) + # Loads data. + dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") - kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") + # Trains a bisecting k-means model. + bkm = BisectingKMeans().setK(2).setSeed(1) + model = bkm.fit(dataset) - model = kmeans.fit(training) + # Evaluate clustering. + cost = model.computeCost(dataset) + print("Within Set Sum of Squared Errors = " + str(cost)) - # Evaluate clustering - cost = model.computeCost(training) - print("Bisecting K-means Cost = " + str(cost)) - - centers = model.clusterCenters() + # Shows the result. print("Cluster Centers: ") + centers = model.clusterCenters() for center in centers: print(center) # $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala new file mode 100644 index 0000000000..5f8f2c99cb --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml + +// scalastyle:off println + +// $example on$ +import org.apache.spark.ml.clustering.BisectingKMeans +// $example off$ +import org.apache.spark.sql.SparkSession + +/** + * An example demonstrating bisecting k-means clustering. + * Run with + * {{{ + * bin/run-example ml.BisectingKMeansExample + * }}} + */ +object BisectingKMeansExample { + + def main(args: Array[String]): Unit = { + // Creates a SparkSession + val spark = SparkSession + .builder + .appName("BisectingKMeansExample") + .getOrCreate() + + // $example on$ + // Loads data. + val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") + + // Trains a bisecting k-means model. + val bkm = new BisectingKMeans().setK(2).setSeed(1) + val model = bkm.fit(dataset) + + // Evaluate clustering. + val cost = model.computeCost(dataset) + println(s"Within Set Sum of Squared Errors = $cost") + + // Shows the result. + println("Cluster Centers: ") + val centers = model.clusterCenters + centers.foreach(println) + // $example off$ + + spark.stop() + } +} +// scalastyle:on println + -- cgit v1.2.3