diff options
author | Zheng RuiFeng <ruifengz@foxmail.com> | 2016-05-11 10:01:43 +0200 |
---|---|---|
committer | Nick Pentreath <nickp@za.ibm.com> | 2016-05-11 10:01:43 +0200 |
commit | 8beae59144827d81491eed385dc2aa6aedd6a7b4 (patch) | |
tree | 1905c4caa10c9f432262272e120a948772a2846f /examples/src/main/scala | |
parent | cef73b563864d5f8aa1b26e31e3b9af6f0a08a5d (diff) | |
download | spark-8beae59144827d81491eed385dc2aa6aedd6a7b4.tar.gz spark-8beae59144827d81491eed385dc2aa6aedd6a7b4.tar.bz2 spark-8beae59144827d81491eed385dc2aa6aedd6a7b4.zip |
[SPARK-15149][EXAMPLE][DOC] update kmeans example
## What changes were proposed in this pull request?
Python example for ml.kmeans already exists, but not included in user guide.
1,small changes like: `example_on` `example_off`
2,add it to user guide
3,update examples to directly read datafile
## How was this patch tested?
manual tests
`./bin/spark-submit examples/src/main/python/ml/kmeans_example.py
Author: Zheng RuiFeng <ruifengz@foxmail.com>
Closes #12925 from zhengruifeng/km_pe.
Diffstat (limited to 'examples/src/main/scala')
-rw-r--r-- | examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala | 33 |
1 files changed, 13 insertions, 20 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala index 2abd588c6f..2341b36db2 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala @@ -21,12 +21,11 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.clustering.KMeans -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.sql.{DataFrame, SparkSession} // $example off$ +import org.apache.spark.sql.SparkSession /** - * An example demonstrating a k-means clustering. + * An example demonstrating k-means clustering. * Run with * {{{ * bin/run-example ml.KMeansExample @@ -35,32 +34,26 @@ import org.apache.spark.sql.{DataFrame, SparkSession} object KMeansExample { def main(args: Array[String]): Unit = { - // Creates a Spark context and a SQL context + // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() // $example on$ - // Crates a DataFrame - val dataset: DataFrame = spark.createDataFrame(Seq( - (1, Vectors.dense(0.0, 0.0, 0.0)), - (2, Vectors.dense(0.1, 0.1, 0.1)), - (3, Vectors.dense(0.2, 0.2, 0.2)), - (4, Vectors.dense(9.0, 9.0, 9.0)), - (5, Vectors.dense(9.1, 9.1, 9.1)), - (6, Vectors.dense(9.2, 9.2, 9.2)) - )).toDF("id", "features") + // Loads data. + val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") - // Trains a k-means model - val kmeans = new KMeans() - .setK(2) - .setFeaturesCol("features") - .setPredictionCol("prediction") + // Trains a k-means model. + val kmeans = new KMeans().setK(2).setSeed(1L) val model = kmeans.fit(dataset) - // Shows the result - println("Final Centers: ") + // Evaluate clustering by computing Within Set Sum of Squared Errors. + val WSSSE = model.computeCost(dataset) + println(s"Within Set Sum of Squared Errors = $WSSSE") + + // Shows the result. + println("Cluster Centers: ") model.clusterCenters.foreach(println) // $example off$ |