aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/scala
diff options
context:
space:
mode:
authorZheng RuiFeng <ruifengz@foxmail.com>2016-05-11 10:01:43 +0200
committerNick Pentreath <nickp@za.ibm.com>2016-05-11 10:01:43 +0200
commit8beae59144827d81491eed385dc2aa6aedd6a7b4 (patch)
tree1905c4caa10c9f432262272e120a948772a2846f /examples/src/main/scala
parentcef73b563864d5f8aa1b26e31e3b9af6f0a08a5d (diff)
downloadspark-8beae59144827d81491eed385dc2aa6aedd6a7b4.tar.gz
spark-8beae59144827d81491eed385dc2aa6aedd6a7b4.tar.bz2
spark-8beae59144827d81491eed385dc2aa6aedd6a7b4.zip
[SPARK-15149][EXAMPLE][DOC] update kmeans example
## What changes were proposed in this pull request? Python example for ml.kmeans already exists, but not included in user guide. 1,small changes like: `example_on` `example_off` 2,add it to user guide 3,update examples to directly read datafile ## How was this patch tested? manual tests `./bin/spark-submit examples/src/main/python/ml/kmeans_example.py Author: Zheng RuiFeng <ruifengz@foxmail.com> Closes #12925 from zhengruifeng/km_pe.
Diffstat (limited to 'examples/src/main/scala')
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala33
1 files changed, 13 insertions, 20 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
index 2abd588c6f..2341b36db2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
@@ -21,12 +21,11 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.clustering.KMeans
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.sql.{DataFrame, SparkSession}
// $example off$
+import org.apache.spark.sql.SparkSession
/**
- * An example demonstrating a k-means clustering.
+ * An example demonstrating k-means clustering.
* Run with
* {{{
* bin/run-example ml.KMeansExample
@@ -35,32 +34,26 @@ import org.apache.spark.sql.{DataFrame, SparkSession}
object KMeansExample {
def main(args: Array[String]): Unit = {
- // Creates a Spark context and a SQL context
+ // Creates a SparkSession.
val spark = SparkSession
.builder
.appName(s"${this.getClass.getSimpleName}")
.getOrCreate()
// $example on$
- // Crates a DataFrame
- val dataset: DataFrame = spark.createDataFrame(Seq(
- (1, Vectors.dense(0.0, 0.0, 0.0)),
- (2, Vectors.dense(0.1, 0.1, 0.1)),
- (3, Vectors.dense(0.2, 0.2, 0.2)),
- (4, Vectors.dense(9.0, 9.0, 9.0)),
- (5, Vectors.dense(9.1, 9.1, 9.1)),
- (6, Vectors.dense(9.2, 9.2, 9.2))
- )).toDF("id", "features")
+ // Loads data.
+ val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
- // Trains a k-means model
- val kmeans = new KMeans()
- .setK(2)
- .setFeaturesCol("features")
- .setPredictionCol("prediction")
+ // Trains a k-means model.
+ val kmeans = new KMeans().setK(2).setSeed(1L)
val model = kmeans.fit(dataset)
- // Shows the result
- println("Final Centers: ")
+ // Evaluate clustering by computing Within Set Sum of Squared Errors.
+ val WSSSE = model.computeCost(dataset)
+ println(s"Within Set Sum of Squared Errors = $WSSSE")
+
+ // Shows the result.
+ println("Cluster Centers: ")
model.clusterCenters.foreach(println)
// $example off$