aboutsummaryrefslogtreecommitdiff
path: root/docs/mllib-clustering.md
diff options
context:
space:
mode:
Diffstat (limited to 'docs/mllib-clustering.md')
-rw-r--r--docs/mllib-clustering.md11
1 files changed, 6 insertions, 5 deletions
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 65ed75b82e..50a8671560 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -48,14 +48,15 @@ optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
{% highlight scala %}
import org.apache.spark.mllib.clustering.KMeans
+import org.apache.spark.mllib.linalg.Vectors
// Load and parse the data
-val data = sc.textFile("kmeans_data.txt")
-val parsedData = data.map( _.split(' ').map(_.toDouble))
+val data = sc.textFile("data/kmeans_data.txt")
+val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
// Cluster the data into two classes using KMeans
-val numIterations = 20
val numClusters = 2
+val numIterations = 20
val clusters = KMeans.train(parsedData, numClusters, numIterations)
// Evaluate clustering by computing Within Set Sum of Squared Errors
@@ -85,12 +86,12 @@ from numpy import array
from math import sqrt
# Load and parse the data
-data = sc.textFile("kmeans_data.txt")
+data = sc.textFile("data/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=10,
- runs=30, initialization_mode="random")
+ runs=10, initialization_mode="random")
# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):