aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorYu ISHIKAWA <yuu.ishikawa@gmail.com>2015-12-16 10:55:42 -0800
committerJoseph K. Bradley <joseph@databricks.com>2015-12-16 10:55:42 -0800
commit7b6dc29d0ebbfb3bb941130f8542120b6bc3e234 (patch)
tree94970c4bfb67f129f2e580542276f623426e8625 /examples
parentad8c1f0b840284d05da737fb2cc5ebf8848f4490 (diff)
downloadspark-7b6dc29d0ebbfb3bb941130f8542120b6bc3e234.tar.gz
spark-7b6dc29d0ebbfb3bb941130f8542120b6bc3e234.tar.bz2
spark-7b6dc29d0ebbfb3bb941130f8542120b6bc3e234.zip
[SPARK-6518][MLLIB][EXAMPLE][DOC] Add example code and user guide for bisecting k-means
This PR includes only an example code in order to finish it quickly. I'll send another PR for the docs soon. Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com> Closes #9952 from yu-iskw/SPARK-6518.
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaBisectingKMeansExample.java69
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/BisectingKMeansExample.scala60
2 files changed, 129 insertions, 0 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBisectingKMeansExample.java
new file mode 100644
index 0000000000..0001500f4f
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBisectingKMeansExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import java.util.ArrayList;
+
+// $example on$
+import com.google.common.collect.Lists;
+// $example off$
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.clustering.BisectingKMeans;
+import org.apache.spark.mllib.clustering.BisectingKMeansModel;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+// $example off$
+
+/**
+ * Java example for graph clustering using power iteration clustering (PIC).
+ */
+public class JavaBisectingKMeansExample {
+ public static void main(String[] args) {
+ SparkConf sparkConf = new SparkConf().setAppName("JavaBisectingKMeansExample");
+ JavaSparkContext sc = new JavaSparkContext(sparkConf);
+
+ // $example on$
+ ArrayList<Vector> localData = Lists.newArrayList(
+ Vectors.dense(0.1, 0.1), Vectors.dense(0.3, 0.3),
+ Vectors.dense(10.1, 10.1), Vectors.dense(10.3, 10.3),
+ Vectors.dense(20.1, 20.1), Vectors.dense(20.3, 20.3),
+ Vectors.dense(30.1, 30.1), Vectors.dense(30.3, 30.3)
+ );
+ JavaRDD<Vector> data = sc.parallelize(localData, 2);
+
+ BisectingKMeans bkm = new BisectingKMeans()
+ .setK(4);
+ BisectingKMeansModel model = bkm.run(data);
+
+ System.out.println("Compute Cost: " + model.computeCost(data));
+ for (Vector center: model.clusterCenters()) {
+ System.out.println("");
+ }
+ Vector[] clusterCenters = model.clusterCenters();
+ for (int i = 0; i < clusterCenters.length; i++) {
+ Vector clusterCenter = clusterCenters[i];
+ System.out.println("Cluster Center " + i + ": " + clusterCenter);
+ }
+ // $example off$
+
+ sc.stop();
+ }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BisectingKMeansExample.scala
new file mode 100644
index 0000000000..3a596cccb8
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BisectingKMeansExample.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+// scalastyle:off println
+// $example on$
+import org.apache.spark.mllib.clustering.BisectingKMeans
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+// $example off$
+import org.apache.spark.{SparkConf, SparkContext}
+
+/**
+ * An example demonstrating a bisecting k-means clustering in spark.mllib.
+ *
+ * Run with
+ * {{{
+ * bin/run-example mllib.BisectingKMeansExample
+ * }}}
+ */
+object BisectingKMeansExample {
+
+ def main(args: Array[String]) {
+ val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample")
+ val sc = new SparkContext(sparkConf)
+
+ // $example on$
+ // Loads and parses data
+ def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble))
+ val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache()
+
+ // Clustering the data into 6 clusters by BisectingKMeans.
+ val bkm = new BisectingKMeans().setK(6)
+ val model = bkm.run(data)
+
+ // Show the compute cost and the cluster centers
+ println(s"Compute Cost: ${model.computeCost(data)}")
+ model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
+ println(s"Cluster Center ${idx}: ${center}")
+ }
+ // $example off$
+
+ sc.stop()
+ }
+}
+// scalastyle:on println