From 0a468a46bf5b905e9b0205e98b862570b2ac556e Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 2 Jul 2015 09:59:54 -0700
Subject: [SPARK-8758] [MLLIB] Add Python user guide for
 PowerIterationClustering

Add Python user guide for PowerIterationClustering

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #7155 from yanboliang/spark-8758 and squashes the following commits:

18d803b [Yanbo Liang] address comments
dd29577 [Yanbo Liang] Add Python user guide for PowerIterationClustering
---
 data/mllib/pic_data.txt  | 19 +++++++++++++++++
 docs/mllib-clustering.md | 54 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 69 insertions(+), 4 deletions(-)
 create mode 100644 data/mllib/pic_data.txt
diff --git a/data/mllib/pic_data.txt b/data/mllib/pic_data.txt
new file mode 100644
index 0000000000..fcfef8cd19
--- /dev/null
+++ b/data/mllib/pic_data.txt
@@ -0,0 +1,19 @@
+0 1 1.0
+0 2 1.0
+0 3 1.0
+1 2 1.0
+1 3 1.0
+2 3 1.0
+3 4 0.1
+4 5 1.0
+4 15 1.0
+5 6 1.0
+6 7 1.0
+7 8 1.0
+8 9 1.0
+9 10 1.0
+10 11 1.0
+11 12 1.0
+12 13 1.0
+13 14 1.0
+14 15 1.0
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index dcaa3784be..3aad4149f9 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -327,11 +327,17 @@ which contains the computed clustering assignments.
 import org.apache.spark.mllib.clustering.{PowerIterationClustering, PowerIterationClusteringModel}
 import org.apache.spark.mllib.linalg.Vectors
 
-val similarities: RDD[(Long, Long, Double)] = ...
+// Load and parse the data
+val data = sc.textFile("data/mllib/pic_data.txt")
+val similarities = data.map { line =>
+  val parts = line.split(' ')
+  (parts(0).toLong, parts(1).toLong, parts(2).toDouble)
+}
 
+// Cluster the data into two classes using PowerIterationClustering
 val pic = new PowerIterationClustering()
-  .setK(3)
-  .setMaxIterations(20)
+  .setK(2)
+  .setMaxIterations(10)
 val model = pic.run(similarities)
 
 model.assignments.foreach { a =>
@@ -363,11 +369,22 @@ import scala.Tuple2;
 import scala.Tuple3;
 
 import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
 import org.apache.spark.mllib.clustering.PowerIterationClustering;
 import org.apache.spark.mllib.clustering.PowerIterationClusteringModel;
 
-JavaRDD<Tuple3<Long, Long, Double>> similarities = ...
+// Load and parse the data
+JavaRDD<String> data = sc.textFile("data/mllib/pic_data.txt");
+JavaRDD<Tuple3<Long, Long, Double>> similarities = data.map(
+  new Function<String, Tuple3<Long, Long, Double>>() {
+    public Tuple3<Long, Long, Double> call(String line) {
+      String[] parts = line.split(" ");
+      return new Tuple3<>(new Long(parts[0]), new Long(parts[1]), new Double(parts[2]));
+    }
+  }
+);
 
+// Cluster the data into two classes using PowerIterationClustering
 PowerIterationClustering pic = new PowerIterationClustering()
   .setK(2)
   .setMaxIterations(10);
@@ -383,6 +400,35 @@ PowerIterationClusteringModel sameModel = PowerIterationClusteringModel.load(sc.
 {% endhighlight %}
 </div>
 
+<div data-lang="python" markdown="1">
+
+[`PowerIterationClustering`](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering)
+implements the PIC algorithm.
+It takes an `RDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the
+affinity matrix.
+Calling `PowerIterationClustering.run` returns a
+[`PowerIterationClusteringModel`](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering),
+which contains the computed clustering assignments.
+
+{% highlight python %}
+from __future__ import print_function
+from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel
+
+# Load and parse the data
+data = sc.textFile("data/mllib/pic_data.txt")
+similarities = data.map(lambda line: tuple([float(x) for x in line.split(' ')]))
+
+# Cluster the data into two classes using PowerIterationClustering
+model = PowerIterationClustering.train(similarities, 2, 10)
+
+model.assignments().foreach(lambda x: print(str(x.id) + " -> " + str(x.cluster)))
+
+# Save and load model
+model.save(sc, "myModelPath")
+sameModel = PowerIterationClusteringModel.load(sc, "myModelPath")
+{% endhighlight %}
+</div>
+
 </div>
 
 ## Latent Dirichlet allocation (LDA)
-- 
cgit v1.2.3