aboutsummaryrefslogtreecommitdiff
path: root/docs
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-07-02 09:59:54 -0700
committerXiangrui Meng <meng@databricks.com>2015-07-02 09:59:54 -0700
commit0a468a46bf5b905e9b0205e98b862570b2ac556e (patch)
treefe4978bada82dea2cf0904cd02891de3503ef728 /docs
parent99c40cd0d8465525cac34dfa373b81532ef3d719 (diff)
downloadspark-0a468a46bf5b905e9b0205e98b862570b2ac556e.tar.gz
spark-0a468a46bf5b905e9b0205e98b862570b2ac556e.tar.bz2
spark-0a468a46bf5b905e9b0205e98b862570b2ac556e.zip
[SPARK-8758] [MLLIB] Add Python user guide for PowerIterationClustering
Add Python user guide for PowerIterationClustering Author: Yanbo Liang <ybliang8@gmail.com> Closes #7155 from yanboliang/spark-8758 and squashes the following commits: 18d803b [Yanbo Liang] address comments dd29577 [Yanbo Liang] Add Python user guide for PowerIterationClustering
Diffstat (limited to 'docs')
-rw-r--r--docs/mllib-clustering.md54
1 files changed, 50 insertions, 4 deletions
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index dcaa3784be..3aad4149f9 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -327,11 +327,17 @@ which contains the computed clustering assignments.
import org.apache.spark.mllib.clustering.{PowerIterationClustering, PowerIterationClusteringModel}
import org.apache.spark.mllib.linalg.Vectors
-val similarities: RDD[(Long, Long, Double)] = ...
+// Load and parse the data
+val data = sc.textFile("data/mllib/pic_data.txt")
+val similarities = data.map { line =>
+ val parts = line.split(' ')
+ (parts(0).toLong, parts(1).toLong, parts(2).toDouble)
+}
+// Cluster the data into two classes using PowerIterationClustering
val pic = new PowerIterationClustering()
- .setK(3)
- .setMaxIterations(20)
+ .setK(2)
+ .setMaxIterations(10)
val model = pic.run(similarities)
model.assignments.foreach { a =>
@@ -363,11 +369,22 @@ import scala.Tuple2;
import scala.Tuple3;
import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.clustering.PowerIterationClustering;
import org.apache.spark.mllib.clustering.PowerIterationClusteringModel;
-JavaRDD<Tuple3<Long, Long, Double>> similarities = ...
+// Load and parse the data
+JavaRDD<String> data = sc.textFile("data/mllib/pic_data.txt");
+JavaRDD<Tuple3<Long, Long, Double>> similarities = data.map(
+ new Function<String, Tuple3<Long, Long, Double>>() {
+ public Tuple3<Long, Long, Double> call(String line) {
+ String[] parts = line.split(" ");
+ return new Tuple3<>(new Long(parts[0]), new Long(parts[1]), new Double(parts[2]));
+ }
+ }
+);
+// Cluster the data into two classes using PowerIterationClustering
PowerIterationClustering pic = new PowerIterationClustering()
.setK(2)
.setMaxIterations(10);
@@ -383,6 +400,35 @@ PowerIterationClusteringModel sameModel = PowerIterationClusteringModel.load(sc.
{% endhighlight %}
</div>
+<div data-lang="python" markdown="1">
+
+[`PowerIterationClustering`](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering)
+implements the PIC algorithm.
+It takes an `RDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the
+affinity matrix.
+Calling `PowerIterationClustering.run` returns a
+[`PowerIterationClusteringModel`](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering),
+which contains the computed clustering assignments.
+
+{% highlight python %}
+from __future__ import print_function
+from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel
+
+# Load and parse the data
+data = sc.textFile("data/mllib/pic_data.txt")
+similarities = data.map(lambda line: tuple([float(x) for x in line.split(' ')]))
+
+# Cluster the data into two classes using PowerIterationClustering
+model = PowerIterationClustering.train(similarities, 2, 10)
+
+model.assignments().foreach(lambda x: print(str(x.id) + " -> " + str(x.cluster)))
+
+# Save and load model
+model.save(sc, "myModelPath")
+sameModel = PowerIterationClusteringModel.load(sc, "myModelPath")
+{% endhighlight %}
+</div>
+
</div>
## Latent Dirichlet allocation (LDA)