aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--data/mllib/pic_data.txt19
-rw-r--r--docs/mllib-clustering.md54
2 files changed, 69 insertions, 4 deletions
diff --git a/data/mllib/pic_data.txt b/data/mllib/pic_data.txt
new file mode 100644
index 0000000000..fcfef8cd19
--- /dev/null
+++ b/data/mllib/pic_data.txt
@@ -0,0 +1,19 @@
+0 1 1.0
+0 2 1.0
+0 3 1.0
+1 2 1.0
+1 3 1.0
+2 3 1.0
+3 4 0.1
+4 5 1.0
+4 15 1.0
+5 6 1.0
+6 7 1.0
+7 8 1.0
+8 9 1.0
+9 10 1.0
+10 11 1.0
+11 12 1.0
+12 13 1.0
+13 14 1.0
+14 15 1.0
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index dcaa3784be..3aad4149f9 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -327,11 +327,17 @@ which contains the computed clustering assignments.
import org.apache.spark.mllib.clustering.{PowerIterationClustering, PowerIterationClusteringModel}
import org.apache.spark.mllib.linalg.Vectors
-val similarities: RDD[(Long, Long, Double)] = ...
+// Load and parse the data
+val data = sc.textFile("data/mllib/pic_data.txt")
+val similarities = data.map { line =>
+ val parts = line.split(' ')
+ (parts(0).toLong, parts(1).toLong, parts(2).toDouble)
+}
+// Cluster the data into two classes using PowerIterationClustering
val pic = new PowerIterationClustering()
- .setK(3)
- .setMaxIterations(20)
+ .setK(2)
+ .setMaxIterations(10)
val model = pic.run(similarities)
model.assignments.foreach { a =>
@@ -363,11 +369,22 @@ import scala.Tuple2;
import scala.Tuple3;
import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.clustering.PowerIterationClustering;
import org.apache.spark.mllib.clustering.PowerIterationClusteringModel;
-JavaRDD<Tuple3<Long, Long, Double>> similarities = ...
+// Load and parse the data
+JavaRDD<String> data = sc.textFile("data/mllib/pic_data.txt");
+JavaRDD<Tuple3<Long, Long, Double>> similarities = data.map(
+ new Function<String, Tuple3<Long, Long, Double>>() {
+ public Tuple3<Long, Long, Double> call(String line) {
+ String[] parts = line.split(" ");
+ return new Tuple3<>(new Long(parts[0]), new Long(parts[1]), new Double(parts[2]));
+ }
+ }
+);
+// Cluster the data into two classes using PowerIterationClustering
PowerIterationClustering pic = new PowerIterationClustering()
.setK(2)
.setMaxIterations(10);
@@ -383,6 +400,35 @@ PowerIterationClusteringModel sameModel = PowerIterationClusteringModel.load(sc.
{% endhighlight %}
</div>
+<div data-lang="python" markdown="1">
+
+[`PowerIterationClustering`](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering)
+implements the PIC algorithm.
+It takes an `RDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the
+affinity matrix.
+Calling `PowerIterationClustering.run` returns a
+[`PowerIterationClusteringModel`](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering),
+which contains the computed clustering assignments.
+
+{% highlight python %}
+from __future__ import print_function
+from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel
+
+# Load and parse the data
+data = sc.textFile("data/mllib/pic_data.txt")
+similarities = data.map(lambda line: tuple([float(x) for x in line.split(' ')]))
+
+# Cluster the data into two classes using PowerIterationClustering
+model = PowerIterationClustering.train(similarities, 2, 10)
+
+model.assignments().foreach(lambda x: print(str(x.id) + " -> " + str(x.cluster)))
+
+# Save and load model
+model.save(sc, "myModelPath")
+sameModel = PowerIterationClusteringModel.load(sc, "myModelPath")
+{% endhighlight %}
+</div>
+
</div>
## Latent Dirichlet allocation (LDA)