From 0a468a46bf5b905e9b0205e98b862570b2ac556e Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Thu, 2 Jul 2015 09:59:54 -0700 Subject: [SPARK-8758] [MLLIB] Add Python user guide for PowerIterationClustering Add Python user guide for PowerIterationClustering Author: Yanbo Liang Closes #7155 from yanboliang/spark-8758 and squashes the following commits: 18d803b [Yanbo Liang] address comments dd29577 [Yanbo Liang] Add Python user guide for PowerIterationClustering --- docs/mllib-clustering.md | 54 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 4 deletions(-) (limited to 'docs') diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index dcaa3784be..3aad4149f9 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -327,11 +327,17 @@ which contains the computed clustering assignments. import org.apache.spark.mllib.clustering.{PowerIterationClustering, PowerIterationClusteringModel} import org.apache.spark.mllib.linalg.Vectors -val similarities: RDD[(Long, Long, Double)] = ... +// Load and parse the data +val data = sc.textFile("data/mllib/pic_data.txt") +val similarities = data.map { line => + val parts = line.split(' ') + (parts(0).toLong, parts(1).toLong, parts(2).toDouble) +} +// Cluster the data into two classes using PowerIterationClustering val pic = new PowerIterationClustering() - .setK(3) - .setMaxIterations(20) + .setK(2) + .setMaxIterations(10) val model = pic.run(similarities) model.assignments.foreach { a => @@ -363,11 +369,22 @@ import scala.Tuple2; import scala.Tuple3; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.clustering.PowerIterationClustering; import org.apache.spark.mllib.clustering.PowerIterationClusteringModel; -JavaRDD> similarities = ... +// Load and parse the data +JavaRDD data = sc.textFile("data/mllib/pic_data.txt"); +JavaRDD> similarities = data.map( + new Function>() { + public Tuple3 call(String line) { + String[] parts = line.split(" "); + return new Tuple3<>(new Long(parts[0]), new Long(parts[1]), new Double(parts[2])); + } + } +); +// Cluster the data into two classes using PowerIterationClustering PowerIterationClustering pic = new PowerIterationClustering() .setK(2) .setMaxIterations(10); @@ -383,6 +400,35 @@ PowerIterationClusteringModel sameModel = PowerIterationClusteringModel.load(sc. {% endhighlight %} +
+ +[`PowerIterationClustering`](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering) +implements the PIC algorithm. +It takes an `RDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the +affinity matrix. +Calling `PowerIterationClustering.run` returns a +[`PowerIterationClusteringModel`](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering), +which contains the computed clustering assignments. + +{% highlight python %} +from __future__ import print_function +from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel + +# Load and parse the data +data = sc.textFile("data/mllib/pic_data.txt") +similarities = data.map(lambda line: tuple([float(x) for x in line.split(' ')])) + +# Cluster the data into two classes using PowerIterationClustering +model = PowerIterationClustering.train(similarities, 2, 10) + +model.assignments().foreach(lambda x: print(str(x.id) + " -> " + str(x.cluster))) + +# Save and load model +model.save(sc, "myModelPath") +sameModel = PowerIterationClusteringModel.load(sc, "myModelPath") +{% endhighlight %} +
+ ## Latent Dirichlet allocation (LDA) -- cgit v1.2.3