[SPARK-5962] [MLLIB] Python support for Power Iteration Clustering

Python support for Power Iteration Clustering https://issues.apache.org/jira/browse/SPARK-5962 Author: Yanbo Liang <ybliang8@gmail.com> Closes #6992 from yanboliang/pyspark-pic and squashes the following commits: 6b03d82 [Yanbo Liang] address comments 4be4423 [Yanbo Liang] Python support for Power Iteration Clustering
author: Yanbo Liang <ybliang8@gmail.com> 2015-06-28 22:38:04 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-06-28 22:38:04 -0700
commit: dfde31da5ce30e0d44cad4fb6618b44d5353d946 (patch)
tree: ae8506a9c2c79756df8479d89b1cc43f8eb933fe /mllib
parent: 25f574eb9a3cb9b93b7d9194a8ec16e00ce2c036 (diff)
download: spark-dfde31da5ce30e0d44cad4fb6618b44d5353d946.tar.gz
spark-dfde31da5ce30e0d44cad4fb6618b44d5353d946.tar.bz2
spark-dfde31da5ce30e0d44cad4fb6618b44d5353d946.zip
2 files changed, 59 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PowerIterationClusteringModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PowerIterationClusteringModelWrapper.scala
new file mode 100644
index 0000000000..bc6041b221
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PowerIterationClusteringModelWrapper.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.api.python
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.clustering.PowerIterationClusteringModel
+
+/**
+ * A Wrapper of PowerIterationClusteringModel to provide helper method for Python
+ */
+private[python] class PowerIterationClusteringModelWrapper(model: PowerIterationClusteringModel)
+  extends PowerIterationClusteringModel(model.k, model.assignments) {
+
+  def getAssignments: RDD[Array[Any]] = {
+    model.assignments.map(x => Array(x.id, x.cluster))
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index b16903a8d5..a66a404d5c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -407,6 +407,33 @@ private[python] class PythonMLLibAPI extends Serializable {
   }
 
   /**
+   * Java stub for Python mllib PowerIterationClustering.run(). This stub returns a
+   * handle to the Java object instead of the content of the Java object.  Extra care
+   * needs to be taken in the Python code to ensure it gets freed on exit; see the
+   * Py4J documentation.
+   * @param data an RDD of (i, j, s,,ij,,) tuples representing the affinity matrix.
+   * @param k number of clusters.
+   * @param maxIterations maximum number of iterations of the power iteration loop.
+   * @param initMode the initialization mode. This can be either "random" to use
+   *                 a random vector as vertex properties, or "degree" to use
+   *                 normalized sum similarities. Default: random.
+   */
+  def trainPowerIterationClusteringModel(
+      data: JavaRDD[Vector],
+      k: Int,
+      maxIterations: Int,
+      initMode: String): PowerIterationClusteringModel = {
+
+    val pic = new PowerIterationClustering()
+      .setK(k)
+      .setMaxIterations(maxIterations)
+      .setInitializationMode(initMode)
+
+    val model = pic.run(data.rdd.map(v => (v(0).toLong, v(1).toLong, v(2))))
+    new PowerIterationClusteringModelWrapper(model)
+  }
+
+  /**
    * Java stub for Python mllib ALS.train().  This stub returns a handle
    * to the Java object instead of the content of the Java object.  Extra care
    * needs to be taken in the Python code to ensure it gets freed on exit; see
author	Yanbo Liang <ybliang8@gmail.com>	2015-06-28 22:38:04 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-06-28 22:38:04 -0700
commit	dfde31da5ce30e0d44cad4fb6618b44d5353d946 (patch)
tree	ae8506a9c2c79756df8479d89b1cc43f8eb933fe /mllib
parent	25f574eb9a3cb9b93b7d9194a8ec16e00ce2c036 (diff)
download	spark-dfde31da5ce30e0d44cad4fb6618b44d5353d946.tar.gz spark-dfde31da5ce30e0d44cad4fb6618b44d5353d946.tar.bz2 spark-dfde31da5ce30e0d44cad4fb6618b44d5353d946.zip