aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-06-28 22:38:04 -0700
committerXiangrui Meng <meng@databricks.com>2015-06-28 22:38:04 -0700
commitdfde31da5ce30e0d44cad4fb6618b44d5353d946 (patch)
treeae8506a9c2c79756df8479d89b1cc43f8eb933fe /mllib
parent25f574eb9a3cb9b93b7d9194a8ec16e00ce2c036 (diff)
downloadspark-dfde31da5ce30e0d44cad4fb6618b44d5353d946.tar.gz
spark-dfde31da5ce30e0d44cad4fb6618b44d5353d946.tar.bz2
spark-dfde31da5ce30e0d44cad4fb6618b44d5353d946.zip
[SPARK-5962] [MLLIB] Python support for Power Iteration Clustering
Python support for Power Iteration Clustering https://issues.apache.org/jira/browse/SPARK-5962 Author: Yanbo Liang <ybliang8@gmail.com> Closes #6992 from yanboliang/pyspark-pic and squashes the following commits: 6b03d82 [Yanbo Liang] address comments 4be4423 [Yanbo Liang] Python support for Power Iteration Clustering
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/api/python/PowerIterationClusteringModelWrapper.scala32
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala27
2 files changed, 59 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PowerIterationClusteringModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PowerIterationClusteringModelWrapper.scala
new file mode 100644
index 0000000000..bc6041b221
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PowerIterationClusteringModelWrapper.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.api.python
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.clustering.PowerIterationClusteringModel
+
+/**
+ * A Wrapper of PowerIterationClusteringModel to provide helper method for Python
+ */
+private[python] class PowerIterationClusteringModelWrapper(model: PowerIterationClusteringModel)
+ extends PowerIterationClusteringModel(model.k, model.assignments) {
+
+ def getAssignments: RDD[Array[Any]] = {
+ model.assignments.map(x => Array(x.id, x.cluster))
+ }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index b16903a8d5..a66a404d5c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -407,6 +407,33 @@ private[python] class PythonMLLibAPI extends Serializable {
}
/**
+ * Java stub for Python mllib PowerIterationClustering.run(). This stub returns a
+ * handle to the Java object instead of the content of the Java object. Extra care
+ * needs to be taken in the Python code to ensure it gets freed on exit; see the
+ * Py4J documentation.
+ * @param data an RDD of (i, j, s,,ij,,) tuples representing the affinity matrix.
+ * @param k number of clusters.
+ * @param maxIterations maximum number of iterations of the power iteration loop.
+ * @param initMode the initialization mode. This can be either "random" to use
+ * a random vector as vertex properties, or "degree" to use
+ * normalized sum similarities. Default: random.
+ */
+ def trainPowerIterationClusteringModel(
+ data: JavaRDD[Vector],
+ k: Int,
+ maxIterations: Int,
+ initMode: String): PowerIterationClusteringModel = {
+
+ val pic = new PowerIterationClustering()
+ .setK(k)
+ .setMaxIterations(maxIterations)
+ .setInitializationMode(initMode)
+
+ val model = pic.run(data.rdd.map(v => (v(0).toLong, v(1).toLong, v(2))))
+ new PowerIterationClusteringModelWrapper(model)
+ }
+
+ /**
* Java stub for Python mllib ALS.train(). This stub returns a handle
* to the Java object instead of the content of the Java object. Extra care
* needs to be taken in the Python code to ensure it gets freed on exit; see