aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-02-19 18:06:16 -0800
committerXiangrui Meng <meng@databricks.com>2015-02-19 18:06:16 -0800
commit0cfd2cebde0b7fac3779eda80d6e42223f8a3d9f (patch)
tree36bdfdec69a205b85f7b85697c36abf2044d9ff5 /examples
parent6bddc40353057a562c78e75c5549c79a0d7d5f8b (diff)
downloadspark-0cfd2cebde0b7fac3779eda80d6e42223f8a3d9f.tar.gz
spark-0cfd2cebde0b7fac3779eda80d6e42223f8a3d9f.tar.bz2
spark-0cfd2cebde0b7fac3779eda80d6e42223f8a3d9f.zip
[SPARK-5900][MLLIB] make PIC and FPGrowth Java-friendly
In the previous version, PIC stores clustering assignments as an `RDD[(Long, Int)]`. This is mapped to `RDD<Tuple2<Object, Object>>` in Java and hence Java users have to cast types manually. We should either create a new method called `javaAssignments` that returns `JavaRDD[(java.lang.Long, java.lang.Int)]` or wrap the result pair in a class. I chose the latter approach in this PR. Now assignments are stored as an `RDD[Assignment]`, where `Assignment` is a class with `id` and `cluster`. Similarly, in FPGrowth, the frequent itemsets are stored as an `RDD[(Array[Item], Long)]`, which is mapped to `RDD<Tuple2<Object, Object>>`. Though we provide a "Java-friendly" method `javaFreqItemsets` that returns `JavaRDD[(Array[Item], java.lang.Long)]`. It doesn't really work because `Array[Item]` is mapped to `Object` in Java. So in this PR I created a class `FreqItemset` to wrap the results. It has `items` and `freq`, as well as a `javaItems` method that returns `List<Item>` in Java. I'm not certain that the names I chose are proper: `Assignment`/`id`/`cluster` and `FreqItemset`/`items`/`freq`. Please let me know if there are better suggestions. CC: jkbradley Author: Xiangrui Meng <meng@databricks.com> Closes #4695 from mengxr/SPARK-5900 and squashes the following commits: 865b5ca [Xiangrui Meng] make Assignment serializable cffa96e [Xiangrui Meng] fix test 9c0e590 [Xiangrui Meng] remove unused Tuple2 1b9db3d [Xiangrui Meng] make PIC and FPGrowth Java-friendly
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java8
-rw-r--r--examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java5
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala8
4 files changed, 9 insertions, 16 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
index 0db572d760..f50e802cf6 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
@@ -18,10 +18,8 @@
package org.apache.spark.examples.mllib;
import java.util.ArrayList;
-import java.util.Arrays;
-
-import scala.Tuple2;
+import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import org.apache.spark.SparkConf;
@@ -54,8 +52,8 @@ public class JavaFPGrowthExample {
.setMinSupport(0.3);
FPGrowthModel<String> model = fpg.run(transactions);
- for (Tuple2<Object, Long> s: model.javaFreqItemsets().collect()) {
- System.out.println(Arrays.toString((Object[]) s._1()) + ", " + s._2());
+ for (FPGrowth.FreqItemset<String> s: model.freqItemsets().toJavaRDD().collect()) {
+ System.out.println("[" + Joiner.on(",").join(s.javaItems()) + "], " + s.freq());
}
sc.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java
index e9371de39f..6c6f9768f0 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java
@@ -17,7 +17,6 @@
package org.apache.spark.examples.mllib;
-import scala.Tuple2;
import scala.Tuple3;
import com.google.common.collect.Lists;
@@ -49,8 +48,8 @@ public class JavaPowerIterationClusteringExample {
.setMaxIterations(10);
PowerIterationClusteringModel model = pic.run(similarities);
- for (Tuple2<Object, Object> assignment: model.assignments().toJavaRDD().collect()) {
- System.out.println(assignment._1() + " -> " + assignment._2());
+ for (PowerIterationClustering.Assignment a: model.assignments().toJavaRDD().collect()) {
+ System.out.println(a.id() + " -> " + a.cluster());
}
sc.stop();
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
index ae66107d70..aaae275ec5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
@@ -42,8 +42,8 @@ object FPGrowthExample {
.setMinSupport(0.3)
val model = fpg.run(transactions)
- model.freqItemsets.collect().foreach { case (itemset, freq) =>
- println(itemset.mkString("[", ",", "]") + ", " + freq)
+ model.freqItemsets.collect().foreach { itemset =>
+ println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
}
sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala
index b2373adba1..91c9772744 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala
@@ -44,8 +44,7 @@ import org.apache.spark.{SparkConf, SparkContext}
*
* Here is a sample run and output:
*
- * ./bin/run-example mllib.PowerIterationClusteringExample
- * -k 3 --n 30 --maxIterations 15
+ * ./bin/run-example mllib.PowerIterationClusteringExample -k 3 --n 30 --maxIterations 15
*
* Cluster assignments: 1 -> [0,1,2,3,4],2 -> [5,6,7,8,9,10,11,12,13,14],
* 0 -> [15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]
@@ -103,7 +102,7 @@ object PowerIterationClusteringExample {
.setMaxIterations(params.maxIterations)
.run(circlesRdd)
- val clusters = model.assignments.collect.groupBy(_._2).mapValues(_.map(_._1))
+ val clusters = model.assignments.collect().groupBy(_.cluster).mapValues(_.map(_.id))
val assignments = clusters.toList.sortBy { case (k, v) => v.length}
val assignmentsStr = assignments
.map { case (k, v) =>
@@ -153,8 +152,5 @@ object PowerIterationClusteringExample {
val expCoeff = -1.0 / 2.0 * math.pow(sigma, 2.0)
val ssquares = (p1._1 - p2._1) * (p1._1 - p2._1) + (p1._2 - p2._2) * (p1._2 - p2._2)
coeff * math.exp(expCoeff * ssquares)
- // math.exp((p1._1 - p2._1) * (p1._1 - p2._1) + (p1._2 - p2._2) * (p1._2 - p2._2))
}
-
-
}