From 0cfd2cebde0b7fac3779eda80d6e42223f8a3d9f Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 19 Feb 2015 18:06:16 -0800
Subject: [SPARK-5900][MLLIB] make PIC and FPGrowth Java-friendly

In the previous version, PIC stores clustering assignments as an `RDD[(Long, Int)]`. This is mapped to `RDD<Tuple2<Object, Object>>` in Java and hence Java users have to cast types manually. We should either create a new method called `javaAssignments` that returns `JavaRDD[(java.lang.Long, java.lang.Int)]` or wrap the result pair in a class. I chose the latter approach in this PR. Now assignments are stored as an `RDD[Assignment]`, where `Assignment` is a class with `id` and `cluster`.

Similarly, in FPGrowth, the frequent itemsets are stored as an `RDD[(Array[Item], Long)]`, which is mapped to `RDD<Tuple2<Object, Object>>`. Though we provide a "Java-friendly" method `javaFreqItemsets` that returns `JavaRDD[(Array[Item], java.lang.Long)]`. It doesn't really work because `Array[Item]` is mapped to `Object` in Java. So in this PR I created a class `FreqItemset` to wrap the results. It has `items` and `freq`, as well as a `javaItems` method that returns `List<Item>` in Java.

I'm not certain that the names I chose are proper: `Assignment`/`id`/`cluster` and `FreqItemset`/`items`/`freq`. Please let me know if there are better suggestions.

CC: jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #4695 from mengxr/SPARK-5900 and squashes the following commits:

865b5ca [Xiangrui Meng] make Assignment serializable
cffa96e [Xiangrui Meng] fix test
9c0e590 [Xiangrui Meng] remove unused Tuple2
1b9db3d [Xiangrui Meng] make PIC and FPGrowth Java-friendly
---
 .../apache/spark/mllib/fpm/JavaFPGrowthSuite.java  | 30 ++++++++--------------
 1 file changed, 10 insertions(+), 20 deletions(-)

(limited to 'mllib/src/test/java/org')
diff --git a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
index 851707c8a1..bd0edf2b9e 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.fpm;
 
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Before;
@@ -28,6 +29,7 @@ import static org.junit.Assert.*;
 
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset;
 
 public class JavaFPGrowthSuite implements Serializable {
   private transient JavaSparkContext sc;
@@ -55,30 +57,18 @@ public class JavaFPGrowthSuite implements Serializable {
       Lists.newArrayList("z".split(" ")),
       Lists.newArrayList("x z y r q t p".split(" "))), 2);
 
-    FPGrowth fpg = new FPGrowth();
-
-    FPGrowthModel<String> model6 = fpg
-      .setMinSupport(0.9)
-      .setNumPartitions(1)
-      .run(rdd);
-    assertEquals(0, model6.javaFreqItemsets().count());
-
-    FPGrowthModel<String> model3 = fpg
+    FPGrowthModel<String> model = new FPGrowth()
       .setMinSupport(0.5)
       .setNumPartitions(2)
       .run(rdd);
-    assertEquals(18, model3.javaFreqItemsets().count());
 
-    FPGrowthModel<String> model2 = fpg
-      .setMinSupport(0.3)
-      .setNumPartitions(4)
-      .run(rdd);
-    assertEquals(54, model2.javaFreqItemsets().count());
+    List<FreqItemset<String>> freqItemsets = model.freqItemsets().toJavaRDD().collect();
+    assertEquals(18, freqItemsets.size());
 
-    FPGrowthModel<String> model1 = fpg
-      .setMinSupport(0.1)
-      .setNumPartitions(8)
-      .run(rdd);
-    assertEquals(625, model1.javaFreqItemsets().count());
+    for (FreqItemset<String> itemset: freqItemsets) {
+      // Test return types.
+      List<String> items = itemset.javaItems();
+      long freq = itemset.freq();
+    }
   }
 }
-- 
cgit v1.2.3