[SPARK-5562] [MLLIB] LDA should handle empty document.

See the jira https://issues.apache.org/jira/browse/SPARK-5562 Author: Alok Singh <singhal@Aloks-MacBook-Pro.local> Author: Alok Singh <singhal@aloks-mbp.usca.ibm.com> Author: Alok Singh <“singhal@us.ibm.com”> Closes #7064 from aloknsingh/aloknsingh_SPARK-5562 and squashes the following commits: 259a0a7 [Alok Singh] change as per the comments by @jkbradley be48491 [Alok Singh] [SPARK-5562][MLlib] re-order import in alphabhetical order c01311b [Alok Singh] [SPARK-5562][MLlib] fix the newline typo b271c8a [Alok Singh] [SPARK-5562][Mllib] As per github discussion with jkbradley. We would like to simply things. 7c06251 [Alok Singh] [SPARK-5562][MLlib] modified the JavaLDASuite for test passing c710cb6 [Alok Singh] fix the scala code style to have space after : 2572a08 [Alok Singh] [SPARK-5562][MLlib] change the import xyz._ to the import xyz.{c1, c2} .. ab55fbf [Alok Singh] [SPARK-5562][MLlib] Change as per Sean Owen's comments https://github.com/apache/spark/pull/7064/files#diff-9236d23975e6f5a5608ffc81dfd79146 9f4f9ea [Alok Singh] [SPARK-5562][MLlib] LDA should handle empty document.
author: Alok Singh <singhal@Aloks-MacBook-Pro.local> 2015-07-06 21:53:55 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2015-07-06 21:53:55 -0700
commit: 6718c1eb671faaf5c1d865ad5d01dbf78dae9cd2 (patch)
tree: 2ef634b209ddb16a89bcc0a93bd103a14f662522 /mllib/src/test/java
parent: 1821fc165808143e98b3d9626141b1a55bde90ac (diff)
download: spark-6718c1eb671faaf5c1d865ad5d01dbf78dae9cd2.tar.gz
spark-6718c1eb671faaf5c1d865ad5d01dbf78dae9cd2.tar.bz2
spark-6718c1eb671faaf5c1d865ad5d01dbf78dae9cd2.zip
1 files changed, 11 insertions, 2 deletions
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index 581c033f08..b48f190f59 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -28,12 +28,13 @@ import static org.junit.Assert.assertArrayEquals;
 import org.junit.Before;
 import org.junit.Test;
 
+import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.linalg.Matrix;
 import org.apache.spark.mllib.linalg.Vector;
-
+import org.apache.spark.mllib.linalg.Vectors;
 
 public class JavaLDASuite implements Serializable {
   private transient JavaSparkContext sc;
@@ -110,7 +111,15 @@ public class JavaLDASuite implements Serializable {
 
     // Check: topic distributions
     JavaPairRDD<Long, Vector> topicDistributions = model.javaTopicDistributions();
-    assertEquals(topicDistributions.count(), corpus.count());
+    // SPARK-5562. since the topicDistribution returns the distribution of the non empty docs
+    // over topics. Compare it against nonEmptyCorpus instead of corpus
+    JavaPairRDD<Long, Vector> nonEmptyCorpus = corpus.filter(
+      new Function<Tuple2<Long, Vector>, Boolean>() {
+        public Boolean call(Tuple2<Long, Vector> tuple2) {
+          return Vectors.norm(tuple2._2(), 1.0) != 0.0;
+        }
+    });
+    assertEquals(topicDistributions.count(), nonEmptyCorpus.count());
   }
 
   @Test
author	Alok Singh <singhal@Aloks-MacBook-Pro.local>	2015-07-06 21:53:55 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2015-07-06 21:53:55 -0700
commit	6718c1eb671faaf5c1d865ad5d01dbf78dae9cd2 (patch)
tree	2ef634b209ddb16a89bcc0a93bd103a14f662522 /mllib/src/test/java
parent	1821fc165808143e98b3d9626141b1a55bde90ac (diff)
download	spark-6718c1eb671faaf5c1d865ad5d01dbf78dae9cd2.tar.gz spark-6718c1eb671faaf5c1d865ad5d01dbf78dae9cd2.tar.bz2 spark-6718c1eb671faaf5c1d865ad5d01dbf78dae9cd2.zip