[SPARK-9225] [MLLIB] LDASuite needs unit tests for empty documents

Add unit tests for running LDA with empty documents. Both EMLDAOptimizer and OnlineLDAOptimizer are tested. feynmanliang Author: Meihua Wu <meihuawu@umich.edu> Closes #7620 from rotationsymmetry/SPARK-9225 and squashes the following commits: 3ed7c88 [Meihua Wu] Incorporate reviewer's further comments f9432e8 [Meihua Wu] Incorporate reviewer's comments 8e1b9ec [Meihua Wu] Merge remote-tracking branch 'upstream/master' into SPARK-9225 ad55665 [Meihua Wu] Add unit tests for running LDA with empty documents
author: Meihua Wu <meihuawu@umich.edu> 2015-07-30 08:52:01 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-07-30 08:52:01 -0700
commit: a6e53a9c8b24326d1b6dca7a0e36ce6c643daa77 (patch)
tree: 0d127dcfe7e6589dbf4fda9c5925e1f008a4b084 /mllib
parent: 9c0501c5d04d83ca25ce433138bf64df6a14dc58 (diff)
download: spark-a6e53a9c8b24326d1b6dca7a0e36ce6c643daa77.tar.gz
spark-a6e53a9c8b24326d1b6dca7a0e36ce6c643daa77.tar.bz2
spark-a6e53a9c8b24326d1b6dca7a0e36ce6c643daa77.zip
1 files changed, 40 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index b91c7cefed..61d2edfd9f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -390,6 +390,46 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("EMLDAOptimizer with empty docs") {
+    val vocabSize = 6
+    val emptyDocsArray = Array.fill(6)(Vectors.sparse(vocabSize, Array.empty, Array.empty))
+    val emptyDocs = emptyDocsArray
+      .zipWithIndex.map { case (wordCounts, docId) =>
+        (docId.toLong, wordCounts)
+    }
+    val distributedEmptyDocs = sc.parallelize(emptyDocs, 2)
+
+    val op = new EMLDAOptimizer()
+    val lda = new LDA()
+      .setK(3)
+      .setMaxIterations(5)
+      .setSeed(12345)
+      .setOptimizer(op)
+
+    val model = lda.run(distributedEmptyDocs)
+    assert(model.vocabSize === vocabSize)
+  }
+
+  test("OnlineLDAOptimizer with empty docs") {
+    val vocabSize = 6
+    val emptyDocsArray = Array.fill(6)(Vectors.sparse(vocabSize, Array.empty, Array.empty))
+    val emptyDocs = emptyDocsArray
+      .zipWithIndex.map { case (wordCounts, docId) =>
+        (docId.toLong, wordCounts)
+    }
+    val distributedEmptyDocs = sc.parallelize(emptyDocs, 2)
+
+    val op = new OnlineLDAOptimizer()
+    val lda = new LDA()
+      .setK(3)
+      .setMaxIterations(5)
+      .setSeed(12345)
+      .setOptimizer(op)
+
+    val model = lda.run(distributedEmptyDocs)
+    assert(model.vocabSize === vocabSize)
+  }
+
 }
 
 private[clustering] object LDASuite {
author	Meihua Wu <meihuawu@umich.edu>	2015-07-30 08:52:01 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-07-30 08:52:01 -0700
commit	a6e53a9c8b24326d1b6dca7a0e36ce6c643daa77 (patch)
tree	0d127dcfe7e6589dbf4fda9c5925e1f008a4b084 /mllib
parent	9c0501c5d04d83ca25ce433138bf64df6a14dc58 (diff)
download	spark-a6e53a9c8b24326d1b6dca7a0e36ce6c643daa77.tar.gz spark-a6e53a9c8b24326d1b6dca7a0e36ce6c643daa77.tar.bz2 spark-a6e53a9c8b24326d1b6dca7a0e36ce6c643daa77.zip