[SPARK-6793] [MLLIB] OnlineLDAOptimizer LDA perplexity

Implements `logPerplexity` in `OnlineLDAOptimizer`. Also refactors inference code into companion object to enable future reuse (e.g. `predict` method). Author: Feynman Liang <fliang@databricks.com> Closes #7705 from feynmanliang/SPARK-6793-perplexity and squashes the following commits: 6da2c99 [Feynman Liang] Remove get* from LDAModel public API 8381da6 [Feynman Liang] Code review comments 17f7000 [Feynman Liang] Documentation typo fixes 2f452a4 [Feynman Liang] Remove auxillary DistributedLDAModel constructor a275914 [Feynman Liang] Prevent empty counts calls to variationalInference 06d02d9 [Feynman Liang] Remove deprecated LocalLDAModel constructor afecb46 [Feynman Liang] Fix regression bug in sstats accumulator 5a327a0 [Feynman Liang] Code review quick fixes 998c03e [Feynman Liang] Fix style 1cbb67d [Feynman Liang] Fix access modifier bug 4362daa [Feynman Liang] Organize imports 4f171f7 [Feynman Liang] Fix indendation 2f049ce [Feynman Liang] Fix failing save/load tests 7415e96 [Feynman Liang] Pick changes from big PR 11e7c33 [Feynman Liang] Merge remote-tracking branch 'apache/master' into SPARK-6793-perplexity f8adc48 [Feynman Liang] Add logPerplexity, refactor variationalBound into a method cd521d6 [Feynman Liang] Refactor methods into companion class 7f62a55 [Feynman Liang] --amend c62cb1e [Feynman Liang] Outer product for stats, revert Range slicing aead650 [Feynman Liang] Range slice, in-place update, reduce transposes
author: Feynman Liang <fliang@databricks.com> 2015-07-29 16:20:20 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2015-07-29 16:20:20 -0700
commit: 2cc212d56a1d50fe68d5816f71b27803de1f6389 (patch)
tree: 57246612d422ea82e592c4079fe43f16e9bdfb84 /mllib/src/test
parent: 1b0099fc62d02ff6216a76fbfe17a4ec5b2f3536 (diff)
download: spark-2cc212d56a1d50fe68d5816f71b27803de1f6389.tar.gz
spark-2cc212d56a1d50fe68d5816f71b27803de1f6389.tar.bz2
spark-2cc212d56a1d50fe68d5816f71b27803de1f6389.zip
2 files changed, 55 insertions, 4 deletions
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index b48f190f59..d272a42c85 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.clustering;
 
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.Arrays;
 
 import scala.Tuple2;
 
@@ -59,7 +60,10 @@ public class JavaLDASuite implements Serializable {
 
   @Test
   public void localLDAModel() {
-    LocalLDAModel model = new LocalLDAModel(LDASuite$.MODULE$.tinyTopics());
+    Matrix topics = LDASuite$.MODULE$.tinyTopics();
+    double[] topicConcentration = new double[topics.numRows()];
+    Arrays.fill(topicConcentration, 1.0D / topics.numRows());
+    LocalLDAModel model = new LocalLDAModel(topics, Vectors.dense(topicConcentration), 1D, 100D);
 
     // Check: basic parameters
     assertEquals(model.k(), tinyK);
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 376a87f051..aa36336ebb 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseMatrix => BDM}
+import breeze.linalg.{DenseMatrix => BDM, max, argmax}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.graphx.Edge
@@ -31,7 +31,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
   import LDASuite._
 
   test("LocalLDAModel") {
-    val model = new LocalLDAModel(tinyTopics)
+    val model = new LocalLDAModel(tinyTopics,
+      Vectors.dense(Array.fill(tinyTopics.numRows)(1.0 / tinyTopics.numRows)), 1D, 100D)
 
     // Check: basic parameters
     assert(model.k === tinyK)
@@ -235,6 +236,51 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("LocalLDAModel logPerplexity") {
+    val k = 2
+    val vocabSize = 6
+    val alpha = 0.01
+    val eta = 0.01
+    val gammaShape = 100
+    val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array(
+      1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597,
+      0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124))
+
+    def toydata: Array[(Long, Vector)] = Array(
+      Vectors.sparse(6, Array(0, 1), Array(1, 1)),
+      Vectors.sparse(6, Array(1, 2), Array(1, 1)),
+      Vectors.sparse(6, Array(0, 2), Array(1, 1)),
+      Vectors.sparse(6, Array(3, 4), Array(1, 1)),
+      Vectors.sparse(6, Array(3, 5), Array(1, 1)),
+      Vectors.sparse(6, Array(4, 5), Array(1, 1))
+    ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
+    val docs = sc.parallelize(toydata)
+
+
+    val ldaModel: LocalLDAModel = new LocalLDAModel(
+      topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape)
+
+    /* Verify results using gensim:
+       import numpy as np
+       from gensim import models
+       corpus = [
+          [(0, 1.0), (1, 1.0)],
+          [(1, 1.0), (2, 1.0)],
+          [(0, 1.0), (2, 1.0)],
+          [(3, 1.0), (4, 1.0)],
+          [(3, 1.0), (5, 1.0)],
+          [(4, 1.0), (5, 1.0)]]
+       np.random.seed(2345)
+       lda = models.ldamodel.LdaModel(
+          corpus=corpus, alpha=0.01, eta=0.01, num_topics=2, update_every=0, passes=100,
+          decay=0.51, offset=1024)
+       print(lda.log_perplexity(corpus))
+       > -3.69051285096
+     */
+
+    assert(ldaModel.logPerplexity(docs) ~== -3.690D relTol 1E-3D)
+  }
+
   test("OnlineLDAOptimizer with asymmetric prior") {
     def toydata: Array[(Long, Vector)] = Array(
       Vectors.sparse(6, Array(0, 1), Array(1, 1)),
@@ -287,7 +333,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("model save/load") {
     // Test for LocalLDAModel.
-    val localModel = new LocalLDAModel(tinyTopics)
+    val localModel = new LocalLDAModel(tinyTopics,
+      Vectors.dense(Array.fill(tinyTopics.numRows)(1.0 / tinyTopics.numRows)), 1D, 100D)
     val tempDir1 = Utils.createTempDir()
     val path1 = tempDir1.toURI.toString
author	Feynman Liang <fliang@databricks.com>	2015-07-29 16:20:20 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2015-07-29 16:20:20 -0700
commit	2cc212d56a1d50fe68d5816f71b27803de1f6389 (patch)
tree	57246612d422ea82e592c4079fe43f16e9bdfb84 /mllib/src/test
parent	1b0099fc62d02ff6216a76fbfe17a4ec5b2f3536 (diff)
download	spark-2cc212d56a1d50fe68d5816f71b27803de1f6389.tar.gz spark-2cc212d56a1d50fe68d5816f71b27803de1f6389.tar.bz2 spark-2cc212d56a1d50fe68d5816f71b27803de1f6389.zip