[SPARK-9481] Add logLikelihood to LocalLDAModel

jkbradley Exposes `bound` (variational log likelihood bound) through public API as `logLikelihood`. Also adds unit tests, some DRYing of `LDASuite`, and includes unit tests mentioned in #7760 Author: Feynman Liang <fliang@databricks.com> Closes #7801 from feynmanliang/SPARK-9481-logLikelihood and squashes the following commits: 6d1b2c9 [Feynman Liang] Negate perplexity definition 5f62b20 [Feynman Liang] Add logLikelihood
author: Feynman Liang <fliang@databricks.com> 2015-07-31 12:12:22 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2015-07-31 12:12:22 -0700
commit: a8340fa7df17e3f0a3658f8b8045ab840845a72a (patch)
tree: 25ba59f08c4976cbf2d640cea9ff1888b3030a16 /mllib/src/main
parent: d04634701413410938a133358fe1d9fbc077645e (diff)
download: spark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.tar.gz
spark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.tar.bz2
spark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.zip
1 files changed, 13 insertions, 7 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 82281a0daf..ff7035d224 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -217,22 +217,28 @@ class LocalLDAModel private[clustering] (
     LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration,
       gammaShape)
   }
-  // TODO
-  // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
+
+  // TODO: declare in LDAModel and override once implemented in DistributedLDAModel
+  /**
+   * Calculates a lower bound on the log likelihood of the entire corpus.
+   * @param documents test corpus to use for calculating log likelihood
+   * @return variational lower bound on the log likelihood of the entire corpus
+   */
+  def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents,
+    docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k,
+    vocabSize)
 
   /**
-   * Calculate the log variational bound on perplexity. See Equation (16) in original Online
+   * Calculate an upper bound bound on perplexity. See Equation (16) in original Online
    * LDA paper.
    * @param documents test corpus to use for calculating perplexity
-   * @return the log perplexity per word
+   * @return variational upper bound on log perplexity per word
    */
   def logPerplexity(documents: RDD[(Long, Vector)]): Double = {
     val corpusWords = documents
       .map { case (_, termCounts) => termCounts.toArray.sum }
       .sum()
-    val batchVariationalBound = bound(documents, docConcentration,
-      topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize)
-    val perWordBound = batchVariationalBound / corpusWords
+    val perWordBound = -logLikelihood(documents) / corpusWords
 
     perWordBound
   }
author	Feynman Liang <fliang@databricks.com>	2015-07-31 12:12:22 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2015-07-31 12:12:22 -0700
commit	a8340fa7df17e3f0a3658f8b8045ab840845a72a (patch)
tree	25ba59f08c4976cbf2d640cea9ff1888b3030a16 /mllib/src/main
parent	d04634701413410938a133358fe1d9fbc077645e (diff)
download	spark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.tar.gz spark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.tar.bz2 spark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.zip