diff options
author | Feynman Liang <fliang@databricks.com> | 2015-07-31 12:12:22 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2015-07-31 12:12:22 -0700 |
commit | a8340fa7df17e3f0a3658f8b8045ab840845a72a (patch) | |
tree | 25ba59f08c4976cbf2d640cea9ff1888b3030a16 /mllib/src/main/scala | |
parent | d04634701413410938a133358fe1d9fbc077645e (diff) | |
download | spark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.tar.gz spark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.tar.bz2 spark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.zip |
[SPARK-9481] Add logLikelihood to LocalLDAModel
jkbradley Exposes `bound` (variational log likelihood bound) through public API as `logLikelihood`. Also adds unit tests, some DRYing of `LDASuite`, and includes unit tests mentioned in #7760
Author: Feynman Liang <fliang@databricks.com>
Closes #7801 from feynmanliang/SPARK-9481-logLikelihood and squashes the following commits:
6d1b2c9 [Feynman Liang] Negate perplexity definition
5f62b20 [Feynman Liang] Add logLikelihood
Diffstat (limited to 'mllib/src/main/scala')
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala | 20 |
1 files changed, 13 insertions, 7 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 82281a0daf..ff7035d224 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -217,22 +217,28 @@ class LocalLDAModel private[clustering] ( LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration, gammaShape) } - // TODO - // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ??? + + // TODO: declare in LDAModel and override once implemented in DistributedLDAModel + /** + * Calculates a lower bound on the log likelihood of the entire corpus. + * @param documents test corpus to use for calculating log likelihood + * @return variational lower bound on the log likelihood of the entire corpus + */ + def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents, + docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, + vocabSize) /** - * Calculate the log variational bound on perplexity. See Equation (16) in original Online + * Calculate an upper bound bound on perplexity. See Equation (16) in original Online * LDA paper. * @param documents test corpus to use for calculating perplexity - * @return the log perplexity per word + * @return variational upper bound on log perplexity per word */ def logPerplexity(documents: RDD[(Long, Vector)]): Double = { val corpusWords = documents .map { case (_, termCounts) => termCounts.toArray.sum } .sum() - val batchVariationalBound = bound(documents, docConcentration, - topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize) - val perWordBound = batchVariationalBound / corpusWords + val perWordBound = -logLikelihood(documents) / corpusWords perWordBound } |