aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/main
diff options
context:
space:
mode:
authorFeynman Liang <fliang@databricks.com>2015-07-31 12:12:22 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-07-31 12:12:22 -0700
commita8340fa7df17e3f0a3658f8b8045ab840845a72a (patch)
tree25ba59f08c4976cbf2d640cea9ff1888b3030a16 /mllib/src/main
parentd04634701413410938a133358fe1d9fbc077645e (diff)
downloadspark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.tar.gz
spark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.tar.bz2
spark-a8340fa7df17e3f0a3658f8b8045ab840845a72a.zip
[SPARK-9481] Add logLikelihood to LocalLDAModel
jkbradley Exposes `bound` (variational log likelihood bound) through public API as `logLikelihood`. Also adds unit tests, some DRYing of `LDASuite`, and includes unit tests mentioned in #7760 Author: Feynman Liang <fliang@databricks.com> Closes #7801 from feynmanliang/SPARK-9481-logLikelihood and squashes the following commits: 6d1b2c9 [Feynman Liang] Negate perplexity definition 5f62b20 [Feynman Liang] Add logLikelihood
Diffstat (limited to 'mllib/src/main')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala20
1 files changed, 13 insertions, 7 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 82281a0daf..ff7035d224 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -217,22 +217,28 @@ class LocalLDAModel private[clustering] (
LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration,
gammaShape)
}
- // TODO
- // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
+
+ // TODO: declare in LDAModel and override once implemented in DistributedLDAModel
+ /**
+ * Calculates a lower bound on the log likelihood of the entire corpus.
+ * @param documents test corpus to use for calculating log likelihood
+ * @return variational lower bound on the log likelihood of the entire corpus
+ */
+ def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents,
+ docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k,
+ vocabSize)
/**
- * Calculate the log variational bound on perplexity. See Equation (16) in original Online
+ * Calculate an upper bound bound on perplexity. See Equation (16) in original Online
* LDA paper.
* @param documents test corpus to use for calculating perplexity
- * @return the log perplexity per word
+ * @return variational upper bound on log perplexity per word
*/
def logPerplexity(documents: RDD[(Long, Vector)]): Double = {
val corpusWords = documents
.map { case (_, termCounts) => termCounts.toArray.sum }
.sum()
- val batchVariationalBound = bound(documents, docConcentration,
- topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize)
- val perWordBound = batchVariationalBound / corpusWords
+ val perWordBound = -logLikelihood(documents) / corpusWords
perWordBound
}