aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/main
diff options
context:
space:
mode:
Diffstat (limited to 'mllib/src/main')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala20
1 files changed, 13 insertions, 7 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 82281a0daf..ff7035d224 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -217,22 +217,28 @@ class LocalLDAModel private[clustering] (
LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration,
gammaShape)
}
- // TODO
- // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
+
+ // TODO: declare in LDAModel and override once implemented in DistributedLDAModel
+ /**
+ * Calculates a lower bound on the log likelihood of the entire corpus.
+ * @param documents test corpus to use for calculating log likelihood
+ * @return variational lower bound on the log likelihood of the entire corpus
+ */
+ def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents,
+ docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k,
+ vocabSize)
/**
- * Calculate the log variational bound on perplexity. See Equation (16) in original Online
+ * Calculate an upper bound bound on perplexity. See Equation (16) in original Online
* LDA paper.
* @param documents test corpus to use for calculating perplexity
- * @return the log perplexity per word
+ * @return variational upper bound on log perplexity per word
*/
def logPerplexity(documents: RDD[(Long, Vector)]): Double = {
val corpusWords = documents
.map { case (_, termCounts) => termCounts.toArray.sum }
.sum()
- val batchVariationalBound = bound(documents, docConcentration,
- topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize)
- val perWordBound = batchVariationalBound / corpusWords
+ val perWordBound = -logLikelihood(documents) / corpusWords
perWordBound
}