diff options
author | Feynman Liang <fliang@databricks.com> | 2015-07-30 13:17:54 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2015-07-30 13:17:54 -0700 |
commit | d8cfd531c7c50c9b00ab546be458f44f84c386ae (patch) | |
tree | f7af8958a3c038ffedc094695eb20c5c41322262 /mllib/src/test | |
parent | a20e743fb863de809863652931bc982aac2d1f86 (diff) | |
download | spark-d8cfd531c7c50c9b00ab546be458f44f84c386ae.tar.gz spark-d8cfd531c7c50c9b00ab546be458f44f84c386ae.tar.bz2 spark-d8cfd531c7c50c9b00ab546be458f44f84c386ae.zip |
[SPARK-5567] [MLLIB] Add predict method to LocalLDAModel
jkbradley hhbyyh
Adds `topicDistributions` to LocalLDAModel. Please review after #7757 is merged.
Author: Feynman Liang <fliang@databricks.com>
Closes #7760 from feynmanliang/SPARK-5567-predict-in-LDA and squashes the following commits:
0ad1134 [Feynman Liang] Remove println
27b3877 [Feynman Liang] Code review fixes
6bfb87c [Feynman Liang] Remove extra newline
476f788 [Feynman Liang] Fix checks and doc for variationalInference
061780c [Feynman Liang] Code review cleanup
3be2947 [Feynman Liang] Rename topicDistribution -> topicDistributions
2a821a6 [Feynman Liang] Add predict methods to LocalLDAModel
Diffstat (limited to 'mllib/src/test')
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index 61d2edfd9f..d74482d3a7 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -242,6 +242,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { val alpha = 0.01 val eta = 0.01 val gammaShape = 100 + // obtained from LDA model trained in gensim, see below val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array( 1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597, 0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124)) @@ -281,6 +282,68 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { assert(ldaModel.logPerplexity(docs) ~== -3.690D relTol 1E-3D) } + test("LocalLDAModel predict") { + val k = 2 + val vocabSize = 6 + val alpha = 0.01 + val eta = 0.01 + val gammaShape = 100 + // obtained from LDA model trained in gensim, see below + val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array( + 1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597, + 0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124)) + + def toydata: Array[(Long, Vector)] = Array( + Vectors.sparse(6, Array(0, 1), Array(1, 1)), + Vectors.sparse(6, Array(1, 2), Array(1, 1)), + Vectors.sparse(6, Array(0, 2), Array(1, 1)), + Vectors.sparse(6, Array(3, 4), Array(1, 1)), + Vectors.sparse(6, Array(3, 5), Array(1, 1)), + Vectors.sparse(6, Array(4, 5), Array(1, 1)) + ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) } + val docs = sc.parallelize(toydata) + + val ldaModel: LocalLDAModel = new LocalLDAModel( + topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape) + + /* Verify results using gensim: + import numpy as np + from gensim import models + corpus = [ + [(0, 1.0), (1, 1.0)], + [(1, 1.0), (2, 1.0)], + [(0, 1.0), (2, 1.0)], + [(3, 1.0), (4, 1.0)], + [(3, 1.0), (5, 1.0)], + [(4, 1.0), (5, 1.0)]] + np.random.seed(2345) + lda = models.ldamodel.LdaModel( + corpus=corpus, alpha=0.01, eta=0.01, num_topics=2, update_every=0, passes=100, + decay=0.51, offset=1024) + print(list(lda.get_document_topics(corpus))) + > [[(0, 0.99504950495049516)], [(0, 0.99504950495049516)], + > [(0, 0.99504950495049516)], [(1, 0.99504950495049516)], + > [(1, 0.99504950495049516)], [(1, 0.99504950495049516)]] + */ + + val expectedPredictions = List( + (0, 0.99504), (0, 0.99504), + (0, 0.99504), (1, 0.99504), + (1, 0.99504), (1, 0.99504)) + + val actualPredictions = ldaModel.topicDistributions(docs).map { case (id, topics) => + // convert results to expectedPredictions format, which only has highest probability topic + val topicsBz = topics.toBreeze.toDenseVector + (id, (argmax(topicsBz), max(topicsBz))) + }.sortByKey() + .values + .collect() + + expectedPredictions.zip(actualPredictions).forall { case (expected, actual) => + expected._1 === actual._1 && (expected._2 ~== actual._2 relTol 1E-3D) + } + } + test("OnlineLDAOptimizer with asymmetric prior") { def toydata: Array[(Long, Vector)] = Array( Vectors.sparse(6, Array(0, 1), Array(1, 1)), |