diff options
author | Feynman Liang <fliang@databricks.com> | 2015-07-29 16:20:20 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2015-07-29 16:20:20 -0700 |
commit | 2cc212d56a1d50fe68d5816f71b27803de1f6389 (patch) | |
tree | 57246612d422ea82e592c4079fe43f16e9bdfb84 /mllib/src/test | |
parent | 1b0099fc62d02ff6216a76fbfe17a4ec5b2f3536 (diff) | |
download | spark-2cc212d56a1d50fe68d5816f71b27803de1f6389.tar.gz spark-2cc212d56a1d50fe68d5816f71b27803de1f6389.tar.bz2 spark-2cc212d56a1d50fe68d5816f71b27803de1f6389.zip |
[SPARK-6793] [MLLIB] OnlineLDAOptimizer LDA perplexity
Implements `logPerplexity` in `OnlineLDAOptimizer`. Also refactors inference code into companion object to enable future reuse (e.g. `predict` method).
Author: Feynman Liang <fliang@databricks.com>
Closes #7705 from feynmanliang/SPARK-6793-perplexity and squashes the following commits:
6da2c99 [Feynman Liang] Remove get* from LDAModel public API
8381da6 [Feynman Liang] Code review comments
17f7000 [Feynman Liang] Documentation typo fixes
2f452a4 [Feynman Liang] Remove auxillary DistributedLDAModel constructor
a275914 [Feynman Liang] Prevent empty counts calls to variationalInference
06d02d9 [Feynman Liang] Remove deprecated LocalLDAModel constructor
afecb46 [Feynman Liang] Fix regression bug in sstats accumulator
5a327a0 [Feynman Liang] Code review quick fixes
998c03e [Feynman Liang] Fix style
1cbb67d [Feynman Liang] Fix access modifier bug
4362daa [Feynman Liang] Organize imports
4f171f7 [Feynman Liang] Fix indendation
2f049ce [Feynman Liang] Fix failing save/load tests
7415e96 [Feynman Liang] Pick changes from big PR
11e7c33 [Feynman Liang] Merge remote-tracking branch 'apache/master' into SPARK-6793-perplexity
f8adc48 [Feynman Liang] Add logPerplexity, refactor variationalBound into a method
cd521d6 [Feynman Liang] Refactor methods into companion class
7f62a55 [Feynman Liang] --amend
c62cb1e [Feynman Liang] Outer product for stats, revert Range slicing
aead650 [Feynman Liang] Range slice, in-place update, reduce transposes
Diffstat (limited to 'mllib/src/test')
-rw-r--r-- | mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java | 6 | ||||
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala | 53 |
2 files changed, 55 insertions, 4 deletions
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java index b48f190f59..d272a42c85 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java @@ -19,6 +19,7 @@ package org.apache.spark.mllib.clustering; import java.io.Serializable; import java.util.ArrayList; +import java.util.Arrays; import scala.Tuple2; @@ -59,7 +60,10 @@ public class JavaLDASuite implements Serializable { @Test public void localLDAModel() { - LocalLDAModel model = new LocalLDAModel(LDASuite$.MODULE$.tinyTopics()); + Matrix topics = LDASuite$.MODULE$.tinyTopics(); + double[] topicConcentration = new double[topics.numRows()]; + Arrays.fill(topicConcentration, 1.0D / topics.numRows()); + LocalLDAModel model = new LocalLDAModel(topics, Vectors.dense(topicConcentration), 1D, 100D); // Check: basic parameters assertEquals(model.k(), tinyK); diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index 376a87f051..aa36336ebb 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{DenseMatrix => BDM} +import breeze.linalg.{DenseMatrix => BDM, max, argmax} import org.apache.spark.SparkFunSuite import org.apache.spark.graphx.Edge @@ -31,7 +31,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { import LDASuite._ test("LocalLDAModel") { - val model = new LocalLDAModel(tinyTopics) + val model = new LocalLDAModel(tinyTopics, + Vectors.dense(Array.fill(tinyTopics.numRows)(1.0 / tinyTopics.numRows)), 1D, 100D) // Check: basic parameters assert(model.k === tinyK) @@ -235,6 +236,51 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { } } + test("LocalLDAModel logPerplexity") { + val k = 2 + val vocabSize = 6 + val alpha = 0.01 + val eta = 0.01 + val gammaShape = 100 + val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array( + 1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597, + 0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124)) + + def toydata: Array[(Long, Vector)] = Array( + Vectors.sparse(6, Array(0, 1), Array(1, 1)), + Vectors.sparse(6, Array(1, 2), Array(1, 1)), + Vectors.sparse(6, Array(0, 2), Array(1, 1)), + Vectors.sparse(6, Array(3, 4), Array(1, 1)), + Vectors.sparse(6, Array(3, 5), Array(1, 1)), + Vectors.sparse(6, Array(4, 5), Array(1, 1)) + ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) } + val docs = sc.parallelize(toydata) + + + val ldaModel: LocalLDAModel = new LocalLDAModel( + topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape) + + /* Verify results using gensim: + import numpy as np + from gensim import models + corpus = [ + [(0, 1.0), (1, 1.0)], + [(1, 1.0), (2, 1.0)], + [(0, 1.0), (2, 1.0)], + [(3, 1.0), (4, 1.0)], + [(3, 1.0), (5, 1.0)], + [(4, 1.0), (5, 1.0)]] + np.random.seed(2345) + lda = models.ldamodel.LdaModel( + corpus=corpus, alpha=0.01, eta=0.01, num_topics=2, update_every=0, passes=100, + decay=0.51, offset=1024) + print(lda.log_perplexity(corpus)) + > -3.69051285096 + */ + + assert(ldaModel.logPerplexity(docs) ~== -3.690D relTol 1E-3D) + } + test("OnlineLDAOptimizer with asymmetric prior") { def toydata: Array[(Long, Vector)] = Array( Vectors.sparse(6, Array(0, 1), Array(1, 1)), @@ -287,7 +333,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { test("model save/load") { // Test for LocalLDAModel. - val localModel = new LocalLDAModel(tinyTopics) + val localModel = new LocalLDAModel(tinyTopics, + Vectors.dense(Array.fill(tinyTopics.numRows)(1.0 / tinyTopics.numRows)), 1D, 100D) val tempDir1 = Utils.createTempDir() val path1 = tempDir1.toURI.toString |