aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test
diff options
context:
space:
mode:
authorFeynman Liang <fliang@databricks.com>2015-07-29 16:20:20 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-07-29 16:20:20 -0700
commit2cc212d56a1d50fe68d5816f71b27803de1f6389 (patch)
tree57246612d422ea82e592c4079fe43f16e9bdfb84 /mllib/src/test
parent1b0099fc62d02ff6216a76fbfe17a4ec5b2f3536 (diff)
downloadspark-2cc212d56a1d50fe68d5816f71b27803de1f6389.tar.gz
spark-2cc212d56a1d50fe68d5816f71b27803de1f6389.tar.bz2
spark-2cc212d56a1d50fe68d5816f71b27803de1f6389.zip
[SPARK-6793] [MLLIB] OnlineLDAOptimizer LDA perplexity
Implements `logPerplexity` in `OnlineLDAOptimizer`. Also refactors inference code into companion object to enable future reuse (e.g. `predict` method). Author: Feynman Liang <fliang@databricks.com> Closes #7705 from feynmanliang/SPARK-6793-perplexity and squashes the following commits: 6da2c99 [Feynman Liang] Remove get* from LDAModel public API 8381da6 [Feynman Liang] Code review comments 17f7000 [Feynman Liang] Documentation typo fixes 2f452a4 [Feynman Liang] Remove auxillary DistributedLDAModel constructor a275914 [Feynman Liang] Prevent empty counts calls to variationalInference 06d02d9 [Feynman Liang] Remove deprecated LocalLDAModel constructor afecb46 [Feynman Liang] Fix regression bug in sstats accumulator 5a327a0 [Feynman Liang] Code review quick fixes 998c03e [Feynman Liang] Fix style 1cbb67d [Feynman Liang] Fix access modifier bug 4362daa [Feynman Liang] Organize imports 4f171f7 [Feynman Liang] Fix indendation 2f049ce [Feynman Liang] Fix failing save/load tests 7415e96 [Feynman Liang] Pick changes from big PR 11e7c33 [Feynman Liang] Merge remote-tracking branch 'apache/master' into SPARK-6793-perplexity f8adc48 [Feynman Liang] Add logPerplexity, refactor variationalBound into a method cd521d6 [Feynman Liang] Refactor methods into companion class 7f62a55 [Feynman Liang] --amend c62cb1e [Feynman Liang] Outer product for stats, revert Range slicing aead650 [Feynman Liang] Range slice, in-place update, reduce transposes
Diffstat (limited to 'mllib/src/test')
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java6
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala53
2 files changed, 55 insertions, 4 deletions
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index b48f190f59..d272a42c85 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.clustering;
import java.io.Serializable;
import java.util.ArrayList;
+import java.util.Arrays;
import scala.Tuple2;
@@ -59,7 +60,10 @@ public class JavaLDASuite implements Serializable {
@Test
public void localLDAModel() {
- LocalLDAModel model = new LocalLDAModel(LDASuite$.MODULE$.tinyTopics());
+ Matrix topics = LDASuite$.MODULE$.tinyTopics();
+ double[] topicConcentration = new double[topics.numRows()];
+ Arrays.fill(topicConcentration, 1.0D / topics.numRows());
+ LocalLDAModel model = new LocalLDAModel(topics, Vectors.dense(topicConcentration), 1D, 100D);
// Check: basic parameters
assertEquals(model.k(), tinyK);
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 376a87f051..aa36336ebb 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -17,7 +17,7 @@
package org.apache.spark.mllib.clustering
-import breeze.linalg.{DenseMatrix => BDM}
+import breeze.linalg.{DenseMatrix => BDM, max, argmax}
import org.apache.spark.SparkFunSuite
import org.apache.spark.graphx.Edge
@@ -31,7 +31,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
import LDASuite._
test("LocalLDAModel") {
- val model = new LocalLDAModel(tinyTopics)
+ val model = new LocalLDAModel(tinyTopics,
+ Vectors.dense(Array.fill(tinyTopics.numRows)(1.0 / tinyTopics.numRows)), 1D, 100D)
// Check: basic parameters
assert(model.k === tinyK)
@@ -235,6 +236,51 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
}
}
+ test("LocalLDAModel logPerplexity") {
+ val k = 2
+ val vocabSize = 6
+ val alpha = 0.01
+ val eta = 0.01
+ val gammaShape = 100
+ val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array(
+ 1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597,
+ 0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124))
+
+ def toydata: Array[(Long, Vector)] = Array(
+ Vectors.sparse(6, Array(0, 1), Array(1, 1)),
+ Vectors.sparse(6, Array(1, 2), Array(1, 1)),
+ Vectors.sparse(6, Array(0, 2), Array(1, 1)),
+ Vectors.sparse(6, Array(3, 4), Array(1, 1)),
+ Vectors.sparse(6, Array(3, 5), Array(1, 1)),
+ Vectors.sparse(6, Array(4, 5), Array(1, 1))
+ ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
+ val docs = sc.parallelize(toydata)
+
+
+ val ldaModel: LocalLDAModel = new LocalLDAModel(
+ topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape)
+
+ /* Verify results using gensim:
+ import numpy as np
+ from gensim import models
+ corpus = [
+ [(0, 1.0), (1, 1.0)],
+ [(1, 1.0), (2, 1.0)],
+ [(0, 1.0), (2, 1.0)],
+ [(3, 1.0), (4, 1.0)],
+ [(3, 1.0), (5, 1.0)],
+ [(4, 1.0), (5, 1.0)]]
+ np.random.seed(2345)
+ lda = models.ldamodel.LdaModel(
+ corpus=corpus, alpha=0.01, eta=0.01, num_topics=2, update_every=0, passes=100,
+ decay=0.51, offset=1024)
+ print(lda.log_perplexity(corpus))
+ > -3.69051285096
+ */
+
+ assert(ldaModel.logPerplexity(docs) ~== -3.690D relTol 1E-3D)
+ }
+
test("OnlineLDAOptimizer with asymmetric prior") {
def toydata: Array[(Long, Vector)] = Array(
Vectors.sparse(6, Array(0, 1), Array(1, 1)),
@@ -287,7 +333,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
test("model save/load") {
// Test for LocalLDAModel.
- val localModel = new LocalLDAModel(tinyTopics)
+ val localModel = new LocalLDAModel(tinyTopics,
+ Vectors.dense(Array.fill(tinyTopics.numRows)(1.0 / tinyTopics.numRows)), 1D, 100D)
val tempDir1 = Utils.createTempDir()
val path1 = tempDir1.toURI.toString