[SPARK-8936] [MLLIB] OnlineLDA document-topic Dirichlet hyperparameter optimization

Adds `alpha` (document-topic Dirichlet parameter) hyperparameter optimization to `OnlineLDAOptimizer` following Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters. Also introduces a private `setSampleWithReplacement` to `OnlineLDAOptimizer` for unit testing purposes. Author: Feynman Liang <fliang@databricks.com> Closes #7836 from feynmanliang/SPARK-8936-alpha-optimize and squashes the following commits: 4bef484 [Feynman Liang] Documentation improvements c3c6c1d [Feynman Liang] Fix docs 151e859 [Feynman Liang] Fix style fa77518 [Feynman Liang] Hyperparameter optimization
author: Feynman Liang <fliang@databricks.com> 2015-07-31 18:36:22 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2015-07-31 18:36:22 -0700
commit: f51fd6fbb4d9822502f98b312251e317d757bc3a (patch)
tree: d69b292785aaeaf1537c2937c24653ee32e1c594 /mllib/src/test
parent: 4d5a6e7b60b315968973e2298eeee5eb174ec721 (diff)
download: spark-f51fd6fbb4d9822502f98b312251e317d757bc3a.tar.gz
spark-f51fd6fbb4d9822502f98b312251e317d757bc3a.tar.bz2
spark-f51fd6fbb4d9822502f98b312251e317d757bc3a.zip
1 files changed, 34 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index f2b94707fd..fdc2554ab8 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -400,6 +400,40 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("OnlineLDAOptimizer alpha hyperparameter optimization") {
+    val k = 2
+    val docs = sc.parallelize(toyData)
+    val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
+      .setGammaShape(100).setOptimzeAlpha(true).setSampleWithReplacement(false)
+    val lda = new LDA().setK(k)
+      .setDocConcentration(1D / k)
+      .setTopicConcentration(0.01)
+      .setMaxIterations(100)
+      .setOptimizer(op)
+      .setSeed(12345)
+    val ldaModel: LocalLDAModel = lda.run(docs).asInstanceOf[LocalLDAModel]
+
+    /* Verify the results with gensim:
+      import numpy as np
+      from gensim import models
+      corpus = [
+       [(0, 1.0), (1, 1.0)],
+       [(1, 1.0), (2, 1.0)],
+       [(0, 1.0), (2, 1.0)],
+       [(3, 1.0), (4, 1.0)],
+       [(3, 1.0), (5, 1.0)],
+       [(4, 1.0), (5, 1.0)]]
+      np.random.seed(2345)
+      lda = models.ldamodel.LdaModel(
+         corpus=corpus, alpha='auto', eta=0.01, num_topics=2, update_every=0, passes=100,
+         decay=0.51, offset=1024)
+      print(lda.alpha)
+      > [ 0.42582646  0.43511073]
+     */
+
+    assert(ldaModel.docConcentration ~== Vectors.dense(0.42582646, 0.43511073) absTol 0.05)
+  }
+
   test("model save/load") {
     // Test for LocalLDAModel.
     val localModel = new LocalLDAModel(tinyTopics,
author	Feynman Liang <fliang@databricks.com>	2015-07-31 18:36:22 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2015-07-31 18:36:22 -0700
commit	f51fd6fbb4d9822502f98b312251e317d757bc3a (patch)
tree	d69b292785aaeaf1537c2937c24653ee32e1c594 /mllib/src/test
parent	4d5a6e7b60b315968973e2298eeee5eb174ec721 (diff)
download	spark-f51fd6fbb4d9822502f98b312251e317d757bc3a.tar.gz spark-f51fd6fbb4d9822502f98b312251e317d757bc3a.tar.bz2 spark-f51fd6fbb4d9822502f98b312251e317d757bc3a.zip