aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2015-05-07 01:12:14 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-05-07 01:12:14 -0700
commit8b6b46e4ff5f19fb7befecaaa0eda63bf29a0e2c (patch)
treee2fa4b294374f9b463956f0cefca45a88da174f0 /mllib
parentfae4e2d6094de57a438ee4188ce47fc5b01b96fe (diff)
downloadspark-8b6b46e4ff5f19fb7befecaaa0eda63bf29a0e2c.tar.gz
spark-8b6b46e4ff5f19fb7befecaaa0eda63bf29a0e2c.tar.bz2
spark-8b6b46e4ff5f19fb7befecaaa0eda63bf29a0e2c.zip
[SPARK-7421] [MLLIB] OnlineLDA cleanups
Small changes, primarily to allow us more flexibility in the future: * Rename "tau_0" to "tau0" * Mark LDAOptimizer trait sealed and DeveloperApi. * Mark LDAOptimizer subclasses as final. * Mark setOptimizer (the one taking an LDAOptimizer) and getOptimizer as DeveloperApi since we may need to change them in the future CC: hhbyyh Author: Joseph K. Bradley <joseph@databricks.com> Closes #5956 from jkbradley/onlinelda-cleanups and squashes the following commits: f4be508 [Joseph K. Bradley] added newline f4003e4 [Joseph K. Bradley] Changes: * Rename "tau_0" to "tau0" * Mark LDAOptimizer trait sealed and DeveloperApi. * Mark LDAOptimizer subclasses as final. * Mark setOptimizer (the one taking an LDAOptimizer) and getOptimizer as DeveloperApi since we may need to change them in the future
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala15
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala37
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java2
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala8
4 files changed, 34 insertions, 28 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index c8daa2388e..a410547a72 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -18,8 +18,9 @@
package org.apache.spark.mllib.clustering
import breeze.linalg.{DenseVector => BDV}
+
import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{DeveloperApi, Experimental}
import org.apache.spark.api.java.JavaPairRDD
import org.apache.spark.graphx._
import org.apache.spark.mllib.linalg.Vector
@@ -197,12 +198,20 @@ class LDA private (
}
- /** LDAOptimizer used to perform the actual calculation */
+ /**
+ * :: DeveloperApi ::
+ *
+ * LDAOptimizer used to perform the actual calculation
+ */
+ @DeveloperApi
def getOptimizer: LDAOptimizer = ldaOptimizer
/**
+ * :: DeveloperApi ::
+ *
* LDAOptimizer used to perform the actual calculation (default = EMLDAOptimizer)
*/
+ @DeveloperApi
def setOptimizer(optimizer: LDAOptimizer): this.type = {
this.ldaOptimizer = optimizer
this
@@ -210,7 +219,7 @@ class LDA private (
/**
* Set the LDAOptimizer used to perform the actual calculation by algorithm name.
- * Currently "em", "online" is supported.
+ * Currently "em", "online" are supported.
*/
def setOptimizer(optimizerName: String): this.type = {
this.ldaOptimizer =
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 093aa0f315..6fa2fe053c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -23,7 +23,7 @@ import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, sum, normalize, kr
import breeze.numerics.{digamma, exp, abs}
import breeze.stats.distributions.{Gamma, RandBasis}
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.graphx._
import org.apache.spark.graphx.impl.GraphImpl
import org.apache.spark.mllib.impl.PeriodicGraphCheckpointer
@@ -31,13 +31,13 @@ import org.apache.spark.mllib.linalg.{Matrices, SparseVector, DenseVector, Vecto
import org.apache.spark.rdd.RDD
/**
- * :: Experimental ::
+ * :: DeveloperApi ::
*
* An LDAOptimizer specifies which optimization/learning/inference algorithm to use, and it can
* hold optimizer-specific parameters for users to set.
*/
-@Experimental
-trait LDAOptimizer {
+@DeveloperApi
+sealed trait LDAOptimizer {
/*
DEVELOPERS NOTE:
@@ -59,7 +59,7 @@ trait LDAOptimizer {
}
/**
- * :: Experimental ::
+ * :: DeveloperApi ::
*
* Optimizer for EM algorithm which stores data + parameter graph, plus algorithm parameters.
*
@@ -75,8 +75,8 @@ trait LDAOptimizer {
* "On Smoothing and Inference for Topic Models." UAI, 2009.
*
*/
-@Experimental
-class EMLDAOptimizer extends LDAOptimizer {
+@DeveloperApi
+final class EMLDAOptimizer extends LDAOptimizer {
import LDA._
@@ -211,7 +211,7 @@ class EMLDAOptimizer extends LDAOptimizer {
/**
- * :: Experimental ::
+ * :: DeveloperApi ::
*
* An online optimizer for LDA. The Optimizer implements the Online variational Bayes LDA
* algorithm, which processes a subset of the corpus on each iteration, and updates the term-topic
@@ -220,8 +220,8 @@ class EMLDAOptimizer extends LDAOptimizer {
* Original Online LDA paper:
* Hoffman, Blei and Bach, "Online Learning for Latent Dirichlet Allocation." NIPS, 2010.
*/
-@Experimental
-class OnlineLDAOptimizer extends LDAOptimizer {
+@DeveloperApi
+final class OnlineLDAOptimizer extends LDAOptimizer {
// LDA common parameters
private var k: Int = 0
@@ -243,8 +243,8 @@ class OnlineLDAOptimizer extends LDAOptimizer {
private var randomGenerator: java.util.Random = null
// Online LDA specific parameters
- // Learning rate is: (tau_0 + t)^{-kappa}
- private var tau_0: Double = 1024
+ // Learning rate is: (tau0 + t)^{-kappa}
+ private var tau0: Double = 1024
private var kappa: Double = 0.51
private var miniBatchFraction: Double = 0.05
@@ -265,16 +265,16 @@ class OnlineLDAOptimizer extends LDAOptimizer {
* A (positive) learning parameter that downweights early iterations. Larger values make early
* iterations count less.
*/
- def getTau_0: Double = this.tau_0
+ def getTau0: Double = this.tau0
/**
* A (positive) learning parameter that downweights early iterations. Larger values make early
* iterations count less.
* Default: 1024, following the original Online LDA paper.
*/
- def setTau_0(tau_0: Double): this.type = {
- require(tau_0 > 0, s"LDA tau_0 must be positive, but was set to $tau_0")
- this.tau_0 = tau_0
+ def setTau0(tau0: Double): this.type = {
+ require(tau0 > 0, s"LDA tau0 must be positive, but was set to $tau0")
+ this.tau0 = tau0
this
}
@@ -434,11 +434,8 @@ class OnlineLDAOptimizer extends LDAOptimizer {
* Update lambda based on the batch submitted. batchSize can be different for each iteration.
*/
private[clustering] def update(stat: BDM[Double], iter: Int, batchSize: Int): Unit = {
- val tau_0 = this.getTau_0
- val kappa = this.getKappa
-
// weight of the mini-batch.
- val weight = math.pow(tau_0 + iter, -kappa)
+ val weight = math.pow(getTau0 + iter, -getKappa)
// Update lambda based on documents.
lambda = lambda * (1 - weight) +
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index f394d90396..96c2da1699 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -117,7 +117,7 @@ public class JavaLDASuite implements Serializable {
// Train a model
OnlineLDAOptimizer op = new OnlineLDAOptimizer()
- .setTau_0(1024)
+ .setTau0(1024)
.setKappa(0.51)
.setGammaShape(1e40)
.setMiniBatchFraction(0.5);
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 2dcc881f5a..d5b7d96335 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -138,12 +138,12 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
val lda = new LDA().setK(2)
val corpus = sc.parallelize(tinyCorpus, 2)
val op = new OnlineLDAOptimizer().initialize(corpus, lda)
- op.setKappa(0.9876).setMiniBatchFraction(0.123).setTau_0(567)
+ op.setKappa(0.9876).setMiniBatchFraction(0.123).setTau0(567)
assert(op.getAlpha == 0.5) // default 1.0 / k
assert(op.getEta == 0.5) // default 1.0 / k
assert(op.getKappa == 0.9876)
assert(op.getMiniBatchFraction == 0.123)
- assert(op.getTau_0 == 567)
+ assert(op.getTau0 == 567)
}
test("OnlineLDAOptimizer one iteration") {
@@ -159,7 +159,7 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
val corpus = sc.parallelize(docs, 2)
// Set GammaShape large to avoid the stochastic impact.
- val op = new OnlineLDAOptimizer().setTau_0(1024).setKappa(0.51).setGammaShape(1e40)
+ val op = new OnlineLDAOptimizer().setTau0(1024).setKappa(0.51).setGammaShape(1e40)
.setMiniBatchFraction(1)
val lda = new LDA().setK(k).setMaxIterations(1).setOptimizer(op).setSeed(12345)
@@ -192,7 +192,7 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
val docs = sc.parallelize(toydata)
- val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau_0(1024).setKappa(0.51)
+ val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
.setGammaShape(1e10)
val lda = new LDA().setK(2)
.setDocConcentration(0.01)