diff options
author | Pravin Gadakh <prgadakh@in.ibm.com> | 2016-04-15 13:08:30 +0100 |
---|---|---|
committer | Sean Owen <sowen@cloudera.com> | 2016-04-15 13:08:30 +0100 |
commit | e24923267f79e7fc03180095fcbb28a91f998f5d (patch) | |
tree | 3895aca19db8640bae1bf269f8a7597672bd47ab /mllib | |
parent | 96534aa47c39e0ec40bc38be566455d11e21adb2 (diff) | |
download | spark-e24923267f79e7fc03180095fcbb28a91f998f5d.tar.gz spark-e24923267f79e7fc03180095fcbb28a91f998f5d.tar.bz2 spark-e24923267f79e7fc03180095fcbb28a91f998f5d.zip |
[SPARK-14370][MLLIB] removed duplicate generation of ids in OnlineLDAOptimizer
## What changes were proposed in this pull request?
Removed duplicated generation of `ids` in OnlineLDAOptimizer.
## How was this patch tested?
tested with existing unit tests.
Author: Pravin Gadakh <prgadakh@in.ibm.com>
Closes #12176 from pravingadakh/SPARK-14370.
Diffstat (limited to 'mllib')
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala | 8 | ||||
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala | 13 |
2 files changed, 10 insertions, 11 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 27b4004927..4913c0287a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -303,7 +303,7 @@ class LocalLDAModel private[spark] ( documents.filter(_._2.numNonzeros > 0).map { case (id: Long, termCounts: Vector) => val localElogbeta = ElogbetaBc.value var docBound = 0.0D - val (gammad: BDV[Double], _) = OnlineLDAOptimizer.variationalTopicInference( + val (gammad: BDV[Double], _, _) = OnlineLDAOptimizer.variationalTopicInference( termCounts, exp(localElogbeta), brzAlpha, gammaShape, k) val Elogthetad: BDV[Double] = LDAUtils.dirichletExpectation(gammad) @@ -354,7 +354,7 @@ class LocalLDAModel private[spark] ( if (termCounts.numNonzeros == 0) { (id, Vectors.zeros(k)) } else { - val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference( + val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference( termCounts, expElogbetaBc.value, docConcentrationBrz, @@ -377,7 +377,7 @@ class LocalLDAModel private[spark] ( if (termCounts.numNonzeros == 0) { Vectors.zeros(k) } else { - val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference( + val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference( termCounts, expElogbetaBc.value, docConcentrationBrz, @@ -403,7 +403,7 @@ class LocalLDAModel private[spark] ( if (document.numNonzeros == 0) { Vectors.zeros(this.k) } else { - val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference( + val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference( document, expElogbeta, this.docConcentration.toBreeze, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 6418f0d3b3..1b3e2f600d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -466,11 +466,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { val stat = BDM.zeros[Double](k, vocabSize) var gammaPart = List[BDV[Double]]() nonEmptyDocs.foreach { case (_, termCounts: Vector) => - val ids: List[Int] = termCounts match { - case v: DenseVector => (0 until v.size).toList - case v: SparseVector => v.indices.toList - } - val (gammad, sstats) = OnlineLDAOptimizer.variationalTopicInference( + val (gammad, sstats, ids) = OnlineLDAOptimizer.variationalTopicInference( termCounts, expElogbetaBc.value, alpha, gammaShape, k) stat(::, ids) := stat(::, ids).toDenseMatrix + sstats gammaPart = gammad :: gammaPart @@ -563,13 +559,16 @@ private[clustering] object OnlineLDAOptimizer { * An optimization (Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001) * avoids explicit computation of variational parameter `phi`. * @see [[http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566]] + * + * @return Returns a tuple of `gammad` - estimate of gamma, the topic distribution, `sstatsd` - + * statistics for updating lambda and `ids` - list of termCounts vector indices. */ private[clustering] def variationalTopicInference( termCounts: Vector, expElogbeta: BDM[Double], alpha: breeze.linalg.Vector[Double], gammaShape: Double, - k: Int): (BDV[Double], BDM[Double]) = { + k: Int): (BDV[Double], BDM[Double], List[Int]) = { val (ids: List[Int], cts: Array[Double]) = termCounts match { case v: DenseVector => ((0 until v.size).toList, v.values) case v: SparseVector => (v.indices.toList, v.values) @@ -596,6 +595,6 @@ private[clustering] object OnlineLDAOptimizer { } val sstatsd = expElogthetad.asDenseMatrix.t * (ctsVector :/ phiNorm).asDenseMatrix - (gammad, sstatsd) + (gammad, sstatsd, ids) } } |