aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorPravin Gadakh <prgadakh@in.ibm.com>2016-04-15 13:08:30 +0100
committerSean Owen <sowen@cloudera.com>2016-04-15 13:08:30 +0100
commite24923267f79e7fc03180095fcbb28a91f998f5d (patch)
tree3895aca19db8640bae1bf269f8a7597672bd47ab /mllib
parent96534aa47c39e0ec40bc38be566455d11e21adb2 (diff)
downloadspark-e24923267f79e7fc03180095fcbb28a91f998f5d.tar.gz
spark-e24923267f79e7fc03180095fcbb28a91f998f5d.tar.bz2
spark-e24923267f79e7fc03180095fcbb28a91f998f5d.zip
[SPARK-14370][MLLIB] removed duplicate generation of ids in OnlineLDAOptimizer
## What changes were proposed in this pull request? Removed duplicated generation of `ids` in OnlineLDAOptimizer. ## How was this patch tested? tested with existing unit tests. Author: Pravin Gadakh <prgadakh@in.ibm.com> Closes #12176 from pravingadakh/SPARK-14370.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala13
2 files changed, 10 insertions, 11 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 27b4004927..4913c0287a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -303,7 +303,7 @@ class LocalLDAModel private[spark] (
documents.filter(_._2.numNonzeros > 0).map { case (id: Long, termCounts: Vector) =>
val localElogbeta = ElogbetaBc.value
var docBound = 0.0D
- val (gammad: BDV[Double], _) = OnlineLDAOptimizer.variationalTopicInference(
+ val (gammad: BDV[Double], _, _) = OnlineLDAOptimizer.variationalTopicInference(
termCounts, exp(localElogbeta), brzAlpha, gammaShape, k)
val Elogthetad: BDV[Double] = LDAUtils.dirichletExpectation(gammad)
@@ -354,7 +354,7 @@ class LocalLDAModel private[spark] (
if (termCounts.numNonzeros == 0) {
(id, Vectors.zeros(k))
} else {
- val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
+ val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
termCounts,
expElogbetaBc.value,
docConcentrationBrz,
@@ -377,7 +377,7 @@ class LocalLDAModel private[spark] (
if (termCounts.numNonzeros == 0) {
Vectors.zeros(k)
} else {
- val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
+ val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
termCounts,
expElogbetaBc.value,
docConcentrationBrz,
@@ -403,7 +403,7 @@ class LocalLDAModel private[spark] (
if (document.numNonzeros == 0) {
Vectors.zeros(this.k)
} else {
- val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
+ val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
document,
expElogbeta,
this.docConcentration.toBreeze,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 6418f0d3b3..1b3e2f600d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -466,11 +466,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
val stat = BDM.zeros[Double](k, vocabSize)
var gammaPart = List[BDV[Double]]()
nonEmptyDocs.foreach { case (_, termCounts: Vector) =>
- val ids: List[Int] = termCounts match {
- case v: DenseVector => (0 until v.size).toList
- case v: SparseVector => v.indices.toList
- }
- val (gammad, sstats) = OnlineLDAOptimizer.variationalTopicInference(
+ val (gammad, sstats, ids) = OnlineLDAOptimizer.variationalTopicInference(
termCounts, expElogbetaBc.value, alpha, gammaShape, k)
stat(::, ids) := stat(::, ids).toDenseMatrix + sstats
gammaPart = gammad :: gammaPart
@@ -563,13 +559,16 @@ private[clustering] object OnlineLDAOptimizer {
* An optimization (Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001)
* avoids explicit computation of variational parameter `phi`.
* @see [[http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566]]
+ *
+ * @return Returns a tuple of `gammad` - estimate of gamma, the topic distribution, `sstatsd` -
+ * statistics for updating lambda and `ids` - list of termCounts vector indices.
*/
private[clustering] def variationalTopicInference(
termCounts: Vector,
expElogbeta: BDM[Double],
alpha: breeze.linalg.Vector[Double],
gammaShape: Double,
- k: Int): (BDV[Double], BDM[Double]) = {
+ k: Int): (BDV[Double], BDM[Double], List[Int]) = {
val (ids: List[Int], cts: Array[Double]) = termCounts match {
case v: DenseVector => ((0 until v.size).toList, v.values)
case v: SparseVector => (v.indices.toList, v.values)
@@ -596,6 +595,6 @@ private[clustering] object OnlineLDAOptimizer {
}
val sstatsd = expElogthetad.asDenseMatrix.t * (ctsVector :/ phiNorm).asDenseMatrix
- (gammad, sstatsd)
+ (gammad, sstatsd, ids)
}
}