aboutsummaryrefslogtreecommitdiff
path: root/mllib/src
diff options
context:
space:
mode:
authorAsher Krim <krim.asher@gmail.com>2017-03-14 13:08:11 +0000
committerSean Owen <sowen@cloudera.com>2017-03-14 13:08:11 +0000
commit5e96a57b2f383d4b33735681b41cd3ec06570671 (patch)
tree37cb10490d80cb580a9f4bcd99b285fe1203e7c5 /mllib/src
parent1c7275efa7bfaaa92719750e93a7b35cbcb48e45 (diff)
downloadspark-5e96a57b2f383d4b33735681b41cd3ec06570671.tar.gz
spark-5e96a57b2f383d4b33735681b41cd3ec06570671.tar.bz2
spark-5e96a57b2f383d4b33735681b41cd3ec06570671.zip
[SPARK-19922][ML] small speedups to findSynonyms
Currently generating synonyms using a large model (I've tested with 3m words) is very slow. These efficiencies have sped things up for us by ~17% I wasn't sure if such small changes were worthy of a jira, but the guidelines seemed to suggest that that is the preferred approach ## What changes were proposed in this pull request? Address a few small issues in the findSynonyms logic: 1) remove usage of ``Array.fill`` to zero out the ``cosineVec`` array. The default float value in Scala and Java is 0.0f, so explicitly setting the values to zero is not needed 2) use Floats throughout. The conversion to Doubles before doing the ``priorityQueue`` is totally superfluous, since all the similarity computations are done using Floats anyway. Creating a second large array just serves to put extra strain on the GC 3) convert the slow ``for(i <- cosVec.indices)`` to an ugly, but faster, ``while`` loop These efficiencies are really only apparent when working with a large model ## How was this patch tested? Existing unit tests + some in-house tests to time the difference cc jkbradley MLNick srowen Author: Asher Krim <krim.asher@gmail.com> Author: Asher Krim <krim.asher@gmail> Closes #17263 from Krimit/fasterFindSynonyms.
Diffstat (limited to 'mllib/src')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala34
1 files changed, 19 insertions, 15 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 531c8b0791..6f96813497 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -491,8 +491,8 @@ class Word2VecModel private[spark] (
// wordVecNorms: Array of length numWords, each value being the Euclidean norm
// of the wordVector.
- private val wordVecNorms: Array[Double] = {
- val wordVecNorms = new Array[Double](numWords)
+ private val wordVecNorms: Array[Float] = {
+ val wordVecNorms = new Array[Float](numWords)
var i = 0
while (i < numWords) {
val vec = wordVectors.slice(i * vectorSize, i * vectorSize + vectorSize)
@@ -570,7 +570,7 @@ class Word2VecModel private[spark] (
require(num > 0, "Number of similar words should > 0")
val fVector = vector.toArray.map(_.toFloat)
- val cosineVec = Array.fill[Float](numWords)(0)
+ val cosineVec = new Array[Float](numWords)
val alpha: Float = 1
val beta: Float = 0
// Normalize input vector before blas.sgemv to avoid Inf value
@@ -581,22 +581,23 @@ class Word2VecModel private[spark] (
blas.sgemv(
"T", vectorSize, numWords, alpha, wordVectors, vectorSize, fVector, 1, beta, cosineVec, 1)
- val cosVec = cosineVec.map(_.toDouble)
- var ind = 0
- while (ind < numWords) {
- val norm = wordVecNorms(ind)
- if (norm == 0.0) {
- cosVec(ind) = 0.0
+ var i = 0
+ while (i < numWords) {
+ val norm = wordVecNorms(i)
+ if (norm == 0.0f) {
+ cosineVec(i) = 0.0f
} else {
- cosVec(ind) /= norm
+ cosineVec(i) /= norm
}
- ind += 1
+ i += 1
}
- val pq = new BoundedPriorityQueue[(String, Double)](num + 1)(Ordering.by(_._2))
+ val pq = new BoundedPriorityQueue[(String, Float)](num + 1)(Ordering.by(_._2))
- for(i <- cosVec.indices) {
- pq += Tuple2(wordList(i), cosVec(i))
+ var j = 0
+ while (j < numWords) {
+ pq += Tuple2(wordList(j), cosineVec(j))
+ j += 1
}
val scored = pq.toSeq.sortBy(-_._2)
@@ -606,7 +607,10 @@ class Word2VecModel private[spark] (
case None => scored
}
- filtered.take(num).toArray
+ filtered
+ .take(num)
+ .map { case (word, score) => (word, score.toDouble) }
+ .toArray
}
/**