aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorSean Owen <sowen@cloudera.com>2016-01-12 11:50:33 +0000
committerSean Owen <sowen@cloudera.com>2016-01-12 11:50:33 +0000
commitc48f2a3a5fd714ad2ff19b29337e55583988431e (patch)
tree5e833677cc0e0f9a6d6f97efeac26823a01fd1b7 /mllib
parent8cfa218f4f1b05f4d076ec15dd0a033ad3e4500d (diff)
downloadspark-c48f2a3a5fd714ad2ff19b29337e55583988431e.tar.gz
spark-c48f2a3a5fd714ad2ff19b29337e55583988431e.tar.bz2
spark-c48f2a3a5fd714ad2ff19b29337e55583988431e.zip
[SPARK-7615][MLLIB] MLLIB Word2Vec wordVectors divided by Euclidean Norm equals to zero
Cosine similarity with 0 vector should be 0 Related to https://github.com/apache/spark/pull/10152 Author: Sean Owen <sowen@cloudera.com> Closes #10696 from srowen/SPARK-7615.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala7
1 files changed, 6 insertions, 1 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index dc5d070890..dee898827f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -543,7 +543,12 @@ class Word2VecModel private[spark] (
val cosVec = cosineVec.map(_.toDouble)
var ind = 0
while (ind < numWords) {
- cosVec(ind) /= wordVecNorms(ind)
+ val norm = wordVecNorms(ind)
+ if (norm == 0.0) {
+ cosVec(ind) = 0.0
+ } else {
+ cosVec(ind) /= norm
+ }
ind += 1
}
wordList.zip(cosVec)