aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib/feature.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/mllib/feature.py')
-rw-r--r--python/pyspark/mllib/feature.py12
1 files changed, 9 insertions, 3 deletions
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index b32d0c70ec..5d99644fca 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -544,8 +544,7 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
@ignore_unicode_prefix
class Word2Vec(object):
- """
- Word2Vec creates vector representation of words in a text corpus.
+ """Word2Vec creates vector representation of words in a text corpus.
The algorithm first constructs a vocabulary from the corpus
and then learns vector representation of words in the vocabulary.
The vector representation can be used as features in
@@ -567,13 +566,19 @@ class Word2Vec(object):
>>> doc = sc.parallelize(localDoc).map(lambda line: line.split(" "))
>>> model = Word2Vec().setVectorSize(10).setSeed(42).fit(doc)
+ Querying for synonyms of a word will not return that word:
+
>>> syms = model.findSynonyms("a", 2)
>>> [s[0] for s in syms]
[u'b', u'c']
+
+ But querying for synonyms of a vector may return the word whose
+ representation is that vector:
+
>>> vec = model.transform("a")
>>> syms = model.findSynonyms(vec, 2)
>>> [s[0] for s in syms]
- [u'b', u'c']
+ [u'a', u'b']
>>> import os, tempfile
>>> path = tempfile.mkdtemp()
@@ -591,6 +596,7 @@ class Word2Vec(object):
... pass
.. versionadded:: 1.2.0
+
"""
def __init__(self):
"""