From e07fb6a41ee949f8dba44d5a3b6c0615f27f0eaf Mon Sep 17 00:00:00 2001 From: Anant Date: Fri, 31 Oct 2014 18:33:19 -0700 Subject: [SPARK-3838][examples][mllib][python] Word2Vec example in python This pull request refers to issue: https://issues.apache.org/jira/browse/SPARK-3838 Python example for word2vec mengxr Author: Anant Closes #2952 from anantasty/SPARK-3838 and squashes the following commits: 87bd723 [Anant] remove stop line 4bd439e [Anant] Changes as per code review. Fized error in word2vec python example, simplified example in docs. 3d3c9ee [Anant] Added empty line after python imports 0c90c31 [Anant] Fixed erroneous code. I was still treating each line to be a single word instead of 16 words ee4f5f6 [Anant] Fixes from code review comments c637bcf [Anant] Added word2vec python example to docs 269f31f [Anant] added example in docs c015b14 [Anant] Added python example for word2vec --- docs/mllib-feature-extraction.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'docs/mllib-feature-extraction.md') diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index 886d71df47..197bc77d50 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -203,6 +203,23 @@ for((synonym, cosineSimilarity) <- synonyms) { } {% endhighlight %} +
+{% highlight python %} +from pyspark import SparkContext +from pyspark.mllib.feature import Word2Vec + +sc = SparkContext(appName='Word2Vec') +inp = sc.textFile("text8_lines").map(lambda row: row.split(" ")) + +word2vec = Word2Vec() +model = word2vec.fit(inp) + +synonyms = model.findSynonyms('china', 40) + +for word, cosine_distance in synonyms: + print "{}: {}".format(word, cosine_distance) +{% endhighlight %} +
## StandardScaler -- cgit v1.2.3