diff options
author | Yanbo Liang <ybliang8@gmail.com> | 2015-08-18 12:56:36 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-08-18 12:56:36 -0700 |
commit | 747c2ba8006d5b86f3be8dfa9ace639042a35628 (patch) | |
tree | f9bc746b077fcca20e398cd0a3d8d9700a388397 /docs | |
parent | f4fa61effe34dae2f0eab0bef57b2dee220cf92f (diff) | |
download | spark-747c2ba8006d5b86f3be8dfa9ace639042a35628.tar.gz spark-747c2ba8006d5b86f3be8dfa9ace639042a35628.tar.bz2 spark-747c2ba8006d5b86f3be8dfa9ace639042a35628.zip |
[SPARK-10032] [PYSPARK] [DOC] Add Python example for mllib LDAModel user guide
Add Python example for mllib LDAModel user guide
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #8227 from yanboliang/spark-10032.
Diffstat (limited to 'docs')
-rw-r--r-- | docs/mllib-clustering.md | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index bb875ae2ae..fd9ab258e1 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -564,6 +564,34 @@ public class JavaLDAExample { {% endhighlight %} </div> +<div data-lang="python" markdown="1"> +{% highlight python %} +from pyspark.mllib.clustering import LDA, LDAModel +from pyspark.mllib.linalg import Vectors + +# Load and parse the data +data = sc.textFile("data/mllib/sample_lda_data.txt") +parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) +# Index documents with unique IDs +corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() + +# Cluster the documents into three topics using LDA +ldaModel = LDA.train(corpus, k=3) + +# Output topics. Each is a distribution over words (matching word count vectors) +print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") +topics = ldaModel.topicsMatrix() +for topic in range(3): + print("Topic " + str(topic) + ":") + for word in range(0, ldaModel.vocabSize()): + print(" " + str(topics[word][topic])) + +# Save and load model +model.save(sc, "myModelPath") +sameModel = LDAModel.load(sc, "myModelPath") +{% endhighlight %} +</div> + </div> ## Streaming k-means |