diff options
author | Xiangrui Meng <meng@databricks.com> | 2015-02-08 23:40:36 -0800 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-02-08 23:40:44 -0800 |
commit | 5782ee29eb273b1f87a07fd624bbf228d2597b98 (patch) | |
tree | 211ebbc8afa0cf37ecc6f96b6502d7de828ccf91 /examples | |
parent | 955f2863e39a96c0b00ad7d3eac972bb1cfcb594 (diff) | |
download | spark-5782ee29eb273b1f87a07fd624bbf228d2597b98.tar.gz spark-5782ee29eb273b1f87a07fd624bbf228d2597b98.tar.bz2 spark-5782ee29eb273b1f87a07fd624bbf228d2597b98.zip |
[SPARK-5539][MLLIB] LDA guide
This is the LDA user guide from jkbradley with Java and Scala code example.
Author: Xiangrui Meng <meng@databricks.com>
Author: Joseph K. Bradley <joseph@databricks.com>
Closes #4465 from mengxr/lda-guide and squashes the following commits:
6dcb7d1 [Xiangrui Meng] update java example in the user guide
76169ff [Xiangrui Meng] update java example
36c3ae2 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into lda-guide
c2a1efe [Joseph K. Bradley] Added LDA programming guide, plus Java example (which is in the guide and probably should be removed).
(cherry picked from commit 855d12ac0a9cdade4cd2cc64c4e7209478be6690)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
Diffstat (limited to 'examples')
-rw-r--r-- | examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java | 75 |
1 files changed, 75 insertions, 0 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java new file mode 100644 index 0000000000..f394ff2084 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import scala.Tuple2; + +import org.apache.spark.api.java.*; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.clustering.DistributedLDAModel; +import org.apache.spark.mllib.clustering.LDA; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.SparkConf; + +public class JavaLDAExample { + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("LDA Example"); + JavaSparkContext sc = new JavaSparkContext(conf); + + // Load and parse the data + String path = "data/mllib/sample_lda_data.txt"; + JavaRDD<String> data = sc.textFile(path); + JavaRDD<Vector> parsedData = data.map( + new Function<String, Vector>() { + public Vector call(String s) { + String[] sarray = s.trim().split(" "); + double[] values = new double[sarray.length]; + for (int i = 0; i < sarray.length; i++) + values[i] = Double.parseDouble(sarray[i]); + return Vectors.dense(values); + } + } + ); + // Index documents with unique IDs + JavaPairRDD<Long, Vector> corpus = JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map( + new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() { + public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) { + return doc_id.swap(); + } + } + )); + corpus.cache(); + + // Cluster the documents into three topics using LDA + DistributedLDAModel ldaModel = new LDA().setK(3).run(corpus); + + // Output topics. Each is a distribution over words (matching word count vectors) + System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize() + + " words):"); + Matrix topics = ldaModel.topicsMatrix(); + for (int topic = 0; topic < 3; topic++) { + System.out.print("Topic " + topic + ":"); + for (int word = 0; word < ldaModel.vocabSize(); word++) { + System.out.print(" " + topics.apply(word, topic)); + } + System.out.println(); + } + } +} |