[SPARK-5539][MLLIB] LDA guide

This is the LDA user guide from jkbradley with Java and Scala code example. Author: Xiangrui Meng <meng@databricks.com> Author: Joseph K. Bradley <joseph@databricks.com> Closes #4465 from mengxr/lda-guide and squashes the following commits: 6dcb7d1 [Xiangrui Meng] update java example in the user guide 76169ff [Xiangrui Meng] update java example 36c3ae2 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into lda-guide c2a1efe [Joseph K. Bradley] Added LDA programming guide, plus Java example (which is in the guide and probably should be removed). (cherry picked from commit 855d12ac0a9cdade4cd2cc64c4e7209478be6690) Signed-off-by: Xiangrui Meng <meng@databricks.com>
author: Xiangrui Meng <meng@databricks.com> 2015-02-08 23:40:36 -0800
committer: Xiangrui Meng <meng@databricks.com> 2015-02-08 23:40:44 -0800
commit: 5782ee29eb273b1f87a07fd624bbf228d2597b98 (patch)
tree: 211ebbc8afa0cf37ecc6f96b6502d7de828ccf91 /examples
parent: 955f2863e39a96c0b00ad7d3eac972bb1cfcb594 (diff)
download: spark-5782ee29eb273b1f87a07fd624bbf228d2597b98.tar.gz
spark-5782ee29eb273b1f87a07fd624bbf228d2597b98.tar.bz2
spark-5782ee29eb273b1f87a07fd624bbf228d2597b98.zip
1 files changed, 75 insertions, 0 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java
new file mode 100644
index 0000000000..f394ff2084
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.clustering.DistributedLDAModel;
+import org.apache.spark.mllib.clustering.LDA;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.SparkConf;
+
+public class JavaLDAExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("LDA Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    // Load and parse the data
+    String path = "data/mllib/sample_lda_data.txt";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<Vector> parsedData = data.map(
+        new Function<String, Vector>() {
+          public Vector call(String s) {
+            String[] sarray = s.trim().split(" ");
+            double[] values = new double[sarray.length];
+            for (int i = 0; i < sarray.length; i++)
+              values[i] = Double.parseDouble(sarray[i]);
+            return Vectors.dense(values);
+          }
+        }
+    );
+    // Index documents with unique IDs
+    JavaPairRDD<Long, Vector> corpus = JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
+        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
+          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
+            return doc_id.swap();
+          }
+        }
+    ));
+    corpus.cache();
+
+    // Cluster the documents into three topics using LDA
+    DistributedLDAModel ldaModel = new LDA().setK(3).run(corpus);
+
+    // Output topics. Each is a distribution over words (matching word count vectors)
+    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
+        + " words):");
+    Matrix topics = ldaModel.topicsMatrix();
+    for (int topic = 0; topic < 3; topic++) {
+      System.out.print("Topic " + topic + ":");
+      for (int word = 0; word < ldaModel.vocabSize(); word++) {
+        System.out.print(" " + topics.apply(word, topic));
+      }
+      System.out.println();
+    }
+  }
+}
author	Xiangrui Meng <meng@databricks.com>	2015-02-08 23:40:36 -0800
committer	Xiangrui Meng <meng@databricks.com>	2015-02-08 23:40:44 -0800
commit	5782ee29eb273b1f87a07fd624bbf228d2597b98 (patch)
tree	211ebbc8afa0cf37ecc6f96b6502d7de828ccf91 /examples
parent	955f2863e39a96c0b00ad7d3eac972bb1cfcb594 (diff)
download	spark-5782ee29eb273b1f87a07fd624bbf228d2597b98.tar.gz spark-5782ee29eb273b1f87a07fd624bbf228d2597b98.tar.bz2 spark-5782ee29eb273b1f87a07fd624bbf228d2597b98.zip