aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/scala
diff options
context:
space:
mode:
authorZheng RuiFeng <ruifengz@foxmail.com>2016-05-11 12:49:41 +0200
committerNick Pentreath <nickp@za.ibm.com>2016-05-11 12:49:41 +0200
commitd88afabdfa83be47f36d833105aadd6b818ceeee (patch)
tree224f8a6fc5268e30bf200ab30f4e20f5c0164b34 /examples/src/main/scala
parentfafc95af79fa34f82964a86407c2ee046eda3814 (diff)
downloadspark-d88afabdfa83be47f36d833105aadd6b818ceeee.tar.gz
spark-d88afabdfa83be47f36d833105aadd6b818ceeee.tar.bz2
spark-d88afabdfa83be47f36d833105aadd6b818ceeee.zip
[SPARK-15150][EXAMPLE][DOC] Update LDA examples
## What changes were proposed in this pull request? 1,create a libsvm-type dataset for lda: `data/mllib/sample_lda_libsvm_data.txt` 2,add python example 3,directly read the datafile in examples 4,BTW, change to `SparkSession` in `aft_survival_regression.py` ## How was this patch tested? manual tests `./bin/spark-submit examples/src/main/python/ml/lda_example.py` Author: Zheng RuiFeng <ruifengz@foxmail.com> Closes #12927 from zhengruifeng/lda_pe.
Diffstat (limited to 'examples/src/main/scala')
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala6
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala41
2 files changed, 21 insertions, 26 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
index 2b224d50a0..b44304d810 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
@@ -25,7 +25,11 @@ import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SparkSession
/**
- * An example for AFTSurvivalRegression.
+ * An example demonstrating AFTSurvivalRegression.
+ * Run with
+ * {{{
+ * bin/run-example ml.AFTSurvivalRegressionExample
+ * }}}
*/
object AFTSurvivalRegressionExample {
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala
index c2920f6a5d..22b3b0e3ad 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala
@@ -20,57 +20,48 @@ package org.apache.spark.examples.ml
// scalastyle:off println
// $example on$
import org.apache.spark.ml.clustering.LDA
-import org.apache.spark.mllib.linalg.{Vectors, VectorUDT}
-import org.apache.spark.sql.{Row, SparkSession}
-import org.apache.spark.sql.types.{StructField, StructType}
// $example off$
+import org.apache.spark.sql.SparkSession
/**
- * An example demonstrating a LDA of ML pipeline.
+ * An example demonstrating LDA.
* Run with
* {{{
* bin/run-example ml.LDAExample
* }}}
*/
object LDAExample {
-
- final val FEATURES_COL = "features"
-
def main(args: Array[String]): Unit = {
-
- val input = "data/mllib/sample_lda_data.txt"
- // Creates a Spark context and a SQL context
+ // Creates a SparkSession
val spark = SparkSession
.builder
.appName(s"${this.getClass.getSimpleName}")
.getOrCreate()
// $example on$
- // Loads data
- val rowRDD = spark.read.text(input).rdd.filter(_.nonEmpty)
- .map(_.split(" ").map(_.toDouble)).map(Vectors.dense).map(Row(_))
- val schema = StructType(Array(StructField(FEATURES_COL, new VectorUDT, false)))
- val dataset = spark.createDataFrame(rowRDD, schema)
+ // Loads data.
+ val dataset = spark.read.format("libsvm")
+ .load("data/mllib/sample_lda_libsvm_data.txt")
- // Trains a LDA model
- val lda = new LDA()
- .setK(10)
- .setMaxIter(10)
- .setFeaturesCol(FEATURES_COL)
+ // Trains a LDA model.
+ val lda = new LDA().setK(10).setMaxIter(10)
val model = lda.fit(dataset)
- val transformed = model.transform(dataset)
val ll = model.logLikelihood(dataset)
val lp = model.logPerplexity(dataset)
+ println(s"The lower bound on the log likelihood of the entire corpus: $ll")
+ println(s"The upper bound bound on perplexity: $lp")
- // describeTopics
+ // Describe topics.
val topics = model.describeTopics(3)
-
- // Shows the result
+ println("The topics described by their top-weighted terms:")
topics.show(false)
- transformed.show(false)
+ // Shows the result.
+ val transformed = model.transform(dataset)
+ transformed.show(false)
// $example off$
+
spark.stop()
}
}