diff options
author | Zheng RuiFeng <ruifengz@foxmail.com> | 2016-05-11 12:49:41 +0200 |
---|---|---|
committer | Nick Pentreath <nickp@za.ibm.com> | 2016-05-11 12:49:41 +0200 |
commit | d88afabdfa83be47f36d833105aadd6b818ceeee (patch) | |
tree | 224f8a6fc5268e30bf200ab30f4e20f5c0164b34 /examples/src/main/scala | |
parent | fafc95af79fa34f82964a86407c2ee046eda3814 (diff) | |
download | spark-d88afabdfa83be47f36d833105aadd6b818ceeee.tar.gz spark-d88afabdfa83be47f36d833105aadd6b818ceeee.tar.bz2 spark-d88afabdfa83be47f36d833105aadd6b818ceeee.zip |
[SPARK-15150][EXAMPLE][DOC] Update LDA examples
## What changes were proposed in this pull request?
1,create a libsvm-type dataset for lda: `data/mllib/sample_lda_libsvm_data.txt`
2,add python example
3,directly read the datafile in examples
4,BTW, change to `SparkSession` in `aft_survival_regression.py`
## How was this patch tested?
manual tests
`./bin/spark-submit examples/src/main/python/ml/lda_example.py`
Author: Zheng RuiFeng <ruifengz@foxmail.com>
Closes #12927 from zhengruifeng/lda_pe.
Diffstat (limited to 'examples/src/main/scala')
-rw-r--r-- | examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala | 6 | ||||
-rw-r--r-- | examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala | 41 |
2 files changed, 21 insertions, 26 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala index 2b224d50a0..b44304d810 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala @@ -25,7 +25,11 @@ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.SparkSession /** - * An example for AFTSurvivalRegression. + * An example demonstrating AFTSurvivalRegression. + * Run with + * {{{ + * bin/run-example ml.AFTSurvivalRegressionExample + * }}} */ object AFTSurvivalRegressionExample { diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala index c2920f6a5d..22b3b0e3ad 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala @@ -20,57 +20,48 @@ package org.apache.spark.examples.ml // scalastyle:off println // $example on$ import org.apache.spark.ml.clustering.LDA -import org.apache.spark.mllib.linalg.{Vectors, VectorUDT} -import org.apache.spark.sql.{Row, SparkSession} -import org.apache.spark.sql.types.{StructField, StructType} // $example off$ +import org.apache.spark.sql.SparkSession /** - * An example demonstrating a LDA of ML pipeline. + * An example demonstrating LDA. * Run with * {{{ * bin/run-example ml.LDAExample * }}} */ object LDAExample { - - final val FEATURES_COL = "features" - def main(args: Array[String]): Unit = { - - val input = "data/mllib/sample_lda_data.txt" - // Creates a Spark context and a SQL context + // Creates a SparkSession val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() // $example on$ - // Loads data - val rowRDD = spark.read.text(input).rdd.filter(_.nonEmpty) - .map(_.split(" ").map(_.toDouble)).map(Vectors.dense).map(Row(_)) - val schema = StructType(Array(StructField(FEATURES_COL, new VectorUDT, false))) - val dataset = spark.createDataFrame(rowRDD, schema) + // Loads data. + val dataset = spark.read.format("libsvm") + .load("data/mllib/sample_lda_libsvm_data.txt") - // Trains a LDA model - val lda = new LDA() - .setK(10) - .setMaxIter(10) - .setFeaturesCol(FEATURES_COL) + // Trains a LDA model. + val lda = new LDA().setK(10).setMaxIter(10) val model = lda.fit(dataset) - val transformed = model.transform(dataset) val ll = model.logLikelihood(dataset) val lp = model.logPerplexity(dataset) + println(s"The lower bound on the log likelihood of the entire corpus: $ll") + println(s"The upper bound bound on perplexity: $lp") - // describeTopics + // Describe topics. val topics = model.describeTopics(3) - - // Shows the result + println("The topics described by their top-weighted terms:") topics.show(false) - transformed.show(false) + // Shows the result. + val transformed = model.transform(dataset) + transformed.show(false) // $example off$ + spark.stop() } } |