diff options
author | Bryan Cutler <cutlerb@gmail.com> | 2016-08-05 20:57:46 +0100 |
---|---|---|
committer | Sean Owen <sowen@cloudera.com> | 2016-08-05 20:57:46 +0100 |
commit | 180fd3e0a3426db200c97170926afb60751dfd0e (patch) | |
tree | 4d10f86a901a0cfd52121f856f409a2b90ff5404 /examples/src/main/scala/org | |
parent | 2460f03ffe94154b73995e4f16dd799d1a0f56b8 (diff) | |
download | spark-180fd3e0a3426db200c97170926afb60751dfd0e.tar.gz spark-180fd3e0a3426db200c97170926afb60751dfd0e.tar.bz2 spark-180fd3e0a3426db200c97170926afb60751dfd0e.zip |
[SPARK-16421][EXAMPLES][ML] Improve ML Example Outputs
## What changes were proposed in this pull request?
Improve example outputs to better reflect the functionality that is being presented. This mostly consisted of modifying what was printed at the end of the example, such as calling show() with truncate=False, but sometimes required minor tweaks in the example data to get relevant output. Explicitly set parameters when they are used as part of the example. Fixed Java examples that failed to run because of using old-style MLlib Vectors or problem with schema. Synced examples between different APIs.
## How was this patch tested?
Ran each example for Scala, Python, and Java and made sure output was legible on a terminal of width 100.
Author: Bryan Cutler <cutlerb@gmail.com>
Closes #14308 from BryanCutler/ml-examples-improve-output-SPARK-16260.
Diffstat (limited to 'examples/src/main/scala/org')
29 files changed, 112 insertions, 47 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala index d0b874c48d..5d8831265e 100644 --- a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala +++ b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala @@ -31,6 +31,11 @@ import org.apache.spark.sql.SparkSession * * This is an example implementation for learning how to use Spark. For more conventional use, * please refer to org.apache.spark.graphx.lib.PageRank + * + * Example Usage: + * {{{ + * bin/run-example SparkPageRank data/mllib/pagerank_data.txt 10 + * }}} */ object SparkPageRank { diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala index b6d7b36916..cdb33f4d6d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala @@ -55,8 +55,9 @@ object AFTSurvivalRegressionExample { val model = aft.fit(training) // Print the coefficients, intercept and scale parameter for AFT survival regression - println(s"Coefficients: ${model.coefficients} Intercept: " + - s"${model.intercept} Scale: ${model.scale}") + println(s"Coefficients: ${model.coefficients}") + println(s"Intercept: ${model.intercept}") + println(s"Scale: ${model.scale}") model.transform(training).show(false) // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala index 5cd13ad64c..a4f62e7871 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala @@ -29,9 +29,10 @@ object BinarizerExample { .builder .appName("BinarizerExample") .getOrCreate() + // $example on$ val data = Array((0, 0.1), (1, 0.8), (2, 0.2)) - val dataFrame = spark.createDataFrame(data).toDF("label", "feature") + val dataFrame = spark.createDataFrame(data).toDF("id", "feature") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") @@ -39,8 +40,9 @@ object BinarizerExample { .setThreshold(0.5) val binarizedDataFrame = binarizer.transform(dataFrame) - val binarizedFeatures = binarizedDataFrame.select("binarized_feature") - binarizedFeatures.collect().foreach(println) + + println(s"Binarizer output with Threshold = ${binarizer.getThreshold}") + binarizedDataFrame.show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala index 38cce34bb5..04e4eccd43 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala @@ -33,7 +33,7 @@ object BucketizerExample { // $example on$ val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity) - val data = Array(-0.5, -0.3, 0.0, 0.2) + val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9) val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val bucketizer = new Bucketizer() @@ -43,8 +43,11 @@ object BucketizerExample { // Transform original data into its bucket index. val bucketedData = bucketizer.transform(dataFrame) + + println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets") bucketedData.show() // $example off$ + spark.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala index c9394dd9c6..5638e66b87 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala @@ -48,8 +48,11 @@ object ChiSqSelectorExample { .setOutputCol("selectedFeatures") val result = selector.fit(df).transform(df) + + println(s"ChiSqSelector output with top ${selector.getNumTopFeatures} features selected") result.show() // $example off$ + spark.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala index 988d8941a4..91d861dd43 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala @@ -49,7 +49,7 @@ object CountVectorizerExample { .setInputCol("words") .setOutputCol("features") - cvModel.transform(df).select("features").show() + cvModel.transform(df).show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala index ddc6717528..3383171303 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala @@ -45,7 +45,7 @@ object DCTExample { .setInverse(false) val dctDf = dct.transform(df) - dctDf.select("featuresDCT").show(3) + dctDf.select("featuresDCT").show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala index 26095b46f5..5e4bea4c4f 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala @@ -49,8 +49,8 @@ object GaussianMixtureExample { // output parameters of mixture model model for (i <- 0 until model.getK) { - println("weight=%f\nmu=%s\nsigma=\n%s\n" format - (model.weights(i), model.gaussians(i).mean, model.gaussians(i).cov)) + println(s"Gaussian $i:\nweight=${model.weights(i)}\n" + + s"mu=${model.gaussians(i).mean}\nsigma=\n${model.gaussians(i).cov}\n") } // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala index 950733831c..2940682c32 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala @@ -19,6 +19,7 @@ package org.apache.spark.examples.ml // $example on$ +import org.apache.spark.ml.attribute.Attribute import org.apache.spark.ml.feature.{IndexToString, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession @@ -46,12 +47,23 @@ object IndexToStringExample { .fit(df) val indexed = indexer.transform(df) + println(s"Transformed string column '${indexer.getInputCol}' " + + s"to indexed column '${indexer.getOutputCol}'") + indexed.show() + + val inputColSchema = indexed.schema(indexer.getOutputCol) + println(s"StringIndexer will store labels in output column metadata: " + + s"${Attribute.fromStructField(inputColSchema).toString}\n") + val converter = new IndexToString() .setInputCol("categoryIndex") .setOutputCol("originalCategory") val converted = converter.transform(indexed) - converted.select("id", "originalCategory").show() + + println(s"Transformed indexed column '${converter.getInputCol}' back to original string " + + s"column '${converter.getOutputCol}' using labels in metadata") + converted.select("id", "categoryIndex", "originalCategory").show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala index a840559d24..9bac16ec76 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala @@ -47,8 +47,8 @@ object IsotonicRegressionExample { val ir = new IsotonicRegression() val model = ir.fit(dataset) - println(s"Boundaries in increasing order: ${model.boundaries}") - println(s"Predictions associated with the boundaries: ${model.predictions}") + println(s"Boundaries in increasing order: ${model.boundaries}\n") + println(s"Predictions associated with the boundaries: ${model.predictions}\n") // Makes predictions. model.transform(dataset).show() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala index 94cf286623..4540a8d728 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala @@ -50,7 +50,7 @@ object LinearRegressionWithElasticNetExample { // Summarize the model over the training set and print out some metrics val trainingSummary = lrModel.summary println(s"numIterations: ${trainingSummary.totalIterations}") - println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}") + println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]") trainingSummary.residuals.show() println(s"RMSE: ${trainingSummary.rootMeanSquaredError}") println(s"r2: ${trainingSummary.r2}") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala index cd8775c942..1740a0d3f9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala @@ -51,6 +51,7 @@ object LogisticRegressionSummaryExample { // Obtain the objective per iteration. val objectiveHistory = trainingSummary.objectiveHistory + println("objectiveHistory:") objectiveHistory.foreach(loss => println(loss)) // Obtain the metrics useful to judge performance on test data. @@ -61,7 +62,7 @@ object LogisticRegressionSummaryExample { // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. val roc = binarySummary.roc roc.show() - println(binarySummary.areaUnderROC) + println(s"areaUnderROC: ${binarySummary.areaUnderROC}") // Set the model threshold to maximize F-Measure val fMeasure = binarySummary.fMeasureByThreshold diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala index 572adce657..85d071369d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala @@ -19,6 +19,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MaxAbsScaler +import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession @@ -30,7 +31,12 @@ object MaxAbsScalerExample { .getOrCreate() // $example on$ - val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val dataFrame = spark.createDataFrame(Seq( + (0, Vectors.dense(1.0, 0.1, -8.0)), + (1, Vectors.dense(2.0, 1.0, -4.0)), + (2, Vectors.dense(4.0, 10.0, 8.0)) + )).toDF("id", "features") + val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaledFeatures") @@ -40,7 +46,7 @@ object MaxAbsScalerExample { // rescale each feature to range [-1, 1] val scaledData = scalerModel.transform(dataFrame) - scaledData.show() + scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala index d728019a62..9ee6d9b449 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MinMaxScaler +import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession @@ -31,7 +32,11 @@ object MinMaxScalerExample { .getOrCreate() // $example on$ - val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val dataFrame = spark.createDataFrame(Seq( + (0, Vectors.dense(1.0, 0.1, -1.0)), + (1, Vectors.dense(2.0, 1.1, 1.0)), + (2, Vectors.dense(3.0, 10.1, 3.0)) + )).toDF("id", "features") val scaler = new MinMaxScaler() .setInputCol("features") @@ -42,7 +47,8 @@ object MinMaxScalerExample { // rescale each feature to range [min, max]. val scaledData = scalerModel.transform(dataFrame) - scaledData.show() + println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]") + scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala index a39e3202ba..6fce82d294 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala @@ -66,7 +66,7 @@ object MultilayerPerceptronClassifierExample { val evaluator = new MulticlassClassificationEvaluator() .setMetricName("accuracy") - println("Accuracy: " + evaluator.evaluate(predictionAndLabels)) + println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels)) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala index e0b52e7a36..d2183d6b49 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala @@ -35,11 +35,12 @@ object NGramExample { (0, Array("Hi", "I", "heard", "about", "Spark")), (1, Array("I", "wish", "Java", "could", "use", "case", "classes")), (2, Array("Logistic", "regression", "models", "are", "neat")) - )).toDF("label", "words") + )).toDF("id", "words") + + val ngram = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams") - val ngram = new NGram().setInputCol("words").setOutputCol("ngrams") val ngramDataFrame = ngram.transform(wordDataFrame) - ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println) + ngramDataFrame.select("ngrams").show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala index 3ae0623c4c..bd9fcc420a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala @@ -52,7 +52,7 @@ object NaiveBayesExample { .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(predictions) - println("Accuracy: " + accuracy) + println("Test set accuracy = " + accuracy) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala index 75ba33a7e7..989d250c17 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Normalizer +import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession @@ -31,7 +32,11 @@ object NormalizerExample { .getOrCreate() // $example on$ - val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val dataFrame = spark.createDataFrame(Seq( + (0, Vectors.dense(1.0, 0.5, -1.0)), + (1, Vectors.dense(2.0, 1.0, 1.0)), + (2, Vectors.dense(4.0, 10.0, 2.0)) + )).toDF("id", "features") // Normalize each Vector using $L^1$ norm. val normalizer = new Normalizer() @@ -40,10 +45,12 @@ object NormalizerExample { .setP(1.0) val l1NormData = normalizer.transform(dataFrame) + println("Normalized using L^1 norm") l1NormData.show() // Normalize each Vector using $L^\infty$ norm. val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity) + println("Normalized using L^inf norm") lInfNormData.show() // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala index 4aa649b133..274cc1268f 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala @@ -49,8 +49,9 @@ object OneHotEncoderExample { val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") + val encoded = encoder.transform(indexed) - encoded.select("id", "categoryVec").show() + encoded.show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala index acde110683..4ad6c7c3ef 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala @@ -69,7 +69,7 @@ object OneVsRestExample { // compute the classification error on test data. val accuracy = evaluator.evaluate(predictions) - println(s"Test Error : ${1 - accuracy}") + println(s"Test Error = ${1 - accuracy}") // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala index dca96eea2b..4e1d7cdbab 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala @@ -38,14 +38,15 @@ object PCAExample { Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") + val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) .fit(df) - val pcaDF = pca.transform(df) - val result = pcaDF.select("pcaFeatures") - result.show() + + val result = pca.transform(df).select("pcaFeatures") + result.show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala index 54d2e6b36d..f117b03ab2 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala @@ -33,17 +33,19 @@ object PolynomialExpansionExample { // $example on$ val data = Array( - Vectors.dense(-2.0, 2.3), + Vectors.dense(2.0, 1.0), Vectors.dense(0.0, 0.0), - Vectors.dense(0.6, -1.1) + Vectors.dense(3.0, -1.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") - val polynomialExpansion = new PolynomialExpansion() + + val polyExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) - val polyDF = polynomialExpansion.transform(df) - polyDF.select("polyFeatures").take(3).foreach(println) + + val polyDF = polyExpansion.transform(df) + polyDF.show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala index a56de0856d..369a6fffd7 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala @@ -40,7 +40,7 @@ object StopWordsRemoverExample { (1, Seq("Mary", "had", "a", "little", "lamb")) )).toDF("id", "raw") - remover.transform(dataSet).show() + remover.transform(dataSet).show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala index 97f6fcce15..ec2df2ef87 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala @@ -33,9 +33,9 @@ object TfIdfExample { // $example on$ val sentenceData = spark.createDataFrame(Seq( - (0, "Hi I heard about Spark"), - (0, "I wish Java could use case classes"), - (1, "Logistic regression models are neat") + (0.0, "Hi I heard about Spark"), + (0.0, "I wish Java could use case classes"), + (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") @@ -51,7 +51,7 @@ object TfIdfExample { val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) - rescaledData.select("features", "label").take(3).foreach(println) + rescaledData.select("label", "features").show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala index 90d0faaf47..0167dc3723 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} +import org.apache.spark.sql.functions._ // $example off$ import org.apache.spark.sql.SparkSession @@ -35,7 +36,7 @@ object TokenizerExample { (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") - )).toDF("label", "sentence") + )).toDF("id", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val regexTokenizer = new RegexTokenizer() @@ -43,11 +44,15 @@ object TokenizerExample { .setOutputCol("words") .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) + val countTokens = udf { (words: Seq[String]) => words.length } + val tokenized = tokenizer.transform(sentenceDataFrame) - tokenized.select("words", "label").take(3).foreach(println) + tokenized.select("sentence", "words") + .withColumn("tokens", countTokens(col("words"))).show(false) val regexTokenized = regexTokenizer.transform(sentenceDataFrame) - regexTokenized.select("words", "label").take(3).foreach(println) + regexTokenized.select("sentence", "words") + .withColumn("tokens", countTokens(col("words"))).show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala index 13c72f88cc..13b58d154b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala @@ -100,6 +100,7 @@ object UnaryTransformerExample { val data = spark.range(0, 5).toDF("input") .select(col("input").cast("double").as("input")) val result = myTransformer.transform(data) + println("Transformed by adding constant value") result.show() // Save and load the Transformer. @@ -109,6 +110,7 @@ object UnaryTransformerExample { val sameTransformer = MyTransformer.load(dirName) // Transform the data to show the results are identical. + println("Same transform applied from loaded model") val sameResult = sameTransformer.transform(data) sameResult.show() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala index 8910470c1c..3d5c7efb20 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala @@ -41,7 +41,8 @@ object VectorAssemblerExample { .setOutputCol("features") val output = assembler.transform(dataset) - println(output.select("features", "clicked").first()) + println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") + output.select("features", "clicked").show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala index 85dd5c2776..63a60912de 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala @@ -37,7 +37,10 @@ object VectorSlicerExample { .getOrCreate() // $example on$ - val data = Arrays.asList(Row(Vectors.dense(-2.0, 2.3, 0.0))) + val data = Arrays.asList( + Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), + Row(Vectors.dense(-2.0, 2.3, 0.0)) + ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) @@ -51,7 +54,7 @@ object VectorSlicerExample { // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) - println(output.select("userFeatures", "features").first()) + output.show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala index 5c8bd19f20..4bcc6ac6a0 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala @@ -20,6 +20,8 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession @@ -47,7 +49,8 @@ object Word2VecExample { val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) - result.select("result").take(3).foreach(println) + result.collect().foreach { case Row(text: Seq[_], features: Vector) => + println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() |