aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2016-08-05 20:57:46 +0100
committerSean Owen <sowen@cloudera.com>2016-08-05 20:57:46 +0100
commit180fd3e0a3426db200c97170926afb60751dfd0e (patch)
tree4d10f86a901a0cfd52121f856f409a2b90ff5404 /examples/src/main
parent2460f03ffe94154b73995e4f16dd799d1a0f56b8 (diff)
downloadspark-180fd3e0a3426db200c97170926afb60751dfd0e.tar.gz
spark-180fd3e0a3426db200c97170926afb60751dfd0e.tar.bz2
spark-180fd3e0a3426db200c97170926afb60751dfd0e.zip
[SPARK-16421][EXAMPLES][ML] Improve ML Example Outputs
## What changes were proposed in this pull request? Improve example outputs to better reflect the functionality that is being presented. This mostly consisted of modifying what was printed at the end of the example, such as calling show() with truncate=False, but sometimes required minor tweaks in the example data to get relevant output. Explicitly set parameters when they are used as part of the example. Fixed Java examples that failed to run because of using old-style MLlib Vectors or problem with schema. Synced examples between different APIs. ## How was this patch tested? Ran each example for Scala, Python, and Java and made sure output was legible on a terminal of width 100. Author: Bryan Cutler <cutlerb@gmail.com> Closes #14308 from BryanCutler/ml-examples-improve-output-SPARK-16260.
Diffstat (limited to 'examples/src/main')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/JavaPageRank.java5
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java5
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java11
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java7
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java2
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java6
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java15
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java28
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java30
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java8
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java13
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java23
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java2
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java2
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java2
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java3
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java12
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java33
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java6
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java9
-rw-r--r--examples/src/main/python/ml/binarizer_example.py10
-rw-r--r--examples/src/main/python/ml/bucketizer_example.py4
-rw-r--r--examples/src/main/python/ml/chisq_selector_example.py2
-rw-r--r--examples/src/main/python/ml/count_vectorizer_example.py4
-rw-r--r--examples/src/main/python/ml/dct_example.py3
-rw-r--r--examples/src/main/python/ml/gaussian_mixture_example.py6
-rw-r--r--examples/src/main/python/ml/index_to_string_example.py14
-rw-r--r--examples/src/main/python/ml/isotonic_regression_example.py4
-rw-r--r--examples/src/main/python/ml/linear_regression_with_elastic_net.py12
-rw-r--r--examples/src/main/python/ml/max_abs_scaler_example.py10
-rw-r--r--examples/src/main/python/ml/min_max_scaler_example.py10
-rw-r--r--examples/src/main/python/ml/multilayer_perceptron_classification.py2
-rw-r--r--examples/src/main/python/ml/n_gram_example.py9
-rw-r--r--examples/src/main/python/ml/naive_bayes_example.py12
-rw-r--r--examples/src/main/python/ml/normalizer_example.py9
-rw-r--r--examples/src/main/python/ml/onehot_encoder_example.py4
-rw-r--r--examples/src/main/python/ml/pipeline_example.py5
-rw-r--r--examples/src/main/python/ml/polynomial_expansion_example.py11
-rw-r--r--examples/src/main/python/ml/stopwords_remover_example.py2
-rw-r--r--examples/src/main/python/ml/tf_idf_example.py9
-rw-r--r--examples/src/main/python/ml/tokenizer_example.py14
-rw-r--r--examples/src/main/python/ml/train_validation_split.py7
-rw-r--r--examples/src/main/python/ml/vector_assembler_example.py3
-rw-r--r--examples/src/main/python/ml/vector_indexer_example.py4
-rw-r--r--examples/src/main/python/ml/word2vec_example.py5
-rwxr-xr-xexamples/src/main/python/pagerank.py7
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala5
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala5
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala8
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala5
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala3
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala14
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala3
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala10
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala10
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala7
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala9
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala3
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala7
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala8
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala3
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala7
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala5
82 files changed, 427 insertions, 188 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java b/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java
index ed0bb87657..bcc493bdcb 100644
--- a/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java
+++ b/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java
@@ -45,6 +45,11 @@ import org.apache.spark.sql.SparkSession;
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.graphx.lib.PageRank
+ *
+ * Example Usage:
+ * <pre>
+ * bin/run-example JavaPageRank data/mllib/pagerank_data.txt 10
+ * </pre>
*/
public final class JavaPageRank {
private static final Pattern SPACES = Pattern.compile("\\s+");
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java
index 3f034588c9..7c741ff56e 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java
@@ -71,8 +71,9 @@ public class JavaAFTSurvivalRegressionExample {
AFTSurvivalRegressionModel model = aft.fit(training);
// Print the coefficients, intercept and scale parameter for AFT survival regression
- System.out.println("Coefficients: " + model.coefficients() + " Intercept: "
- + model.intercept() + " Scale: " + model.scale());
+ System.out.println("Coefficients: " + model.coefficients());
+ System.out.println("Intercept: " + model.intercept());
+ System.out.println("Scale: " + model.scale());
model.transform(training).show(false);
// $example off$
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
index a954dbd20c..3090d8fd14 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
@@ -51,17 +51,18 @@ public class JavaBinarizerExample {
new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
});
Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);
+
Binarizer binarizer = new Binarizer()
.setInputCol("feature")
.setOutputCol("binarized_feature")
.setThreshold(0.5);
+
Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame);
- Dataset<Row> binarizedFeatures = binarizedDataFrame.select("binarized_feature");
- for (Row r : binarizedFeatures.collectAsList()) {
- Double binarized_value = r.getDouble(0);
- System.out.println(binarized_value);
- }
+
+ System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold());
+ binarizedDataFrame.show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
index 691df3887a..f009938333 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
@@ -44,10 +44,12 @@ public class JavaBucketizerExample {
double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
List<Row> data = Arrays.asList(
+ RowFactory.create(-999.9),
RowFactory.create(-0.5),
RowFactory.create(-0.3),
RowFactory.create(0.0),
- RowFactory.create(0.2)
+ RowFactory.create(0.2),
+ RowFactory.create(999.9)
);
StructType schema = new StructType(new StructField[]{
new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
@@ -61,8 +63,11 @@ public class JavaBucketizerExample {
// Transform original data into its bucket index.
Dataset<Row> bucketedData = bucketizer.transform(dataFrame);
+
+ System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets");
bucketedData.show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java
index fcf90d8d18..73738966b1 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java
@@ -63,7 +63,11 @@ public class JavaChiSqSelectorExample {
.setOutputCol("selectedFeatures");
Dataset<Row> result = selector.fit(df).transform(df);
+
+ System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures()
+ + " features selected");
result.show();
+
// $example off$
spark.stop();
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
index 0a6b136014..ac2a86c30b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
@@ -61,7 +61,7 @@ public class JavaCountVectorizerExample {
.setInputCol("text")
.setOutputCol("feature");
- cvModel.transform(df).show();
+ cvModel.transform(df).show(false);
// $example off$
spark.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
index 66ce23b49d..04546d29fa 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
@@ -51,13 +51,17 @@ public class JavaDCTExample {
new StructField("features", new VectorUDT(), false, Metadata.empty()),
});
Dataset<Row> df = spark.createDataFrame(data, schema);
+
DCT dct = new DCT()
.setInputCol("features")
.setOutputCol("featuresDCT")
.setInverse(false);
+
Dataset<Row> dctDf = dct.transform(df);
- dctDf.select("featuresDCT").show(3);
+
+ dctDf.select("featuresDCT").show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
index 526bed93fb..72bd5d0395 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
@@ -54,8 +54,8 @@ public class JavaGaussianMixtureExample {
// Output the parameters of the mixture model
for (int i = 0; i < model.getK(); i++) {
- System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n",
- model.weights()[i], model.gaussians()[i].mean(), model.gaussians()[i].cov());
+ System.out.printf("Gaussian %d:\nweight=%f\nmu=%s\nsigma=\n%s\n\n",
+ i, model.weights()[i], model.gaussians()[i].mean(), model.gaussians()[i].cov());
}
// $example off$
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java
index 0064beb8c8..6965512f93 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java
@@ -24,6 +24,7 @@ import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
import java.util.List;
+import org.apache.spark.ml.attribute.Attribute;
import org.apache.spark.ml.feature.IndexToString;
import org.apache.spark.ml.feature.StringIndexer;
import org.apache.spark.ml.feature.StringIndexerModel;
@@ -63,11 +64,23 @@ public class JavaIndexToStringExample {
.fit(df);
Dataset<Row> indexed = indexer.transform(df);
+ System.out.println("Transformed string column '" + indexer.getInputCol() + "' " +
+ "to indexed column '" + indexer.getOutputCol() + "'");
+ indexed.show();
+
+ StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol());
+ System.out.println("StringIndexer will store labels in output column metadata: " +
+ Attribute.fromStructField(inputColSchema).toString() + "\n");
+
IndexToString converter = new IndexToString()
.setInputCol("categoryIndex")
.setOutputCol("originalCategory");
Dataset<Row> converted = converter.transform(indexed);
- converted.select("id", "originalCategory").show();
+
+ System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " +
+ "original string column '" + converter.getOutputCol() + "' using labels in metadata");
+ converted.select("id", "categoryIndex", "originalCategory").show();
+
// $example off$
spark.stop();
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java
index 0ec17b0471..a7de8e699c 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java
@@ -50,8 +50,8 @@ public class JavaIsotonicRegressionExample {
IsotonicRegression ir = new IsotonicRegression();
IsotonicRegressionModel model = ir.fit(dataset);
- System.out.println("Boundaries in increasing order: " + model.boundaries());
- System.out.println("Predictions associated with the boundaries: " + model.predictions());
+ System.out.println("Boundaries in increasing order: " + model.boundaries() + "\n");
+ System.out.println("Predictions associated with the boundaries: " + model.predictions() + "\n");
// Makes predictions.
model.transform(dataset).show();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java
index 9a27b0e9e2..9f1ce463cf 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java
@@ -18,10 +18,20 @@
package org.apache.spark.examples.ml;
// $example on$
+import java.util.Arrays;
+import java.util.List;
+
import org.apache.spark.ml.feature.MaxAbsScaler;
import org.apache.spark.ml.feature.MaxAbsScalerModel;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
// $example off$
import org.apache.spark.sql.SparkSession;
@@ -34,10 +44,17 @@ public class JavaMaxAbsScalerExample {
.getOrCreate();
// $example on$
- Dataset<Row> dataFrame = spark
- .read()
- .format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt");
+ List<Row> data = Arrays.asList(
+ RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
+ RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
+ RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))
+ );
+ StructType schema = new StructType(new StructField[]{
+ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("features", new VectorUDT(), false, Metadata.empty())
+ });
+ Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
+
MaxAbsScaler scaler = new MaxAbsScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures");
@@ -47,8 +64,9 @@ public class JavaMaxAbsScalerExample {
// rescale each feature to range [-1, 1].
Dataset<Row> scaledData = scalerModel.transform(dataFrame);
- scaledData.show();
+ scaledData.select("features", "scaledFeatures").show();
// $example off$
+
spark.stop();
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
index 37fa1c5434..2757af8d24 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
@@ -20,10 +20,20 @@ package org.apache.spark.examples.ml;
import org.apache.spark.sql.SparkSession;
// $example on$
+import java.util.Arrays;
+import java.util.List;
+
import org.apache.spark.ml.feature.MinMaxScaler;
import org.apache.spark.ml.feature.MinMaxScalerModel;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
// $example off$
public class JavaMinMaxScalerExample {
@@ -34,10 +44,17 @@ public class JavaMinMaxScalerExample {
.getOrCreate();
// $example on$
- Dataset<Row> dataFrame = spark
- .read()
- .format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt");
+ List<Row> data = Arrays.asList(
+ RowFactory.create(0, Vectors.dense(1.0, 0.1, -1.0)),
+ RowFactory.create(1, Vectors.dense(2.0, 1.1, 1.0)),
+ RowFactory.create(2, Vectors.dense(3.0, 10.1, 3.0))
+ );
+ StructType schema = new StructType(new StructField[]{
+ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("features", new VectorUDT(), false, Metadata.empty())
+ });
+ Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
+
MinMaxScaler scaler = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures");
@@ -47,8 +64,11 @@ public class JavaMinMaxScalerExample {
// rescale each feature to range [min, max].
Dataset<Row> scaledData = scalerModel.transform(dataFrame);
- scaledData.show();
+ System.out.println("Features scaled to range: [" + scaler.getMin() + ", "
+ + scaler.getMax() + "]");
+ scaledData.select("features", "scaledFeatures").show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
index 0f1d9c2634..43db41ce17 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
@@ -41,28 +41,34 @@ public class JavaMultilayerPerceptronClassifierExample {
// Load training data
String path = "data/mllib/sample_multiclass_classification_data.txt";
Dataset<Row> dataFrame = spark.read().format("libsvm").load(path);
+
// Split the data into train and test
Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
Dataset<Row> train = splits[0];
Dataset<Row> test = splits[1];
+
// specify layers for the neural network:
// input layer of size 4 (features), two intermediate of size 5 and 4
// and output of size 3 (classes)
int[] layers = new int[] {4, 5, 4, 3};
+
// create the trainer and set its parameters
MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier()
.setLayers(layers)
.setBlockSize(128)
.setSeed(1234L)
.setMaxIter(100);
+
// train the model
MultilayerPerceptronClassificationModel model = trainer.fit(train);
+
// compute accuracy on the test set
Dataset<Row> result = model.transform(test);
Dataset<Row> predictionAndLabels = result.select("prediction", "label");
MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
.setMetricName("accuracy");
- System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels));
+
+ System.out.println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels));
// $example off$
spark.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
index 899815f57c..5427e46665 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
@@ -42,29 +42,25 @@ public class JavaNGramExample {
// $example on$
List<Row> data = Arrays.asList(
- RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
- RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
- RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
+ RowFactory.create(0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
+ RowFactory.create(1, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
+ RowFactory.create(2, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
);
StructType schema = new StructType(new StructField[]{
- new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField(
"words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
});
Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema);
- NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams");
+ NGram ngramTransformer = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams");
Dataset<Row> ngramDataFrame = ngramTransformer.transform(wordDataFrame);
-
- for (Row r : ngramDataFrame.select("ngrams", "label").takeAsList(3)) {
- java.util.List<String> ngrams = r.getList(0);
- for (String ngram : ngrams) System.out.print(ngram + " --- ");
- System.out.println();
- }
+ ngramDataFrame.select("ngrams").show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java
index 3226d5d2fa..be578dc811 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java
@@ -48,14 +48,21 @@ public class JavaNaiveBayesExample {
// create the trainer and set its parameters
NaiveBayes nb = new NaiveBayes();
+
// train the model
NaiveBayesModel model = nb.fit(train);
+
+ // Select example rows to display.
+ Dataset<Row> predictions = model.transform(test);
+ predictions.show();
+
// compute accuracy on the test set
- Dataset<Row> result = model.transform(test);
- Dataset<Row> predictionAndLabels = result.select("prediction", "label");
MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
+ .setLabelCol("label")
+ .setPredictionCol("prediction")
.setMetricName("accuracy");
- System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels));
+ double accuracy = evaluator.evaluate(predictions);
+ System.out.println("Test set accuracy = " + accuracy);
// $example off$
spark.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
index abc38f85ea..f878c420d8 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
@@ -20,9 +20,19 @@ package org.apache.spark.examples.ml;
import org.apache.spark.sql.SparkSession;
// $example on$
+import java.util.Arrays;
+import java.util.List;
+
import org.apache.spark.ml.feature.Normalizer;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
// $example off$
public class JavaNormalizerExample {
@@ -33,8 +43,16 @@ public class JavaNormalizerExample {
.getOrCreate();
// $example on$
- Dataset<Row> dataFrame =
- spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ List<Row> data = Arrays.asList(
+ RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
+ RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
+ RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))
+ );
+ StructType schema = new StructType(new StructField[]{
+ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("features", new VectorUDT(), false, Metadata.empty())
+ });
+ Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
// Normalize each Vector using $L^1$ norm.
Normalizer normalizer = new Normalizer()
@@ -50,6 +68,7 @@ public class JavaNormalizerExample {
normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
lInfNormData.show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
index a15e5f84a1..99af37676b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
@@ -68,9 +68,11 @@ public class JavaOneHotEncoderExample {
OneHotEncoder encoder = new OneHotEncoder()
.setInputCol("categoryIndex")
.setOutputCol("categoryVec");
+
Dataset<Row> encoded = encoder.transform(indexed);
- encoded.select("id", "categoryVec").show();
+ encoded.show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
index c6a083ddc9..82fb540950 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
@@ -75,7 +75,7 @@ public class JavaOneVsRestExample {
// compute the classification error on test data.
double accuracy = evaluator.evaluate(predictions);
- System.out.println("Test Error : " + (1 - accuracy));
+ System.out.println("Test Error = " + (1 - accuracy));
// $example off$
spark.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
index d597a9a2ed..6951a65553 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
@@ -62,7 +62,7 @@ public class JavaPCAExample {
.fit(df);
Dataset<Row> result = pca.transform(df).select("pcaFeatures");
- result.show();
+ result.show(false);
// $example off$
spark.stop();
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
index 67180df65c..43c636c534 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
@@ -48,23 +48,19 @@ public class JavaPolynomialExpansionExample {
.setDegree(3);
List<Row> data = Arrays.asList(
- RowFactory.create(Vectors.dense(-2.0, 2.3)),
+ RowFactory.create(Vectors.dense(2.0, 1.0)),
RowFactory.create(Vectors.dense(0.0, 0.0)),
- RowFactory.create(Vectors.dense(0.6, -1.1))
+ RowFactory.create(Vectors.dense(3.0, -1.0))
);
-
StructType schema = new StructType(new StructField[]{
new StructField("features", new VectorUDT(), false, Metadata.empty()),
});
-
Dataset<Row> df = spark.createDataFrame(data, schema);
- Dataset<Row> polyDF = polyExpansion.transform(df);
- List<Row> rows = polyDF.select("polyFeatures").takeAsList(3);
- for (Row r : rows) {
- System.out.println(r.get(0));
- }
+ Dataset<Row> polyDF = polyExpansion.transform(df);
+ polyDF.show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
index 278cce0842..94ead625b4 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
@@ -57,7 +57,7 @@ public class JavaStopWordsRemoverExample {
});
Dataset<Row> dataset = spark.createDataFrame(data, schema);
- remover.transform(dataset).show();
+ remover.transform(dataset).show(false);
// $example off$
spark.stop();
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
index 7533c1835e..cf9747a994 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
@@ -54,12 +54,15 @@ public class JavaStringIndexerExample {
createStructField("category", StringType, false)
});
Dataset<Row> df = spark.createDataFrame(data, schema);
+
StringIndexer indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("categoryIndex");
+
Dataset<Row> indexed = indexer.fit(df).transform(df);
indexed.show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
index 800e42c949..b740cd097a 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
@@ -25,7 +25,6 @@ import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.IDF;
import org.apache.spark.ml.feature.IDFModel;
import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.ml.linalg.Vector;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
@@ -54,25 +53,24 @@ public class JavaTfIdfExample {
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
Dataset<Row> sentenceData = spark.createDataFrame(data, schema);
+
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
Dataset<Row> wordsData = tokenizer.transform(sentenceData);
+
int numFeatures = 20;
HashingTF hashingTF = new HashingTF()
.setInputCol("words")
.setOutputCol("rawFeatures")
.setNumFeatures(numFeatures);
+
Dataset<Row> featurizedData = hashingTF.transform(wordsData);
// alternatively, CountVectorizer can also be used to get term frequency vectors
IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
IDFModel idfModel = idf.fit(featurizedData);
+
Dataset<Row> rescaledData = idfModel.transform(featurizedData);
- for (Row r : rescaledData.select("features", "label").takeAsList(3)) {
- Vector features = r.getAs(0);
- Double label = r.getDouble(1);
- System.out.println(features);
- System.out.println(label);
- }
+ rescaledData.select("label", "features").show();
// $example off$
spark.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
index a206cef4c2..101a4df779 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
@@ -23,8 +23,11 @@ import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
import java.util.List;
+import scala.collection.mutable.WrappedArray;
+
import org.apache.spark.ml.feature.RegexTokenizer;
import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
@@ -34,6 +37,12 @@ import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
// $example off$
+// $example on:untyped_ops$
+// col("...") is preferable to df.col("...")
+import static org.apache.spark.sql.functions.callUDF;
+import static org.apache.spark.sql.functions.col;
+// $example off:untyped_ops$
+
public class JavaTokenizerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
@@ -49,7 +58,7 @@ public class JavaTokenizerExample {
);
StructType schema = new StructType(new StructField[]{
- new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
@@ -62,20 +71,22 @@ public class JavaTokenizerExample {
.setOutputCol("words")
.setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
+ spark.udf().register("countTokens", new UDF1<WrappedArray, Integer>() {
+ @Override
+ public Integer call(WrappedArray words) {
+ return words.size();
+ }
+ }, DataTypes.IntegerType);
+
Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame);
- for (Row r : tokenized.select("words", "label").takeAsList(3)) {
- java.util.List<String> words = r.getList(0);
- for (String word : words) System.out.print(word + " ");
- System.out.println();
- }
+ tokenized.select("sentence", "words")
+ .withColumn("tokens", callUDF("countTokens", col("words"))).show(false);
Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame);
- for (Row r : regexTokenized.select("words", "label").takeAsList(3)) {
- java.util.List<String> words = r.getList(0);
- for (String word : words) System.out.print(word + " ");
- System.out.println();
- }
+ regexTokenized.select("sentence", "words")
+ .withColumn("tokens", callUDF("countTokens", col("words"))).show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
index 9bb0f93d3a..384e09c73b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
@@ -29,7 +29,6 @@ import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.*;
-
import static org.apache.spark.sql.types.DataTypes.*;
// $example off$
@@ -56,8 +55,11 @@ public class JavaVectorAssemblerExample {
.setOutputCol("features");
Dataset<Row> output = assembler.transform(dataset);
- System.out.println(output.select("features", "clicked").first());
+ System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
+ "'features'");
+ output.select("features", "clicked").show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
index 19b8bc83be..1922514c87 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
@@ -65,9 +65,9 @@ public class JavaVectorSlicerExample {
// or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})
Dataset<Row> output = vectorSlicer.transform(dataset);
-
- System.out.println(output.select("userFeatures", "features").first());
+ output.show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
index 9be6e6353a..fc9b459688 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
@@ -23,6 +23,7 @@ import java.util.List;
import org.apache.spark.ml.feature.Word2Vec;
import org.apache.spark.ml.feature.Word2VecModel;
+import org.apache.spark.ml.linalg.Vector;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
@@ -55,10 +56,14 @@ public class JavaWord2VecExample {
.setOutputCol("result")
.setVectorSize(3)
.setMinCount(0);
+
Word2VecModel model = word2Vec.fit(documentDF);
Dataset<Row> result = model.transform(documentDF);
- for (Row r : result.select("result").takeAsList(3)) {
- System.out.println(r);
+
+ for (Row row : result.collectAsList()) {
+ List<String> text = row.getList(0);
+ Vector vector = (Vector) row.get(1);
+ System.out.println("Text: " + text + " => \nVector: " + vector + "\n");
}
// $example off$
diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
index 4224a27dbe..669bb2aeab 100644
--- a/examples/src/main/python/ml/binarizer_example.py
+++ b/examples/src/main/python/ml/binarizer_example.py
@@ -33,12 +33,14 @@ if __name__ == "__main__":
(0, 0.1),
(1, 0.8),
(2, 0.2)
- ], ["label", "feature"])
+ ], ["id", "feature"])
+
binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
+
binarizedDataFrame = binarizer.transform(continuousDataFrame)
- binarizedFeatures = binarizedDataFrame.select("binarized_feature")
- for binarized_feature, in binarizedFeatures.collect():
- print(binarized_feature)
+
+ print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
+ binarizedDataFrame.show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
index 8177e560dd..742f35093b 100644
--- a/examples/src/main/python/ml/bucketizer_example.py
+++ b/examples/src/main/python/ml/bucketizer_example.py
@@ -31,13 +31,15 @@ if __name__ == "__main__":
# $example on$
splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
- data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
+ data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])
bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)
+
+ print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()
# $example off$
diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py
index 5e19ef1624..028a9ea9d6 100644
--- a/examples/src/main/python/ml/chisq_selector_example.py
+++ b/examples/src/main/python/ml/chisq_selector_example.py
@@ -39,6 +39,8 @@ if __name__ == "__main__":
outputCol="selectedFeatures", labelCol="clicked")
result = selector.fit(df).transform(df)
+
+ print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show()
# $example off$
diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py
index 38cfac82fb..f2e41db77d 100644
--- a/examples/src/main/python/ml/count_vectorizer_example.py
+++ b/examples/src/main/python/ml/count_vectorizer_example.py
@@ -37,9 +37,11 @@ if __name__ == "__main__":
# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)
+
model = cv.fit(df)
+
result = model.transform(df)
- result.show()
+ result.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/dct_example.py b/examples/src/main/python/ml/dct_example.py
index a4f25df784..c0457f8d0f 100644
--- a/examples/src/main/python/ml/dct_example.py
+++ b/examples/src/main/python/ml/dct_example.py
@@ -39,8 +39,7 @@ if __name__ == "__main__":
dctDf = dct.transform(df)
- for dcts in dctDf.select("featuresDCT").take(3):
- print(dcts)
+ dctDf.select("featuresDCT").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py
index edc258de05..8ad450b669 100644
--- a/examples/src/main/python/ml/gaussian_mixture_example.py
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@@ -38,11 +38,11 @@ if __name__ == "__main__":
# loads data
dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
- gmm = GaussianMixture().setK(2)
+ gmm = GaussianMixture().setK(2).setSeed(538009335L)
model = gmm.fit(dataset)
- print("Gaussians: ")
- model.gaussiansDF.show()
+ print("Gaussians shown as a DataFrame: ")
+ model.gaussiansDF.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/index_to_string_example.py b/examples/src/main/python/ml/index_to_string_example.py
index 523caac00c..33d104e8e3 100644
--- a/examples/src/main/python/ml/index_to_string_example.py
+++ b/examples/src/main/python/ml/index_to_string_example.py
@@ -33,14 +33,22 @@ if __name__ == "__main__":
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
- stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
- model = stringIndexer.fit(df)
+ indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+ model = indexer.fit(df)
indexed = model.transform(df)
+ print("Transformed string column '%s' to indexed column '%s'"
+ % (indexer.getInputCol(), indexer.getOutputCol()))
+ indexed.show()
+
+ print("StringIndexer will store labels in output column metadata\n")
+
converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)
- converted.select("id", "originalCategory").show()
+ print("Transformed indexed column '%s' back to original string column '%s' using "
+ "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
+ converted.select("id", "categoryIndex", "originalCategory").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py
index a41b8ffacb..6ae15f1b4b 100644
--- a/examples/src/main/python/ml/isotonic_regression_example.py
+++ b/examples/src/main/python/ml/isotonic_regression_example.py
@@ -44,8 +44,8 @@ if __name__ == "__main__":
# Trains an isotonic regression model.
model = IsotonicRegression().fit(dataset)
- print("Boundaries in increasing order: " + str(model.boundaries))
- print("Predictions associated with the boundaries: " + str(model.predictions))
+ print("Boundaries in increasing order: %s\n" % str(model.boundaries))
+ print("Predictions associated with the boundaries: %s\n" % str(model.predictions))
# Makes predictions.
model.transform(dataset).show()
diff --git a/examples/src/main/python/ml/linear_regression_with_elastic_net.py b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
index 620ab5b87e..6639e9160a 100644
--- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
@@ -39,8 +39,16 @@ if __name__ == "__main__":
lrModel = lr.fit(training)
# Print the coefficients and intercept for linear regression
- print("Coefficients: " + str(lrModel.coefficients))
- print("Intercept: " + str(lrModel.intercept))
+ print("Coefficients: %s" % str(lrModel.coefficients))
+ print("Intercept: %s" % str(lrModel.intercept))
+
+ # Summarize the model over the training set and print out some metrics
+ trainingSummary = lrModel.summary
+ print("numIterations: %d" % trainingSummary.totalIterations)
+ print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
+ trainingSummary.residuals.show()
+ print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
+ print("r2: %f" % trainingSummary.r2)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/max_abs_scaler_example.py b/examples/src/main/python/ml/max_abs_scaler_example.py
index ab91198b08..45eda3cdad 100644
--- a/examples/src/main/python/ml/max_abs_scaler_example.py
+++ b/examples/src/main/python/ml/max_abs_scaler_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import MaxAbsScaler
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,7 +30,11 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.1, -8.0]),),
+ (1, Vectors.dense([2.0, 1.0, -4.0]),),
+ (2, Vectors.dense([4.0, 10.0, 8.0]),)
+ ], ["id", "features"])
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
@@ -38,7 +43,8 @@ if __name__ == "__main__":
# rescale each feature to range [-1, 1].
scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+
+ scaledData.select("features", "scaledFeatures").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/min_max_scaler_example.py b/examples/src/main/python/ml/min_max_scaler_example.py
index e3e7bc205b..b5f272e59b 100644
--- a/examples/src/main/python/ml/min_max_scaler_example.py
+++ b/examples/src/main/python/ml/min_max_scaler_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import MinMaxScaler
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,7 +30,11 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.1, -1.0]),),
+ (1, Vectors.dense([2.0, 1.1, 1.0]),),
+ (2, Vectors.dense([3.0, 10.1, 3.0]),)
+ ], ["id", "features"])
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
@@ -38,7 +43,8 @@ if __name__ == "__main__":
# rescale each feature to range [min, max].
scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+ print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
+ scaledData.select("features", "scaledFeatures").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py
index 2cc38c2855..88fc69f753 100644
--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -52,7 +52,7 @@ if __name__ == "__main__":
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
- print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
+ print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py
index 55263adb46..31676e076a 100644
--- a/examples/src/main/python/ml/n_gram_example.py
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -33,13 +33,12 @@ if __name__ == "__main__":
(0, ["Hi", "I", "heard", "about", "Spark"]),
(1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
(2, ["Logistic", "regression", "models", "are", "neat"])
- ], ["label", "words"])
+ ], ["id", "words"])
- ngram = NGram(inputCol="words", outputCol="ngrams")
- ngramDataFrame = ngram.transform(wordDataFrame)
+ ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
- for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
- print(ngrams_label)
+ ngramDataFrame = ngram.transform(wordDataFrame)
+ ngramDataFrame.select("ngrams").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py
index aa23f298c8..7290ab81cd 100644
--- a/examples/src/main/python/ml/naive_bayes_example.py
+++ b/examples/src/main/python/ml/naive_bayes_example.py
@@ -45,11 +45,15 @@ if __name__ == "__main__":
# train the model
model = nb.fit(train)
+ # select example rows to display.
+ predictions = model.transform(test)
+ predictions.show()
+
# compute accuracy on the test set
- result = model.transform(test)
- predictionAndLabels = result.select("prediction", "label")
- evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
- print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
+ evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
+ metricName="accuracy")
+ accuracy = evaluator.evaluate(predictions)
+ print("Test set accuracy = " + str(accuracy))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py
index 19012f51f4..510bd825fd 100644
--- a/examples/src/main/python/ml/normalizer_example.py
+++ b/examples/src/main/python/ml/normalizer_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import Normalizer
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,15 +30,21 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.5, -1.0]),),
+ (1, Vectors.dense([2.0, 1.0, 1.0]),),
+ (2, Vectors.dense([4.0, 10.0, 2.0]),)
+ ], ["id", "features"])
# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
+ print("Normalized using L^1 norm")
l1NormData.show()
# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
+ print("Normalized using L^inf norm")
lInfNormData.show()
# $example off$
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py
index 47faf8d202..e1996c7f0a 100644
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@@ -42,9 +42,9 @@ if __name__ == "__main__":
model = stringIndexer.fit(df)
indexed = model.transform(df)
- encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec")
+ encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.transform(indexed)
- encoded.select("id", "categoryVec").show()
+ encoded.show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py
index 2d0865578a..f63e4db434 100644
--- a/examples/src/main/python/ml/pipeline_example.py
+++ b/examples/src/main/python/ml/pipeline_example.py
@@ -60,9 +60,10 @@ if __name__ == "__main__":
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
- selected = prediction.select("id", "text", "prediction")
+ selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
- print(row)
+ rid, text, prob, prediction = row
+ print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
index b464ee86b6..40bcb7b13a 100644
--- a/examples/src/main/python/ml/polynomial_expansion_example.py
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -31,16 +31,15 @@ if __name__ == "__main__":
# $example on$
df = spark.createDataFrame([
- (Vectors.dense([-2.0, 2.3]),),
+ (Vectors.dense([2.0, 1.0]),),
(Vectors.dense([0.0, 0.0]),),
- (Vectors.dense([0.6, -1.1]),)
+ (Vectors.dense([3.0, -1.0]),)
], ["features"])
- px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
- polyDF = px.transform(df)
+ polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
+ polyDF = polyExpansion.transform(df)
- for expanded in polyDF.select("polyFeatures").take(3):
- print(expanded)
+ polyDF.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py
index 8a8392cc1f..3b8e7855e3 100644
--- a/examples/src/main/python/ml/stopwords_remover_example.py
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@@ -32,7 +32,7 @@ if __name__ == "__main__":
sentenceData = spark.createDataFrame([
(0, ["I", "saw", "the", "red", "balloon"]),
(1, ["Mary", "had", "a", "little", "lamb"])
- ], ["label", "raw"])
+ ], ["id", "raw"])
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
index 4ab7eb6964..d43244fa68 100644
--- a/examples/src/main/python/ml/tf_idf_example.py
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -30,9 +30,9 @@ if __name__ == "__main__":
# $example on$
sentenceData = spark.createDataFrame([
- (0, "Hi I heard about Spark"),
- (0, "I wish Java could use case classes"),
- (1, "Logistic regression models are neat")
+ (0.0, "Hi I heard about Spark"),
+ (0.0, "I wish Java could use case classes"),
+ (1.0, "Logistic regression models are neat")
], ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
@@ -46,8 +46,7 @@ if __name__ == "__main__":
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
- for features_label in rescaledData.select("features", "label").take(3):
- print(features_label)
+ rescaledData.select("label", "features").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py
index 89f5060705..5c65c5c9f8 100644
--- a/examples/src/main/python/ml/tokenizer_example.py
+++ b/examples/src/main/python/ml/tokenizer_example.py
@@ -19,6 +19,8 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer
+from pyspark.sql.functions import col, udf
+from pyspark.sql.types import IntegerType
# $example off$
from pyspark.sql import SparkSession
@@ -33,20 +35,22 @@ if __name__ == "__main__":
(0, "Hi I heard about Spark"),
(1, "I wish Java could use case classes"),
(2, "Logistic,regression,models,are,neat")
- ], ["label", "sentence"])
+ ], ["id", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)
+ countTokens = udf(lambda words: len(words), IntegerType())
+
tokenized = tokenizer.transform(sentenceDataFrame)
- for words_label in tokenized.select("words", "label").take(3):
- print(words_label)
+ tokenized.select("sentence", "words")\
+ .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
regexTokenized = regexTokenizer.transform(sentenceDataFrame)
- for words_label in regexTokenized.select("words", "label").take(3):
- print(words_label)
+ regexTokenized.select("sentence", "words") \
+ .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py
index a92b861f83..d104f7d30a 100644
--- a/examples/src/main/python/ml/train_validation_split.py
+++ b/examples/src/main/python/ml/train_validation_split.py
@@ -66,8 +66,9 @@ if __name__ == "__main__":
# Make predictions on test data. model is the model with combination of parameters
# that performed best.
- prediction = model.transform(test)
- for row in prediction.take(5):
- print(row)
+ model.transform(test)\
+ .select("features", "label", "prediction")\
+ .show()
+
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py
index eac33711ad..98de1d5ea7 100644
--- a/examples/src/main/python/ml/vector_assembler_example.py
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@@ -39,7 +39,8 @@ if __name__ == "__main__":
outputCol="features")
output = assembler.transform(dataset)
- print(output.select("features", "clicked").first())
+ print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+ output.select("features", "clicked").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py
index 3912c135be..5c2956077d 100644
--- a/examples/src/main/python/ml/vector_indexer_example.py
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@@ -34,6 +34,10 @@ if __name__ == "__main__":
indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
indexerModel = indexer.fit(data)
+ categoricalFeatures = indexerModel.categoryMaps
+ print("Chose %d categorical features: %s" %
+ (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys())))
+
# Create new column "indexed" with categorical values transformed to indices
indexedData = indexerModel.transform(data)
indexedData.show()
diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py
index 78a91c92fc..77f8951df0 100644
--- a/examples/src/main/python/ml/word2vec_example.py
+++ b/examples/src/main/python/ml/word2vec_example.py
@@ -41,8 +41,9 @@ if __name__ == "__main__":
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
- for feature in result.select("result").take(3):
- print(feature)
+ for row in result.collect():
+ text, vector = row
+ print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/pagerank.py b/examples/src/main/python/pagerank.py
index a399a9c37c..0d6c253d39 100755
--- a/examples/src/main/python/pagerank.py
+++ b/examples/src/main/python/pagerank.py
@@ -18,6 +18,9 @@
"""
This is an example implementation of PageRank. For more conventional use,
Please refer to PageRank implementation provided by graphx
+
+Example Usage:
+bin/spark-submit examples/src/main/python/pagerank.py data/mllib/pagerank_data.txt 10
"""
from __future__ import print_function
@@ -46,8 +49,8 @@ if __name__ == "__main__":
print("Usage: pagerank <file> <iterations>", file=sys.stderr)
exit(-1)
- print("""WARN: This is a naive implementation of PageRank and is
- given as an example! Please refer to PageRank implementation provided by graphx""",
+ print("WARN: This is a naive implementation of PageRank and is given as an example!\n" +
+ "Please refer to PageRank implementation provided by graphx",
file=sys.stderr)
# Initialize the spark context.
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
index d0b874c48d..5d8831265e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
@@ -31,6 +31,11 @@ import org.apache.spark.sql.SparkSession
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.graphx.lib.PageRank
+ *
+ * Example Usage:
+ * {{{
+ * bin/run-example SparkPageRank data/mllib/pagerank_data.txt 10
+ * }}}
*/
object SparkPageRank {
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
index b6d7b36916..cdb33f4d6d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
@@ -55,8 +55,9 @@ object AFTSurvivalRegressionExample {
val model = aft.fit(training)
// Print the coefficients, intercept and scale parameter for AFT survival regression
- println(s"Coefficients: ${model.coefficients} Intercept: " +
- s"${model.intercept} Scale: ${model.scale}")
+ println(s"Coefficients: ${model.coefficients}")
+ println(s"Intercept: ${model.intercept}")
+ println(s"Scale: ${model.scale}")
model.transform(training).show(false)
// $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
index 5cd13ad64c..a4f62e7871 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
@@ -29,9 +29,10 @@ object BinarizerExample {
.builder
.appName("BinarizerExample")
.getOrCreate()
+
// $example on$
val data = Array((0, 0.1), (1, 0.8), (2, 0.2))
- val dataFrame = spark.createDataFrame(data).toDF("label", "feature")
+ val dataFrame = spark.createDataFrame(data).toDF("id", "feature")
val binarizer: Binarizer = new Binarizer()
.setInputCol("feature")
@@ -39,8 +40,9 @@ object BinarizerExample {
.setThreshold(0.5)
val binarizedDataFrame = binarizer.transform(dataFrame)
- val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
- binarizedFeatures.collect().foreach(println)
+
+ println(s"Binarizer output with Threshold = ${binarizer.getThreshold}")
+ binarizedDataFrame.show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
index 38cce34bb5..04e4eccd43 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
@@ -33,7 +33,7 @@ object BucketizerExample {
// $example on$
val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
- val data = Array(-0.5, -0.3, 0.0, 0.2)
+ val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9)
val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
val bucketizer = new Bucketizer()
@@ -43,8 +43,11 @@ object BucketizerExample {
// Transform original data into its bucket index.
val bucketedData = bucketizer.transform(dataFrame)
+
+ println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets")
bucketedData.show()
// $example off$
+
spark.stop()
}
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala
index c9394dd9c6..5638e66b87 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala
@@ -48,8 +48,11 @@ object ChiSqSelectorExample {
.setOutputCol("selectedFeatures")
val result = selector.fit(df).transform(df)
+
+ println(s"ChiSqSelector output with top ${selector.getNumTopFeatures} features selected")
result.show()
// $example off$
+
spark.stop()
}
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
index 988d8941a4..91d861dd43 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
@@ -49,7 +49,7 @@ object CountVectorizerExample {
.setInputCol("words")
.setOutputCol("features")
- cvModel.transform(df).select("features").show()
+ cvModel.transform(df).show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
index ddc6717528..3383171303 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
@@ -45,7 +45,7 @@ object DCTExample {
.setInverse(false)
val dctDf = dct.transform(df)
- dctDf.select("featuresDCT").show(3)
+ dctDf.select("featuresDCT").show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
index 26095b46f5..5e4bea4c4f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
@@ -49,8 +49,8 @@ object GaussianMixtureExample {
// output parameters of mixture model model
for (i <- 0 until model.getK) {
- println("weight=%f\nmu=%s\nsigma=\n%s\n" format
- (model.weights(i), model.gaussians(i).mean, model.gaussians(i).cov))
+ println(s"Gaussian $i:\nweight=${model.weights(i)}\n" +
+ s"mu=${model.gaussians(i).mean}\nsigma=\n${model.gaussians(i).cov}\n")
}
// $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala
index 950733831c..2940682c32 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala
@@ -19,6 +19,7 @@
package org.apache.spark.examples.ml
// $example on$
+import org.apache.spark.ml.attribute.Attribute
import org.apache.spark.ml.feature.{IndexToString, StringIndexer}
// $example off$
import org.apache.spark.sql.SparkSession
@@ -46,12 +47,23 @@ object IndexToStringExample {
.fit(df)
val indexed = indexer.transform(df)
+ println(s"Transformed string column '${indexer.getInputCol}' " +
+ s"to indexed column '${indexer.getOutputCol}'")
+ indexed.show()
+
+ val inputColSchema = indexed.schema(indexer.getOutputCol)
+ println(s"StringIndexer will store labels in output column metadata: " +
+ s"${Attribute.fromStructField(inputColSchema).toString}\n")
+
val converter = new IndexToString()
.setInputCol("categoryIndex")
.setOutputCol("originalCategory")
val converted = converter.transform(indexed)
- converted.select("id", "originalCategory").show()
+
+ println(s"Transformed indexed column '${converter.getInputCol}' back to original string " +
+ s"column '${converter.getOutputCol}' using labels in metadata")
+ converted.select("id", "categoryIndex", "originalCategory").show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala
index a840559d24..9bac16ec76 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala
@@ -47,8 +47,8 @@ object IsotonicRegressionExample {
val ir = new IsotonicRegression()
val model = ir.fit(dataset)
- println(s"Boundaries in increasing order: ${model.boundaries}")
- println(s"Predictions associated with the boundaries: ${model.predictions}")
+ println(s"Boundaries in increasing order: ${model.boundaries}\n")
+ println(s"Predictions associated with the boundaries: ${model.predictions}\n")
// Makes predictions.
model.transform(dataset).show()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala
index 94cf286623..4540a8d728 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala
@@ -50,7 +50,7 @@ object LinearRegressionWithElasticNetExample {
// Summarize the model over the training set and print out some metrics
val trainingSummary = lrModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
- println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
+ println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
index cd8775c942..1740a0d3f9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
@@ -51,6 +51,7 @@ object LogisticRegressionSummaryExample {
// Obtain the objective per iteration.
val objectiveHistory = trainingSummary.objectiveHistory
+ println("objectiveHistory:")
objectiveHistory.foreach(loss => println(loss))
// Obtain the metrics useful to judge performance on test data.
@@ -61,7 +62,7 @@ object LogisticRegressionSummaryExample {
// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
val roc = binarySummary.roc
roc.show()
- println(binarySummary.areaUnderROC)
+ println(s"areaUnderROC: ${binarySummary.areaUnderROC}")
// Set the model threshold to maximize F-Measure
val fMeasure = binarySummary.fMeasureByThreshold
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala
index 572adce657..85d071369d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala
@@ -19,6 +19,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.MaxAbsScaler
+import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession
@@ -30,7 +31,12 @@ object MaxAbsScalerExample {
.getOrCreate()
// $example on$
- val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val dataFrame = spark.createDataFrame(Seq(
+ (0, Vectors.dense(1.0, 0.1, -8.0)),
+ (1, Vectors.dense(2.0, 1.0, -4.0)),
+ (2, Vectors.dense(4.0, 10.0, 8.0))
+ )).toDF("id", "features")
+
val scaler = new MaxAbsScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures")
@@ -40,7 +46,7 @@ object MaxAbsScalerExample {
// rescale each feature to range [-1, 1]
val scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+ scaledData.select("features", "scaledFeatures").show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
index d728019a62..9ee6d9b449 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
@@ -20,6 +20,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.MinMaxScaler
+import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession
@@ -31,7 +32,11 @@ object MinMaxScalerExample {
.getOrCreate()
// $example on$
- val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val dataFrame = spark.createDataFrame(Seq(
+ (0, Vectors.dense(1.0, 0.1, -1.0)),
+ (1, Vectors.dense(2.0, 1.1, 1.0)),
+ (2, Vectors.dense(3.0, 10.1, 3.0))
+ )).toDF("id", "features")
val scaler = new MinMaxScaler()
.setInputCol("features")
@@ -42,7 +47,8 @@ object MinMaxScalerExample {
// rescale each feature to range [min, max].
val scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+ println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]")
+ scaledData.select("features", "scaledFeatures").show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
index a39e3202ba..6fce82d294 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
@@ -66,7 +66,7 @@ object MultilayerPerceptronClassifierExample {
val evaluator = new MulticlassClassificationEvaluator()
.setMetricName("accuracy")
- println("Accuracy: " + evaluator.evaluate(predictionAndLabels))
+ println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels))
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
index e0b52e7a36..d2183d6b49 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
@@ -35,11 +35,12 @@ object NGramExample {
(0, Array("Hi", "I", "heard", "about", "Spark")),
(1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
(2, Array("Logistic", "regression", "models", "are", "neat"))
- )).toDF("label", "words")
+ )).toDF("id", "words")
+
+ val ngram = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams")
- val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
val ngramDataFrame = ngram.transform(wordDataFrame)
- ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
+ ngramDataFrame.select("ngrams").show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
index 3ae0623c4c..bd9fcc420a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
@@ -52,7 +52,7 @@ object NaiveBayesExample {
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
- println("Accuracy: " + accuracy)
+ println("Test set accuracy = " + accuracy)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
index 75ba33a7e7..989d250c17 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
@@ -20,6 +20,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.Normalizer
+import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession
@@ -31,7 +32,11 @@ object NormalizerExample {
.getOrCreate()
// $example on$
- val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val dataFrame = spark.createDataFrame(Seq(
+ (0, Vectors.dense(1.0, 0.5, -1.0)),
+ (1, Vectors.dense(2.0, 1.0, 1.0)),
+ (2, Vectors.dense(4.0, 10.0, 2.0))
+ )).toDF("id", "features")
// Normalize each Vector using $L^1$ norm.
val normalizer = new Normalizer()
@@ -40,10 +45,12 @@ object NormalizerExample {
.setP(1.0)
val l1NormData = normalizer.transform(dataFrame)
+ println("Normalized using L^1 norm")
l1NormData.show()
// Normalize each Vector using $L^\infty$ norm.
val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
+ println("Normalized using L^inf norm")
lInfNormData.show()
// $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
index 4aa649b133..274cc1268f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
@@ -49,8 +49,9 @@ object OneHotEncoderExample {
val encoder = new OneHotEncoder()
.setInputCol("categoryIndex")
.setOutputCol("categoryVec")
+
val encoded = encoder.transform(indexed)
- encoded.select("id", "categoryVec").show()
+ encoded.show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
index acde110683..4ad6c7c3ef 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
@@ -69,7 +69,7 @@ object OneVsRestExample {
// compute the classification error on test data.
val accuracy = evaluator.evaluate(predictions)
- println(s"Test Error : ${1 - accuracy}")
+ println(s"Test Error = ${1 - accuracy}")
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
index dca96eea2b..4e1d7cdbab 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
@@ -38,14 +38,15 @@ object PCAExample {
Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
)
val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+
val pca = new PCA()
.setInputCol("features")
.setOutputCol("pcaFeatures")
.setK(3)
.fit(df)
- val pcaDF = pca.transform(df)
- val result = pcaDF.select("pcaFeatures")
- result.show()
+
+ val result = pca.transform(df).select("pcaFeatures")
+ result.show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
index 54d2e6b36d..f117b03ab2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
@@ -33,17 +33,19 @@ object PolynomialExpansionExample {
// $example on$
val data = Array(
- Vectors.dense(-2.0, 2.3),
+ Vectors.dense(2.0, 1.0),
Vectors.dense(0.0, 0.0),
- Vectors.dense(0.6, -1.1)
+ Vectors.dense(3.0, -1.0)
)
val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
- val polynomialExpansion = new PolynomialExpansion()
+
+ val polyExpansion = new PolynomialExpansion()
.setInputCol("features")
.setOutputCol("polyFeatures")
.setDegree(3)
- val polyDF = polynomialExpansion.transform(df)
- polyDF.select("polyFeatures").take(3).foreach(println)
+
+ val polyDF = polyExpansion.transform(df)
+ polyDF.show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
index a56de0856d..369a6fffd7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
@@ -40,7 +40,7 @@ object StopWordsRemoverExample {
(1, Seq("Mary", "had", "a", "little", "lamb"))
)).toDF("id", "raw")
- remover.transform(dataSet).show()
+ remover.transform(dataSet).show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
index 97f6fcce15..ec2df2ef87 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
@@ -33,9 +33,9 @@ object TfIdfExample {
// $example on$
val sentenceData = spark.createDataFrame(Seq(
- (0, "Hi I heard about Spark"),
- (0, "I wish Java could use case classes"),
- (1, "Logistic regression models are neat")
+ (0.0, "Hi I heard about Spark"),
+ (0.0, "I wish Java could use case classes"),
+ (1.0, "Logistic regression models are neat")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
@@ -51,7 +51,7 @@ object TfIdfExample {
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
- rescaledData.select("features", "label").take(3).foreach(println)
+ rescaledData.select("label", "features").show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
index 90d0faaf47..0167dc3723 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
@@ -20,6 +20,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
+import org.apache.spark.sql.functions._
// $example off$
import org.apache.spark.sql.SparkSession
@@ -35,7 +36,7 @@ object TokenizerExample {
(0, "Hi I heard about Spark"),
(1, "I wish Java could use case classes"),
(2, "Logistic,regression,models,are,neat")
- )).toDF("label", "sentence")
+ )).toDF("id", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val regexTokenizer = new RegexTokenizer()
@@ -43,11 +44,15 @@ object TokenizerExample {
.setOutputCol("words")
.setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
+ val countTokens = udf { (words: Seq[String]) => words.length }
+
val tokenized = tokenizer.transform(sentenceDataFrame)
- tokenized.select("words", "label").take(3).foreach(println)
+ tokenized.select("sentence", "words")
+ .withColumn("tokens", countTokens(col("words"))).show(false)
val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
- regexTokenized.select("words", "label").take(3).foreach(println)
+ regexTokenized.select("sentence", "words")
+ .withColumn("tokens", countTokens(col("words"))).show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala
index 13c72f88cc..13b58d154b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala
@@ -100,6 +100,7 @@ object UnaryTransformerExample {
val data = spark.range(0, 5).toDF("input")
.select(col("input").cast("double").as("input"))
val result = myTransformer.transform(data)
+ println("Transformed by adding constant value")
result.show()
// Save and load the Transformer.
@@ -109,6 +110,7 @@ object UnaryTransformerExample {
val sameTransformer = MyTransformer.load(dirName)
// Transform the data to show the results are identical.
+ println("Same transform applied from loaded model")
val sameResult = sameTransformer.transform(data)
sameResult.show()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
index 8910470c1c..3d5c7efb20 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
@@ -41,7 +41,8 @@ object VectorAssemblerExample {
.setOutputCol("features")
val output = assembler.transform(dataset)
- println(output.select("features", "clicked").first())
+ println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+ output.select("features", "clicked").show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
index 85dd5c2776..63a60912de 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
@@ -37,7 +37,10 @@ object VectorSlicerExample {
.getOrCreate()
// $example on$
- val data = Arrays.asList(Row(Vectors.dense(-2.0, 2.3, 0.0)))
+ val data = Arrays.asList(
+ Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))),
+ Row(Vectors.dense(-2.0, 2.3, 0.0))
+ )
val defaultAttr = NumericAttribute.defaultAttr
val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
@@ -51,7 +54,7 @@ object VectorSlicerExample {
// or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))
val output = slicer.transform(dataset)
- println(output.select("userFeatures", "features").first())
+ output.show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
index 5c8bd19f20..4bcc6ac6a0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
@@ -20,6 +20,8 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.Word2Vec
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.sql.Row
// $example off$
import org.apache.spark.sql.SparkSession
@@ -47,7 +49,8 @@ object Word2VecExample {
val model = word2Vec.fit(documentDF)
val result = model.transform(documentDF)
- result.select("result").take(3).foreach(println)
+ result.collect().foreach { case Row(text: Seq[_], features: Vector) =>
+ println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") }
// $example off$
spark.stop()