aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/JavaPageRank.java5
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java5
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java11
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java7
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java2
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java6
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java15
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java28
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java30
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java8
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java13
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java23
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java2
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java2
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java2
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java3
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java12
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java33
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java6
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java9
-rw-r--r--examples/src/main/python/ml/binarizer_example.py10
-rw-r--r--examples/src/main/python/ml/bucketizer_example.py4
-rw-r--r--examples/src/main/python/ml/chisq_selector_example.py2
-rw-r--r--examples/src/main/python/ml/count_vectorizer_example.py4
-rw-r--r--examples/src/main/python/ml/dct_example.py3
-rw-r--r--examples/src/main/python/ml/gaussian_mixture_example.py6
-rw-r--r--examples/src/main/python/ml/index_to_string_example.py14
-rw-r--r--examples/src/main/python/ml/isotonic_regression_example.py4
-rw-r--r--examples/src/main/python/ml/linear_regression_with_elastic_net.py12
-rw-r--r--examples/src/main/python/ml/max_abs_scaler_example.py10
-rw-r--r--examples/src/main/python/ml/min_max_scaler_example.py10
-rw-r--r--examples/src/main/python/ml/multilayer_perceptron_classification.py2
-rw-r--r--examples/src/main/python/ml/n_gram_example.py9
-rw-r--r--examples/src/main/python/ml/naive_bayes_example.py12
-rw-r--r--examples/src/main/python/ml/normalizer_example.py9
-rw-r--r--examples/src/main/python/ml/onehot_encoder_example.py4
-rw-r--r--examples/src/main/python/ml/pipeline_example.py5
-rw-r--r--examples/src/main/python/ml/polynomial_expansion_example.py11
-rw-r--r--examples/src/main/python/ml/stopwords_remover_example.py2
-rw-r--r--examples/src/main/python/ml/tf_idf_example.py9
-rw-r--r--examples/src/main/python/ml/tokenizer_example.py14
-rw-r--r--examples/src/main/python/ml/train_validation_split.py7
-rw-r--r--examples/src/main/python/ml/vector_assembler_example.py3
-rw-r--r--examples/src/main/python/ml/vector_indexer_example.py4
-rw-r--r--examples/src/main/python/ml/word2vec_example.py5
-rwxr-xr-xexamples/src/main/python/pagerank.py7
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala5
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala5
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala8
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala5
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala3
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala14
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala3
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala10
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala10
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala7
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala9
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala3
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala7
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala8
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala3
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala7
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala5
82 files changed, 427 insertions, 188 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java b/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java
index ed0bb87657..bcc493bdcb 100644
--- a/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java
+++ b/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java
@@ -45,6 +45,11 @@ import org.apache.spark.sql.SparkSession;
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.graphx.lib.PageRank
+ *
+ * Example Usage:
+ * <pre>
+ * bin/run-example JavaPageRank data/mllib/pagerank_data.txt 10
+ * </pre>
*/
public final class JavaPageRank {
private static final Pattern SPACES = Pattern.compile("\\s+");
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java
index 3f034588c9..7c741ff56e 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java
@@ -71,8 +71,9 @@ public class JavaAFTSurvivalRegressionExample {
AFTSurvivalRegressionModel model = aft.fit(training);
// Print the coefficients, intercept and scale parameter for AFT survival regression
- System.out.println("Coefficients: " + model.coefficients() + " Intercept: "
- + model.intercept() + " Scale: " + model.scale());
+ System.out.println("Coefficients: " + model.coefficients());
+ System.out.println("Intercept: " + model.intercept());
+ System.out.println("Scale: " + model.scale());
model.transform(training).show(false);
// $example off$
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
index a954dbd20c..3090d8fd14 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
@@ -51,17 +51,18 @@ public class JavaBinarizerExample {
new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
});
Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);
+
Binarizer binarizer = new Binarizer()
.setInputCol("feature")
.setOutputCol("binarized_feature")
.setThreshold(0.5);
+
Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame);
- Dataset<Row> binarizedFeatures = binarizedDataFrame.select("binarized_feature");
- for (Row r : binarizedFeatures.collectAsList()) {
- Double binarized_value = r.getDouble(0);
- System.out.println(binarized_value);
- }
+
+ System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold());
+ binarizedDataFrame.show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
index 691df3887a..f009938333 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
@@ -44,10 +44,12 @@ public class JavaBucketizerExample {
double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
List<Row> data = Arrays.asList(
+ RowFactory.create(-999.9),
RowFactory.create(-0.5),
RowFactory.create(-0.3),
RowFactory.create(0.0),
- RowFactory.create(0.2)
+ RowFactory.create(0.2),
+ RowFactory.create(999.9)
);
StructType schema = new StructType(new StructField[]{
new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
@@ -61,8 +63,11 @@ public class JavaBucketizerExample {
// Transform original data into its bucket index.
Dataset<Row> bucketedData = bucketizer.transform(dataFrame);
+
+ System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets");
bucketedData.show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java
index fcf90d8d18..73738966b1 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java
@@ -63,7 +63,11 @@ public class JavaChiSqSelectorExample {
.setOutputCol("selectedFeatures");
Dataset<Row> result = selector.fit(df).transform(df);
+
+ System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures()
+ + " features selected");
result.show();
+
// $example off$
spark.stop();
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
index 0a6b136014..ac2a86c30b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
@@ -61,7 +61,7 @@ public class JavaCountVectorizerExample {
.setInputCol("text")
.setOutputCol("feature");
- cvModel.transform(df).show();
+ cvModel.transform(df).show(false);
// $example off$
spark.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
index 66ce23b49d..04546d29fa 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
@@ -51,13 +51,17 @@ public class JavaDCTExample {
new StructField("features", new VectorUDT(), false, Metadata.empty()),
});
Dataset<Row> df = spark.createDataFrame(data, schema);
+
DCT dct = new DCT()
.setInputCol("features")
.setOutputCol("featuresDCT")
.setInverse(false);
+
Dataset<Row> dctDf = dct.transform(df);
- dctDf.select("featuresDCT").show(3);
+
+ dctDf.select("featuresDCT").show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
index 526bed93fb..72bd5d0395 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
@@ -54,8 +54,8 @@ public class JavaGaussianMixtureExample {
// Output the parameters of the mixture model
for (int i = 0; i < model.getK(); i++) {
- System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n",
- model.weights()[i], model.gaussians()[i].mean(), model.gaussians()[i].cov());
+ System.out.printf("Gaussian %d:\nweight=%f\nmu=%s\nsigma=\n%s\n\n",
+ i, model.weights()[i], model.gaussians()[i].mean(), model.gaussians()[i].cov());
}
// $example off$
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java
index 0064beb8c8..6965512f93 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java
@@ -24,6 +24,7 @@ import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
import java.util.List;
+import org.apache.spark.ml.attribute.Attribute;
import org.apache.spark.ml.feature.IndexToString;
import org.apache.spark.ml.feature.StringIndexer;
import org.apache.spark.ml.feature.StringIndexerModel;
@@ -63,11 +64,23 @@ public class JavaIndexToStringExample {
.fit(df);
Dataset<Row> indexed = indexer.transform(df);
+ System.out.println("Transformed string column '" + indexer.getInputCol() + "' " +
+ "to indexed column '" + indexer.getOutputCol() + "'");
+ indexed.show();
+
+ StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol());
+ System.out.println("StringIndexer will store labels in output column metadata: " +
+ Attribute.fromStructField(inputColSchema).toString() + "\n");
+
IndexToString converter = new IndexToString()
.setInputCol("categoryIndex")
.setOutputCol("originalCategory");
Dataset<Row> converted = converter.transform(indexed);
- converted.select("id", "originalCategory").show();
+
+ System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " +
+ "original string column '" + converter.getOutputCol() + "' using labels in metadata");
+ converted.select("id", "categoryIndex", "originalCategory").show();
+
// $example off$
spark.stop();
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java
index 0ec17b0471..a7de8e699c 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java
@@ -50,8 +50,8 @@ public class JavaIsotonicRegressionExample {
IsotonicRegression ir = new IsotonicRegression();
IsotonicRegressionModel model = ir.fit(dataset);
- System.out.println("Boundaries in increasing order: " + model.boundaries());
- System.out.println("Predictions associated with the boundaries: " + model.predictions());
+ System.out.println("Boundaries in increasing order: " + model.boundaries() + "\n");
+ System.out.println("Predictions associated with the boundaries: " + model.predictions() + "\n");
// Makes predictions.
model.transform(dataset).show();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java
index 9a27b0e9e2..9f1ce463cf 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java
@@ -18,10 +18,20 @@
package org.apache.spark.examples.ml;
// $example on$
+import java.util.Arrays;
+import java.util.List;
+
import org.apache.spark.ml.feature.MaxAbsScaler;
import org.apache.spark.ml.feature.MaxAbsScalerModel;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
// $example off$
import org.apache.spark.sql.SparkSession;
@@ -34,10 +44,17 @@ public class JavaMaxAbsScalerExample {
.getOrCreate();
// $example on$
- Dataset<Row> dataFrame = spark
- .read()
- .format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt");
+ List<Row> data = Arrays.asList(
+ RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
+ RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
+ RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))
+ );
+ StructType schema = new StructType(new StructField[]{
+ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("features", new VectorUDT(), false, Metadata.empty())
+ });
+ Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
+
MaxAbsScaler scaler = new MaxAbsScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures");
@@ -47,8 +64,9 @@ public class JavaMaxAbsScalerExample {
// rescale each feature to range [-1, 1].
Dataset<Row> scaledData = scalerModel.transform(dataFrame);
- scaledData.show();
+ scaledData.select("features", "scaledFeatures").show();
// $example off$
+
spark.stop();
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
index 37fa1c5434..2757af8d24 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
@@ -20,10 +20,20 @@ package org.apache.spark.examples.ml;
import org.apache.spark.sql.SparkSession;
// $example on$
+import java.util.Arrays;
+import java.util.List;
+
import org.apache.spark.ml.feature.MinMaxScaler;
import org.apache.spark.ml.feature.MinMaxScalerModel;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
// $example off$
public class JavaMinMaxScalerExample {
@@ -34,10 +44,17 @@ public class JavaMinMaxScalerExample {
.getOrCreate();
// $example on$
- Dataset<Row> dataFrame = spark
- .read()
- .format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt");
+ List<Row> data = Arrays.asList(
+ RowFactory.create(0, Vectors.dense(1.0, 0.1, -1.0)),
+ RowFactory.create(1, Vectors.dense(2.0, 1.1, 1.0)),
+ RowFactory.create(2, Vectors.dense(3.0, 10.1, 3.0))
+ );
+ StructType schema = new StructType(new StructField[]{
+ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("features", new VectorUDT(), false, Metadata.empty())
+ });
+ Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
+
MinMaxScaler scaler = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures");
@@ -47,8 +64,11 @@ public class JavaMinMaxScalerExample {
// rescale each feature to range [min, max].
Dataset<Row> scaledData = scalerModel.transform(dataFrame);
- scaledData.show();
+ System.out.println("Features scaled to range: [" + scaler.getMin() + ", "
+ + scaler.getMax() + "]");
+ scaledData.select("features", "scaledFeatures").show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
index 0f1d9c2634..43db41ce17 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
@@ -41,28 +41,34 @@ public class JavaMultilayerPerceptronClassifierExample {
// Load training data
String path = "data/mllib/sample_multiclass_classification_data.txt";
Dataset<Row> dataFrame = spark.read().format("libsvm").load(path);
+
// Split the data into train and test
Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
Dataset<Row> train = splits[0];
Dataset<Row> test = splits[1];
+
// specify layers for the neural network:
// input layer of size 4 (features), two intermediate of size 5 and 4
// and output of size 3 (classes)
int[] layers = new int[] {4, 5, 4, 3};
+
// create the trainer and set its parameters
MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier()
.setLayers(layers)
.setBlockSize(128)
.setSeed(1234L)
.setMaxIter(100);
+
// train the model
MultilayerPerceptronClassificationModel model = trainer.fit(train);
+
// compute accuracy on the test set
Dataset<Row> result = model.transform(test);
Dataset<Row> predictionAndLabels = result.select("prediction", "label");
MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
.setMetricName("accuracy");
- System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels));
+
+ System.out.println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels));
// $example off$
spark.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
index 899815f57c..5427e46665 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
@@ -42,29 +42,25 @@ public class JavaNGramExample {
// $example on$
List<Row> data = Arrays.asList(
- RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
- RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
- RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
+ RowFactory.create(0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
+ RowFactory.create(1, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
+ RowFactory.create(2, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
);
StructType schema = new StructType(new StructField[]{
- new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField(
"words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
});
Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema);
- NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams");
+ NGram ngramTransformer = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams");
Dataset<Row> ngramDataFrame = ngramTransformer.transform(wordDataFrame);
-
- for (Row r : ngramDataFrame.select("ngrams", "label").takeAsList(3)) {
- java.util.List<String> ngrams = r.getList(0);
- for (String ngram : ngrams) System.out.print(ngram + " --- ");
- System.out.println();
- }
+ ngramDataFrame.select("ngrams").show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java
index 3226d5d2fa..be578dc811 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java
@@ -48,14 +48,21 @@ public class JavaNaiveBayesExample {
// create the trainer and set its parameters
NaiveBayes nb = new NaiveBayes();
+
// train the model
NaiveBayesModel model = nb.fit(train);
+
+ // Select example rows to display.
+ Dataset<Row> predictions = model.transform(test);
+ predictions.show();
+
// compute accuracy on the test set
- Dataset<Row> result = model.transform(test);
- Dataset<Row> predictionAndLabels = result.select("prediction", "label");
MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
+ .setLabelCol("label")
+ .setPredictionCol("prediction")
.setMetricName("accuracy");
- System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels));
+ double accuracy = evaluator.evaluate(predictions);
+ System.out.println("Test set accuracy = " + accuracy);
// $example off$
spark.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
index abc38f85ea..f878c420d8 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
@@ -20,9 +20,19 @@ package org.apache.spark.examples.ml;
import org.apache.spark.sql.SparkSession;
// $example on$
+import java.util.Arrays;
+import java.util.List;
+
import org.apache.spark.ml.feature.Normalizer;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
// $example off$
public class JavaNormalizerExample {
@@ -33,8 +43,16 @@ public class JavaNormalizerExample {
.getOrCreate();
// $example on$
- Dataset<Row> dataFrame =
- spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ List<Row> data = Arrays.asList(
+ RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
+ RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
+ RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))
+ );
+ StructType schema = new StructType(new StructField[]{
+ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("features", new VectorUDT(), false, Metadata.empty())
+ });
+ Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
// Normalize each Vector using $L^1$ norm.
Normalizer normalizer = new Normalizer()
@@ -50,6 +68,7 @@ public class JavaNormalizerExample {
normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
lInfNormData.show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
index a15e5f84a1..99af37676b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
@@ -68,9 +68,11 @@ public class JavaOneHotEncoderExample {
OneHotEncoder encoder = new OneHotEncoder()
.setInputCol("categoryIndex")
.setOutputCol("categoryVec");
+
Dataset<Row> encoded = encoder.transform(indexed);
- encoded.select("id", "categoryVec").show();
+ encoded.show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
index c6a083ddc9..82fb540950 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
@@ -75,7 +75,7 @@ public class JavaOneVsRestExample {
// compute the classification error on test data.
double accuracy = evaluator.evaluate(predictions);
- System.out.println("Test Error : " + (1 - accuracy));
+ System.out.println("Test Error = " + (1 - accuracy));
// $example off$
spark.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
index d597a9a2ed..6951a65553 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
@@ -62,7 +62,7 @@ public class JavaPCAExample {
.fit(df);
Dataset<Row> result = pca.transform(df).select("pcaFeatures");
- result.show();
+ result.show(false);
// $example off$
spark.stop();
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
index 67180df65c..43c636c534 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
@@ -48,23 +48,19 @@ public class JavaPolynomialExpansionExample {
.setDegree(3);
List<Row> data = Arrays.asList(
- RowFactory.create(Vectors.dense(-2.0, 2.3)),
+ RowFactory.create(Vectors.dense(2.0, 1.0)),
RowFactory.create(Vectors.dense(0.0, 0.0)),
- RowFactory.create(Vectors.dense(0.6, -1.1))
+ RowFactory.create(Vectors.dense(3.0, -1.0))
);
-
StructType schema = new StructType(new StructField[]{
new StructField("features", new VectorUDT(), false, Metadata.empty()),
});
-
Dataset<Row> df = spark.createDataFrame(data, schema);
- Dataset<Row> polyDF = polyExpansion.transform(df);
- List<Row> rows = polyDF.select("polyFeatures").takeAsList(3);
- for (Row r : rows) {
- System.out.println(r.get(0));
- }
+ Dataset<Row> polyDF = polyExpansion.transform(df);
+ polyDF.show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
index 278cce0842..94ead625b4 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
@@ -57,7 +57,7 @@ public class JavaStopWordsRemoverExample {
});
Dataset<Row> dataset = spark.createDataFrame(data, schema);
- remover.transform(dataset).show();
+ remover.transform(dataset).show(false);
// $example off$
spark.stop();
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
index 7533c1835e..cf9747a994 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
@@ -54,12 +54,15 @@ public class JavaStringIndexerExample {
createStructField("category", StringType, false)
});
Dataset<Row> df = spark.createDataFrame(data, schema);
+
StringIndexer indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("categoryIndex");
+
Dataset<Row> indexed = indexer.fit(df).transform(df);
indexed.show();
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
index 800e42c949..b740cd097a 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
@@ -25,7 +25,6 @@ import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.IDF;
import org.apache.spark.ml.feature.IDFModel;
import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.ml.linalg.Vector;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
@@ -54,25 +53,24 @@ public class JavaTfIdfExample {
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
Dataset<Row> sentenceData = spark.createDataFrame(data, schema);
+
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
Dataset<Row> wordsData = tokenizer.transform(sentenceData);
+
int numFeatures = 20;
HashingTF hashingTF = new HashingTF()
.setInputCol("words")
.setOutputCol("rawFeatures")
.setNumFeatures(numFeatures);
+
Dataset<Row> featurizedData = hashingTF.transform(wordsData);
// alternatively, CountVectorizer can also be used to get term frequency vectors
IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
IDFModel idfModel = idf.fit(featurizedData);
+
Dataset<Row> rescaledData = idfModel.transform(featurizedData);
- for (Row r : rescaledData.select("features", "label").takeAsList(3)) {
- Vector features = r.getAs(0);
- Double label = r.getDouble(1);
- System.out.println(features);
- System.out.println(label);
- }
+ rescaledData.select("label", "features").show();
// $example off$
spark.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
index a206cef4c2..101a4df779 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
@@ -23,8 +23,11 @@ import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
import java.util.List;
+import scala.collection.mutable.WrappedArray;
+
import org.apache.spark.ml.feature.RegexTokenizer;
import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
@@ -34,6 +37,12 @@ import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
// $example off$
+// $example on:untyped_ops$
+// col("...") is preferable to df.col("...")
+import static org.apache.spark.sql.functions.callUDF;
+import static org.apache.spark.sql.functions.col;
+// $example off:untyped_ops$
+
public class JavaTokenizerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
@@ -49,7 +58,7 @@ public class JavaTokenizerExample {
);
StructType schema = new StructType(new StructField[]{
- new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
@@ -62,20 +71,22 @@ public class JavaTokenizerExample {
.setOutputCol("words")
.setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
+ spark.udf().register("countTokens", new UDF1<WrappedArray, Integer>() {
+ @Override
+ public Integer call(WrappedArray words) {
+ return words.size();
+ }
+ }, DataTypes.IntegerType);
+
Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame);
- for (Row r : tokenized.select("words", "label").takeAsList(3)) {
- java.util.List<String> words = r.getList(0);
- for (String word : words) System.out.print(word + " ");
- System.out.println();
- }
+ tokenized.select("sentence", "words")
+ .withColumn("tokens", callUDF("countTokens", col("words"))).show(false);
Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame);
- for (Row r : regexTokenized.select("words", "label").takeAsList(3)) {
- java.util.List<String> words = r.getList(0);
- for (String word : words) System.out.print(word + " ");
- System.out.println();
- }
+ regexTokenized.select("sentence", "words")
+ .withColumn("tokens", callUDF("countTokens", col("words"))).show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
index 9bb0f93d3a..384e09c73b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
@@ -29,7 +29,6 @@ import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.*;
-
import static org.apache.spark.sql.types.DataTypes.*;
// $example off$
@@ -56,8 +55,11 @@ public class JavaVectorAssemblerExample {
.setOutputCol("features");
Dataset<Row> output = assembler.transform(dataset);
- System.out.println(output.select("features", "clicked").first());
+ System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
+ "'features'");
+ output.select("features", "clicked").show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
index 19b8bc83be..1922514c87 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
@@ -65,9 +65,9 @@ public class JavaVectorSlicerExample {
// or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})
Dataset<Row> output = vectorSlicer.transform(dataset);
-
- System.out.println(output.select("userFeatures", "features").first());
+ output.show(false);
// $example off$
+
spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
index 9be6e6353a..fc9b459688 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
@@ -23,6 +23,7 @@ import java.util.List;
import org.apache.spark.ml.feature.Word2Vec;
import org.apache.spark.ml.feature.Word2VecModel;
+import org.apache.spark.ml.linalg.Vector;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
@@ -55,10 +56,14 @@ public class JavaWord2VecExample {
.setOutputCol("result")
.setVectorSize(3)
.setMinCount(0);
+
Word2VecModel model = word2Vec.fit(documentDF);
Dataset<Row> result = model.transform(documentDF);
- for (Row r : result.select("result").takeAsList(3)) {
- System.out.println(r);
+
+ for (Row row : result.collectAsList()) {
+ List<String> text = row.getList(0);
+ Vector vector = (Vector) row.get(1);
+ System.out.println("Text: " + text + " => \nVector: " + vector + "\n");
}
// $example off$
diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
index 4224a27dbe..669bb2aeab 100644
--- a/examples/src/main/python/ml/binarizer_example.py
+++ b/examples/src/main/python/ml/binarizer_example.py
@@ -33,12 +33,14 @@ if __name__ == "__main__":
(0, 0.1),
(1, 0.8),
(2, 0.2)
- ], ["label", "feature"])
+ ], ["id", "feature"])
+
binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
+
binarizedDataFrame = binarizer.transform(continuousDataFrame)
- binarizedFeatures = binarizedDataFrame.select("binarized_feature")
- for binarized_feature, in binarizedFeatures.collect():
- print(binarized_feature)
+
+ print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
+ binarizedDataFrame.show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
index 8177e560dd..742f35093b 100644
--- a/examples/src/main/python/ml/bucketizer_example.py
+++ b/examples/src/main/python/ml/bucketizer_example.py
@@ -31,13 +31,15 @@ if __name__ == "__main__":
# $example on$
splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
- data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
+ data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])
bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)
+
+ print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()
# $example off$
diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py
index 5e19ef1624..028a9ea9d6 100644
--- a/examples/src/main/python/ml/chisq_selector_example.py
+++ b/examples/src/main/python/ml/chisq_selector_example.py
@@ -39,6 +39,8 @@ if __name__ == "__main__":
outputCol="selectedFeatures", labelCol="clicked")
result = selector.fit(df).transform(df)
+
+ print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show()
# $example off$
diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py
index 38cfac82fb..f2e41db77d 100644
--- a/examples/src/main/python/ml/count_vectorizer_example.py
+++ b/examples/src/main/python/ml/count_vectorizer_example.py
@@ -37,9 +37,11 @@ if __name__ == "__main__":
# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)
+
model = cv.fit(df)
+
result = model.transform(df)
- result.show()
+ result.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/dct_example.py b/examples/src/main/python/ml/dct_example.py
index a4f25df784..c0457f8d0f 100644
--- a/examples/src/main/python/ml/dct_example.py
+++ b/examples/src/main/python/ml/dct_example.py
@@ -39,8 +39,7 @@ if __name__ == "__main__":
dctDf = dct.transform(df)
- for dcts in dctDf.select("featuresDCT").take(3):
- print(dcts)
+ dctDf.select("featuresDCT").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py
index edc258de05..8ad450b669 100644
--- a/examples/src/main/python/ml/gaussian_mixture_example.py
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@@ -38,11 +38,11 @@ if __name__ == "__main__":
# loads data
dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
- gmm = GaussianMixture().setK(2)
+ gmm = GaussianMixture().setK(2).setSeed(538009335L)
model = gmm.fit(dataset)
- print("Gaussians: ")
- model.gaussiansDF.show()
+ print("Gaussians shown as a DataFrame: ")
+ model.gaussiansDF.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/index_to_string_example.py b/examples/src/main/python/ml/index_to_string_example.py
index 523caac00c..33d104e8e3 100644
--- a/examples/src/main/python/ml/index_to_string_example.py
+++ b/examples/src/main/python/ml/index_to_string_example.py
@@ -33,14 +33,22 @@ if __name__ == "__main__":
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
- stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
- model = stringIndexer.fit(df)
+ indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+ model = indexer.fit(df)
indexed = model.transform(df)
+ print("Transformed string column '%s' to indexed column '%s'"
+ % (indexer.getInputCol(), indexer.getOutputCol()))
+ indexed.show()
+
+ print("StringIndexer will store labels in output column metadata\n")
+
converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)
- converted.select("id", "originalCategory").show()
+ print("Transformed indexed column '%s' back to original string column '%s' using "
+ "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
+ converted.select("id", "categoryIndex", "originalCategory").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py
index a41b8ffacb..6ae15f1b4b 100644
--- a/examples/src/main/python/ml/isotonic_regression_example.py
+++ b/examples/src/main/python/ml/isotonic_regression_example.py
@@ -44,8 +44,8 @@ if __name__ == "__main__":
# Trains an isotonic regression model.
model = IsotonicRegression().fit(dataset)
- print("Boundaries in increasing order: " + str(model.boundaries))
- print("Predictions associated with the boundaries: " + str(model.predictions))
+ print("Boundaries in increasing order: %s\n" % str(model.boundaries))
+ print("Predictions associated with the boundaries: %s\n" % str(model.predictions))
# Makes predictions.
model.transform(dataset).show()
diff --git a/examples/src/main/python/ml/linear_regression_with_elastic_net.py b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
index 620ab5b87e..6639e9160a 100644
--- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
@@ -39,8 +39,16 @@ if __name__ == "__main__":
lrModel = lr.fit(training)
# Print the coefficients and intercept for linear regression
- print("Coefficients: " + str(lrModel.coefficients))
- print("Intercept: " + str(lrModel.intercept))
+ print("Coefficients: %s" % str(lrModel.coefficients))
+ print("Intercept: %s" % str(lrModel.intercept))
+
+ # Summarize the model over the training set and print out some metrics
+ trainingSummary = lrModel.summary
+ print("numIterations: %d" % trainingSummary.totalIterations)
+ print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
+ trainingSummary.residuals.show()
+ print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
+ print("r2: %f" % trainingSummary.r2)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/max_abs_scaler_example.py b/examples/src/main/python/ml/max_abs_scaler_example.py
index ab91198b08..45eda3cdad 100644
--- a/examples/src/main/python/ml/max_abs_scaler_example.py
+++ b/examples/src/main/python/ml/max_abs_scaler_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import MaxAbsScaler
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,7 +30,11 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.1, -8.0]),),
+ (1, Vectors.dense([2.0, 1.0, -4.0]),),
+ (2, Vectors.dense([4.0, 10.0, 8.0]),)
+ ], ["id", "features"])
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
@@ -38,7 +43,8 @@ if __name__ == "__main__":
# rescale each feature to range [-1, 1].
scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+
+ scaledData.select("features", "scaledFeatures").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/min_max_scaler_example.py b/examples/src/main/python/ml/min_max_scaler_example.py
index e3e7bc205b..b5f272e59b 100644
--- a/examples/src/main/python/ml/min_max_scaler_example.py
+++ b/examples/src/main/python/ml/min_max_scaler_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import MinMaxScaler
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,7 +30,11 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.1, -1.0]),),
+ (1, Vectors.dense([2.0, 1.1, 1.0]),),
+ (2, Vectors.dense([3.0, 10.1, 3.0]),)
+ ], ["id", "features"])
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
@@ -38,7 +43,8 @@ if __name__ == "__main__":
# rescale each feature to range [min, max].
scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+ print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
+ scaledData.select("features", "scaledFeatures").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py
index 2cc38c2855..88fc69f753 100644
--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -52,7 +52,7 @@ if __name__ == "__main__":
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
- print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
+ print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py
index 55263adb46..31676e076a 100644
--- a/examples/src/main/python/ml/n_gram_example.py
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -33,13 +33,12 @@ if __name__ == "__main__":
(0, ["Hi", "I", "heard", "about", "Spark"]),
(1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
(2, ["Logistic", "regression", "models", "are", "neat"])
- ], ["label", "words"])
+ ], ["id", "words"])
- ngram = NGram(inputCol="words", outputCol="ngrams")
- ngramDataFrame = ngram.transform(wordDataFrame)
+ ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
- for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
- print(ngrams_label)
+ ngramDataFrame = ngram.transform(wordDataFrame)
+ ngramDataFrame.select("ngrams").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py
index aa23f298c8..7290ab81cd 100644
--- a/examples/src/main/python/ml/naive_bayes_example.py
+++ b/examples/src/main/python/ml/naive_bayes_example.py
@@ -45,11 +45,15 @@ if __name__ == "__main__":
# train the model
model = nb.fit(train)
+ # select example rows to display.
+ predictions = model.transform(test)
+ predictions.show()
+
# compute accuracy on the test set
- result = model.transform(test)
- predictionAndLabels = result.select("prediction", "label")
- evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
- print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
+ evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
+ metricName="accuracy")
+ accuracy = evaluator.evaluate(predictions)
+ print("Test set accuracy = " + str(accuracy))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py
index 19012f51f4..510bd825fd 100644
--- a/examples/src/main/python/ml/normalizer_example.py
+++ b/examples/src/main/python/ml/normalizer_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import Normalizer
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,15 +30,21 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.5, -1.0]),),
+ (1, Vectors.dense([2.0, 1.0, 1.0]),),
+ (2, Vectors.dense([4.0, 10.0, 2.0]),)
+ ], ["id", "features"])
# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
+ print("Normalized using L^1 norm")
l1NormData.show()
# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
+ print("Normalized using L^inf norm")
lInfNormData.show()
# $example off$
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py
index 47faf8d202..e1996c7f0a 100644
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@@ -42,9 +42,9 @@ if __name__ == "__main__":
model = stringIndexer.fit(df)
indexed = model.transform(df)
- encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec")
+ encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.transform(indexed)
- encoded.select("id", "categoryVec").show()
+ encoded.show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py
index 2d0865578a..f63e4db434 100644
--- a/examples/src/main/python/ml/pipeline_example.py
+++ b/examples/src/main/python/ml/pipeline_example.py
@@ -60,9 +60,10 @@ if __name__ == "__main__":
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
- selected = prediction.select("id", "text", "prediction")
+ selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
- print(row)
+ rid, text, prob, prediction = row
+ print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
index b464ee86b6..40bcb7b13a 100644
--- a/examples/src/main/python/ml/polynomial_expansion_example.py
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -31,16 +31,15 @@ if __name__ == "__main__":
# $example on$
df = spark.createDataFrame([
- (Vectors.dense([-2.0, 2.3]),),
+ (Vectors.dense([2.0, 1.0]),),
(Vectors.dense([0.0, 0.0]),),
- (Vectors.dense([0.6, -1.1]),)
+ (Vectors.dense([3.0, -1.0]),)
], ["features"])
- px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
- polyDF = px.transform(df)
+ polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
+ polyDF = polyExpansion.transform(df)
- for expanded in polyDF.select("polyFeatures").take(3):
- print(expanded)
+ polyDF.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py
index 8a8392cc1f..3b8e7855e3 100644
--- a/examples/src/main/python/ml/stopwords_remover_example.py
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@@ -32,7 +32,7 @@ if __name__ == "__main__":
sentenceData = spark.createDataFrame([
(0, ["I", "saw", "the", "red", "balloon"]),
(1, ["Mary", "had", "a", "little", "lamb"])
- ], ["label", "raw"])
+ ], ["id", "raw"])
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
index 4ab7eb6964..d43244fa68 100644
--- a/examples/src/main/python/ml/tf_idf_example.py
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -30,9 +30,9 @@ if __name__ == "__main__":
# $example on$
sentenceData = spark.createDataFrame([
- (0, "Hi I heard about Spark"),
- (0, "I wish Java could use case classes"),
- (1, "Logistic regression models are neat")
+ (0.0, "Hi I heard about Spark"),
+ (0.0, "I wish Java could use case classes"),
+ (1.0, "Logistic regression models are neat")
], ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
@@ -46,8 +46,7 @@ if __name__ == "__main__":
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
- for features_label in rescaledData.select("features", "label").take(3):
- print(features_label)
+ rescaledData.select("label", "features").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py
index 89f5060705..5c65c5c9f8 100644
--- a/examples/src/main/python/ml/tokenizer_example.py
+++ b/examples/src/main/python/ml/tokenizer_example.py
@@ -19,6 +19,8 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer
+from pyspark.sql.functions import col, udf
+from pyspark.sql.types import IntegerType
# $example off$
from pyspark.sql import SparkSession
@@ -33,20 +35,22 @@ if __name__ == "__main__":
(0, "Hi I heard about Spark"),
(1, "I wish Java could use case classes"),
(2, "Logistic,regression,models,are,neat")
- ], ["label", "sentence"])
+ ], ["id", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)
+ countTokens = udf(lambda words: len(words), IntegerType())
+
tokenized = tokenizer.transform(sentenceDataFrame)
- for words_label in tokenized.select("words", "label").take(3):
- print(words_label)
+ tokenized.select("sentence", "words")\
+ .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
regexTokenized = regexTokenizer.transform(sentenceDataFrame)
- for words_label in regexTokenized.select("words", "label").take(3):
- print(words_label)
+ regexTokenized.select("sentence", "words") \
+ .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py
index a92b861f83..d104f7d30a 100644
--- a/examples/src/main/python/ml/train_validation_split.py
+++ b/examples/src/main/python/ml/train_validation_split.py
@@ -66,8 +66,9 @@ if __name__ == "__main__":
# Make predictions on test data. model is the model with combination of parameters
# that performed best.
- prediction = model.transform(test)
- for row in prediction.take(5):
- print(row)
+ model.transform(test)\
+ .select("features", "label", "prediction")\
+ .show()
+
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py
index eac33711ad..98de1d5ea7 100644
--- a/examples/src/main/python/ml/vector_assembler_example.py
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@@ -39,7 +39,8 @@ if __name__ == "__main__":
outputCol="features")
output = assembler.transform(dataset)
- print(output.select("features", "clicked").first())
+ print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+ output.select("features", "clicked").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py
index 3912c135be..5c2956077d 100644
--- a/examples/src/main/python/ml/vector_indexer_example.py
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@@ -34,6 +34,10 @@ if __name__ == "__main__":
indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
indexerModel = indexer.fit(data)
+ categoricalFeatures = indexerModel.categoryMaps
+ print("Chose %d categorical features: %s" %
+ (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys())))
+
# Create new column "indexed" with categorical values transformed to indices
indexedData = indexerModel.transform(data)
indexedData.show()
diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py
index 78a91c92fc..77f8951df0 100644
--- a/examples/src/main/python/ml/word2vec_example.py
+++ b/examples/src/main/python/ml/word2vec_example.py
@@ -41,8 +41,9 @@ if __name__ == "__main__":
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
- for feature in result.select("result").take(3):
- print(feature)
+ for row in result.collect():
+ text, vector = row
+ print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/pagerank.py b/examples/src/main/python/pagerank.py
index a399a9c37c..0d6c253d39 100755
--- a/examples/src/main/python/pagerank.py
+++ b/examples/src/main/python/pagerank.py
@@ -18,6 +18,9 @@
"""
This is an example implementation of PageRank. For more conventional use,
Please refer to PageRank implementation provided by graphx
+
+Example Usage:
+bin/spark-submit examples/src/main/python/pagerank.py data/mllib/pagerank_data.txt 10
"""
from __future__ import print_function
@@ -46,8 +49,8 @@ if __name__ == "__main__":
print("Usage: pagerank <file> <iterations>", file=sys.stderr)
exit(-1)
- print("""WARN: This is a naive implementation of PageRank and is
- given as an example! Please refer to PageRank implementation provided by graphx""",
+ print("WARN: This is a naive implementation of PageRank and is given as an example!\n" +
+ "Please refer to PageRank implementation provided by graphx",
file=sys.stderr)
# Initialize the spark context.
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
index d0b874c48d..5d8831265e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
@@ -31,6 +31,11 @@ import org.apache.spark.sql.SparkSession
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.graphx.lib.PageRank
+ *
+ * Example Usage:
+ * {{{
+ * bin/run-example SparkPageRank data/mllib/pagerank_data.txt 10
+ * }}}
*/
object SparkPageRank {
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
index b6d7b36916..cdb33f4d6d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
@@ -55,8 +55,9 @@ object AFTSurvivalRegressionExample {
val model = aft.fit(training)
// Print the coefficients, intercept and scale parameter for AFT survival regression
- println(s"Coefficients: ${model.coefficients} Intercept: " +
- s"${model.intercept} Scale: ${model.scale}")
+ println(s"Coefficients: ${model.coefficients}")
+ println(s"Intercept: ${model.intercept}")
+ println(s"Scale: ${model.scale}")
model.transform(training).show(false)
// $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
index 5cd13ad64c..a4f62e7871 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
@@ -29,9 +29,10 @@ object BinarizerExample {
.builder
.appName("BinarizerExample")
.getOrCreate()
+
// $example on$
val data = Array((0, 0.1), (1, 0.8), (2, 0.2))
- val dataFrame = spark.createDataFrame(data).toDF("label", "feature")
+ val dataFrame = spark.createDataFrame(data).toDF("id", "feature")
val binarizer: Binarizer = new Binarizer()
.setInputCol("feature")
@@ -39,8 +40,9 @@ object BinarizerExample {
.setThreshold(0.5)
val binarizedDataFrame = binarizer.transform(dataFrame)
- val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
- binarizedFeatures.collect().foreach(println)
+
+ println(s"Binarizer output with Threshold = ${binarizer.getThreshold}")
+ binarizedDataFrame.show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
index 38cce34bb5..04e4eccd43 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
@@ -33,7 +33,7 @@ object BucketizerExample {
// $example on$
val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
- val data = Array(-0.5, -0.3, 0.0, 0.2)
+ val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9)
val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
val bucketizer = new Bucketizer()
@@ -43,8 +43,11 @@ object BucketizerExample {
// Transform original data into its bucket index.
val bucketedData = bucketizer.transform(dataFrame)
+
+ println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets")
bucketedData.show()
// $example off$
+
spark.stop()
}
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala
index c9394dd9c6..5638e66b87 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala
@@ -48,8 +48,11 @@ object ChiSqSelectorExample {
.setOutputCol("selectedFeatures")
val result = selector.fit(df).transform(df)
+
+ println(s"ChiSqSelector output with top ${selector.getNumTopFeatures} features selected")
result.show()
// $example off$
+
spark.stop()
}
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
index 988d8941a4..91d861dd43 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
@@ -49,7 +49,7 @@ object CountVectorizerExample {
.setInputCol("words")
.setOutputCol("features")
- cvModel.transform(df).select("features").show()
+ cvModel.transform(df).show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
index ddc6717528..3383171303 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
@@ -45,7 +45,7 @@ object DCTExample {
.setInverse(false)
val dctDf = dct.transform(df)
- dctDf.select("featuresDCT").show(3)
+ dctDf.select("featuresDCT").show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
index 26095b46f5..5e4bea4c4f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
@@ -49,8 +49,8 @@ object GaussianMixtureExample {
// output parameters of mixture model model
for (i <- 0 until model.getK) {
- println("weight=%f\nmu=%s\nsigma=\n%s\n" format
- (model.weights(i), model.gaussians(i).mean, model.gaussians(i).cov))
+ println(s"Gaussian $i:\nweight=${model.weights(i)}\n" +
+ s"mu=${model.gaussians(i).mean}\nsigma=\n${model.gaussians(i).cov}\n")
}
// $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala
index 950733831c..2940682c32 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala
@@ -19,6 +19,7 @@
package org.apache.spark.examples.ml
// $example on$
+import org.apache.spark.ml.attribute.Attribute
import org.apache.spark.ml.feature.{IndexToString, StringIndexer}
// $example off$
import org.apache.spark.sql.SparkSession
@@ -46,12 +47,23 @@ object IndexToStringExample {
.fit(df)
val indexed = indexer.transform(df)
+ println(s"Transformed string column '${indexer.getInputCol}' " +
+ s"to indexed column '${indexer.getOutputCol}'")
+ indexed.show()
+
+ val inputColSchema = indexed.schema(indexer.getOutputCol)
+ println(s"StringIndexer will store labels in output column metadata: " +
+ s"${Attribute.fromStructField(inputColSchema).toString}\n")
+
val converter = new IndexToString()
.setInputCol("categoryIndex")
.setOutputCol("originalCategory")
val converted = converter.transform(indexed)
- converted.select("id", "originalCategory").show()
+
+ println(s"Transformed indexed column '${converter.getInputCol}' back to original string " +
+ s"column '${converter.getOutputCol}' using labels in metadata")
+ converted.select("id", "categoryIndex", "originalCategory").show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala
index a840559d24..9bac16ec76 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala
@@ -47,8 +47,8 @@ object IsotonicRegressionExample {
val ir = new IsotonicRegression()
val model = ir.fit(dataset)
- println(s"Boundaries in increasing order: ${model.boundaries}")
- println(s"Predictions associated with the boundaries: ${model.predictions}")
+ println(s"Boundaries in increasing order: ${model.boundaries}\n")
+ println(s"Predictions associated with the boundaries: ${model.predictions}\n")
// Makes predictions.
model.transform(dataset).show()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala
index 94cf286623..4540a8d728 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala
@@ -50,7 +50,7 @@ object LinearRegressionWithElasticNetExample {
// Summarize the model over the training set and print out some metrics
val trainingSummary = lrModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
- println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
+ println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
index cd8775c942..1740a0d3f9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
@@ -51,6 +51,7 @@ object LogisticRegressionSummaryExample {
// Obtain the objective per iteration.
val objectiveHistory = trainingSummary.objectiveHistory
+ println("objectiveHistory:")
objectiveHistory.foreach(loss => println(loss))
// Obtain the metrics useful to judge performance on test data.
@@ -61,7 +62,7 @@ object LogisticRegressionSummaryExample {
// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
val roc = binarySummary.roc
roc.show()
- println(binarySummary.areaUnderROC)
+ println(s"areaUnderROC: ${binarySummary.areaUnderROC}")
// Set the model threshold to maximize F-Measure
val fMeasure = binarySummary.fMeasureByThreshold
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala
index 572adce657..85d071369d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala
@@ -19,6 +19,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.MaxAbsScaler
+import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession
@@ -30,7 +31,12 @@ object MaxAbsScalerExample {
.getOrCreate()
// $example on$
- val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val dataFrame = spark.createDataFrame(Seq(
+ (0, Vectors.dense(1.0, 0.1, -8.0)),
+ (1, Vectors.dense(2.0, 1.0, -4.0)),
+ (2, Vectors.dense(4.0, 10.0, 8.0))
+ )).toDF("id", "features")
+
val scaler = new MaxAbsScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures")
@@ -40,7 +46,7 @@ object MaxAbsScalerExample {
// rescale each feature to range [-1, 1]
val scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+ scaledData.select("features", "scaledFeatures").show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
index d728019a62..9ee6d9b449 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
@@ -20,6 +20,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.MinMaxScaler
+import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession
@@ -31,7 +32,11 @@ object MinMaxScalerExample {
.getOrCreate()
// $example on$
- val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val dataFrame = spark.createDataFrame(Seq(
+ (0, Vectors.dense(1.0, 0.1, -1.0)),
+ (1, Vectors.dense(2.0, 1.1, 1.0)),
+ (2, Vectors.dense(3.0, 10.1, 3.0))
+ )).toDF("id", "features")
val scaler = new MinMaxScaler()
.setInputCol("features")
@@ -42,7 +47,8 @@ object MinMaxScalerExample {
// rescale each feature to range [min, max].
val scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+ println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]")
+ scaledData.select("features", "scaledFeatures").show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
index a39e3202ba..6fce82d294 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
@@ -66,7 +66,7 @@ object MultilayerPerceptronClassifierExample {
val evaluator = new MulticlassClassificationEvaluator()
.setMetricName("accuracy")
- println("Accuracy: " + evaluator.evaluate(predictionAndLabels))
+ println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels))
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
index e0b52e7a36..d2183d6b49 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
@@ -35,11 +35,12 @@ object NGramExample {
(0, Array("Hi", "I", "heard", "about", "Spark")),
(1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
(2, Array("Logistic", "regression", "models", "are", "neat"))
- )).toDF("label", "words")
+ )).toDF("id", "words")
+
+ val ngram = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams")
- val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
val ngramDataFrame = ngram.transform(wordDataFrame)
- ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
+ ngramDataFrame.select("ngrams").show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
index 3ae0623c4c..bd9fcc420a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
@@ -52,7 +52,7 @@ object NaiveBayesExample {
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
- println("Accuracy: " + accuracy)
+ println("Test set accuracy = " + accuracy)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
index 75ba33a7e7..989d250c17 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
@@ -20,6 +20,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.Normalizer
+import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession
@@ -31,7 +32,11 @@ object NormalizerExample {
.getOrCreate()
// $example on$
- val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val dataFrame = spark.createDataFrame(Seq(
+ (0, Vectors.dense(1.0, 0.5, -1.0)),
+ (1, Vectors.dense(2.0, 1.0, 1.0)),
+ (2, Vectors.dense(4.0, 10.0, 2.0))
+ )).toDF("id", "features")
// Normalize each Vector using $L^1$ norm.
val normalizer = new Normalizer()
@@ -40,10 +45,12 @@ object NormalizerExample {
.setP(1.0)
val l1NormData = normalizer.transform(dataFrame)
+ println("Normalized using L^1 norm")
l1NormData.show()
// Normalize each Vector using $L^\infty$ norm.
val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
+ println("Normalized using L^inf norm")
lInfNormData.show()
// $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
index 4aa649b133..274cc1268f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
@@ -49,8 +49,9 @@ object OneHotEncoderExample {
val encoder = new OneHotEncoder()
.setInputCol("categoryIndex")
.setOutputCol("categoryVec")
+
val encoded = encoder.transform(indexed)
- encoded.select("id", "categoryVec").show()
+ encoded.show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
index acde110683..4ad6c7c3ef 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
@@ -69,7 +69,7 @@ object OneVsRestExample {
// compute the classification error on test data.
val accuracy = evaluator.evaluate(predictions)
- println(s"Test Error : ${1 - accuracy}")
+ println(s"Test Error = ${1 - accuracy}")
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
index dca96eea2b..4e1d7cdbab 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
@@ -38,14 +38,15 @@ object PCAExample {
Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
)
val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+
val pca = new PCA()
.setInputCol("features")
.setOutputCol("pcaFeatures")
.setK(3)
.fit(df)
- val pcaDF = pca.transform(df)
- val result = pcaDF.select("pcaFeatures")
- result.show()
+
+ val result = pca.transform(df).select("pcaFeatures")
+ result.show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
index 54d2e6b36d..f117b03ab2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
@@ -33,17 +33,19 @@ object PolynomialExpansionExample {
// $example on$
val data = Array(
- Vectors.dense(-2.0, 2.3),
+ Vectors.dense(2.0, 1.0),
Vectors.dense(0.0, 0.0),
- Vectors.dense(0.6, -1.1)
+ Vectors.dense(3.0, -1.0)
)
val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
- val polynomialExpansion = new PolynomialExpansion()
+
+ val polyExpansion = new PolynomialExpansion()
.setInputCol("features")
.setOutputCol("polyFeatures")
.setDegree(3)
- val polyDF = polynomialExpansion.transform(df)
- polyDF.select("polyFeatures").take(3).foreach(println)
+
+ val polyDF = polyExpansion.transform(df)
+ polyDF.show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
index a56de0856d..369a6fffd7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
@@ -40,7 +40,7 @@ object StopWordsRemoverExample {
(1, Seq("Mary", "had", "a", "little", "lamb"))
)).toDF("id", "raw")
- remover.transform(dataSet).show()
+ remover.transform(dataSet).show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
index 97f6fcce15..ec2df2ef87 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
@@ -33,9 +33,9 @@ object TfIdfExample {
// $example on$
val sentenceData = spark.createDataFrame(Seq(
- (0, "Hi I heard about Spark"),
- (0, "I wish Java could use case classes"),
- (1, "Logistic regression models are neat")
+ (0.0, "Hi I heard about Spark"),
+ (0.0, "I wish Java could use case classes"),
+ (1.0, "Logistic regression models are neat")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
@@ -51,7 +51,7 @@ object TfIdfExample {
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
- rescaledData.select("features", "label").take(3).foreach(println)
+ rescaledData.select("label", "features").show()
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
index 90d0faaf47..0167dc3723 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
@@ -20,6 +20,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
+import org.apache.spark.sql.functions._
// $example off$
import org.apache.spark.sql.SparkSession
@@ -35,7 +36,7 @@ object TokenizerExample {
(0, "Hi I heard about Spark"),
(1, "I wish Java could use case classes"),
(2, "Logistic,regression,models,are,neat")
- )).toDF("label", "sentence")
+ )).toDF("id", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val regexTokenizer = new RegexTokenizer()
@@ -43,11 +44,15 @@ object TokenizerExample {
.setOutputCol("words")
.setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
+ val countTokens = udf { (words: Seq[String]) => words.length }
+
val tokenized = tokenizer.transform(sentenceDataFrame)
- tokenized.select("words", "label").take(3).foreach(println)
+ tokenized.select("sentence", "words")
+ .withColumn("tokens", countTokens(col("words"))).show(false)
val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
- regexTokenized.select("words", "label").take(3).foreach(println)
+ regexTokenized.select("sentence", "words")
+ .withColumn("tokens", countTokens(col("words"))).show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala
index 13c72f88cc..13b58d154b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala
@@ -100,6 +100,7 @@ object UnaryTransformerExample {
val data = spark.range(0, 5).toDF("input")
.select(col("input").cast("double").as("input"))
val result = myTransformer.transform(data)
+ println("Transformed by adding constant value")
result.show()
// Save and load the Transformer.
@@ -109,6 +110,7 @@ object UnaryTransformerExample {
val sameTransformer = MyTransformer.load(dirName)
// Transform the data to show the results are identical.
+ println("Same transform applied from loaded model")
val sameResult = sameTransformer.transform(data)
sameResult.show()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
index 8910470c1c..3d5c7efb20 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
@@ -41,7 +41,8 @@ object VectorAssemblerExample {
.setOutputCol("features")
val output = assembler.transform(dataset)
- println(output.select("features", "clicked").first())
+ println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+ output.select("features", "clicked").show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
index 85dd5c2776..63a60912de 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
@@ -37,7 +37,10 @@ object VectorSlicerExample {
.getOrCreate()
// $example on$
- val data = Arrays.asList(Row(Vectors.dense(-2.0, 2.3, 0.0)))
+ val data = Arrays.asList(
+ Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))),
+ Row(Vectors.dense(-2.0, 2.3, 0.0))
+ )
val defaultAttr = NumericAttribute.defaultAttr
val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
@@ -51,7 +54,7 @@ object VectorSlicerExample {
// or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))
val output = slicer.transform(dataset)
- println(output.select("userFeatures", "features").first())
+ output.show(false)
// $example off$
spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
index 5c8bd19f20..4bcc6ac6a0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
@@ -20,6 +20,8 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.Word2Vec
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.sql.Row
// $example off$
import org.apache.spark.sql.SparkSession
@@ -47,7 +49,8 @@ object Word2VecExample {
val model = word2Vec.fit(documentDF)
val result = model.transform(documentDF)
- result.select("result").take(3).foreach(println)
+ result.collect().foreach { case Row(text: Seq[_], features: Vector) =>
+ println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") }
// $example off$
spark.stop()