diff options
Diffstat (limited to 'examples')
82 files changed, 427 insertions, 188 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java b/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java index ed0bb87657..bcc493bdcb 100644 --- a/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java +++ b/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java @@ -45,6 +45,11 @@ import org.apache.spark.sql.SparkSession; * * This is an example implementation for learning how to use Spark. For more conventional use, * please refer to org.apache.spark.graphx.lib.PageRank + * + * Example Usage: + * <pre> + * bin/run-example JavaPageRank data/mllib/pagerank_data.txt 10 + * </pre> */ public final class JavaPageRank { private static final Pattern SPACES = Pattern.compile("\\s+"); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java index 3f034588c9..7c741ff56e 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java @@ -71,8 +71,9 @@ public class JavaAFTSurvivalRegressionExample { AFTSurvivalRegressionModel model = aft.fit(training); // Print the coefficients, intercept and scale parameter for AFT survival regression - System.out.println("Coefficients: " + model.coefficients() + " Intercept: " - + model.intercept() + " Scale: " + model.scale()); + System.out.println("Coefficients: " + model.coefficients()); + System.out.println("Intercept: " + model.intercept()); + System.out.println("Scale: " + model.scale()); model.transform(training).show(false); // $example off$ diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java index a954dbd20c..3090d8fd14 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java @@ -51,17 +51,18 @@ public class JavaBinarizerExample { new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema); + Binarizer binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(0.5); + Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame); - Dataset<Row> binarizedFeatures = binarizedDataFrame.select("binarized_feature"); - for (Row r : binarizedFeatures.collectAsList()) { - Double binarized_value = r.getDouble(0); - System.out.println(binarized_value); - } + + System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold()); + binarizedDataFrame.show(); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java index 691df3887a..f009938333 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java @@ -44,10 +44,12 @@ public class JavaBucketizerExample { double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY}; List<Row> data = Arrays.asList( + RowFactory.create(-999.9), RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), - RowFactory.create(0.2) + RowFactory.create(0.2), + RowFactory.create(999.9) ); StructType schema = new StructType(new StructField[]{ new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) @@ -61,8 +63,11 @@ public class JavaBucketizerExample { // Transform original data into its bucket index. Dataset<Row> bucketedData = bucketizer.transform(dataFrame); + + System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets"); bucketedData.show(); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java index fcf90d8d18..73738966b1 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java @@ -63,7 +63,11 @@ public class JavaChiSqSelectorExample { .setOutputCol("selectedFeatures"); Dataset<Row> result = selector.fit(df).transform(df); + + System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures() + + " features selected"); result.show(); + // $example off$ spark.stop(); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java index 0a6b136014..ac2a86c30b 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java @@ -61,7 +61,7 @@ public class JavaCountVectorizerExample { .setInputCol("text") .setOutputCol("feature"); - cvModel.transform(df).show(); + cvModel.transform(df).show(false); // $example off$ spark.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java index 66ce23b49d..04546d29fa 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java @@ -51,13 +51,17 @@ public class JavaDCTExample { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); + DCT dct = new DCT() .setInputCol("features") .setOutputCol("featuresDCT") .setInverse(false); + Dataset<Row> dctDf = dct.transform(df); - dctDf.select("featuresDCT").show(3); + + dctDf.select("featuresDCT").show(false); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java index 526bed93fb..72bd5d0395 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java @@ -54,8 +54,8 @@ public class JavaGaussianMixtureExample { // Output the parameters of the mixture model for (int i = 0; i < model.getK(); i++) { - System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n", - model.weights()[i], model.gaussians()[i].mean(), model.gaussians()[i].cov()); + System.out.printf("Gaussian %d:\nweight=%f\nmu=%s\nsigma=\n%s\n\n", + i, model.weights()[i], model.gaussians()[i].mean(), model.gaussians()[i].cov()); } // $example off$ diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java index 0064beb8c8..6965512f93 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java @@ -24,6 +24,7 @@ import org.apache.spark.sql.SparkSession; import java.util.Arrays; import java.util.List; +import org.apache.spark.ml.attribute.Attribute; import org.apache.spark.ml.feature.IndexToString; import org.apache.spark.ml.feature.StringIndexer; import org.apache.spark.ml.feature.StringIndexerModel; @@ -63,11 +64,23 @@ public class JavaIndexToStringExample { .fit(df); Dataset<Row> indexed = indexer.transform(df); + System.out.println("Transformed string column '" + indexer.getInputCol() + "' " + + "to indexed column '" + indexer.getOutputCol() + "'"); + indexed.show(); + + StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol()); + System.out.println("StringIndexer will store labels in output column metadata: " + + Attribute.fromStructField(inputColSchema).toString() + "\n"); + IndexToString converter = new IndexToString() .setInputCol("categoryIndex") .setOutputCol("originalCategory"); Dataset<Row> converted = converter.transform(indexed); - converted.select("id", "originalCategory").show(); + + System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " + + "original string column '" + converter.getOutputCol() + "' using labels in metadata"); + converted.select("id", "categoryIndex", "originalCategory").show(); + // $example off$ spark.stop(); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java index 0ec17b0471..a7de8e699c 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java @@ -50,8 +50,8 @@ public class JavaIsotonicRegressionExample { IsotonicRegression ir = new IsotonicRegression(); IsotonicRegressionModel model = ir.fit(dataset); - System.out.println("Boundaries in increasing order: " + model.boundaries()); - System.out.println("Predictions associated with the boundaries: " + model.predictions()); + System.out.println("Boundaries in increasing order: " + model.boundaries() + "\n"); + System.out.println("Predictions associated with the boundaries: " + model.predictions() + "\n"); // Makes predictions. model.transform(dataset).show(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java index 9a27b0e9e2..9f1ce463cf 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java @@ -18,10 +18,20 @@ package org.apache.spark.examples.ml; // $example on$ +import java.util.Arrays; +import java.util.List; + import org.apache.spark.ml.feature.MaxAbsScaler; import org.apache.spark.ml.feature.MaxAbsScalerModel; +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.ml.linalg.VectorUDT; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; // $example off$ import org.apache.spark.sql.SparkSession; @@ -34,10 +44,17 @@ public class JavaMaxAbsScalerExample { .getOrCreate(); // $example on$ - Dataset<Row> dataFrame = spark - .read() - .format("libsvm") - .load("data/mllib/sample_libsvm_data.txt"); + List<Row> data = Arrays.asList( + RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)), + RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)), + RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0)) + ); + StructType schema = new StructType(new StructField[]{ + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("features", new VectorUDT(), false, Metadata.empty()) + }); + Dataset<Row> dataFrame = spark.createDataFrame(data, schema); + MaxAbsScaler scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); @@ -47,8 +64,9 @@ public class JavaMaxAbsScalerExample { // rescale each feature to range [-1, 1]. Dataset<Row> scaledData = scalerModel.transform(dataFrame); - scaledData.show(); + scaledData.select("features", "scaledFeatures").show(); // $example off$ + spark.stop(); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java index 37fa1c5434..2757af8d24 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java @@ -20,10 +20,20 @@ package org.apache.spark.examples.ml; import org.apache.spark.sql.SparkSession; // $example on$ +import java.util.Arrays; +import java.util.List; + import org.apache.spark.ml.feature.MinMaxScaler; import org.apache.spark.ml.feature.MinMaxScalerModel; +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.ml.linalg.VectorUDT; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; // $example off$ public class JavaMinMaxScalerExample { @@ -34,10 +44,17 @@ public class JavaMinMaxScalerExample { .getOrCreate(); // $example on$ - Dataset<Row> dataFrame = spark - .read() - .format("libsvm") - .load("data/mllib/sample_libsvm_data.txt"); + List<Row> data = Arrays.asList( + RowFactory.create(0, Vectors.dense(1.0, 0.1, -1.0)), + RowFactory.create(1, Vectors.dense(2.0, 1.1, 1.0)), + RowFactory.create(2, Vectors.dense(3.0, 10.1, 3.0)) + ); + StructType schema = new StructType(new StructField[]{ + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("features", new VectorUDT(), false, Metadata.empty()) + }); + Dataset<Row> dataFrame = spark.createDataFrame(data, schema); + MinMaxScaler scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); @@ -47,8 +64,11 @@ public class JavaMinMaxScalerExample { // rescale each feature to range [min, max]. Dataset<Row> scaledData = scalerModel.transform(dataFrame); - scaledData.show(); + System.out.println("Features scaled to range: [" + scaler.getMin() + ", " + + scaler.getMax() + "]"); + scaledData.select("features", "scaledFeatures").show(); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java index 0f1d9c2634..43db41ce17 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java @@ -41,28 +41,34 @@ public class JavaMultilayerPerceptronClassifierExample { // Load training data String path = "data/mllib/sample_multiclass_classification_data.txt"; Dataset<Row> dataFrame = spark.read().format("libsvm").load(path); + // Split the data into train and test Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); Dataset<Row> train = splits[0]; Dataset<Row> test = splits[1]; + // specify layers for the neural network: // input layer of size 4 (features), two intermediate of size 5 and 4 // and output of size 3 (classes) int[] layers = new int[] {4, 5, 4, 3}; + // create the trainer and set its parameters MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(100); + // train the model MultilayerPerceptronClassificationModel model = trainer.fit(train); + // compute accuracy on the test set Dataset<Row> result = model.transform(test); Dataset<Row> predictionAndLabels = result.select("prediction", "label"); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setMetricName("accuracy"); - System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels)); + + System.out.println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels)); // $example off$ spark.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java index 899815f57c..5427e46665 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java @@ -42,29 +42,25 @@ public class JavaNGramExample { // $example on$ List<Row> data = Arrays.asList( - RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")), - RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")), - RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat")) + RowFactory.create(0, Arrays.asList("Hi", "I", "heard", "about", "Spark")), + RowFactory.create(1, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")), + RowFactory.create(2, Arrays.asList("Logistic", "regression", "models", "are", "neat")) ); StructType schema = new StructType(new StructField[]{ - new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField( "words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema); - NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams"); + NGram ngramTransformer = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams"); Dataset<Row> ngramDataFrame = ngramTransformer.transform(wordDataFrame); - - for (Row r : ngramDataFrame.select("ngrams", "label").takeAsList(3)) { - java.util.List<String> ngrams = r.getList(0); - for (String ngram : ngrams) System.out.print(ngram + " --- "); - System.out.println(); - } + ngramDataFrame.select("ngrams").show(false); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java index 3226d5d2fa..be578dc811 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java @@ -48,14 +48,21 @@ public class JavaNaiveBayesExample { // create the trainer and set its parameters NaiveBayes nb = new NaiveBayes(); + // train the model NaiveBayesModel model = nb.fit(train); + + // Select example rows to display. + Dataset<Row> predictions = model.transform(test); + predictions.show(); + // compute accuracy on the test set - Dataset<Row> result = model.transform(test); - Dataset<Row> predictionAndLabels = result.select("prediction", "label"); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() + .setLabelCol("label") + .setPredictionCol("prediction") .setMetricName("accuracy"); - System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels)); + double accuracy = evaluator.evaluate(predictions); + System.out.println("Test set accuracy = " + accuracy); // $example off$ spark.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java index abc38f85ea..f878c420d8 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java @@ -20,9 +20,19 @@ package org.apache.spark.examples.ml; import org.apache.spark.sql.SparkSession; // $example on$ +import java.util.Arrays; +import java.util.List; + import org.apache.spark.ml.feature.Normalizer; +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.ml.linalg.VectorUDT; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; // $example off$ public class JavaNormalizerExample { @@ -33,8 +43,16 @@ public class JavaNormalizerExample { .getOrCreate(); // $example on$ - Dataset<Row> dataFrame = - spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + List<Row> data = Arrays.asList( + RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)), + RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)), + RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0)) + ); + StructType schema = new StructType(new StructField[]{ + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("features", new VectorUDT(), false, Metadata.empty()) + }); + Dataset<Row> dataFrame = spark.createDataFrame(data, schema); // Normalize each Vector using $L^1$ norm. Normalizer normalizer = new Normalizer() @@ -50,6 +68,7 @@ public class JavaNormalizerExample { normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY)); lInfNormData.show(); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java index a15e5f84a1..99af37676b 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java @@ -68,9 +68,11 @@ public class JavaOneHotEncoderExample { OneHotEncoder encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec"); + Dataset<Row> encoded = encoder.transform(indexed); - encoded.select("id", "categoryVec").show(); + encoded.show(); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java index c6a083ddc9..82fb540950 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java @@ -75,7 +75,7 @@ public class JavaOneVsRestExample { // compute the classification error on test data. double accuracy = evaluator.evaluate(predictions); - System.out.println("Test Error : " + (1 - accuracy)); + System.out.println("Test Error = " + (1 - accuracy)); // $example off$ spark.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java index d597a9a2ed..6951a65553 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java @@ -62,7 +62,7 @@ public class JavaPCAExample { .fit(df); Dataset<Row> result = pca.transform(df).select("pcaFeatures"); - result.show(); + result.show(false); // $example off$ spark.stop(); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java index 67180df65c..43c636c534 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java @@ -48,23 +48,19 @@ public class JavaPolynomialExpansionExample { .setDegree(3); List<Row> data = Arrays.asList( - RowFactory.create(Vectors.dense(-2.0, 2.3)), + RowFactory.create(Vectors.dense(2.0, 1.0)), RowFactory.create(Vectors.dense(0.0, 0.0)), - RowFactory.create(Vectors.dense(0.6, -1.1)) + RowFactory.create(Vectors.dense(3.0, -1.0)) ); - StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); - Dataset<Row> df = spark.createDataFrame(data, schema); - Dataset<Row> polyDF = polyExpansion.transform(df); - List<Row> rows = polyDF.select("polyFeatures").takeAsList(3); - for (Row r : rows) { - System.out.println(r.get(0)); - } + Dataset<Row> polyDF = polyExpansion.transform(df); + polyDF.show(false); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java index 278cce0842..94ead625b4 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java @@ -57,7 +57,7 @@ public class JavaStopWordsRemoverExample { }); Dataset<Row> dataset = spark.createDataFrame(data, schema); - remover.transform(dataset).show(); + remover.transform(dataset).show(false); // $example off$ spark.stop(); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java index 7533c1835e..cf9747a994 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java @@ -54,12 +54,15 @@ public class JavaStringIndexerExample { createStructField("category", StringType, false) }); Dataset<Row> df = spark.createDataFrame(data, schema); + StringIndexer indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex"); + Dataset<Row> indexed = indexer.fit(df).transform(df); indexed.show(); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java index 800e42c949..b740cd097a 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java @@ -25,7 +25,6 @@ import org.apache.spark.ml.feature.HashingTF; import org.apache.spark.ml.feature.IDF; import org.apache.spark.ml.feature.IDFModel; import org.apache.spark.ml.feature.Tokenizer; -import org.apache.spark.ml.linalg.Vector; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -54,25 +53,24 @@ public class JavaTfIdfExample { new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> sentenceData = spark.createDataFrame(data, schema); + Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); Dataset<Row> wordsData = tokenizer.transform(sentenceData); + int numFeatures = 20; HashingTF hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("rawFeatures") .setNumFeatures(numFeatures); + Dataset<Row> featurizedData = hashingTF.transform(wordsData); // alternatively, CountVectorizer can also be used to get term frequency vectors IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features"); IDFModel idfModel = idf.fit(featurizedData); + Dataset<Row> rescaledData = idfModel.transform(featurizedData); - for (Row r : rescaledData.select("features", "label").takeAsList(3)) { - Vector features = r.getAs(0); - Double label = r.getDouble(1); - System.out.println(features); - System.out.println(label); - } + rescaledData.select("label", "features").show(); // $example off$ spark.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java index a206cef4c2..101a4df779 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java @@ -23,8 +23,11 @@ import org.apache.spark.sql.SparkSession; import java.util.Arrays; import java.util.List; +import scala.collection.mutable.WrappedArray; + import org.apache.spark.ml.feature.RegexTokenizer; import org.apache.spark.ml.feature.Tokenizer; +import org.apache.spark.sql.api.java.UDF1; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -34,6 +37,12 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; // $example off$ +// $example on:untyped_ops$ +// col("...") is preferable to df.col("...") +import static org.apache.spark.sql.functions.callUDF; +import static org.apache.spark.sql.functions.col; +// $example off:untyped_ops$ + public class JavaTokenizerExample { public static void main(String[] args) { SparkSession spark = SparkSession @@ -49,7 +58,7 @@ public class JavaTokenizerExample { ); StructType schema = new StructType(new StructField[]{ - new StructField("label", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); @@ -62,20 +71,22 @@ public class JavaTokenizerExample { .setOutputCol("words") .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); + spark.udf().register("countTokens", new UDF1<WrappedArray, Integer>() { + @Override + public Integer call(WrappedArray words) { + return words.size(); + } + }, DataTypes.IntegerType); + Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame); - for (Row r : tokenized.select("words", "label").takeAsList(3)) { - java.util.List<String> words = r.getList(0); - for (String word : words) System.out.print(word + " "); - System.out.println(); - } + tokenized.select("sentence", "words") + .withColumn("tokens", callUDF("countTokens", col("words"))).show(false); Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame); - for (Row r : regexTokenized.select("words", "label").takeAsList(3)) { - java.util.List<String> words = r.getList(0); - for (String word : words) System.out.print(word + " "); - System.out.println(); - } + regexTokenized.select("sentence", "words") + .withColumn("tokens", callUDF("countTokens", col("words"))).show(false); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java index 9bb0f93d3a..384e09c73b 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java @@ -29,7 +29,6 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.*; - import static org.apache.spark.sql.types.DataTypes.*; // $example off$ @@ -56,8 +55,11 @@ public class JavaVectorAssemblerExample { .setOutputCol("features"); Dataset<Row> output = assembler.transform(dataset); - System.out.println(output.select("features", "clicked").first()); + System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " + + "'features'"); + output.select("features", "clicked").show(false); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java index 19b8bc83be..1922514c87 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java @@ -65,9 +65,9 @@ public class JavaVectorSlicerExample { // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"}) Dataset<Row> output = vectorSlicer.transform(dataset); - - System.out.println(output.select("userFeatures", "features").first()); + output.show(false); // $example off$ + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java index 9be6e6353a..fc9b459688 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java @@ -23,6 +23,7 @@ import java.util.List; import org.apache.spark.ml.feature.Word2Vec; import org.apache.spark.ml.feature.Word2VecModel; +import org.apache.spark.ml.linalg.Vector; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -55,10 +56,14 @@ public class JavaWord2VecExample { .setOutputCol("result") .setVectorSize(3) .setMinCount(0); + Word2VecModel model = word2Vec.fit(documentDF); Dataset<Row> result = model.transform(documentDF); - for (Row r : result.select("result").takeAsList(3)) { - System.out.println(r); + + for (Row row : result.collectAsList()) { + List<String> text = row.getList(0); + Vector vector = (Vector) row.get(1); + System.out.println("Text: " + text + " => \nVector: " + vector + "\n"); } // $example off$ diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py index 4224a27dbe..669bb2aeab 100644 --- a/examples/src/main/python/ml/binarizer_example.py +++ b/examples/src/main/python/ml/binarizer_example.py @@ -33,12 +33,14 @@ if __name__ == "__main__": (0, 0.1), (1, 0.8), (2, 0.2) - ], ["label", "feature"]) + ], ["id", "feature"]) + binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature") + binarizedDataFrame = binarizer.transform(continuousDataFrame) - binarizedFeatures = binarizedDataFrame.select("binarized_feature") - for binarized_feature, in binarizedFeatures.collect(): - print(binarized_feature) + + print("Binarizer output with Threshold = %f" % binarizer.getThreshold()) + binarizedDataFrame.show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py index 8177e560dd..742f35093b 100644 --- a/examples/src/main/python/ml/bucketizer_example.py +++ b/examples/src/main/python/ml/bucketizer_example.py @@ -31,13 +31,15 @@ if __name__ == "__main__": # $example on$ splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] - data = [(-0.5,), (-0.3,), (0.0,), (0.2,)] + data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)] dataFrame = spark.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) + + print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1)) bucketedData.show() # $example off$ diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py index 5e19ef1624..028a9ea9d6 100644 --- a/examples/src/main/python/ml/chisq_selector_example.py +++ b/examples/src/main/python/ml/chisq_selector_example.py @@ -39,6 +39,8 @@ if __name__ == "__main__": outputCol="selectedFeatures", labelCol="clicked") result = selector.fit(df).transform(df) + + print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures()) result.show() # $example off$ diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py index 38cfac82fb..f2e41db77d 100644 --- a/examples/src/main/python/ml/count_vectorizer_example.py +++ b/examples/src/main/python/ml/count_vectorizer_example.py @@ -37,9 +37,11 @@ if __name__ == "__main__": # fit a CountVectorizerModel from the corpus. cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0) + model = cv.fit(df) + result = model.transform(df) - result.show() + result.show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/dct_example.py b/examples/src/main/python/ml/dct_example.py index a4f25df784..c0457f8d0f 100644 --- a/examples/src/main/python/ml/dct_example.py +++ b/examples/src/main/python/ml/dct_example.py @@ -39,8 +39,7 @@ if __name__ == "__main__": dctDf = dct.transform(df) - for dcts in dctDf.select("featuresDCT").take(3): - print(dcts) + dctDf.select("featuresDCT").show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py index edc258de05..8ad450b669 100644 --- a/examples/src/main/python/ml/gaussian_mixture_example.py +++ b/examples/src/main/python/ml/gaussian_mixture_example.py @@ -38,11 +38,11 @@ if __name__ == "__main__": # loads data dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") - gmm = GaussianMixture().setK(2) + gmm = GaussianMixture().setK(2).setSeed(538009335L) model = gmm.fit(dataset) - print("Gaussians: ") - model.gaussiansDF.show() + print("Gaussians shown as a DataFrame: ") + model.gaussiansDF.show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/index_to_string_example.py b/examples/src/main/python/ml/index_to_string_example.py index 523caac00c..33d104e8e3 100644 --- a/examples/src/main/python/ml/index_to_string_example.py +++ b/examples/src/main/python/ml/index_to_string_example.py @@ -33,14 +33,22 @@ if __name__ == "__main__": [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) - stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") - model = stringIndexer.fit(df) + indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") + model = indexer.fit(df) indexed = model.transform(df) + print("Transformed string column '%s' to indexed column '%s'" + % (indexer.getInputCol(), indexer.getOutputCol())) + indexed.show() + + print("StringIndexer will store labels in output column metadata\n") + converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) - converted.select("id", "originalCategory").show() + print("Transformed indexed column '%s' back to original string column '%s' using " + "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) + converted.select("id", "categoryIndex", "originalCategory").show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py index a41b8ffacb..6ae15f1b4b 100644 --- a/examples/src/main/python/ml/isotonic_regression_example.py +++ b/examples/src/main/python/ml/isotonic_regression_example.py @@ -44,8 +44,8 @@ if __name__ == "__main__": # Trains an isotonic regression model. model = IsotonicRegression().fit(dataset) - print("Boundaries in increasing order: " + str(model.boundaries)) - print("Predictions associated with the boundaries: " + str(model.predictions)) + print("Boundaries in increasing order: %s\n" % str(model.boundaries)) + print("Predictions associated with the boundaries: %s\n" % str(model.predictions)) # Makes predictions. model.transform(dataset).show() diff --git a/examples/src/main/python/ml/linear_regression_with_elastic_net.py b/examples/src/main/python/ml/linear_regression_with_elastic_net.py index 620ab5b87e..6639e9160a 100644 --- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py +++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py @@ -39,8 +39,16 @@ if __name__ == "__main__": lrModel = lr.fit(training) # Print the coefficients and intercept for linear regression - print("Coefficients: " + str(lrModel.coefficients)) - print("Intercept: " + str(lrModel.intercept)) + print("Coefficients: %s" % str(lrModel.coefficients)) + print("Intercept: %s" % str(lrModel.intercept)) + + # Summarize the model over the training set and print out some metrics + trainingSummary = lrModel.summary + print("numIterations: %d" % trainingSummary.totalIterations) + print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) + trainingSummary.residuals.show() + print("RMSE: %f" % trainingSummary.rootMeanSquaredError) + print("r2: %f" % trainingSummary.r2) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/max_abs_scaler_example.py b/examples/src/main/python/ml/max_abs_scaler_example.py index ab91198b08..45eda3cdad 100644 --- a/examples/src/main/python/ml/max_abs_scaler_example.py +++ b/examples/src/main/python/ml/max_abs_scaler_example.py @@ -19,6 +19,7 @@ from __future__ import print_function # $example on$ from pyspark.ml.feature import MaxAbsScaler +from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession @@ -29,7 +30,11 @@ if __name__ == "__main__": .getOrCreate() # $example on$ - dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + dataFrame = spark.createDataFrame([ + (0, Vectors.dense([1.0, 0.1, -8.0]),), + (1, Vectors.dense([2.0, 1.0, -4.0]),), + (2, Vectors.dense([4.0, 10.0, 8.0]),) + ], ["id", "features"]) scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") @@ -38,7 +43,8 @@ if __name__ == "__main__": # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) - scaledData.show() + + scaledData.select("features", "scaledFeatures").show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/min_max_scaler_example.py b/examples/src/main/python/ml/min_max_scaler_example.py index e3e7bc205b..b5f272e59b 100644 --- a/examples/src/main/python/ml/min_max_scaler_example.py +++ b/examples/src/main/python/ml/min_max_scaler_example.py @@ -19,6 +19,7 @@ from __future__ import print_function # $example on$ from pyspark.ml.feature import MinMaxScaler +from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession @@ -29,7 +30,11 @@ if __name__ == "__main__": .getOrCreate() # $example on$ - dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + dataFrame = spark.createDataFrame([ + (0, Vectors.dense([1.0, 0.1, -1.0]),), + (1, Vectors.dense([2.0, 1.1, 1.0]),), + (2, Vectors.dense([3.0, 10.1, 3.0]),) + ], ["id", "features"]) scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") @@ -38,7 +43,8 @@ if __name__ == "__main__": # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) - scaledData.show() + print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) + scaledData.select("features", "scaledFeatures").show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py index 2cc38c2855..88fc69f753 100644 --- a/examples/src/main/python/ml/multilayer_perceptron_classification.py +++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py @@ -52,7 +52,7 @@ if __name__ == "__main__": result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") - print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels))) + print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py index 55263adb46..31676e076a 100644 --- a/examples/src/main/python/ml/n_gram_example.py +++ b/examples/src/main/python/ml/n_gram_example.py @@ -33,13 +33,12 @@ if __name__ == "__main__": (0, ["Hi", "I", "heard", "about", "Spark"]), (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"]) - ], ["label", "words"]) + ], ["id", "words"]) - ngram = NGram(inputCol="words", outputCol="ngrams") - ngramDataFrame = ngram.transform(wordDataFrame) + ngram = NGram(n=2, inputCol="words", outputCol="ngrams") - for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3): - print(ngrams_label) + ngramDataFrame = ngram.transform(wordDataFrame) + ngramDataFrame.select("ngrams").show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py index aa23f298c8..7290ab81cd 100644 --- a/examples/src/main/python/ml/naive_bayes_example.py +++ b/examples/src/main/python/ml/naive_bayes_example.py @@ -45,11 +45,15 @@ if __name__ == "__main__": # train the model model = nb.fit(train) + # select example rows to display. + predictions = model.transform(test) + predictions.show() + # compute accuracy on the test set - result = model.transform(test) - predictionAndLabels = result.select("prediction", "label") - evaluator = MulticlassClassificationEvaluator(metricName="accuracy") - print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels))) + evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", + metricName="accuracy") + accuracy = evaluator.evaluate(predictions) + print("Test set accuracy = " + str(accuracy)) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py index 19012f51f4..510bd825fd 100644 --- a/examples/src/main/python/ml/normalizer_example.py +++ b/examples/src/main/python/ml/normalizer_example.py @@ -19,6 +19,7 @@ from __future__ import print_function # $example on$ from pyspark.ml.feature import Normalizer +from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession @@ -29,15 +30,21 @@ if __name__ == "__main__": .getOrCreate() # $example on$ - dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + dataFrame = spark.createDataFrame([ + (0, Vectors.dense([1.0, 0.5, -1.0]),), + (1, Vectors.dense([2.0, 1.0, 1.0]),), + (2, Vectors.dense([4.0, 10.0, 2.0]),) + ], ["id", "features"]) # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(dataFrame) + print("Normalized using L^1 norm") l1NormData.show() # Normalize each Vector using $L^\infty$ norm. lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")}) + print("Normalized using L^inf norm") lInfNormData.show() # $example off$ diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py index 47faf8d202..e1996c7f0a 100644 --- a/examples/src/main/python/ml/onehot_encoder_example.py +++ b/examples/src/main/python/ml/onehot_encoder_example.py @@ -42,9 +42,9 @@ if __name__ == "__main__": model = stringIndexer.fit(df) indexed = model.transform(df) - encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec") + encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec") encoded = encoder.transform(indexed) - encoded.select("id", "categoryVec").show() + encoded.show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py index 2d0865578a..f63e4db434 100644 --- a/examples/src/main/python/ml/pipeline_example.py +++ b/examples/src/main/python/ml/pipeline_example.py @@ -60,9 +60,10 @@ if __name__ == "__main__": # Make predictions on test documents and print columns of interest. prediction = model.transform(test) - selected = prediction.select("id", "text", "prediction") + selected = prediction.select("id", "text", "probability", "prediction") for row in selected.collect(): - print(row) + rid, text, prob, prediction = row + print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction)) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py index b464ee86b6..40bcb7b13a 100644 --- a/examples/src/main/python/ml/polynomial_expansion_example.py +++ b/examples/src/main/python/ml/polynomial_expansion_example.py @@ -31,16 +31,15 @@ if __name__ == "__main__": # $example on$ df = spark.createDataFrame([ - (Vectors.dense([-2.0, 2.3]),), + (Vectors.dense([2.0, 1.0]),), (Vectors.dense([0.0, 0.0]),), - (Vectors.dense([0.6, -1.1]),) + (Vectors.dense([3.0, -1.0]),) ], ["features"]) - px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") - polyDF = px.transform(df) + polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") + polyDF = polyExpansion.transform(df) - for expanded in polyDF.select("polyFeatures").take(3): - print(expanded) + polyDF.show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py index 8a8392cc1f..3b8e7855e3 100644 --- a/examples/src/main/python/ml/stopwords_remover_example.py +++ b/examples/src/main/python/ml/stopwords_remover_example.py @@ -32,7 +32,7 @@ if __name__ == "__main__": sentenceData = spark.createDataFrame([ (0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"]) - ], ["label", "raw"]) + ], ["id", "raw"]) remover = StopWordsRemover(inputCol="raw", outputCol="filtered") remover.transform(sentenceData).show(truncate=False) diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py index 4ab7eb6964..d43244fa68 100644 --- a/examples/src/main/python/ml/tf_idf_example.py +++ b/examples/src/main/python/ml/tf_idf_example.py @@ -30,9 +30,9 @@ if __name__ == "__main__": # $example on$ sentenceData = spark.createDataFrame([ - (0, "Hi I heard about Spark"), - (0, "I wish Java could use case classes"), - (1, "Logistic regression models are neat") + (0.0, "Hi I heard about Spark"), + (0.0, "I wish Java could use case classes"), + (1.0, "Logistic regression models are neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") @@ -46,8 +46,7 @@ if __name__ == "__main__": idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) - for features_label in rescaledData.select("features", "label").take(3): - print(features_label) + rescaledData.select("label", "features").show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py index 89f5060705..5c65c5c9f8 100644 --- a/examples/src/main/python/ml/tokenizer_example.py +++ b/examples/src/main/python/ml/tokenizer_example.py @@ -19,6 +19,8 @@ from __future__ import print_function # $example on$ from pyspark.ml.feature import Tokenizer, RegexTokenizer +from pyspark.sql.functions import col, udf +from pyspark.sql.types import IntegerType # $example off$ from pyspark.sql import SparkSession @@ -33,20 +35,22 @@ if __name__ == "__main__": (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") - ], ["label", "sentence"]) + ], ["id", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) + countTokens = udf(lambda words: len(words), IntegerType()) + tokenized = tokenizer.transform(sentenceDataFrame) - for words_label in tokenized.select("words", "label").take(3): - print(words_label) + tokenized.select("sentence", "words")\ + .withColumn("tokens", countTokens(col("words"))).show(truncate=False) regexTokenized = regexTokenizer.transform(sentenceDataFrame) - for words_label in regexTokenized.select("words", "label").take(3): - print(words_label) + regexTokenized.select("sentence", "words") \ + .withColumn("tokens", countTokens(col("words"))).show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py index a92b861f83..d104f7d30a 100644 --- a/examples/src/main/python/ml/train_validation_split.py +++ b/examples/src/main/python/ml/train_validation_split.py @@ -66,8 +66,9 @@ if __name__ == "__main__": # Make predictions on test data. model is the model with combination of parameters # that performed best. - prediction = model.transform(test) - for row in prediction.take(5): - print(row) + model.transform(test)\ + .select("features", "label", "prediction")\ + .show() + # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py index eac33711ad..98de1d5ea7 100644 --- a/examples/src/main/python/ml/vector_assembler_example.py +++ b/examples/src/main/python/ml/vector_assembler_example.py @@ -39,7 +39,8 @@ if __name__ == "__main__": outputCol="features") output = assembler.transform(dataset) - print(output.select("features", "clicked").first()) + print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") + output.select("features", "clicked").show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py index 3912c135be..5c2956077d 100644 --- a/examples/src/main/python/ml/vector_indexer_example.py +++ b/examples/src/main/python/ml/vector_indexer_example.py @@ -34,6 +34,10 @@ if __name__ == "__main__": indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) + categoricalFeatures = indexerModel.categoryMaps + print("Chose %d categorical features: %s" % + (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys()))) + # Create new column "indexed" with categorical values transformed to indices indexedData = indexerModel.transform(data) indexedData.show() diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py index 78a91c92fc..77f8951df0 100644 --- a/examples/src/main/python/ml/word2vec_example.py +++ b/examples/src/main/python/ml/word2vec_example.py @@ -41,8 +41,9 @@ if __name__ == "__main__": model = word2Vec.fit(documentDF) result = model.transform(documentDF) - for feature in result.select("result").take(3): - print(feature) + for row in result.collect(): + text, vector = row + print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector))) # $example off$ spark.stop() diff --git a/examples/src/main/python/pagerank.py b/examples/src/main/python/pagerank.py index a399a9c37c..0d6c253d39 100755 --- a/examples/src/main/python/pagerank.py +++ b/examples/src/main/python/pagerank.py @@ -18,6 +18,9 @@ """ This is an example implementation of PageRank. For more conventional use, Please refer to PageRank implementation provided by graphx + +Example Usage: +bin/spark-submit examples/src/main/python/pagerank.py data/mllib/pagerank_data.txt 10 """ from __future__ import print_function @@ -46,8 +49,8 @@ if __name__ == "__main__": print("Usage: pagerank <file> <iterations>", file=sys.stderr) exit(-1) - print("""WARN: This is a naive implementation of PageRank and is - given as an example! Please refer to PageRank implementation provided by graphx""", + print("WARN: This is a naive implementation of PageRank and is given as an example!\n" + + "Please refer to PageRank implementation provided by graphx", file=sys.stderr) # Initialize the spark context. diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala index d0b874c48d..5d8831265e 100644 --- a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala +++ b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala @@ -31,6 +31,11 @@ import org.apache.spark.sql.SparkSession * * This is an example implementation for learning how to use Spark. For more conventional use, * please refer to org.apache.spark.graphx.lib.PageRank + * + * Example Usage: + * {{{ + * bin/run-example SparkPageRank data/mllib/pagerank_data.txt 10 + * }}} */ object SparkPageRank { diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala index b6d7b36916..cdb33f4d6d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala @@ -55,8 +55,9 @@ object AFTSurvivalRegressionExample { val model = aft.fit(training) // Print the coefficients, intercept and scale parameter for AFT survival regression - println(s"Coefficients: ${model.coefficients} Intercept: " + - s"${model.intercept} Scale: ${model.scale}") + println(s"Coefficients: ${model.coefficients}") + println(s"Intercept: ${model.intercept}") + println(s"Scale: ${model.scale}") model.transform(training).show(false) // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala index 5cd13ad64c..a4f62e7871 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala @@ -29,9 +29,10 @@ object BinarizerExample { .builder .appName("BinarizerExample") .getOrCreate() + // $example on$ val data = Array((0, 0.1), (1, 0.8), (2, 0.2)) - val dataFrame = spark.createDataFrame(data).toDF("label", "feature") + val dataFrame = spark.createDataFrame(data).toDF("id", "feature") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") @@ -39,8 +40,9 @@ object BinarizerExample { .setThreshold(0.5) val binarizedDataFrame = binarizer.transform(dataFrame) - val binarizedFeatures = binarizedDataFrame.select("binarized_feature") - binarizedFeatures.collect().foreach(println) + + println(s"Binarizer output with Threshold = ${binarizer.getThreshold}") + binarizedDataFrame.show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala index 38cce34bb5..04e4eccd43 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala @@ -33,7 +33,7 @@ object BucketizerExample { // $example on$ val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity) - val data = Array(-0.5, -0.3, 0.0, 0.2) + val data = Array(-999.9, -0.5, -0.3, 0.0, 0.2, 999.9) val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val bucketizer = new Bucketizer() @@ -43,8 +43,11 @@ object BucketizerExample { // Transform original data into its bucket index. val bucketedData = bucketizer.transform(dataFrame) + + println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets") bucketedData.show() // $example off$ + spark.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala index c9394dd9c6..5638e66b87 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala @@ -48,8 +48,11 @@ object ChiSqSelectorExample { .setOutputCol("selectedFeatures") val result = selector.fit(df).transform(df) + + println(s"ChiSqSelector output with top ${selector.getNumTopFeatures} features selected") result.show() // $example off$ + spark.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala index 988d8941a4..91d861dd43 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala @@ -49,7 +49,7 @@ object CountVectorizerExample { .setInputCol("words") .setOutputCol("features") - cvModel.transform(df).select("features").show() + cvModel.transform(df).show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala index ddc6717528..3383171303 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala @@ -45,7 +45,7 @@ object DCTExample { .setInverse(false) val dctDf = dct.transform(df) - dctDf.select("featuresDCT").show(3) + dctDf.select("featuresDCT").show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala index 26095b46f5..5e4bea4c4f 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala @@ -49,8 +49,8 @@ object GaussianMixtureExample { // output parameters of mixture model model for (i <- 0 until model.getK) { - println("weight=%f\nmu=%s\nsigma=\n%s\n" format - (model.weights(i), model.gaussians(i).mean, model.gaussians(i).cov)) + println(s"Gaussian $i:\nweight=${model.weights(i)}\n" + + s"mu=${model.gaussians(i).mean}\nsigma=\n${model.gaussians(i).cov}\n") } // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala index 950733831c..2940682c32 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala @@ -19,6 +19,7 @@ package org.apache.spark.examples.ml // $example on$ +import org.apache.spark.ml.attribute.Attribute import org.apache.spark.ml.feature.{IndexToString, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession @@ -46,12 +47,23 @@ object IndexToStringExample { .fit(df) val indexed = indexer.transform(df) + println(s"Transformed string column '${indexer.getInputCol}' " + + s"to indexed column '${indexer.getOutputCol}'") + indexed.show() + + val inputColSchema = indexed.schema(indexer.getOutputCol) + println(s"StringIndexer will store labels in output column metadata: " + + s"${Attribute.fromStructField(inputColSchema).toString}\n") + val converter = new IndexToString() .setInputCol("categoryIndex") .setOutputCol("originalCategory") val converted = converter.transform(indexed) - converted.select("id", "originalCategory").show() + + println(s"Transformed indexed column '${converter.getInputCol}' back to original string " + + s"column '${converter.getOutputCol}' using labels in metadata") + converted.select("id", "categoryIndex", "originalCategory").show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala index a840559d24..9bac16ec76 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala @@ -47,8 +47,8 @@ object IsotonicRegressionExample { val ir = new IsotonicRegression() val model = ir.fit(dataset) - println(s"Boundaries in increasing order: ${model.boundaries}") - println(s"Predictions associated with the boundaries: ${model.predictions}") + println(s"Boundaries in increasing order: ${model.boundaries}\n") + println(s"Predictions associated with the boundaries: ${model.predictions}\n") // Makes predictions. model.transform(dataset).show() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala index 94cf286623..4540a8d728 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala @@ -50,7 +50,7 @@ object LinearRegressionWithElasticNetExample { // Summarize the model over the training set and print out some metrics val trainingSummary = lrModel.summary println(s"numIterations: ${trainingSummary.totalIterations}") - println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}") + println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]") trainingSummary.residuals.show() println(s"RMSE: ${trainingSummary.rootMeanSquaredError}") println(s"r2: ${trainingSummary.r2}") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala index cd8775c942..1740a0d3f9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala @@ -51,6 +51,7 @@ object LogisticRegressionSummaryExample { // Obtain the objective per iteration. val objectiveHistory = trainingSummary.objectiveHistory + println("objectiveHistory:") objectiveHistory.foreach(loss => println(loss)) // Obtain the metrics useful to judge performance on test data. @@ -61,7 +62,7 @@ object LogisticRegressionSummaryExample { // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. val roc = binarySummary.roc roc.show() - println(binarySummary.areaUnderROC) + println(s"areaUnderROC: ${binarySummary.areaUnderROC}") // Set the model threshold to maximize F-Measure val fMeasure = binarySummary.fMeasureByThreshold diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala index 572adce657..85d071369d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala @@ -19,6 +19,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MaxAbsScaler +import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession @@ -30,7 +31,12 @@ object MaxAbsScalerExample { .getOrCreate() // $example on$ - val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val dataFrame = spark.createDataFrame(Seq( + (0, Vectors.dense(1.0, 0.1, -8.0)), + (1, Vectors.dense(2.0, 1.0, -4.0)), + (2, Vectors.dense(4.0, 10.0, 8.0)) + )).toDF("id", "features") + val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaledFeatures") @@ -40,7 +46,7 @@ object MaxAbsScalerExample { // rescale each feature to range [-1, 1] val scaledData = scalerModel.transform(dataFrame) - scaledData.show() + scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala index d728019a62..9ee6d9b449 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MinMaxScaler +import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession @@ -31,7 +32,11 @@ object MinMaxScalerExample { .getOrCreate() // $example on$ - val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val dataFrame = spark.createDataFrame(Seq( + (0, Vectors.dense(1.0, 0.1, -1.0)), + (1, Vectors.dense(2.0, 1.1, 1.0)), + (2, Vectors.dense(3.0, 10.1, 3.0)) + )).toDF("id", "features") val scaler = new MinMaxScaler() .setInputCol("features") @@ -42,7 +47,8 @@ object MinMaxScalerExample { // rescale each feature to range [min, max]. val scaledData = scalerModel.transform(dataFrame) - scaledData.show() + println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]") + scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala index a39e3202ba..6fce82d294 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala @@ -66,7 +66,7 @@ object MultilayerPerceptronClassifierExample { val evaluator = new MulticlassClassificationEvaluator() .setMetricName("accuracy") - println("Accuracy: " + evaluator.evaluate(predictionAndLabels)) + println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels)) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala index e0b52e7a36..d2183d6b49 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala @@ -35,11 +35,12 @@ object NGramExample { (0, Array("Hi", "I", "heard", "about", "Spark")), (1, Array("I", "wish", "Java", "could", "use", "case", "classes")), (2, Array("Logistic", "regression", "models", "are", "neat")) - )).toDF("label", "words") + )).toDF("id", "words") + + val ngram = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams") - val ngram = new NGram().setInputCol("words").setOutputCol("ngrams") val ngramDataFrame = ngram.transform(wordDataFrame) - ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println) + ngramDataFrame.select("ngrams").show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala index 3ae0623c4c..bd9fcc420a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala @@ -52,7 +52,7 @@ object NaiveBayesExample { .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(predictions) - println("Accuracy: " + accuracy) + println("Test set accuracy = " + accuracy) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala index 75ba33a7e7..989d250c17 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Normalizer +import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession @@ -31,7 +32,11 @@ object NormalizerExample { .getOrCreate() // $example on$ - val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val dataFrame = spark.createDataFrame(Seq( + (0, Vectors.dense(1.0, 0.5, -1.0)), + (1, Vectors.dense(2.0, 1.0, 1.0)), + (2, Vectors.dense(4.0, 10.0, 2.0)) + )).toDF("id", "features") // Normalize each Vector using $L^1$ norm. val normalizer = new Normalizer() @@ -40,10 +45,12 @@ object NormalizerExample { .setP(1.0) val l1NormData = normalizer.transform(dataFrame) + println("Normalized using L^1 norm") l1NormData.show() // Normalize each Vector using $L^\infty$ norm. val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity) + println("Normalized using L^inf norm") lInfNormData.show() // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala index 4aa649b133..274cc1268f 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala @@ -49,8 +49,9 @@ object OneHotEncoderExample { val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") + val encoded = encoder.transform(indexed) - encoded.select("id", "categoryVec").show() + encoded.show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala index acde110683..4ad6c7c3ef 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala @@ -69,7 +69,7 @@ object OneVsRestExample { // compute the classification error on test data. val accuracy = evaluator.evaluate(predictions) - println(s"Test Error : ${1 - accuracy}") + println(s"Test Error = ${1 - accuracy}") // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala index dca96eea2b..4e1d7cdbab 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala @@ -38,14 +38,15 @@ object PCAExample { Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") + val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) .fit(df) - val pcaDF = pca.transform(df) - val result = pcaDF.select("pcaFeatures") - result.show() + + val result = pca.transform(df).select("pcaFeatures") + result.show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala index 54d2e6b36d..f117b03ab2 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala @@ -33,17 +33,19 @@ object PolynomialExpansionExample { // $example on$ val data = Array( - Vectors.dense(-2.0, 2.3), + Vectors.dense(2.0, 1.0), Vectors.dense(0.0, 0.0), - Vectors.dense(0.6, -1.1) + Vectors.dense(3.0, -1.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") - val polynomialExpansion = new PolynomialExpansion() + + val polyExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) - val polyDF = polynomialExpansion.transform(df) - polyDF.select("polyFeatures").take(3).foreach(println) + + val polyDF = polyExpansion.transform(df) + polyDF.show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala index a56de0856d..369a6fffd7 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala @@ -40,7 +40,7 @@ object StopWordsRemoverExample { (1, Seq("Mary", "had", "a", "little", "lamb")) )).toDF("id", "raw") - remover.transform(dataSet).show() + remover.transform(dataSet).show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala index 97f6fcce15..ec2df2ef87 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala @@ -33,9 +33,9 @@ object TfIdfExample { // $example on$ val sentenceData = spark.createDataFrame(Seq( - (0, "Hi I heard about Spark"), - (0, "I wish Java could use case classes"), - (1, "Logistic regression models are neat") + (0.0, "Hi I heard about Spark"), + (0.0, "I wish Java could use case classes"), + (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") @@ -51,7 +51,7 @@ object TfIdfExample { val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) - rescaledData.select("features", "label").take(3).foreach(println) + rescaledData.select("label", "features").show() // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala index 90d0faaf47..0167dc3723 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} +import org.apache.spark.sql.functions._ // $example off$ import org.apache.spark.sql.SparkSession @@ -35,7 +36,7 @@ object TokenizerExample { (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") - )).toDF("label", "sentence") + )).toDF("id", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val regexTokenizer = new RegexTokenizer() @@ -43,11 +44,15 @@ object TokenizerExample { .setOutputCol("words") .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) + val countTokens = udf { (words: Seq[String]) => words.length } + val tokenized = tokenizer.transform(sentenceDataFrame) - tokenized.select("words", "label").take(3).foreach(println) + tokenized.select("sentence", "words") + .withColumn("tokens", countTokens(col("words"))).show(false) val regexTokenized = regexTokenizer.transform(sentenceDataFrame) - regexTokenized.select("words", "label").take(3).foreach(println) + regexTokenized.select("sentence", "words") + .withColumn("tokens", countTokens(col("words"))).show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala index 13c72f88cc..13b58d154b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala @@ -100,6 +100,7 @@ object UnaryTransformerExample { val data = spark.range(0, 5).toDF("input") .select(col("input").cast("double").as("input")) val result = myTransformer.transform(data) + println("Transformed by adding constant value") result.show() // Save and load the Transformer. @@ -109,6 +110,7 @@ object UnaryTransformerExample { val sameTransformer = MyTransformer.load(dirName) // Transform the data to show the results are identical. + println("Same transform applied from loaded model") val sameResult = sameTransformer.transform(data) sameResult.show() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala index 8910470c1c..3d5c7efb20 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala @@ -41,7 +41,8 @@ object VectorAssemblerExample { .setOutputCol("features") val output = assembler.transform(dataset) - println(output.select("features", "clicked").first()) + println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") + output.select("features", "clicked").show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala index 85dd5c2776..63a60912de 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala @@ -37,7 +37,10 @@ object VectorSlicerExample { .getOrCreate() // $example on$ - val data = Arrays.asList(Row(Vectors.dense(-2.0, 2.3, 0.0))) + val data = Arrays.asList( + Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), + Row(Vectors.dense(-2.0, 2.3, 0.0)) + ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) @@ -51,7 +54,7 @@ object VectorSlicerExample { // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) - println(output.select("userFeatures", "features").first()) + output.show(false) // $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala index 5c8bd19f20..4bcc6ac6a0 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala @@ -20,6 +20,8 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession @@ -47,7 +49,8 @@ object Word2VecExample { val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) - result.select("result").take(3).foreach(println) + result.collect().foreach { case Row(text: Seq[_], features: Vector) => + println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() |