From 9e266d07a444fd465fe178cdd5c4894cd09cbda3 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 11 May 2016 22:45:30 -0700 Subject: [SPARK-15031][SPARK-15134][EXAMPLE][DOC] Use SparkSession and update indent in examples ## What changes were proposed in this pull request? 1, Use `SparkSession` according to [SPARK-15031](https://issues.apache.org/jira/browse/SPARK-15031) 2, Update indent for `SparkContext` according to [SPARK-15134](https://issues.apache.org/jira/browse/SPARK-15134) 3, BTW, remove some duplicate space and add missing '.' ## How was this patch tested? manual tests Author: Zheng RuiFeng Closes #13050 from zhengruifeng/use_sparksession. --- .../ml/JavaDecisionTreeClassificationExample.java | 14 ++++---- .../ml/JavaDecisionTreeRegressionExample.java | 12 ++++--- .../spark/examples/ml/JavaDeveloperApiExample.java | 6 ++-- .../ml/JavaEstimatorTransformerParamExample.java | 4 ++- .../JavaGradientBoostedTreeClassifierExample.java | 6 ++-- .../JavaGradientBoostedTreeRegressorExample.java | 12 ++++--- .../JavaLinearRegressionWithElasticNetExample.java | 12 ++++--- .../ml/JavaLogisticRegressionSummaryExample.java | 4 ++- ...avaLogisticRegressionWithElasticNetExample.java | 4 ++- ...avaModelSelectionViaCrossValidationExample.java | 4 ++- ...delSelectionViaTrainValidationSplitExample.java | 4 ++- .../JavaMultilayerPerceptronClassifierExample.java | 4 ++- .../ml/JavaQuantileDiscretizerExample.java | 4 ++- .../ml/JavaRandomForestClassifierExample.java | 4 ++- .../ml/JavaRandomForestRegressorExample.java | 6 ++-- .../spark/examples/ml/JavaSimpleParamsExample.java | 8 ++--- .../ml/JavaSimpleTextClassificationPipeline.java | 4 ++- .../ml/DecisionTreeClassificationExample.scala | 10 +++--- .../spark/examples/ml/DecisionTreeExample.scala | 39 +++++++++++----------- .../ml/DecisionTreeRegressionExample.scala | 8 ++--- .../spark/examples/ml/DeveloperApiExample.scala | 14 ++++---- .../ml/EstimatorTransformerParamExample.scala | 8 ++--- .../org/apache/spark/examples/ml/GBTExample.scala | 30 +++++++++-------- .../ml/GradientBoostedTreeClassifierExample.scala | 8 ++--- .../ml/GradientBoostedTreeRegressorExample.scala | 8 ++--- .../examples/ml/LinearRegressionExample.scala | 17 +++++----- .../examples/ml/LogisticRegressionExample.scala | 21 ++++++------ .../LogisticRegressionWithElasticNetExample.scala | 4 ++- .../ModelSelectionViaCrossValidationExample.scala | 4 ++- ...elSelectionViaTrainValidationSplitExample.scala | 4 ++- .../ml/RandomForestClassifierExample.scala | 8 ++--- .../spark/examples/ml/RandomForestExample.scala | 32 +++++++++--------- .../examples/ml/RandomForestRegressorExample.scala | 8 ++--- .../spark/examples/ml/SimpleParamsExample.scala | 8 ++--- 34 files changed, 192 insertions(+), 151 deletions(-) (limited to 'examples/src') diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java index 733bc4181c..bdb76f004f 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java @@ -32,7 +32,9 @@ import org.apache.spark.sql.SparkSession; public class JavaDecisionTreeClassificationExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaDecisionTreeClassificationExample").getOrCreate(); + .builder() + .appName("JavaDecisionTreeClassificationExample") + .getOrCreate(); // $example on$ // Load the data stored in LIBSVM format as a DataFrame. @@ -52,10 +54,10 @@ public class JavaDecisionTreeClassificationExample { VectorIndexerModel featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") - .setMaxCategories(4) // features with > 4 distinct values are treated as continuous + .setMaxCategories(4) // features with > 4 distinct values are treated as continuous. .fit(data); - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). Dataset[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset trainingData = splits[0]; Dataset testData = splits[1]; @@ -71,11 +73,11 @@ public class JavaDecisionTreeClassificationExample { .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels()); - // Chain indexers and tree in a Pipeline + // Chain indexers and tree in a Pipeline. Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter}); - // Train model. This also runs the indexers. + // Train model. This also runs the indexers. PipelineModel model = pipeline.fit(trainingData); // Make predictions. @@ -84,7 +86,7 @@ public class JavaDecisionTreeClassificationExample { // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java index bd6dc3edd3..cffb7139ed 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java @@ -33,7 +33,9 @@ import org.apache.spark.sql.SparkSession; public class JavaDecisionTreeRegressionExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaDecisionTreeRegressionExample").getOrCreate(); + .builder() + .appName("JavaDecisionTreeRegressionExample") + .getOrCreate(); // $example on$ // Load the data stored in LIBSVM format as a DataFrame. Dataset data = spark.read().format("libsvm") @@ -47,7 +49,7 @@ public class JavaDecisionTreeRegressionExample { .setMaxCategories(4) .fit(data); - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). Dataset[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset trainingData = splits[0]; Dataset testData = splits[1]; @@ -56,11 +58,11 @@ public class JavaDecisionTreeRegressionExample { DecisionTreeRegressor dt = new DecisionTreeRegressor() .setFeaturesCol("indexedFeatures"); - // Chain indexer and tree in a Pipeline + // Chain indexer and tree in a Pipeline. Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{featureIndexer, dt}); - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. PipelineModel model = pipeline.fit(trainingData); // Make predictions. @@ -69,7 +71,7 @@ public class JavaDecisionTreeRegressionExample { // Select example rows to display. predictions.select("label", "features").show(5); - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. RegressionEvaluator evaluator = new RegressionEvaluator() .setLabelCol("label") .setPredictionCol("prediction") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java index 49bad0afc0..3265c4d7ec 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java @@ -62,7 +62,7 @@ public class JavaDeveloperApiExample { new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))); Dataset training = spark.createDataFrame(localTraining, LabeledPoint.class); - // Create a LogisticRegression instance. This instance is an Estimator. + // Create a LogisticRegression instance. This instance is an Estimator. MyJavaLogisticRegression lr = new MyJavaLogisticRegression(); // Print out the parameters, documentation, and any default values. System.out.println("MyJavaLogisticRegression parameters:\n" + lr.explainParams() + "\n"); @@ -70,7 +70,7 @@ public class JavaDeveloperApiExample { // We may set parameters using setter methods. lr.setMaxIter(10); - // Learn a LogisticRegression model. This uses the parameters stored in lr. + // Learn a LogisticRegression model. This uses the parameters stored in lr. MyJavaLogisticRegressionModel model = lr.fit(training); // Prepare test data. @@ -214,7 +214,7 @@ class MyJavaLogisticRegressionModel } /** - * Number of classes the label can take. 2 indicates binary classification. + * Number of classes the label can take. 2 indicates binary classification. */ public int numClasses() { return 2; } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java index 5ba8e6cf44..889f5785df 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java @@ -38,7 +38,9 @@ import org.apache.spark.sql.SparkSession; public class JavaEstimatorTransformerParamExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaEstimatorTransformerParamExample").getOrCreate(); + .builder() + .appName("JavaEstimatorTransformerParamExample") + .getOrCreate(); // $example on$ // Prepare training data. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java index baacd796a0..5c2e03eda9 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java @@ -75,11 +75,11 @@ public class JavaGradientBoostedTreeClassifierExample { .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels()); - // Chain indexers and GBT in a Pipeline + // Chain indexers and GBT in a Pipeline. Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {labelIndexer, featureIndexer, gbt, labelConverter}); - // Train model. This also runs the indexers. + // Train model. This also runs the indexers. PipelineModel model = pipeline.fit(trainingData); // Make predictions. @@ -88,7 +88,7 @@ public class JavaGradientBoostedTreeClassifierExample { // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java index 6d3f21fdaf..769b5c3e85 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java @@ -34,7 +34,9 @@ import org.apache.spark.sql.SparkSession; public class JavaGradientBoostedTreeRegressorExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaGradientBoostedTreeRegressorExample").getOrCreate(); + .builder() + .appName("JavaGradientBoostedTreeRegressorExample") + .getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. @@ -48,7 +50,7 @@ public class JavaGradientBoostedTreeRegressorExample { .setMaxCategories(4) .fit(data); - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). Dataset[] splits = data.randomSplit(new double[] {0.7, 0.3}); Dataset trainingData = splits[0]; Dataset testData = splits[1]; @@ -59,10 +61,10 @@ public class JavaGradientBoostedTreeRegressorExample { .setFeaturesCol("indexedFeatures") .setMaxIter(10); - // Chain indexer and GBT in a Pipeline + // Chain indexer and GBT in a Pipeline. Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {featureIndexer, gbt}); - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. PipelineModel model = pipeline.fit(trainingData); // Make predictions. @@ -71,7 +73,7 @@ public class JavaGradientBoostedTreeRegressorExample { // Select example rows to display. predictions.select("prediction", "label", "features").show(5); - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. RegressionEvaluator evaluator = new RegressionEvaluator() .setLabelCol("label") .setPredictionCol("prediction") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java index b6ea1fed25..dcd209e28e 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java @@ -30,10 +30,12 @@ import org.apache.spark.sql.SparkSession; public class JavaLinearRegressionWithElasticNetExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaLinearRegressionWithElasticNetExample").getOrCreate(); + .builder() + .appName("JavaLinearRegressionWithElasticNetExample") + .getOrCreate(); // $example on$ - // Load training data + // Load training data. Dataset training = spark.read().format("libsvm") .load("data/mllib/sample_linear_regression_data.txt"); @@ -42,14 +44,14 @@ public class JavaLinearRegressionWithElasticNetExample { .setRegParam(0.3) .setElasticNetParam(0.8); - // Fit the model + // Fit the model. LinearRegressionModel lrModel = lr.fit(training); - // Print the coefficients and intercept for linear regression + // Print the coefficients and intercept for linear regression. System.out.println("Coefficients: " + lrModel.coefficients() + " Intercept: " + lrModel.intercept()); - // Summarize the model over the training set and print out some metrics + // Summarize the model over the training set and print out some metrics. LinearRegressionTrainingSummary trainingSummary = lrModel.summary(); System.out.println("numIterations: " + trainingSummary.totalIterations()); System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory())); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java index fd040aead4..dee56799d8 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java @@ -31,7 +31,9 @@ import org.apache.spark.sql.functions; public class JavaLogisticRegressionSummaryExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaLogisticRegressionSummaryExample").getOrCreate(); + .builder() + .appName("JavaLogisticRegressionSummaryExample") + .getOrCreate(); // Load training data Dataset training = spark.read().format("libsvm") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java index f00c7a05cd..6101c79fb0 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java @@ -28,7 +28,9 @@ import org.apache.spark.sql.SparkSession; public class JavaLogisticRegressionWithElasticNetExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaLogisticRegressionWithElasticNetExample").getOrCreate(); + .builder() + .appName("JavaLogisticRegressionWithElasticNetExample") + .getOrCreate(); // $example on$ // Load training data diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java index a4ec4f5815..975c65edc0 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java @@ -43,7 +43,9 @@ import org.apache.spark.sql.SparkSession; public class JavaModelSelectionViaCrossValidationExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaModelSelectionViaCrossValidationExample").getOrCreate(); + .builder() + .appName("JavaModelSelectionViaCrossValidationExample") + .getOrCreate(); // $example on$ // Prepare training documents, which are labeled. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java index 63a0ad1cb8..0f96293f03 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java @@ -43,7 +43,9 @@ import org.apache.spark.sql.SparkSession; public class JavaModelSelectionViaTrainValidationSplitExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaModelSelectionViaTrainValidationSplitExample").getOrCreate(); + .builder() + .appName("JavaModelSelectionViaTrainValidationSplitExample") + .getOrCreate(); // $example on$ Dataset data = spark.read().format("libsvm") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java index d547a2a64b..c7d03d8593 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java @@ -33,7 +33,9 @@ public class JavaMultilayerPerceptronClassifierExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaMultilayerPerceptronClassifierExample").getOrCreate(); + .builder() + .appName("JavaMultilayerPerceptronClassifierExample") + .getOrCreate(); // $example on$ // Load training data diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java index 94e3fafcab..16f58a852d 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java @@ -35,7 +35,9 @@ import org.apache.spark.sql.types.StructType; public class JavaQuantileDiscretizerExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaQuantileDiscretizerExample").getOrCreate(); + .builder() + .appName("JavaQuantileDiscretizerExample") + .getOrCreate(); // $example on$ List data = Arrays.asList( diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java index 21e783a968..14af2fbbbb 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java @@ -33,7 +33,9 @@ import org.apache.spark.sql.SparkSession; public class JavaRandomForestClassifierExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaRandomForestClassifierExample").getOrCreate(); + .builder() + .appName("JavaRandomForestClassifierExample") + .getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java index ece184a878..a7078453de 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java @@ -34,7 +34,9 @@ import org.apache.spark.sql.SparkSession; public class JavaRandomForestRegressorExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaRandomForestRegressorExample").getOrCreate(); + .builder() + .appName("JavaRandomForestRegressorExample") + .getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. @@ -62,7 +64,7 @@ public class JavaRandomForestRegressorExample { Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {featureIndexer, rf}); - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. PipelineModel model = pipeline.fit(trainingData); // Make predictions. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java index 0787079ba4..ff1eb07dc6 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java @@ -46,7 +46,7 @@ public class JavaSimpleParamsExample { .getOrCreate(); // Prepare training data. - // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans + // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans // into DataFrames, where it uses the bean metadata to infer the schema. List localTraining = Lists.newArrayList( new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), @@ -56,7 +56,7 @@ public class JavaSimpleParamsExample { Dataset training = spark.createDataFrame(localTraining, LabeledPoint.class); - // Create a LogisticRegression instance. This instance is an Estimator. + // Create a LogisticRegression instance. This instance is an Estimator. LogisticRegression lr = new LogisticRegression(); // Print out the parameters, documentation, and any default values. System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n"); @@ -65,7 +65,7 @@ public class JavaSimpleParamsExample { lr.setMaxIter(10) .setRegParam(0.01); - // Learn a LogisticRegression model. This uses the parameters stored in lr. + // Learn a LogisticRegression model. This uses the parameters stored in lr. LogisticRegressionModel model1 = lr.fit(training); // Since model1 is a Model (i.e., a Transformer produced by an Estimator), // we can view the parameters it used during fit(). @@ -82,7 +82,7 @@ public class JavaSimpleParamsExample { // One can also combine ParamMaps. ParamMap paramMap2 = new ParamMap(); - paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name + paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name. ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2); // Now learn a new model using the paramMapCombined parameters. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java index 9516ce1f4f..7c24c46d2e 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java @@ -43,7 +43,9 @@ public class JavaSimpleTextClassificationPipeline { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaSimpleTextClassificationPipeline").getOrCreate(); + .builder() + .appName("JavaSimpleTextClassificationPipeline") + .getOrCreate(); // Prepare training documents, which are labeled. List localTraining = Lists.newArrayList( diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala index 7f6c8de967..b3103ced91 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala @@ -47,10 +47,10 @@ object DecisionTreeClassificationExample { val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") - .setMaxCategories(4) // features with > 4 distinct values are treated as continuous + .setMaxCategories(4) // features with > 4 distinct values are treated as continuous. .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a DecisionTree model. @@ -64,11 +64,11 @@ object DecisionTreeClassificationExample { .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels) - // Chain indexers and tree in a Pipeline + // Chain indexers and tree in a Pipeline. val pipeline = new Pipeline() .setStages(Array(labelIndexer, featureIndexer, dt, labelConverter)) - // Train model. This also runs the indexers. + // Train model. This also runs the indexers. val model = pipeline.fit(trainingData) // Make predictions. @@ -77,7 +77,7 @@ object DecisionTreeClassificationExample { // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala index eadb02ab0d..310418008c 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala @@ -23,7 +23,6 @@ import scala.language.reflectiveCalls import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.{Pipeline, PipelineStage, Transformer} import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier} @@ -40,7 +39,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} * {{{ * ./bin/run-example ml.DecisionTreeExample [options] * }}} - * Note that Decision Trees can take a large amount of memory. If the run-example command above + * Note that Decision Trees can take a large amount of memory. If the run-example command above * fails, try running via spark-submit and specifying the amount of memory as at least 1g. * For local mode, run * {{{ @@ -87,7 +86,7 @@ object DecisionTreeExample { .text(s"min info gain required to create a split, default: ${defaultParams.minInfoGain}") .action((x, c) => c.copy(minInfoGain = x)) opt[Double]("fracTest") - .text(s"fraction of data to hold out for testing. If given option testInput, " + + .text(s"fraction of data to hold out for testing. If given option testInput, " + s"this option is ignored. default: ${defaultParams.fracTest}") .action((x, c) => c.copy(fracTest = x)) opt[Boolean]("cacheNodeIds") @@ -106,7 +105,7 @@ object DecisionTreeExample { s"default: ${defaultParams.checkpointInterval}") .action((x, c) => c.copy(checkpointInterval = x)) opt[String]("testInput") - .text(s"input path to test dataset. If given, option fracTest is ignored." + + .text(s"input path to test dataset. If given, option fracTest is ignored." + s" default: ${defaultParams.testInput}") .action((x, c) => c.copy(testInput = x)) opt[String]("dataFormat") @@ -157,11 +156,10 @@ object DecisionTreeExample { * @param dataFormat "libsvm" or "dense" * @param testInput Path to test dataset. * @param algo Classification or Regression - * @param fracTest Fraction of input data to hold out for testing. Ignored if testInput given. + * @param fracTest Fraction of input data to hold out for testing. Ignored if testInput given. * @return (training dataset, test dataset) */ private[ml] def loadDatasets( - sc: SparkContext, input: String, dataFormat: String, testInput: String, @@ -200,18 +198,21 @@ object DecisionTreeExample { } def run(params: Params) { - val conf = new SparkConf().setAppName(s"DecisionTreeExample with $params") - val sc = new SparkContext(conf) - params.checkpointDir.foreach(sc.setCheckpointDir) + val spark = SparkSession + .builder + .appName(s"DecisionTreeExample with $params") + .getOrCreate() + + params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir) val algo = params.algo.toLowerCase println(s"DecisionTreeExample with parameters:\n$params") // Load training and test data and cache it. val (training: DataFrame, test: DataFrame) = - loadDatasets(sc, params.input, params.dataFormat, params.testInput, algo, params.fracTest) + loadDatasets(params.input, params.dataFormat, params.testInput, algo, params.fracTest) - // Set up Pipeline + // Set up Pipeline. val stages = new mutable.ArrayBuffer[PipelineStage]() // (1) For classification, re-index classes. val labelColName = if (algo == "classification") "indexedLabel" else "label" @@ -228,7 +229,7 @@ object DecisionTreeExample { .setOutputCol("indexedFeatures") .setMaxCategories(10) stages += featuresIndexer - // (3) Learn Decision Tree + // (3) Learn Decision Tree. val dt = algo match { case "classification" => new DecisionTreeClassifier() @@ -255,13 +256,13 @@ object DecisionTreeExample { stages += dt val pipeline = new Pipeline().setStages(stages.toArray) - // Fit the Pipeline + // Fit the Pipeline. val startTime = System.nanoTime() val pipelineModel = pipeline.fit(training) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") - // Get the trained Decision Tree from the fitted PipelineModel + // Get the trained Decision Tree from the fitted PipelineModel. algo match { case "classification" => val treeModel = pipelineModel.stages.last.asInstanceOf[DecisionTreeClassificationModel] @@ -280,7 +281,7 @@ object DecisionTreeExample { case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - // Evaluate model on training, test data + // Evaluate model on training, test data. algo match { case "classification" => println("Training data results:") @@ -296,11 +297,11 @@ object DecisionTreeExample { throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - sc.stop() + spark.stop() } /** - * Evaluate the given ClassificationModel on data. Print the results. + * Evaluate the given ClassificationModel on data. Print the results. * @param model Must fit ClassificationModel abstraction * @param data DataFrame with "prediction" and labelColName columns * @param labelColName Name of the labelCol parameter for the model @@ -314,7 +315,7 @@ object DecisionTreeExample { val fullPredictions = model.transform(data).cache() val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0)) val labels = fullPredictions.select(labelColName).rdd.map(_.getDouble(0)) - // Print number of classes for reference + // Print number of classes for reference. val numClasses = MetadataUtils.getNumClasses(fullPredictions.schema(labelColName)) match { case Some(n) => n case None => throw new RuntimeException( @@ -325,7 +326,7 @@ object DecisionTreeExample { } /** - * Evaluate the given RegressionModel on data. Print the results. + * Evaluate the given RegressionModel on data. Print the results. * @param model Must fit RegressionModel abstraction * @param data DataFrame with "prediction" and labelColName columns * @param labelColName Name of the labelCol parameter for the model diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala index 799070ef47..ee61200ad1 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala @@ -46,7 +46,7 @@ object DecisionTreeRegressionExample { .setMaxCategories(4) .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a DecisionTree model. @@ -54,11 +54,11 @@ object DecisionTreeRegressionExample { .setLabelCol("label") .setFeaturesCol("indexedFeatures") - // Chain indexer and tree in a Pipeline + // Chain indexer and tree in a Pipeline. val pipeline = new Pipeline() .setStages(Array(featureIndexer, dt)) - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. val model = pipeline.fit(trainingData) // Make predictions. @@ -67,7 +67,7 @@ object DecisionTreeRegressionExample { // Select example rows to display. predictions.select("prediction", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala index a522d2127e..b8f47bf12b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala @@ -50,7 +50,7 @@ object DeveloperApiExample { LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)))) - // Create a LogisticRegression instance. This instance is an Estimator. + // Create a LogisticRegression instance. This instance is an Estimator. val lr = new MyLogisticRegression() // Print out the parameters, documentation, and any default values. println("MyLogisticRegression parameters:\n" + lr.explainParams() + "\n") @@ -58,7 +58,7 @@ object DeveloperApiExample { // We may set parameters using setter methods. lr.setMaxIter(10) - // Learn a LogisticRegression model. This uses the parameters stored in lr. + // Learn a LogisticRegression model. This uses the parameters stored in lr. val model = lr.fit(training.toDF()) // Prepare test data. @@ -84,7 +84,7 @@ object DeveloperApiExample { /** * Example of defining a parameter trait for a user-defined type of [[Classifier]]. * - * NOTE: This is private since it is an example. In practice, you may not want it to be private. + * NOTE: This is private since it is an example. In practice, you may not want it to be private. */ private trait MyLogisticRegressionParams extends ClassifierParams { @@ -96,7 +96,7 @@ private trait MyLogisticRegressionParams extends ClassifierParams { * - def getMyParamName * - def setMyParamName * Here, we have a trait to be mixed in with the Estimator and Model (MyLogisticRegression - * and MyLogisticRegressionModel). We place the setter (setMaxIter) method in the Estimator + * and MyLogisticRegressionModel). We place the setter (setMaxIter) method in the Estimator * class since the maxIter parameter is only used during training (not in the Model). */ val maxIter: IntParam = new IntParam(this, "maxIter", "max number of iterations") @@ -106,7 +106,7 @@ private trait MyLogisticRegressionParams extends ClassifierParams { /** * Example of defining a type of [[Classifier]]. * - * NOTE: This is private since it is an example. In practice, you may not want it to be private. + * NOTE: This is private since it is an example. In practice, you may not want it to be private. */ private class MyLogisticRegression(override val uid: String) extends Classifier[Vector, MyLogisticRegression, MyLogisticRegressionModel] @@ -138,7 +138,7 @@ private class MyLogisticRegression(override val uid: String) /** * Example of defining a type of [[ClassificationModel]]. * - * NOTE: This is private since it is an example. In practice, you may not want it to be private. + * NOTE: This is private since it is an example. In practice, you may not want it to be private. */ private class MyLogisticRegressionModel( override val uid: String, @@ -169,7 +169,7 @@ private class MyLogisticRegressionModel( Vectors.dense(-margin, margin) } - /** Number of classes the label can take. 2 indicates binary classification. */ + /** Number of classes the label can take. 2 indicates binary classification. */ override val numClasses: Int = 2 /** Number of features the model was trained on. */ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala index 972241e769..a2918d66ea 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala @@ -43,7 +43,7 @@ object EstimatorTransformerParamExample { (1.0, Vectors.dense(0.0, 1.2, -0.5)) )).toDF("label", "features") - // Create a LogisticRegression instance. This instance is an Estimator. + // Create a LogisticRegression instance. This instance is an Estimator. val lr = new LogisticRegression() // Print out the parameters, documentation, and any default values. println("LogisticRegression parameters:\n" + lr.explainParams() + "\n") @@ -52,7 +52,7 @@ object EstimatorTransformerParamExample { lr.setMaxIter(10) .setRegParam(0.01) - // Learn a LogisticRegression model. This uses the parameters stored in lr. + // Learn a LogisticRegression model. This uses the parameters stored in lr. val model1 = lr.fit(training) // Since model1 is a Model (i.e., a Transformer produced by an Estimator), // we can view the parameters it used during fit(). @@ -63,11 +63,11 @@ object EstimatorTransformerParamExample { // We may alternatively specify parameters using a ParamMap, // which supports several methods for specifying parameters. val paramMap = ParamMap(lr.maxIter -> 20) - .put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. + .put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. .put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params. // One can also combine ParamMaps. - val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name + val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name. val paramMapCombined = paramMap ++ paramMap2 // Now learn a new model using the paramMapCombined parameters. diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala index 6b0be0f34e..a4274ae954 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala @@ -23,13 +23,12 @@ import scala.language.reflectiveCalls import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier} import org.apache.spark.ml.feature.{StringIndexer, VectorIndexer} import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor} -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, SparkSession} /** @@ -37,7 +36,7 @@ import org.apache.spark.sql.DataFrame * {{{ * ./bin/run-example ml.GBTExample [options] * }}} - * Decision Trees and ensembles can take a large amount of memory. If the run-example command + * Decision Trees and ensembles can take a large amount of memory. If the run-example command * above fails, try running via spark-submit and specifying the amount of memory as at least 1g. * For local mode, run * {{{ @@ -88,7 +87,7 @@ object GBTExample { .text(s"number of trees in ensemble, default: ${defaultParams.maxIter}") .action((x, c) => c.copy(maxIter = x)) opt[Double]("fracTest") - .text(s"fraction of data to hold out for testing. If given option testInput, " + + .text(s"fraction of data to hold out for testing. If given option testInput, " + s"this option is ignored. default: ${defaultParams.fracTest}") .action((x, c) => c.copy(fracTest = x)) opt[Boolean]("cacheNodeIds") @@ -109,7 +108,7 @@ object GBTExample { s"default: ${defaultParams.checkpointInterval}") .action((x, c) => c.copy(checkpointInterval = x)) opt[String]("testInput") - .text(s"input path to test dataset. If given, option fracTest is ignored." + + .text(s"input path to test dataset. If given, option fracTest is ignored." + s" default: ${defaultParams.testInput}") .action((x, c) => c.copy(testInput = x)) opt[String]("dataFormat") @@ -136,15 +135,18 @@ object GBTExample { } def run(params: Params) { - val conf = new SparkConf().setAppName(s"GBTExample with $params") - val sc = new SparkContext(conf) - params.checkpointDir.foreach(sc.setCheckpointDir) + val spark = SparkSession + .builder + .appName(s"GBTExample with $params") + .getOrCreate() + + params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir) val algo = params.algo.toLowerCase println(s"GBTExample with parameters:\n$params") // Load training and test data and cache it. - val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input, + val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input, params.dataFormat, params.testInput, algo, params.fracTest) // Set up Pipeline @@ -164,7 +166,7 @@ object GBTExample { .setOutputCol("indexedFeatures") .setMaxCategories(10) stages += featuresIndexer - // (3) Learn GBT + // (3) Learn GBT. val dt = algo match { case "classification" => new GBTClassifier() @@ -193,13 +195,13 @@ object GBTExample { stages += dt val pipeline = new Pipeline().setStages(stages.toArray) - // Fit the Pipeline + // Fit the Pipeline. val startTime = System.nanoTime() val pipelineModel = pipeline.fit(training) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") - // Get the trained GBT from the fitted PipelineModel + // Get the trained GBT from the fitted PipelineModel. algo match { case "classification" => val rfModel = pipelineModel.stages.last.asInstanceOf[GBTClassificationModel] @@ -218,7 +220,7 @@ object GBTExample { case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - // Evaluate model on training, test data + // Evaluate model on training, test data. algo match { case "classification" => println("Training data results:") @@ -234,7 +236,7 @@ object GBTExample { throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala index b6a8baba2d..0d1ffbe225 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala @@ -51,7 +51,7 @@ object GradientBoostedTreeClassifierExample { .setMaxCategories(4) .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a GBT model. @@ -66,11 +66,11 @@ object GradientBoostedTreeClassifierExample { .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels) - // Chain indexers and GBT in a Pipeline + // Chain indexers and GBT in a Pipeline. val pipeline = new Pipeline() .setStages(Array(labelIndexer, featureIndexer, gbt, labelConverter)) - // Train model. This also runs the indexers. + // Train model. This also runs the indexers. val model = pipeline.fit(trainingData) // Make predictions. @@ -79,7 +79,7 @@ object GradientBoostedTreeClassifierExample { // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala index 62285b83cb..e53aab7f32 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala @@ -45,7 +45,7 @@ object GradientBoostedTreeRegressorExample { .setMaxCategories(4) .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a GBT model. @@ -54,11 +54,11 @@ object GradientBoostedTreeRegressorExample { .setFeaturesCol("indexedFeatures") .setMaxIter(10) - // Chain indexer and GBT in a Pipeline + // Chain indexer and GBT in a Pipeline. val pipeline = new Pipeline() .setStages(Array(featureIndexer, gbt)) - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. val model = pipeline.fit(trainingData) // Make predictions. @@ -67,7 +67,7 @@ object GradientBoostedTreeRegressorExample { // Select example rows to display. predictions.select("prediction", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala index 25be87811d..de96fb2979 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala @@ -22,10 +22,9 @@ import scala.language.reflectiveCalls import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.regression.LinearRegression -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, SparkSession} /** * An example runner for linear regression with elastic-net (mixing L1/L2) regularization. @@ -74,11 +73,11 @@ object LinearRegressionExample { s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}") .action((x, c) => c.copy(tol = x)) opt[Double]("fracTest") - .text(s"fraction of data to hold out for testing. If given option testInput, " + + .text(s"fraction of data to hold out for testing. If given option testInput, " + s"this option is ignored. default: ${defaultParams.fracTest}") .action((x, c) => c.copy(fracTest = x)) opt[String]("testInput") - .text(s"input path to test dataset. If given, option fracTest is ignored." + + .text(s"input path to test dataset. If given, option fracTest is ignored." + s" default: ${defaultParams.testInput}") .action((x, c) => c.copy(testInput = x)) opt[String]("dataFormat") @@ -105,13 +104,15 @@ object LinearRegressionExample { } def run(params: Params) { - val conf = new SparkConf().setAppName(s"LinearRegressionExample with $params") - val sc = new SparkContext(conf) + val spark = SparkSession + .builder + .appName(s"LinearRegressionExample with $params") + .getOrCreate() println(s"LinearRegressionExample with parameters:\n$params") // Load training and test data and cache it. - val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input, + val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input, params.dataFormat, params.testInput, "regression", params.fracTest) val lir = new LinearRegression() @@ -136,7 +137,7 @@ object LinearRegressionExample { println("Test data results:") DecisionTreeExample.evaluateRegressionModel(lirModel, test, "label") - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala index a380c90662..c2a87e1ddf 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala @@ -23,12 +23,11 @@ import scala.language.reflectiveCalls import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.feature.StringIndexer -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, SparkSession} /** * An example runner for logistic regression with elastic-net (mixing L1/L2) regularization. @@ -81,11 +80,11 @@ object LogisticRegressionExample { s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}") .action((x, c) => c.copy(tol = x)) opt[Double]("fracTest") - .text(s"fraction of data to hold out for testing. If given option testInput, " + + .text(s"fraction of data to hold out for testing. If given option testInput, " + s"this option is ignored. default: ${defaultParams.fracTest}") .action((x, c) => c.copy(fracTest = x)) opt[String]("testInput") - .text(s"input path to test dataset. If given, option fracTest is ignored." + + .text(s"input path to test dataset. If given, option fracTest is ignored." + s" default: ${defaultParams.testInput}") .action((x, c) => c.copy(testInput = x)) opt[String]("dataFormat") @@ -112,16 +111,18 @@ object LogisticRegressionExample { } def run(params: Params) { - val conf = new SparkConf().setAppName(s"LogisticRegressionExample with $params") - val sc = new SparkContext(conf) + val spark = SparkSession + .builder + .appName(s"LogisticRegressionExample with $params") + .getOrCreate() println(s"LogisticRegressionExample with parameters:\n$params") // Load training and test data and cache it. - val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input, + val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input, params.dataFormat, params.testInput, "classification", params.fracTest) - // Set up Pipeline + // Set up Pipeline. val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() @@ -141,7 +142,7 @@ object LogisticRegressionExample { stages += lor val pipeline = new Pipeline().setStages(stages.toArray) - // Fit the Pipeline + // Fit the Pipeline. val startTime = System.nanoTime() val pipelineModel = pipeline.fit(training) val elapsedTime = (System.nanoTime() - startTime) / 1e9 @@ -156,7 +157,7 @@ object LogisticRegressionExample { println("Test data results:") DecisionTreeExample.evaluateClassificationModel(pipelineModel, test, "indexedLabel") - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala index fcba813d5b..616263b8e9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala @@ -27,7 +27,9 @@ object LogisticRegressionWithElasticNetExample { def main(args: Array[String]): Unit = { val spark = SparkSession - .builder.appName("LogisticRegressionWithElasticNetExample").getOrCreate() + .builder + .appName("LogisticRegressionWithElasticNetExample") + .getOrCreate() // $example on$ // Load training data diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala index 5fb3536060..c29d36210a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala @@ -42,7 +42,9 @@ object ModelSelectionViaCrossValidationExample { def main(args: Array[String]): Unit = { val spark = SparkSession - .builder.appName("ModelSelectionViaCrossValidationExample").getOrCreate() + .builder + .appName("ModelSelectionViaCrossValidationExample") + .getOrCreate() // $example on$ // Prepare training data from a list of (id, text, label) tuples. diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala index 6bc082982c..75fef2922a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala @@ -36,7 +36,9 @@ object ModelSelectionViaTrainValidationSplitExample { def main(args: Array[String]): Unit = { val spark = SparkSession - .builder.appName("ModelSelectionViaTrainValidationSplitExample").getOrCreate() + .builder + .appName("ModelSelectionViaTrainValidationSplitExample") + .getOrCreate() // $example on$ // Prepare training and test data. diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala index ae0bd945d8..cccc4a6ea2 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala @@ -51,7 +51,7 @@ object RandomForestClassifierExample { .setMaxCategories(4) .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a RandomForest model. @@ -66,11 +66,11 @@ object RandomForestClassifierExample { .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels) - // Chain indexers and forest in a Pipeline + // Chain indexers and forest in a Pipeline. val pipeline = new Pipeline() .setStages(Array(labelIndexer, featureIndexer, rf, labelConverter)) - // Train model. This also runs the indexers. + // Train model. This also runs the indexers. val model = pipeline.fit(trainingData) // Make predictions. @@ -79,7 +79,7 @@ object RandomForestClassifierExample { // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala index 7a00d99dfe..2419dc49cd 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala @@ -23,13 +23,12 @@ import scala.language.reflectiveCalls import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier} import org.apache.spark.ml.feature.{StringIndexer, VectorIndexer} import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, SparkSession} /** @@ -37,7 +36,7 @@ import org.apache.spark.sql.DataFrame * {{{ * ./bin/run-example ml.RandomForestExample [options] * }}} - * Decision Trees and ensembles can take a large amount of memory. If the run-example command + * Decision Trees and ensembles can take a large amount of memory. If the run-example command * above fails, try running via spark-submit and specifying the amount of memory as at least 1g. * For local mode, run * {{{ @@ -94,7 +93,7 @@ object RandomForestExample { s" default: ${defaultParams.numTrees}") .action((x, c) => c.copy(featureSubsetStrategy = x)) opt[Double]("fracTest") - .text(s"fraction of data to hold out for testing. If given option testInput, " + + .text(s"fraction of data to hold out for testing. If given option testInput, " + s"this option is ignored. default: ${defaultParams.fracTest}") .action((x, c) => c.copy(fracTest = x)) opt[Boolean]("cacheNodeIds") @@ -115,7 +114,7 @@ object RandomForestExample { s"default: ${defaultParams.checkpointInterval}") .action((x, c) => c.copy(checkpointInterval = x)) opt[String]("testInput") - .text(s"input path to test dataset. If given, option fracTest is ignored." + + .text(s"input path to test dataset. If given, option fracTest is ignored." + s" default: ${defaultParams.testInput}") .action((x, c) => c.copy(testInput = x)) opt[String]("dataFormat") @@ -142,18 +141,21 @@ object RandomForestExample { } def run(params: Params) { - val conf = new SparkConf().setAppName(s"RandomForestExample with $params") - val sc = new SparkContext(conf) - params.checkpointDir.foreach(sc.setCheckpointDir) + val spark = SparkSession + .builder + .appName(s"RandomForestExample with $params") + .getOrCreate() + + params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir) val algo = params.algo.toLowerCase println(s"RandomForestExample with parameters:\n$params") // Load training and test data and cache it. - val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input, + val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input, params.dataFormat, params.testInput, algo, params.fracTest) - // Set up Pipeline + // Set up Pipeline. val stages = new mutable.ArrayBuffer[PipelineStage]() // (1) For classification, re-index classes. val labelColName = if (algo == "classification") "indexedLabel" else "label" @@ -170,7 +172,7 @@ object RandomForestExample { .setOutputCol("indexedFeatures") .setMaxCategories(10) stages += featuresIndexer - // (3) Learn Random Forest + // (3) Learn Random Forest. val dt = algo match { case "classification" => new RandomForestClassifier() @@ -201,13 +203,13 @@ object RandomForestExample { stages += dt val pipeline = new Pipeline().setStages(stages.toArray) - // Fit the Pipeline + // Fit the Pipeline. val startTime = System.nanoTime() val pipelineModel = pipeline.fit(training) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") - // Get the trained Random Forest from the fitted PipelineModel + // Get the trained Random Forest from the fitted PipelineModel. algo match { case "classification" => val rfModel = pipelineModel.stages.last.asInstanceOf[RandomForestClassificationModel] @@ -226,7 +228,7 @@ object RandomForestExample { case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - // Evaluate model on training, test data + // Evaluate model on training, test data. algo match { case "classification" => println("Training data results:") @@ -242,7 +244,7 @@ object RandomForestExample { throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala index 96dc2f05be..9a0a001c26 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala @@ -45,7 +45,7 @@ object RandomForestRegressorExample { .setMaxCategories(4) .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a RandomForest model. @@ -53,11 +53,11 @@ object RandomForestRegressorExample { .setLabelCol("label") .setFeaturesCol("indexedFeatures") - // Chain indexer and forest in a Pipeline + // Chain indexer and forest in a Pipeline. val pipeline = new Pipeline() .setStages(Array(featureIndexer, rf)) - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. val model = pipeline.fit(trainingData) // Make predictions. @@ -66,7 +66,7 @@ object RandomForestRegressorExample { // Select example rows to display. predictions.select("prediction", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala index 3547dd95bd..83bab5c557 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala @@ -41,7 +41,7 @@ object SimpleParamsExample { import spark.implicits._ // Prepare training data. - // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes + // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes // into DataFrames, where it uses the case class metadata to infer the schema. val training = spark.createDataFrame(Seq( LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), @@ -49,7 +49,7 @@ object SimpleParamsExample { LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)))) - // Create a LogisticRegression instance. This instance is an Estimator. + // Create a LogisticRegression instance. This instance is an Estimator. val lr = new LogisticRegression() // Print out the parameters, documentation, and any default values. println("LogisticRegression parameters:\n" + lr.explainParams() + "\n") @@ -58,7 +58,7 @@ object SimpleParamsExample { lr.setMaxIter(10) .setRegParam(0.01) - // Learn a LogisticRegression model. This uses the parameters stored in lr. + // Learn a LogisticRegression model. This uses the parameters stored in lr. val model1 = lr.fit(training) // Since model1 is a Model (i.e., a Transformer produced by an Estimator), // we can view the parameters it used during fit(). @@ -69,7 +69,7 @@ object SimpleParamsExample { // We may alternatively specify parameters using a ParamMap, // which supports several methods for specifying parameters. val paramMap = ParamMap(lr.maxIter -> 20) - paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. + paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.45, 0.55)) // Specify multiple Params. // One can also combine ParamMaps. -- cgit v1.2.3