aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main
diff options
context:
space:
mode:
authorZheng RuiFeng <ruifengz@foxmail.com>2016-05-11 22:45:30 -0700
committerAndrew Or <andrew@databricks.com>2016-05-11 22:45:30 -0700
commit9e266d07a444fd465fe178cdd5c4894cd09cbda3 (patch)
treee6ddfe9c9f6f6e8c5bc8c4b91b44c5fc5a948fac /examples/src/main
parentba5487c061168627b27af2fa9610d53791fcc90d (diff)
downloadspark-9e266d07a444fd465fe178cdd5c4894cd09cbda3.tar.gz
spark-9e266d07a444fd465fe178cdd5c4894cd09cbda3.tar.bz2
spark-9e266d07a444fd465fe178cdd5c4894cd09cbda3.zip
[SPARK-15031][SPARK-15134][EXAMPLE][DOC] Use SparkSession and update indent in examples
## What changes were proposed in this pull request? 1, Use `SparkSession` according to [SPARK-15031](https://issues.apache.org/jira/browse/SPARK-15031) 2, Update indent for `SparkContext` according to [SPARK-15134](https://issues.apache.org/jira/browse/SPARK-15134) 3, BTW, remove some duplicate space and add missing '.' ## How was this patch tested? manual tests Author: Zheng RuiFeng <ruifengz@foxmail.com> Closes #13050 from zhengruifeng/use_sparksession.
Diffstat (limited to 'examples/src/main')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java12
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java6
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java6
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java12
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java12
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java6
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java8
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala10
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala39
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala8
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala14
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala8
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala30
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala8
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala8
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala17
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala21
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala8
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala32
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala8
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala8
34 files changed, 192 insertions, 151 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
index 733bc4181c..bdb76f004f 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
@@ -32,7 +32,9 @@ import org.apache.spark.sql.SparkSession;
public class JavaDecisionTreeClassificationExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaDecisionTreeClassificationExample").getOrCreate();
+ .builder()
+ .appName("JavaDecisionTreeClassificationExample")
+ .getOrCreate();
// $example on$
// Load the data stored in LIBSVM format as a DataFrame.
@@ -52,10 +54,10 @@ public class JavaDecisionTreeClassificationExample {
VectorIndexerModel featureIndexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexedFeatures")
- .setMaxCategories(4) // features with > 4 distinct values are treated as continuous
+ .setMaxCategories(4) // features with > 4 distinct values are treated as continuous.
.fit(data);
- // Split the data into training and test sets (30% held out for testing)
+ // Split the data into training and test sets (30% held out for testing).
Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
Dataset<Row> trainingData = splits[0];
Dataset<Row> testData = splits[1];
@@ -71,11 +73,11 @@ public class JavaDecisionTreeClassificationExample {
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels());
- // Chain indexers and tree in a Pipeline
+ // Chain indexers and tree in a Pipeline.
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter});
- // Train model. This also runs the indexers.
+ // Train model. This also runs the indexers.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
@@ -84,7 +86,7 @@ public class JavaDecisionTreeClassificationExample {
// Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5);
- // Select (prediction, true label) and compute test error
+ // Select (prediction, true label) and compute test error.
MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("indexedLabel")
.setPredictionCol("prediction")
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
index bd6dc3edd3..cffb7139ed 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
@@ -33,7 +33,9 @@ import org.apache.spark.sql.SparkSession;
public class JavaDecisionTreeRegressionExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaDecisionTreeRegressionExample").getOrCreate();
+ .builder()
+ .appName("JavaDecisionTreeRegressionExample")
+ .getOrCreate();
// $example on$
// Load the data stored in LIBSVM format as a DataFrame.
Dataset<Row> data = spark.read().format("libsvm")
@@ -47,7 +49,7 @@ public class JavaDecisionTreeRegressionExample {
.setMaxCategories(4)
.fit(data);
- // Split the data into training and test sets (30% held out for testing)
+ // Split the data into training and test sets (30% held out for testing).
Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
Dataset<Row> trainingData = splits[0];
Dataset<Row> testData = splits[1];
@@ -56,11 +58,11 @@ public class JavaDecisionTreeRegressionExample {
DecisionTreeRegressor dt = new DecisionTreeRegressor()
.setFeaturesCol("indexedFeatures");
- // Chain indexer and tree in a Pipeline
+ // Chain indexer and tree in a Pipeline.
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[]{featureIndexer, dt});
- // Train model. This also runs the indexer.
+ // Train model. This also runs the indexer.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
@@ -69,7 +71,7 @@ public class JavaDecisionTreeRegressionExample {
// Select example rows to display.
predictions.select("label", "features").show(5);
- // Select (prediction, true label) and compute test error
+ // Select (prediction, true label) and compute test error.
RegressionEvaluator evaluator = new RegressionEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
index 49bad0afc0..3265c4d7ec 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
@@ -62,7 +62,7 @@ public class JavaDeveloperApiExample {
new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
Dataset<Row> training = spark.createDataFrame(localTraining, LabeledPoint.class);
- // Create a LogisticRegression instance. This instance is an Estimator.
+ // Create a LogisticRegression instance. This instance is an Estimator.
MyJavaLogisticRegression lr = new MyJavaLogisticRegression();
// Print out the parameters, documentation, and any default values.
System.out.println("MyJavaLogisticRegression parameters:\n" + lr.explainParams() + "\n");
@@ -70,7 +70,7 @@ public class JavaDeveloperApiExample {
// We may set parameters using setter methods.
lr.setMaxIter(10);
- // Learn a LogisticRegression model. This uses the parameters stored in lr.
+ // Learn a LogisticRegression model. This uses the parameters stored in lr.
MyJavaLogisticRegressionModel model = lr.fit(training);
// Prepare test data.
@@ -214,7 +214,7 @@ class MyJavaLogisticRegressionModel
}
/**
- * Number of classes the label can take. 2 indicates binary classification.
+ * Number of classes the label can take. 2 indicates binary classification.
*/
public int numClasses() { return 2; }
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java
index 5ba8e6cf44..889f5785df 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java
@@ -38,7 +38,9 @@ import org.apache.spark.sql.SparkSession;
public class JavaEstimatorTransformerParamExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaEstimatorTransformerParamExample").getOrCreate();
+ .builder()
+ .appName("JavaEstimatorTransformerParamExample")
+ .getOrCreate();
// $example on$
// Prepare training data.
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java
index baacd796a0..5c2e03eda9 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java
@@ -75,11 +75,11 @@ public class JavaGradientBoostedTreeClassifierExample {
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels());
- // Chain indexers and GBT in a Pipeline
+ // Chain indexers and GBT in a Pipeline.
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[] {labelIndexer, featureIndexer, gbt, labelConverter});
- // Train model. This also runs the indexers.
+ // Train model. This also runs the indexers.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
@@ -88,7 +88,7 @@ public class JavaGradientBoostedTreeClassifierExample {
// Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5);
- // Select (prediction, true label) and compute test error
+ // Select (prediction, true label) and compute test error.
MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("indexedLabel")
.setPredictionCol("prediction")
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java
index 6d3f21fdaf..769b5c3e85 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java
@@ -34,7 +34,9 @@ import org.apache.spark.sql.SparkSession;
public class JavaGradientBoostedTreeRegressorExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaGradientBoostedTreeRegressorExample").getOrCreate();
+ .builder()
+ .appName("JavaGradientBoostedTreeRegressorExample")
+ .getOrCreate();
// $example on$
// Load and parse the data file, converting it to a DataFrame.
@@ -48,7 +50,7 @@ public class JavaGradientBoostedTreeRegressorExample {
.setMaxCategories(4)
.fit(data);
- // Split the data into training and test sets (30% held out for testing)
+ // Split the data into training and test sets (30% held out for testing).
Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
Dataset<Row> trainingData = splits[0];
Dataset<Row> testData = splits[1];
@@ -59,10 +61,10 @@ public class JavaGradientBoostedTreeRegressorExample {
.setFeaturesCol("indexedFeatures")
.setMaxIter(10);
- // Chain indexer and GBT in a Pipeline
+ // Chain indexer and GBT in a Pipeline.
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {featureIndexer, gbt});
- // Train model. This also runs the indexer.
+ // Train model. This also runs the indexer.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
@@ -71,7 +73,7 @@ public class JavaGradientBoostedTreeRegressorExample {
// Select example rows to display.
predictions.select("prediction", "label", "features").show(5);
- // Select (prediction, true label) and compute test error
+ // Select (prediction, true label) and compute test error.
RegressionEvaluator evaluator = new RegressionEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java
index b6ea1fed25..dcd209e28e 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java
@@ -30,10 +30,12 @@ import org.apache.spark.sql.SparkSession;
public class JavaLinearRegressionWithElasticNetExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaLinearRegressionWithElasticNetExample").getOrCreate();
+ .builder()
+ .appName("JavaLinearRegressionWithElasticNetExample")
+ .getOrCreate();
// $example on$
- // Load training data
+ // Load training data.
Dataset<Row> training = spark.read().format("libsvm")
.load("data/mllib/sample_linear_regression_data.txt");
@@ -42,14 +44,14 @@ public class JavaLinearRegressionWithElasticNetExample {
.setRegParam(0.3)
.setElasticNetParam(0.8);
- // Fit the model
+ // Fit the model.
LinearRegressionModel lrModel = lr.fit(training);
- // Print the coefficients and intercept for linear regression
+ // Print the coefficients and intercept for linear regression.
System.out.println("Coefficients: "
+ lrModel.coefficients() + " Intercept: " + lrModel.intercept());
- // Summarize the model over the training set and print out some metrics
+ // Summarize the model over the training set and print out some metrics.
LinearRegressionTrainingSummary trainingSummary = lrModel.summary();
System.out.println("numIterations: " + trainingSummary.totalIterations());
System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java
index fd040aead4..dee56799d8 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java
@@ -31,7 +31,9 @@ import org.apache.spark.sql.functions;
public class JavaLogisticRegressionSummaryExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaLogisticRegressionSummaryExample").getOrCreate();
+ .builder()
+ .appName("JavaLogisticRegressionSummaryExample")
+ .getOrCreate();
// Load training data
Dataset<Row> training = spark.read().format("libsvm")
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
index f00c7a05cd..6101c79fb0 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
@@ -28,7 +28,9 @@ import org.apache.spark.sql.SparkSession;
public class JavaLogisticRegressionWithElasticNetExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaLogisticRegressionWithElasticNetExample").getOrCreate();
+ .builder()
+ .appName("JavaLogisticRegressionWithElasticNetExample")
+ .getOrCreate();
// $example on$
// Load training data
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java
index a4ec4f5815..975c65edc0 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java
@@ -43,7 +43,9 @@ import org.apache.spark.sql.SparkSession;
public class JavaModelSelectionViaCrossValidationExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaModelSelectionViaCrossValidationExample").getOrCreate();
+ .builder()
+ .appName("JavaModelSelectionViaCrossValidationExample")
+ .getOrCreate();
// $example on$
// Prepare training documents, which are labeled.
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
index 63a0ad1cb8..0f96293f03 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
@@ -43,7 +43,9 @@ import org.apache.spark.sql.SparkSession;
public class JavaModelSelectionViaTrainValidationSplitExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaModelSelectionViaTrainValidationSplitExample").getOrCreate();
+ .builder()
+ .appName("JavaModelSelectionViaTrainValidationSplitExample")
+ .getOrCreate();
// $example on$
Dataset<Row> data = spark.read().format("libsvm")
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
index d547a2a64b..c7d03d8593 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
@@ -33,7 +33,9 @@ public class JavaMultilayerPerceptronClassifierExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaMultilayerPerceptronClassifierExample").getOrCreate();
+ .builder()
+ .appName("JavaMultilayerPerceptronClassifierExample")
+ .getOrCreate();
// $example on$
// Load training data
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
index 94e3fafcab..16f58a852d 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
@@ -35,7 +35,9 @@ import org.apache.spark.sql.types.StructType;
public class JavaQuantileDiscretizerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaQuantileDiscretizerExample").getOrCreate();
+ .builder()
+ .appName("JavaQuantileDiscretizerExample")
+ .getOrCreate();
// $example on$
List<Row> data = Arrays.asList(
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java
index 21e783a968..14af2fbbbb 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java
@@ -33,7 +33,9 @@ import org.apache.spark.sql.SparkSession;
public class JavaRandomForestClassifierExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaRandomForestClassifierExample").getOrCreate();
+ .builder()
+ .appName("JavaRandomForestClassifierExample")
+ .getOrCreate();
// $example on$
// Load and parse the data file, converting it to a DataFrame.
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java
index ece184a878..a7078453de 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java
@@ -34,7 +34,9 @@ import org.apache.spark.sql.SparkSession;
public class JavaRandomForestRegressorExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaRandomForestRegressorExample").getOrCreate();
+ .builder()
+ .appName("JavaRandomForestRegressorExample")
+ .getOrCreate();
// $example on$
// Load and parse the data file, converting it to a DataFrame.
@@ -62,7 +64,7 @@ public class JavaRandomForestRegressorExample {
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[] {featureIndexer, rf});
- // Train model. This also runs the indexer.
+ // Train model. This also runs the indexer.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
index 0787079ba4..ff1eb07dc6 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -46,7 +46,7 @@ public class JavaSimpleParamsExample {
.getOrCreate();
// Prepare training data.
- // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
+ // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
// into DataFrames, where it uses the bean metadata to infer the schema.
List<LabeledPoint> localTraining = Lists.newArrayList(
new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
@@ -56,7 +56,7 @@ public class JavaSimpleParamsExample {
Dataset<Row> training =
spark.createDataFrame(localTraining, LabeledPoint.class);
- // Create a LogisticRegression instance. This instance is an Estimator.
+ // Create a LogisticRegression instance. This instance is an Estimator.
LogisticRegression lr = new LogisticRegression();
// Print out the parameters, documentation, and any default values.
System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");
@@ -65,7 +65,7 @@ public class JavaSimpleParamsExample {
lr.setMaxIter(10)
.setRegParam(0.01);
- // Learn a LogisticRegression model. This uses the parameters stored in lr.
+ // Learn a LogisticRegression model. This uses the parameters stored in lr.
LogisticRegressionModel model1 = lr.fit(training);
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
// we can view the parameters it used during fit().
@@ -82,7 +82,7 @@ public class JavaSimpleParamsExample {
// One can also combine ParamMaps.
ParamMap paramMap2 = new ParamMap();
- paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name
+ paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name.
ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
// Now learn a new model using the paramMapCombined parameters.
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
index 9516ce1f4f..7c24c46d2e 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -43,7 +43,9 @@ public class JavaSimpleTextClassificationPipeline {
public static void main(String[] args) {
SparkSession spark = SparkSession
- .builder().appName("JavaSimpleTextClassificationPipeline").getOrCreate();
+ .builder()
+ .appName("JavaSimpleTextClassificationPipeline")
+ .getOrCreate();
// Prepare training documents, which are labeled.
List<LabeledDocument> localTraining = Lists.newArrayList(
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
index 7f6c8de967..b3103ced91 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
@@ -47,10 +47,10 @@ object DecisionTreeClassificationExample {
val featureIndexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexedFeatures")
- .setMaxCategories(4) // features with > 4 distinct values are treated as continuous
+ .setMaxCategories(4) // features with > 4 distinct values are treated as continuous.
.fit(data)
- // Split the data into training and test sets (30% held out for testing)
+ // Split the data into training and test sets (30% held out for testing).
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
// Train a DecisionTree model.
@@ -64,11 +64,11 @@ object DecisionTreeClassificationExample {
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels)
- // Chain indexers and tree in a Pipeline
+ // Chain indexers and tree in a Pipeline.
val pipeline = new Pipeline()
.setStages(Array(labelIndexer, featureIndexer, dt, labelConverter))
- // Train model. This also runs the indexers.
+ // Train model. This also runs the indexers.
val model = pipeline.fit(trainingData)
// Make predictions.
@@ -77,7 +77,7 @@ object DecisionTreeClassificationExample {
// Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)
- // Select (prediction, true label) and compute test error
+ // Select (prediction, true label) and compute test error.
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("indexedLabel")
.setPredictionCol("prediction")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
index eadb02ab0d..310418008c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
@@ -23,7 +23,6 @@ import scala.language.reflectiveCalls
import scopt.OptionParser
-import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.{Pipeline, PipelineStage, Transformer}
import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}
@@ -40,7 +39,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession}
* {{{
* ./bin/run-example ml.DecisionTreeExample [options]
* }}}
- * Note that Decision Trees can take a large amount of memory. If the run-example command above
+ * Note that Decision Trees can take a large amount of memory. If the run-example command above
* fails, try running via spark-submit and specifying the amount of memory as at least 1g.
* For local mode, run
* {{{
@@ -87,7 +86,7 @@ object DecisionTreeExample {
.text(s"min info gain required to create a split, default: ${defaultParams.minInfoGain}")
.action((x, c) => c.copy(minInfoGain = x))
opt[Double]("fracTest")
- .text(s"fraction of data to hold out for testing. If given option testInput, " +
+ .text(s"fraction of data to hold out for testing. If given option testInput, " +
s"this option is ignored. default: ${defaultParams.fracTest}")
.action((x, c) => c.copy(fracTest = x))
opt[Boolean]("cacheNodeIds")
@@ -106,7 +105,7 @@ object DecisionTreeExample {
s"default: ${defaultParams.checkpointInterval}")
.action((x, c) => c.copy(checkpointInterval = x))
opt[String]("testInput")
- .text(s"input path to test dataset. If given, option fracTest is ignored." +
+ .text(s"input path to test dataset. If given, option fracTest is ignored." +
s" default: ${defaultParams.testInput}")
.action((x, c) => c.copy(testInput = x))
opt[String]("dataFormat")
@@ -157,11 +156,10 @@ object DecisionTreeExample {
* @param dataFormat "libsvm" or "dense"
* @param testInput Path to test dataset.
* @param algo Classification or Regression
- * @param fracTest Fraction of input data to hold out for testing. Ignored if testInput given.
+ * @param fracTest Fraction of input data to hold out for testing. Ignored if testInput given.
* @return (training dataset, test dataset)
*/
private[ml] def loadDatasets(
- sc: SparkContext,
input: String,
dataFormat: String,
testInput: String,
@@ -200,18 +198,21 @@ object DecisionTreeExample {
}
def run(params: Params) {
- val conf = new SparkConf().setAppName(s"DecisionTreeExample with $params")
- val sc = new SparkContext(conf)
- params.checkpointDir.foreach(sc.setCheckpointDir)
+ val spark = SparkSession
+ .builder
+ .appName(s"DecisionTreeExample with $params")
+ .getOrCreate()
+
+ params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir)
val algo = params.algo.toLowerCase
println(s"DecisionTreeExample with parameters:\n$params")
// Load training and test data and cache it.
val (training: DataFrame, test: DataFrame) =
- loadDatasets(sc, params.input, params.dataFormat, params.testInput, algo, params.fracTest)
+ loadDatasets(params.input, params.dataFormat, params.testInput, algo, params.fracTest)
- // Set up Pipeline
+ // Set up Pipeline.
val stages = new mutable.ArrayBuffer[PipelineStage]()
// (1) For classification, re-index classes.
val labelColName = if (algo == "classification") "indexedLabel" else "label"
@@ -228,7 +229,7 @@ object DecisionTreeExample {
.setOutputCol("indexedFeatures")
.setMaxCategories(10)
stages += featuresIndexer
- // (3) Learn Decision Tree
+ // (3) Learn Decision Tree.
val dt = algo match {
case "classification" =>
new DecisionTreeClassifier()
@@ -255,13 +256,13 @@ object DecisionTreeExample {
stages += dt
val pipeline = new Pipeline().setStages(stages.toArray)
- // Fit the Pipeline
+ // Fit the Pipeline.
val startTime = System.nanoTime()
val pipelineModel = pipeline.fit(training)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
- // Get the trained Decision Tree from the fitted PipelineModel
+ // Get the trained Decision Tree from the fitted PipelineModel.
algo match {
case "classification" =>
val treeModel = pipelineModel.stages.last.asInstanceOf[DecisionTreeClassificationModel]
@@ -280,7 +281,7 @@ object DecisionTreeExample {
case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
}
- // Evaluate model on training, test data
+ // Evaluate model on training, test data.
algo match {
case "classification" =>
println("Training data results:")
@@ -296,11 +297,11 @@ object DecisionTreeExample {
throw new IllegalArgumentException("Algo ${params.algo} not supported.")
}
- sc.stop()
+ spark.stop()
}
/**
- * Evaluate the given ClassificationModel on data. Print the results.
+ * Evaluate the given ClassificationModel on data. Print the results.
* @param model Must fit ClassificationModel abstraction
* @param data DataFrame with "prediction" and labelColName columns
* @param labelColName Name of the labelCol parameter for the model
@@ -314,7 +315,7 @@ object DecisionTreeExample {
val fullPredictions = model.transform(data).cache()
val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0))
val labels = fullPredictions.select(labelColName).rdd.map(_.getDouble(0))
- // Print number of classes for reference
+ // Print number of classes for reference.
val numClasses = MetadataUtils.getNumClasses(fullPredictions.schema(labelColName)) match {
case Some(n) => n
case None => throw new RuntimeException(
@@ -325,7 +326,7 @@ object DecisionTreeExample {
}
/**
- * Evaluate the given RegressionModel on data. Print the results.
+ * Evaluate the given RegressionModel on data. Print the results.
* @param model Must fit RegressionModel abstraction
* @param data DataFrame with "prediction" and labelColName columns
* @param labelColName Name of the labelCol parameter for the model
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
index 799070ef47..ee61200ad1 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
@@ -46,7 +46,7 @@ object DecisionTreeRegressionExample {
.setMaxCategories(4)
.fit(data)
- // Split the data into training and test sets (30% held out for testing)
+ // Split the data into training and test sets (30% held out for testing).
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
// Train a DecisionTree model.
@@ -54,11 +54,11 @@ object DecisionTreeRegressionExample {
.setLabelCol("label")
.setFeaturesCol("indexedFeatures")
- // Chain indexer and tree in a Pipeline
+ // Chain indexer and tree in a Pipeline.
val pipeline = new Pipeline()
.setStages(Array(featureIndexer, dt))
- // Train model. This also runs the indexer.
+ // Train model. This also runs the indexer.
val model = pipeline.fit(trainingData)
// Make predictions.
@@ -67,7 +67,7 @@ object DecisionTreeRegressionExample {
// Select example rows to display.
predictions.select("prediction", "label", "features").show(5)
- // Select (prediction, true label) and compute test error
+ // Select (prediction, true label) and compute test error.
val evaluator = new RegressionEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index a522d2127e..b8f47bf12b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -50,7 +50,7 @@ object DeveloperApiExample {
LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))))
- // Create a LogisticRegression instance. This instance is an Estimator.
+ // Create a LogisticRegression instance. This instance is an Estimator.
val lr = new MyLogisticRegression()
// Print out the parameters, documentation, and any default values.
println("MyLogisticRegression parameters:\n" + lr.explainParams() + "\n")
@@ -58,7 +58,7 @@ object DeveloperApiExample {
// We may set parameters using setter methods.
lr.setMaxIter(10)
- // Learn a LogisticRegression model. This uses the parameters stored in lr.
+ // Learn a LogisticRegression model. This uses the parameters stored in lr.
val model = lr.fit(training.toDF())
// Prepare test data.
@@ -84,7 +84,7 @@ object DeveloperApiExample {
/**
* Example of defining a parameter trait for a user-defined type of [[Classifier]].
*
- * NOTE: This is private since it is an example. In practice, you may not want it to be private.
+ * NOTE: This is private since it is an example. In practice, you may not want it to be private.
*/
private trait MyLogisticRegressionParams extends ClassifierParams {
@@ -96,7 +96,7 @@ private trait MyLogisticRegressionParams extends ClassifierParams {
* - def getMyParamName
* - def setMyParamName
* Here, we have a trait to be mixed in with the Estimator and Model (MyLogisticRegression
- * and MyLogisticRegressionModel). We place the setter (setMaxIter) method in the Estimator
+ * and MyLogisticRegressionModel). We place the setter (setMaxIter) method in the Estimator
* class since the maxIter parameter is only used during training (not in the Model).
*/
val maxIter: IntParam = new IntParam(this, "maxIter", "max number of iterations")
@@ -106,7 +106,7 @@ private trait MyLogisticRegressionParams extends ClassifierParams {
/**
* Example of defining a type of [[Classifier]].
*
- * NOTE: This is private since it is an example. In practice, you may not want it to be private.
+ * NOTE: This is private since it is an example. In practice, you may not want it to be private.
*/
private class MyLogisticRegression(override val uid: String)
extends Classifier[Vector, MyLogisticRegression, MyLogisticRegressionModel]
@@ -138,7 +138,7 @@ private class MyLogisticRegression(override val uid: String)
/**
* Example of defining a type of [[ClassificationModel]].
*
- * NOTE: This is private since it is an example. In practice, you may not want it to be private.
+ * NOTE: This is private since it is an example. In practice, you may not want it to be private.
*/
private class MyLogisticRegressionModel(
override val uid: String,
@@ -169,7 +169,7 @@ private class MyLogisticRegressionModel(
Vectors.dense(-margin, margin)
}
- /** Number of classes the label can take. 2 indicates binary classification. */
+ /** Number of classes the label can take. 2 indicates binary classification. */
override val numClasses: Int = 2
/** Number of features the model was trained on. */
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
index 972241e769..a2918d66ea 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
@@ -43,7 +43,7 @@ object EstimatorTransformerParamExample {
(1.0, Vectors.dense(0.0, 1.2, -0.5))
)).toDF("label", "features")
- // Create a LogisticRegression instance. This instance is an Estimator.
+ // Create a LogisticRegression instance. This instance is an Estimator.
val lr = new LogisticRegression()
// Print out the parameters, documentation, and any default values.
println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
@@ -52,7 +52,7 @@ object EstimatorTransformerParamExample {
lr.setMaxIter(10)
.setRegParam(0.01)
- // Learn a LogisticRegression model. This uses the parameters stored in lr.
+ // Learn a LogisticRegression model. This uses the parameters stored in lr.
val model1 = lr.fit(training)
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
// we can view the parameters it used during fit().
@@ -63,11 +63,11 @@ object EstimatorTransformerParamExample {
// We may alternatively specify parameters using a ParamMap,
// which supports several methods for specifying parameters.
val paramMap = ParamMap(lr.maxIter -> 20)
- .put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter.
+ .put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter.
.put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params.
// One can also combine ParamMaps.
- val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name
+ val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name.
val paramMapCombined = paramMap ++ paramMap2
// Now learn a new model using the paramMapCombined parameters.
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
index 6b0be0f34e..a4274ae954 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
@@ -23,13 +23,12 @@ import scala.language.reflectiveCalls
import scopt.OptionParser
-import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}
import org.apache.spark.ml.feature.{StringIndexer, VectorIndexer}
import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{DataFrame, SparkSession}
/**
@@ -37,7 +36,7 @@ import org.apache.spark.sql.DataFrame
* {{{
* ./bin/run-example ml.GBTExample [options]
* }}}
- * Decision Trees and ensembles can take a large amount of memory. If the run-example command
+ * Decision Trees and ensembles can take a large amount of memory. If the run-example command
* above fails, try running via spark-submit and specifying the amount of memory as at least 1g.
* For local mode, run
* {{{
@@ -88,7 +87,7 @@ object GBTExample {
.text(s"number of trees in ensemble, default: ${defaultParams.maxIter}")
.action((x, c) => c.copy(maxIter = x))
opt[Double]("fracTest")
- .text(s"fraction of data to hold out for testing. If given option testInput, " +
+ .text(s"fraction of data to hold out for testing. If given option testInput, " +
s"this option is ignored. default: ${defaultParams.fracTest}")
.action((x, c) => c.copy(fracTest = x))
opt[Boolean]("cacheNodeIds")
@@ -109,7 +108,7 @@ object GBTExample {
s"default: ${defaultParams.checkpointInterval}")
.action((x, c) => c.copy(checkpointInterval = x))
opt[String]("testInput")
- .text(s"input path to test dataset. If given, option fracTest is ignored." +
+ .text(s"input path to test dataset. If given, option fracTest is ignored." +
s" default: ${defaultParams.testInput}")
.action((x, c) => c.copy(testInput = x))
opt[String]("dataFormat")
@@ -136,15 +135,18 @@ object GBTExample {
}
def run(params: Params) {
- val conf = new SparkConf().setAppName(s"GBTExample with $params")
- val sc = new SparkContext(conf)
- params.checkpointDir.foreach(sc.setCheckpointDir)
+ val spark = SparkSession
+ .builder
+ .appName(s"GBTExample with $params")
+ .getOrCreate()
+
+ params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir)
val algo = params.algo.toLowerCase
println(s"GBTExample with parameters:\n$params")
// Load training and test data and cache it.
- val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input,
+ val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input,
params.dataFormat, params.testInput, algo, params.fracTest)
// Set up Pipeline
@@ -164,7 +166,7 @@ object GBTExample {
.setOutputCol("indexedFeatures")
.setMaxCategories(10)
stages += featuresIndexer
- // (3) Learn GBT
+ // (3) Learn GBT.
val dt = algo match {
case "classification" =>
new GBTClassifier()
@@ -193,13 +195,13 @@ object GBTExample {
stages += dt
val pipeline = new Pipeline().setStages(stages.toArray)
- // Fit the Pipeline
+ // Fit the Pipeline.
val startTime = System.nanoTime()
val pipelineModel = pipeline.fit(training)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
- // Get the trained GBT from the fitted PipelineModel
+ // Get the trained GBT from the fitted PipelineModel.
algo match {
case "classification" =>
val rfModel = pipelineModel.stages.last.asInstanceOf[GBTClassificationModel]
@@ -218,7 +220,7 @@ object GBTExample {
case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
}
- // Evaluate model on training, test data
+ // Evaluate model on training, test data.
algo match {
case "classification" =>
println("Training data results:")
@@ -234,7 +236,7 @@ object GBTExample {
throw new IllegalArgumentException("Algo ${params.algo} not supported.")
}
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
index b6a8baba2d..0d1ffbe225 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
@@ -51,7 +51,7 @@ object GradientBoostedTreeClassifierExample {
.setMaxCategories(4)
.fit(data)
- // Split the data into training and test sets (30% held out for testing)
+ // Split the data into training and test sets (30% held out for testing).
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
// Train a GBT model.
@@ -66,11 +66,11 @@ object GradientBoostedTreeClassifierExample {
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels)
- // Chain indexers and GBT in a Pipeline
+ // Chain indexers and GBT in a Pipeline.
val pipeline = new Pipeline()
.setStages(Array(labelIndexer, featureIndexer, gbt, labelConverter))
- // Train model. This also runs the indexers.
+ // Train model. This also runs the indexers.
val model = pipeline.fit(trainingData)
// Make predictions.
@@ -79,7 +79,7 @@ object GradientBoostedTreeClassifierExample {
// Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)
- // Select (prediction, true label) and compute test error
+ // Select (prediction, true label) and compute test error.
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("indexedLabel")
.setPredictionCol("prediction")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
index 62285b83cb..e53aab7f32 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
@@ -45,7 +45,7 @@ object GradientBoostedTreeRegressorExample {
.setMaxCategories(4)
.fit(data)
- // Split the data into training and test sets (30% held out for testing)
+ // Split the data into training and test sets (30% held out for testing).
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
// Train a GBT model.
@@ -54,11 +54,11 @@ object GradientBoostedTreeRegressorExample {
.setFeaturesCol("indexedFeatures")
.setMaxIter(10)
- // Chain indexer and GBT in a Pipeline
+ // Chain indexer and GBT in a Pipeline.
val pipeline = new Pipeline()
.setStages(Array(featureIndexer, gbt))
- // Train model. This also runs the indexer.
+ // Train model. This also runs the indexer.
val model = pipeline.fit(trainingData)
// Make predictions.
@@ -67,7 +67,7 @@ object GradientBoostedTreeRegressorExample {
// Select example rows to display.
predictions.select("prediction", "label", "features").show(5)
- // Select (prediction, true label) and compute test error
+ // Select (prediction, true label) and compute test error.
val evaluator = new RegressionEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
index 25be87811d..de96fb2979 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
@@ -22,10 +22,9 @@ import scala.language.reflectiveCalls
import scopt.OptionParser
-import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.regression.LinearRegression
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* An example runner for linear regression with elastic-net (mixing L1/L2) regularization.
@@ -74,11 +73,11 @@ object LinearRegressionExample {
s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}")
.action((x, c) => c.copy(tol = x))
opt[Double]("fracTest")
- .text(s"fraction of data to hold out for testing. If given option testInput, " +
+ .text(s"fraction of data to hold out for testing. If given option testInput, " +
s"this option is ignored. default: ${defaultParams.fracTest}")
.action((x, c) => c.copy(fracTest = x))
opt[String]("testInput")
- .text(s"input path to test dataset. If given, option fracTest is ignored." +
+ .text(s"input path to test dataset. If given, option fracTest is ignored." +
s" default: ${defaultParams.testInput}")
.action((x, c) => c.copy(testInput = x))
opt[String]("dataFormat")
@@ -105,13 +104,15 @@ object LinearRegressionExample {
}
def run(params: Params) {
- val conf = new SparkConf().setAppName(s"LinearRegressionExample with $params")
- val sc = new SparkContext(conf)
+ val spark = SparkSession
+ .builder
+ .appName(s"LinearRegressionExample with $params")
+ .getOrCreate()
println(s"LinearRegressionExample with parameters:\n$params")
// Load training and test data and cache it.
- val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input,
+ val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input,
params.dataFormat, params.testInput, "regression", params.fracTest)
val lir = new LinearRegression()
@@ -136,7 +137,7 @@ object LinearRegressionExample {
println("Test data results:")
DecisionTreeExample.evaluateRegressionModel(lirModel, test, "label")
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
index a380c90662..c2a87e1ddf 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
@@ -23,12 +23,11 @@ import scala.language.reflectiveCalls
import scopt.OptionParser
-import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.feature.StringIndexer
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* An example runner for logistic regression with elastic-net (mixing L1/L2) regularization.
@@ -81,11 +80,11 @@ object LogisticRegressionExample {
s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}")
.action((x, c) => c.copy(tol = x))
opt[Double]("fracTest")
- .text(s"fraction of data to hold out for testing. If given option testInput, " +
+ .text(s"fraction of data to hold out for testing. If given option testInput, " +
s"this option is ignored. default: ${defaultParams.fracTest}")
.action((x, c) => c.copy(fracTest = x))
opt[String]("testInput")
- .text(s"input path to test dataset. If given, option fracTest is ignored." +
+ .text(s"input path to test dataset. If given, option fracTest is ignored." +
s" default: ${defaultParams.testInput}")
.action((x, c) => c.copy(testInput = x))
opt[String]("dataFormat")
@@ -112,16 +111,18 @@ object LogisticRegressionExample {
}
def run(params: Params) {
- val conf = new SparkConf().setAppName(s"LogisticRegressionExample with $params")
- val sc = new SparkContext(conf)
+ val spark = SparkSession
+ .builder
+ .appName(s"LogisticRegressionExample with $params")
+ .getOrCreate()
println(s"LogisticRegressionExample with parameters:\n$params")
// Load training and test data and cache it.
- val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input,
+ val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input,
params.dataFormat, params.testInput, "classification", params.fracTest)
- // Set up Pipeline
+ // Set up Pipeline.
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
@@ -141,7 +142,7 @@ object LogisticRegressionExample {
stages += lor
val pipeline = new Pipeline().setStages(stages.toArray)
- // Fit the Pipeline
+ // Fit the Pipeline.
val startTime = System.nanoTime()
val pipelineModel = pipeline.fit(training)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
@@ -156,7 +157,7 @@ object LogisticRegressionExample {
println("Test data results:")
DecisionTreeExample.evaluateClassificationModel(pipelineModel, test, "indexedLabel")
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala
index fcba813d5b..616263b8e9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala
@@ -27,7 +27,9 @@ object LogisticRegressionWithElasticNetExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
- .builder.appName("LogisticRegressionWithElasticNetExample").getOrCreate()
+ .builder
+ .appName("LogisticRegressionWithElasticNetExample")
+ .getOrCreate()
// $example on$
// Load training data
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala
index 5fb3536060..c29d36210a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala
@@ -42,7 +42,9 @@ object ModelSelectionViaCrossValidationExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
- .builder.appName("ModelSelectionViaCrossValidationExample").getOrCreate()
+ .builder
+ .appName("ModelSelectionViaCrossValidationExample")
+ .getOrCreate()
// $example on$
// Prepare training data from a list of (id, text, label) tuples.
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
index 6bc082982c..75fef2922a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
@@ -36,7 +36,9 @@ object ModelSelectionViaTrainValidationSplitExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
- .builder.appName("ModelSelectionViaTrainValidationSplitExample").getOrCreate()
+ .builder
+ .appName("ModelSelectionViaTrainValidationSplitExample")
+ .getOrCreate()
// $example on$
// Prepare training and test data.
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
index ae0bd945d8..cccc4a6ea2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
@@ -51,7 +51,7 @@ object RandomForestClassifierExample {
.setMaxCategories(4)
.fit(data)
- // Split the data into training and test sets (30% held out for testing)
+ // Split the data into training and test sets (30% held out for testing).
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
// Train a RandomForest model.
@@ -66,11 +66,11 @@ object RandomForestClassifierExample {
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels)
- // Chain indexers and forest in a Pipeline
+ // Chain indexers and forest in a Pipeline.
val pipeline = new Pipeline()
.setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))
- // Train model. This also runs the indexers.
+ // Train model. This also runs the indexers.
val model = pipeline.fit(trainingData)
// Make predictions.
@@ -79,7 +79,7 @@ object RandomForestClassifierExample {
// Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)
- // Select (prediction, true label) and compute test error
+ // Select (prediction, true label) and compute test error.
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("indexedLabel")
.setPredictionCol("prediction")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
index 7a00d99dfe..2419dc49cd 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
@@ -23,13 +23,12 @@ import scala.language.reflectiveCalls
import scopt.OptionParser
-import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.feature.{StringIndexer, VectorIndexer}
import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{DataFrame, SparkSession}
/**
@@ -37,7 +36,7 @@ import org.apache.spark.sql.DataFrame
* {{{
* ./bin/run-example ml.RandomForestExample [options]
* }}}
- * Decision Trees and ensembles can take a large amount of memory. If the run-example command
+ * Decision Trees and ensembles can take a large amount of memory. If the run-example command
* above fails, try running via spark-submit and specifying the amount of memory as at least 1g.
* For local mode, run
* {{{
@@ -94,7 +93,7 @@ object RandomForestExample {
s" default: ${defaultParams.numTrees}")
.action((x, c) => c.copy(featureSubsetStrategy = x))
opt[Double]("fracTest")
- .text(s"fraction of data to hold out for testing. If given option testInput, " +
+ .text(s"fraction of data to hold out for testing. If given option testInput, " +
s"this option is ignored. default: ${defaultParams.fracTest}")
.action((x, c) => c.copy(fracTest = x))
opt[Boolean]("cacheNodeIds")
@@ -115,7 +114,7 @@ object RandomForestExample {
s"default: ${defaultParams.checkpointInterval}")
.action((x, c) => c.copy(checkpointInterval = x))
opt[String]("testInput")
- .text(s"input path to test dataset. If given, option fracTest is ignored." +
+ .text(s"input path to test dataset. If given, option fracTest is ignored." +
s" default: ${defaultParams.testInput}")
.action((x, c) => c.copy(testInput = x))
opt[String]("dataFormat")
@@ -142,18 +141,21 @@ object RandomForestExample {
}
def run(params: Params) {
- val conf = new SparkConf().setAppName(s"RandomForestExample with $params")
- val sc = new SparkContext(conf)
- params.checkpointDir.foreach(sc.setCheckpointDir)
+ val spark = SparkSession
+ .builder
+ .appName(s"RandomForestExample with $params")
+ .getOrCreate()
+
+ params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir)
val algo = params.algo.toLowerCase
println(s"RandomForestExample with parameters:\n$params")
// Load training and test data and cache it.
- val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input,
+ val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input,
params.dataFormat, params.testInput, algo, params.fracTest)
- // Set up Pipeline
+ // Set up Pipeline.
val stages = new mutable.ArrayBuffer[PipelineStage]()
// (1) For classification, re-index classes.
val labelColName = if (algo == "classification") "indexedLabel" else "label"
@@ -170,7 +172,7 @@ object RandomForestExample {
.setOutputCol("indexedFeatures")
.setMaxCategories(10)
stages += featuresIndexer
- // (3) Learn Random Forest
+ // (3) Learn Random Forest.
val dt = algo match {
case "classification" =>
new RandomForestClassifier()
@@ -201,13 +203,13 @@ object RandomForestExample {
stages += dt
val pipeline = new Pipeline().setStages(stages.toArray)
- // Fit the Pipeline
+ // Fit the Pipeline.
val startTime = System.nanoTime()
val pipelineModel = pipeline.fit(training)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
- // Get the trained Random Forest from the fitted PipelineModel
+ // Get the trained Random Forest from the fitted PipelineModel.
algo match {
case "classification" =>
val rfModel = pipelineModel.stages.last.asInstanceOf[RandomForestClassificationModel]
@@ -226,7 +228,7 @@ object RandomForestExample {
case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
}
- // Evaluate model on training, test data
+ // Evaluate model on training, test data.
algo match {
case "classification" =>
println("Training data results:")
@@ -242,7 +244,7 @@ object RandomForestExample {
throw new IllegalArgumentException("Algo ${params.algo} not supported.")
}
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
index 96dc2f05be..9a0a001c26 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
@@ -45,7 +45,7 @@ object RandomForestRegressorExample {
.setMaxCategories(4)
.fit(data)
- // Split the data into training and test sets (30% held out for testing)
+ // Split the data into training and test sets (30% held out for testing).
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
// Train a RandomForest model.
@@ -53,11 +53,11 @@ object RandomForestRegressorExample {
.setLabelCol("label")
.setFeaturesCol("indexedFeatures")
- // Chain indexer and forest in a Pipeline
+ // Chain indexer and forest in a Pipeline.
val pipeline = new Pipeline()
.setStages(Array(featureIndexer, rf))
- // Train model. This also runs the indexer.
+ // Train model. This also runs the indexer.
val model = pipeline.fit(trainingData)
// Make predictions.
@@ -66,7 +66,7 @@ object RandomForestRegressorExample {
// Select example rows to display.
predictions.select("prediction", "label", "features").show(5)
- // Select (prediction, true label) and compute test error
+ // Select (prediction, true label) and compute test error.
val evaluator = new RegressionEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index 3547dd95bd..83bab5c557 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -41,7 +41,7 @@ object SimpleParamsExample {
import spark.implicits._
// Prepare training data.
- // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes
+ // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes
// into DataFrames, where it uses the case class metadata to infer the schema.
val training = spark.createDataFrame(Seq(
LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
@@ -49,7 +49,7 @@ object SimpleParamsExample {
LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))))
- // Create a LogisticRegression instance. This instance is an Estimator.
+ // Create a LogisticRegression instance. This instance is an Estimator.
val lr = new LogisticRegression()
// Print out the parameters, documentation, and any default values.
println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
@@ -58,7 +58,7 @@ object SimpleParamsExample {
lr.setMaxIter(10)
.setRegParam(0.01)
- // Learn a LogisticRegression model. This uses the parameters stored in lr.
+ // Learn a LogisticRegression model. This uses the parameters stored in lr.
val model1 = lr.fit(training)
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
// we can view the parameters it used during fit().
@@ -69,7 +69,7 @@ object SimpleParamsExample {
// We may alternatively specify parameters using a ParamMap,
// which supports several methods for specifying parameters.
val paramMap = ParamMap(lr.maxIter -> 20)
- paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter.
+ paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter.
paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.45, 0.55)) // Specify multiple Params.
// One can also combine ParamMaps.