aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorDongjoon Hyun <dongjoon@apache.org>2016-05-04 14:31:36 -0700
committerAndrew Or <andrew@databricks.com>2016-05-04 14:31:36 -0700
commitcdce4e62a5674e2034e5d395578b1a60e3d8c435 (patch)
treec715f2555dad353683f82820962576f89b2db452 /examples
parentcf2e9da612397233ae7bca0e9ce57309f16226b5 (diff)
downloadspark-cdce4e62a5674e2034e5d395578b1a60e3d8c435.tar.gz
spark-cdce4e62a5674e2034e5d395578b1a60e3d8c435.tar.bz2
spark-cdce4e62a5674e2034e5d395578b1a60e3d8c435.zip
[SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example.
## What changes were proposed in this pull request? This PR aims to update Scala/Python/Java examples by replacing `SQLContext` with newly added `SparkSession`. - Use **SparkSession Builder Pattern** in 154(Scala 55, Java 52, Python 47) files. - Add `getConf` in Python SparkContext class: `python/pyspark/context.py` - Replace **SQLContext Singleton Pattern** with **SparkSession Singleton Pattern**: - `SqlNetworkWordCount.scala` - `JavaSqlNetworkWordCount.java` - `sql_network_wordcount.py` Now, `SQLContexts` are used only in R examples and the following two Python examples. The python examples are untouched in this PR since it already fails some unknown issue. - `simple_params_example.py` - `aft_survival_regression.py` ## How was this patch tested? Manual. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #12809 from dongjoon-hyun/SPARK-15031.
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java12
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java15
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java15
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java15
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java19
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java15
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java13
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java13
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java15
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java15
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java16
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java11
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java13
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java13
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java13
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java12
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java12
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java16
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java13
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java13
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java16
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java17
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java29
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java19
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java15
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java13
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java18
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java12
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java19
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java19
-rw-r--r--examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java33
-rw-r--r--examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java19
-rw-r--r--examples/src/main/python/ml/als_example.py14
-rw-r--r--examples/src/main/python/ml/binarizer_example.py10
-rw-r--r--examples/src/main/python/ml/bisecting_k_means_example.py16
-rw-r--r--examples/src/main/python/ml/bucketizer_example.py10
-rw-r--r--examples/src/main/python/ml/chisq_selector_example.py10
-rw-r--r--examples/src/main/python/ml/count_vectorizer_example.py10
-rw-r--r--examples/src/main/python/ml/cross_validator.py49
-rw-r--r--examples/src/main/python/ml/dataframe_example.py14
-rw-r--r--examples/src/main/python/ml/dct_example.py10
-rw-r--r--examples/src/main/python/ml/decision_tree_classification_example.py9
-rw-r--r--examples/src/main/python/ml/decision_tree_regression_example.py9
-rw-r--r--examples/src/main/python/ml/elementwise_product_example.py10
-rw-r--r--examples/src/main/python/ml/estimator_transformer_param_example.py13
-rw-r--r--examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py9
-rw-r--r--examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py9
-rw-r--r--examples/src/main/python/ml/index_to_string_example.py10
-rw-r--r--examples/src/main/python/ml/kmeans_example.py16
-rw-r--r--examples/src/main/python/ml/linear_regression_with_elastic_net.py10
-rw-r--r--examples/src/main/python/ml/logistic_regression_with_elastic_net.py10
-rw-r--r--examples/src/main/python/ml/max_abs_scaler_example.py10
-rw-r--r--examples/src/main/python/ml/min_max_scaler_example.py10
-rw-r--r--examples/src/main/python/ml/multilayer_perceptron_classification.py12
-rw-r--r--examples/src/main/python/ml/n_gram_example.py10
-rw-r--r--examples/src/main/python/ml/naive_bayes_example.py11
-rw-r--r--examples/src/main/python/ml/normalizer_example.py10
-rw-r--r--examples/src/main/python/ml/onehot_encoder_example.py10
-rw-r--r--examples/src/main/python/ml/pca_example.py10
-rw-r--r--examples/src/main/python/ml/pipeline_example.py13
-rw-r--r--examples/src/main/python/ml/polynomial_expansion_example.py10
-rw-r--r--examples/src/main/python/ml/random_forest_classifier_example.py9
-rw-r--r--examples/src/main/python/ml/random_forest_regressor_example.py9
-rw-r--r--examples/src/main/python/ml/rformula_example.py10
-rw-r--r--examples/src/main/python/ml/simple_text_classification_pipeline.py32
-rw-r--r--examples/src/main/python/ml/sql_transformer.py10
-rw-r--r--examples/src/main/python/ml/standard_scaler_example.py10
-rw-r--r--examples/src/main/python/ml/stopwords_remover_example.py10
-rw-r--r--examples/src/main/python/ml/string_indexer_example.py10
-rw-r--r--examples/src/main/python/ml/tf_idf_example.py10
-rw-r--r--examples/src/main/python/ml/tokenizer_example.py10
-rw-r--r--examples/src/main/python/ml/train_validation_split.py10
-rw-r--r--examples/src/main/python/ml/vector_assembler_example.py10
-rw-r--r--examples/src/main/python/ml/vector_indexer_example.py10
-rw-r--r--examples/src/main/python/ml/vector_slicer_example.py10
-rw-r--r--examples/src/main/python/ml/word2vec_example.py10
-rw-r--r--examples/src/main/python/mllib/binary_classification_metrics_example.py6
-rw-r--r--examples/src/main/python/sql.py2
-rw-r--r--examples/src/main/python/streaming/sql_network_wordcount.py19
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala14
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala14
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala14
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala18
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala17
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala13
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala13
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala13
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala13
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala14
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala14
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala13
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala13
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala13
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala16
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala19
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala15
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala12
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala17
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala6
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala18
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala21
154 files changed, 847 insertions, 1232 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java
index 22b93a3a85..ecb7084e03 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java
@@ -21,23 +21,19 @@ package org.apache.spark.examples.ml;
import java.util.Arrays;
import java.util.List;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.regression.AFTSurvivalRegression;
import org.apache.spark.ml.regression.AFTSurvivalRegressionModel;
import org.apache.spark.mllib.linalg.*;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.*;
// $example off$
public class JavaAFTSurvivalRegressionExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaAFTSurvivalRegressionExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaAFTSurvivalRegressionExample").getOrCreate();
// $example on$
List<Row> data = Arrays.asList(
@@ -52,7 +48,7 @@ public class JavaAFTSurvivalRegressionExample {
new StructField("censor", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("features", new VectorUDT(), false, Metadata.empty())
});
- Dataset<Row> training = jsql.createDataFrame(data, schema);
+ Dataset<Row> training = spark.createDataFrame(data, schema);
double[] quantileProbabilities = new double[]{0.3, 0.6};
AFTSurvivalRegression aft = new AFTSurvivalRegression()
.setQuantileProbabilities(quantileProbabilities)
@@ -66,6 +62,6 @@ public class JavaAFTSurvivalRegressionExample {
model.transform(training).show(false);
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java
index 088037d427..9a9a10489b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java
@@ -17,11 +17,9 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.io.Serializable;
@@ -83,18 +81,17 @@ public class JavaALSExample {
// $example off$
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaALSExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaALSExample").getOrCreate();
// $example on$
- JavaRDD<Rating> ratingsRDD = jsc.textFile("data/mllib/als/sample_movielens_ratings.txt")
+ JavaRDD<Rating> ratingsRDD = spark
+ .read().text("data/mllib/als/sample_movielens_ratings.txt").javaRDD()
.map(new Function<String, Rating>() {
public Rating call(String str) {
return Rating.parseRating(str);
}
});
- Dataset<Row> ratings = sqlContext.createDataFrame(ratingsRDD, Rating.class);
+ Dataset<Row> ratings = spark.createDataFrame(ratingsRDD, Rating.class);
Dataset<Row>[] splits = ratings.randomSplit(new double[]{0.8, 0.2});
Dataset<Row> training = splits[0];
Dataset<Row> test = splits[1];
@@ -121,6 +118,6 @@ public class JavaALSExample {
Double rmse = evaluator.evaluate(predictions);
System.out.println("Root-mean-square error = " + rmse);
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
index 0a6e9c2a1f..88e4298a61 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
@@ -20,10 +20,11 @@ package org.apache.spark.examples.ml;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.Binarizer;
@@ -37,21 +38,19 @@ import org.apache.spark.sql.types.StructType;
public class JavaBinarizerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaBinarizerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaBinarizerExample").getOrCreate();
// $example on$
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(0, 0.1),
RowFactory.create(1, 0.8),
RowFactory.create(2, 0.2)
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
});
- Dataset<Row> continuousDataFrame = jsql.createDataFrame(jrdd, schema);
+ Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);
Binarizer binarizer = new Binarizer()
.setInputCol("feature")
.setOutputCol("binarized_feature")
@@ -63,6 +62,6 @@ public class JavaBinarizerExample {
System.out.println(binarized_value);
}
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
index 1d1a518bbc..51aa35084e 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
@@ -18,12 +18,10 @@
package org.apache.spark.examples.ml;
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import org.apache.spark.ml.clustering.BisectingKMeans;
import org.apache.spark.ml.clustering.BisectingKMeansModel;
@@ -44,25 +42,23 @@ import org.apache.spark.sql.types.StructType;
public class JavaBisectingKMeansExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaBisectingKMeansExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaBisectingKMeansExample").getOrCreate();
// $example on$
- JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(Vectors.dense(0.1, 0.1, 0.1)),
RowFactory.create(Vectors.dense(0.3, 0.3, 0.25)),
RowFactory.create(Vectors.dense(0.1, 0.1, -0.1)),
RowFactory.create(Vectors.dense(20.3, 20.1, 19.9)),
RowFactory.create(Vectors.dense(20.2, 20.1, 19.7)),
RowFactory.create(Vectors.dense(18.9, 20.0, 19.7))
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("features", new VectorUDT(), false, Metadata.empty()),
});
- Dataset<Row> dataset = jsql.createDataFrame(data, schema);
+ Dataset<Row> dataset = spark.createDataFrame(data, schema);
BisectingKMeans bkm = new BisectingKMeans().setK(2);
BisectingKMeansModel model = bkm.fit(dataset);
@@ -76,6 +72,6 @@ public class JavaBisectingKMeansExample {
}
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
index 68ffa702ea..0c24f52cf5 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
@@ -17,14 +17,12 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.Bucketizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
@@ -37,23 +35,21 @@ import org.apache.spark.sql.types.StructType;
public class JavaBucketizerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaBucketizerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaBucketizerExample").getOrCreate();
// $example on$
double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
- JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(-0.5),
RowFactory.create(-0.3),
RowFactory.create(0.0),
RowFactory.create(0.2)
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
});
- Dataset<Row> dataFrame = jsql.createDataFrame(data, schema);
+ Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
Bucketizer bucketizer = new Bucketizer()
.setInputCol("features")
@@ -64,7 +60,7 @@ public class JavaBucketizerExample {
Dataset<Row> bucketedData = bucketizer.transform(dataFrame);
bucketedData.show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java
index b1bf1cfeb2..684cf9a714 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java
@@ -21,10 +21,11 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
import org.apache.spark.ml.feature.ChiSqSelector;
import org.apache.spark.mllib.linalg.VectorUDT;
@@ -39,23 +40,21 @@ import org.apache.spark.sql.types.StructType;
public class JavaChiSqSelectorExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaChiSqSelectorExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaChiSqSelectorExample").getOrCreate();
// $example on$
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("features", new VectorUDT(), false, Metadata.empty()),
new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty())
});
- Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> df = spark.createDataFrame(data, schema);
ChiSqSelector selector = new ChiSqSelector()
.setNumTopFeatures(1)
@@ -66,6 +65,6 @@ public class JavaChiSqSelectorExample {
Dataset<Row> result = selector.fit(df).transform(df);
result.show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
index ec3ac202be..0631f9d6d5 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
@@ -19,36 +19,31 @@ package org.apache.spark.examples.ml;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.feature.CountVectorizer;
import org.apache.spark.ml.feature.CountVectorizerModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.*;
// $example off$
public class JavaCountVectorizerExample {
public static void main(String[] args) {
-
- SparkConf conf = new SparkConf().setAppName("JavaCountVectorizerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaCountVectorizerExample").getOrCreate();
// $example on$
// Input data: Each row is a bag of words from a sentence or document.
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(Arrays.asList("a", "b", "c")),
RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
- ));
+ );
StructType schema = new StructType(new StructField [] {
new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
});
- Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> df = spark.createDataFrame(data, schema);
// fit a CountVectorizerModel from the corpus
CountVectorizerModel cvModel = new CountVectorizer()
@@ -66,6 +61,6 @@ public class JavaCountVectorizerExample {
cvModel.transform(df).show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
index 4b15fde9c3..ec57a24451 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
@@ -20,10 +20,11 @@ package org.apache.spark.examples.ml;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.DCT;
@@ -38,20 +39,18 @@ import org.apache.spark.sql.types.StructType;
public class JavaDCTExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaDCTExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaDCTExample").getOrCreate();
// $example on$
- JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("features", new VectorUDT(), false, Metadata.empty()),
});
- Dataset<Row> df = jsql.createDataFrame(data, schema);
+ Dataset<Row> df = spark.createDataFrame(data, schema);
DCT dct = new DCT()
.setInputCol("features")
.setOutputCol("featuresDCT")
@@ -59,7 +58,7 @@ public class JavaDCTExample {
Dataset<Row> dctDf = dct.transform(df);
dctDf.select("featuresDCT").show(3);
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
index 8214952f80..733bc4181c 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
@@ -17,8 +17,6 @@
// scalastyle:off println
package org.apache.spark.examples.ml;
// $example on$
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
@@ -28,18 +26,17 @@ import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
import org.apache.spark.ml.feature.*;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example off$
public class JavaDecisionTreeClassificationExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaDecisionTreeClassificationExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaDecisionTreeClassificationExample").getOrCreate();
// $example on$
// Load the data stored in LIBSVM format as a DataFrame.
- Dataset<Row> data = sqlContext
+ Dataset<Row> data = spark
.read()
.format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
@@ -100,6 +97,6 @@ public class JavaDecisionTreeClassificationExample {
System.out.println("Learned classification tree model:\n" + treeModel.toDebugString());
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
index a4f3e97bf3..bd6dc3edd3 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
@@ -17,8 +17,6 @@
// scalastyle:off println
package org.apache.spark.examples.ml;
// $example on$
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
@@ -29,17 +27,16 @@ import org.apache.spark.ml.regression.DecisionTreeRegressionModel;
import org.apache.spark.ml.regression.DecisionTreeRegressor;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example off$
public class JavaDecisionTreeRegressionExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaDecisionTreeRegressionExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaDecisionTreeRegressionExample").getOrCreate();
// $example on$
// Load the data stored in LIBSVM format as a DataFrame.
- Dataset<Row> data = sqlContext.read().format("libsvm")
+ Dataset<Row> data = spark.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
// Automatically identify categorical features, and index them.
@@ -85,6 +82,6 @@ public class JavaDecisionTreeRegressionExample {
System.out.println("Learned regression tree model:\n" + treeModel.toDebugString());
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
index 0ba94786d4..90023ac06b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
@@ -21,9 +21,7 @@ import java.util.List;
import com.google.common.collect.Lists;
-import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.classification.Classifier;
import org.apache.spark.ml.classification.ClassificationModel;
import org.apache.spark.ml.param.IntParam;
@@ -35,7 +33,7 @@ import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
/**
@@ -51,9 +49,7 @@ import org.apache.spark.sql.SQLContext;
public class JavaDeveloperApiExample {
public static void main(String[] args) throws Exception {
- SparkConf conf = new SparkConf().setAppName("JavaDeveloperApiExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaDeveloperApiExample").getOrCreate();
// Prepare training data.
List<LabeledPoint> localTraining = Lists.newArrayList(
@@ -61,8 +57,7 @@ public class JavaDeveloperApiExample {
new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
- Dataset<Row> training = jsql.createDataFrame(
- jsc.parallelize(localTraining), LabeledPoint.class);
+ Dataset<Row> training = spark.createDataFrame(localTraining, LabeledPoint.class);
// Create a LogisticRegression instance. This instance is an Estimator.
MyJavaLogisticRegression lr = new MyJavaLogisticRegression();
@@ -80,7 +75,7 @@ public class JavaDeveloperApiExample {
new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
- Dataset<Row> test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
+ Dataset<Row> test = spark.createDataFrame(localTest, LabeledPoint.class);
// Make predictions on test documents. cvModel uses the best model found (lrModel).
Dataset<Row> results = model.transform(test);
@@ -93,7 +88,7 @@ public class JavaDeveloperApiExample {
" even though all coefficients are 0!");
}
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java
index 37de9cf359..a062a6fcd0 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java
@@ -20,7 +20,7 @@ package org.apache.spark.examples.ml;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.ArrayList;
@@ -41,16 +41,15 @@ import org.apache.spark.sql.types.StructType;
public class JavaElementwiseProductExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaElementwiseProductExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaElementwiseProductExample").getOrCreate();
// $example on$
// Create some vector data; also works for sparse vectors
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
- ));
+ );
List<StructField> fields = new ArrayList<>(2);
fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
@@ -58,7 +57,7 @@ public class JavaElementwiseProductExample {
StructType schema = DataTypes.createStructType(fields);
- Dataset<Row> dataFrame = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
@@ -70,6 +69,6 @@ public class JavaElementwiseProductExample {
// Batch transform the vectors to create new column:
transformer.transform(dataFrame).show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java
index 604b193dd4..5ba8e6cf44 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java
@@ -21,8 +21,6 @@ package org.apache.spark.examples.ml;
import java.util.Arrays;
// $example off$
-import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
// $example on$
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.classification.LogisticRegressionModel;
@@ -32,23 +30,21 @@ import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
// $example off$
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
/**
* Java example for Estimator, Transformer, and Param.
*/
public class JavaEstimatorTransformerParamExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf()
- .setAppName("JavaEstimatorTransformerParamExample");
- SparkContext sc = new SparkContext(conf);
- SQLContext sqlContext = new SQLContext(sc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaEstimatorTransformerParamExample").getOrCreate();
// $example on$
// Prepare training data.
// We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans into
// DataFrames, where it uses the bean metadata to infer the schema.
- Dataset<Row> training = sqlContext.createDataFrame(
+ Dataset<Row> training = spark.createDataFrame(
Arrays.asList(
new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
@@ -89,7 +85,7 @@ public class JavaEstimatorTransformerParamExample {
System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
// Prepare test documents.
- Dataset<Row> test = sqlContext.createDataFrame(Arrays.asList(
+ Dataset<Row> test = spark.createDataFrame(Arrays.asList(
new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))
@@ -107,6 +103,6 @@ public class JavaEstimatorTransformerParamExample {
}
// $example off$
- sc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java
index 553070dace..a7c89b9d19 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java
@@ -29,18 +29,17 @@ import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
import org.apache.spark.ml.feature.*;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example off$
public class JavaGradientBoostedTreeClassifierExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaGradientBoostedTreeClassifierExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaGradientBoostedTreeClassifierExample").getOrCreate();
// $example on$
// Load and parse the data file, converting it to a DataFrame.
- Dataset<Row> data = sqlContext.read().format("libsvm")
+ Dataset<Row> data = spark.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
// Index labels, adding metadata to the label column.
@@ -99,6 +98,6 @@ public class JavaGradientBoostedTreeClassifierExample {
System.out.println("Learned classification GBT model:\n" + gbtModel.toDebugString());
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java
index 83fd89e3bd..6d3f21fdaf 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java
@@ -17,8 +17,6 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
@@ -30,19 +28,17 @@ import org.apache.spark.ml.regression.GBTRegressionModel;
import org.apache.spark.ml.regression.GBTRegressor;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example off$
public class JavaGradientBoostedTreeRegressorExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaGradientBoostedTreeRegressorExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaGradientBoostedTreeRegressorExample").getOrCreate();
// $example on$
// Load and parse the data file, converting it to a DataFrame.
- Dataset<Row> data =
- sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -87,6 +83,6 @@ public class JavaGradientBoostedTreeRegressorExample {
System.out.println("Learned regression GBT model:\n" + gbtModel.toDebugString());
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java
index 9b8c22f3bd..ccd74f2920 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java
@@ -17,14 +17,12 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
import org.apache.spark.ml.feature.IndexToString;
import org.apache.spark.ml.feature.StringIndexer;
@@ -39,24 +37,22 @@ import org.apache.spark.sql.types.StructType;
public class JavaIndexToStringExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaIndexToStringExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaIndexToStringExample").getOrCreate();
// $example on$
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(0, "a"),
RowFactory.create(1, "b"),
RowFactory.create(2, "c"),
RowFactory.create(3, "a"),
RowFactory.create(4, "a"),
RowFactory.create(5, "c")
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("category", DataTypes.StringType, false, Metadata.empty())
});
- Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> df = spark.createDataFrame(data, schema);
StringIndexerModel indexer = new StringIndexer()
.setInputCol("category")
@@ -70,6 +66,6 @@ public class JavaIndexToStringExample {
Dataset<Row> converted = converter.transform(indexed);
converted.select("id", "originalCategory").show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
index c5022f4c0b..e6d82a0513 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
@@ -19,12 +19,10 @@ package org.apache.spark.examples.ml;
import java.util.regex.Pattern;
-import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.expressions.GenericRow;
// $example on$
import org.apache.spark.ml.clustering.KMeansModel;
@@ -72,16 +70,14 @@ public class JavaKMeansExample {
int k = Integer.parseInt(args[1]);
// Parses the arguments
- SparkConf conf = new SparkConf().setAppName("JavaKMeansExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaKMeansExample").getOrCreate();
// $example on$
// Loads data
- JavaRDD<Row> points = jsc.textFile(inputFile).map(new ParsePoint());
+ JavaRDD<Row> points = spark.read().text(inputFile).javaRDD().map(new ParsePoint());
StructField[] fields = {new StructField("features", new VectorUDT(), false, Metadata.empty())};
StructType schema = new StructType(fields);
- Dataset<Row> dataset = sqlContext.createDataFrame(points, schema);
+ Dataset<Row> dataset = spark.createDataFrame(points, schema);
// Trains a k-means model
KMeans kmeans = new KMeans()
@@ -96,6 +92,6 @@ public class JavaKMeansExample {
}
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
index 351bc40118..b8baca5920 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java
@@ -19,9 +19,7 @@ package org.apache.spark.examples.ml;
// $example on$
import java.util.regex.Pattern;
-import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.ml.clustering.LDA;
import org.apache.spark.ml.clustering.LDAModel;
@@ -30,7 +28,7 @@ import org.apache.spark.mllib.linalg.VectorUDT;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.expressions.GenericRow;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
@@ -67,15 +65,13 @@ public class JavaLDAExample {
String inputFile = "data/mllib/sample_lda_data.txt";
// Parses the arguments
- SparkConf conf = new SparkConf().setAppName("JavaLDAExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaLDAExample").getOrCreate();
// Loads data
- JavaRDD<Row> points = jsc.textFile(inputFile).map(new ParseVector());
+ JavaRDD<Row> points = spark.read().text(inputFile).javaRDD().map(new ParseVector());
StructField[] fields = {new StructField("features", new VectorUDT(), false, Metadata.empty())};
StructType schema = new StructType(fields);
- Dataset<Row> dataset = sqlContext.createDataFrame(points, schema);
+ Dataset<Row> dataset = spark.createDataFrame(points, schema);
// Trains a LDA model
LDA lda = new LDA()
@@ -91,7 +87,7 @@ public class JavaLDAExample {
topics.show(false);
model.transform(dataset).show(false);
- jsc.stop();
+ spark.stop();
}
// $example off$
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java
index 08fce89359..b6ea1fed25 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java
@@ -17,8 +17,6 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.regression.LinearRegression;
import org.apache.spark.ml.regression.LinearRegressionModel;
@@ -26,18 +24,17 @@ import org.apache.spark.ml.regression.LinearRegressionTrainingSummary;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example off$
public class JavaLinearRegressionWithElasticNetExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaLinearRegressionWithElasticNetExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaLinearRegressionWithElasticNetExample").getOrCreate();
// $example on$
// Load training data
- Dataset<Row> training = sqlContext.read().format("libsvm")
+ Dataset<Row> training = spark.read().format("libsvm")
.load("data/mllib/sample_linear_regression_data.txt");
LinearRegression lr = new LinearRegression()
@@ -61,6 +58,6 @@ public class JavaLinearRegressionWithElasticNetExample {
System.out.println("r2: " + trainingSummary.r2());
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java
index 73b028fb44..fd040aead4 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java
@@ -17,8 +17,6 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary;
import org.apache.spark.ml.classification.LogisticRegression;
@@ -26,18 +24,17 @@ import org.apache.spark.ml.classification.LogisticRegressionModel;
import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
// $example off$
public class JavaLogisticRegressionSummaryExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionSummaryExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaLogisticRegressionSummaryExample").getOrCreate();
// Load training data
- Dataset<Row> training = sqlContext.read().format("libsvm")
+ Dataset<Row> training = spark.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
LogisticRegression lr = new LogisticRegression()
@@ -80,6 +77,6 @@ public class JavaLogisticRegressionSummaryExample {
lrModel.setThreshold(bestThreshold);
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
index 6911668522..f00c7a05cd 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
@@ -17,25 +17,22 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.classification.LogisticRegressionModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example off$
public class JavaLogisticRegressionWithElasticNetExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithElasticNetExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaLogisticRegressionWithElasticNetExample").getOrCreate();
// $example on$
// Load training data
- Dataset<Row> training = sqlContext.read().format("libsvm")
+ Dataset<Row> training = spark.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
LogisticRegression lr = new LogisticRegression()
@@ -51,6 +48,6 @@ public class JavaLogisticRegressionWithElasticNetExample {
+ lrModel.coefficients() + " Intercept: " + lrModel.intercept());
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java
index a2a072b253..80cdd364b9 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java
@@ -17,25 +17,21 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.feature.MaxAbsScaler;
import org.apache.spark.ml.feature.MaxAbsScalerModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
// $example off$
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
public class JavaMaxAbsScalerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaMaxAbsScalerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaMaxAbsScalerExample").getOrCreate();
// $example on$
- Dataset<Row> dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ Dataset<Row> dataFrame = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
MaxAbsScaler scaler = new MaxAbsScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures");
@@ -47,7 +43,7 @@ public class JavaMaxAbsScalerExample {
Dataset<Row> scaledData = scalerModel.transform(dataFrame);
scaledData.show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
index 4aee18eeab..022940fd1e 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
@@ -17,9 +17,7 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import org.apache.spark.ml.feature.MinMaxScaler;
@@ -30,12 +28,10 @@ import org.apache.spark.sql.Row;
public class JavaMinMaxScalerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JaveMinMaxScalerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaMinMaxScalerExample").getOrCreate();
// $example on$
- Dataset<Row> dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ Dataset<Row> dataFrame = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
MinMaxScaler scaler = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures");
@@ -47,6 +43,6 @@ public class JavaMinMaxScalerExample {
Dataset<Row> scaledData = scalerModel.transform(dataFrame);
scaledData.show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java
index c4122d1247..a4ec4f5815 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java
@@ -21,8 +21,6 @@ package org.apache.spark.examples.ml;
import java.util.Arrays;
// $example off$
-import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
// $example on$
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineStage;
@@ -37,21 +35,19 @@ import org.apache.spark.ml.tuning.ParamGridBuilder;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
// $example off$
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
/**
* Java example for Model Selection via Cross Validation.
*/
public class JavaModelSelectionViaCrossValidationExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf()
- .setAppName("JavaModelSelectionViaCrossValidationExample");
- SparkContext sc = new SparkContext(conf);
- SQLContext sqlContext = new SQLContext(sc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaModelSelectionViaCrossValidationExample").getOrCreate();
// $example on$
// Prepare training documents, which are labeled.
- Dataset<Row> training = sqlContext.createDataFrame(Arrays.asList(
+ Dataset<Row> training = spark.createDataFrame(Arrays.asList(
new JavaLabeledDocument(0L, "a b c d e spark", 1.0),
new JavaLabeledDocument(1L, "b d", 0.0),
new JavaLabeledDocument(2L,"spark f g h", 1.0),
@@ -102,7 +98,7 @@ public class JavaModelSelectionViaCrossValidationExample {
CrossValidatorModel cvModel = cv.fit(training);
// Prepare test documents, which are unlabeled.
- Dataset<Row> test = sqlContext.createDataFrame(Arrays.asList(
+ Dataset<Row> test = spark.createDataFrame(Arrays.asList(
new JavaDocument(4L, "spark i j k"),
new JavaDocument(5L, "l m n"),
new JavaDocument(6L, "mapreduce spark"),
@@ -117,6 +113,6 @@ public class JavaModelSelectionViaCrossValidationExample {
}
// $example off$
- sc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
index 4994f8f9fa..63a0ad1cb8 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
@@ -17,8 +17,6 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
// $example on$
import org.apache.spark.ml.evaluation.RegressionEvaluator;
import org.apache.spark.ml.param.ParamMap;
@@ -29,7 +27,7 @@ import org.apache.spark.ml.tuning.TrainValidationSplitModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
// $example off$
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
/**
* Java example demonstrating model selection using TrainValidationSplit.
@@ -44,13 +42,11 @@ import org.apache.spark.sql.SQLContext;
*/
public class JavaModelSelectionViaTrainValidationSplitExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf()
- .setAppName("JavaModelSelectionViaTrainValidationSplitExample");
- SparkContext sc = new SparkContext(conf);
- SQLContext jsql = new SQLContext(sc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaModelSelectionViaTrainValidationSplitExample").getOrCreate();
// $example on$
- Dataset<Row> data = jsql.read().format("libsvm")
+ Dataset<Row> data = spark.read().format("libsvm")
.load("data/mllib/sample_linear_regression_data.txt");
// Prepare training and test data.
@@ -87,6 +83,6 @@ public class JavaModelSelectionViaTrainValidationSplitExample {
.show();
// $example off$
- sc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
index 0ca528d8cd..d547a2a64b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
@@ -18,11 +18,9 @@
package org.apache.spark.examples.ml;
// $example on$
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel;
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier;
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
@@ -34,14 +32,13 @@ import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
public class JavaMultilayerPerceptronClassifierExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaMultilayerPerceptronClassifierExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaMultilayerPerceptronClassifierExample").getOrCreate();
// $example on$
// Load training data
String path = "data/mllib/sample_multiclass_classification_data.txt";
- Dataset<Row> dataFrame = jsql.read().format("libsvm").load(path);
+ Dataset<Row> dataFrame = spark.read().format("libsvm").load(path);
// Split the data into train and test
Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
Dataset<Row> train = splits[0];
@@ -66,6 +63,6 @@ public class JavaMultilayerPerceptronClassifierExample {
System.out.println("Precision = " + evaluator.evaluate(predictionAndLabels));
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
index 608bd80285..325b7b5874 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
@@ -17,15 +17,13 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.NGram;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
@@ -37,16 +35,14 @@ import org.apache.spark.sql.types.StructType;
public class JavaNGramExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaNGramExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaNGramExample").getOrCreate();
// $example on$
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
@@ -54,7 +50,7 @@ public class JavaNGramExample {
"words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
});
- Dataset<Row> wordDataFrame = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema);
NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams");
@@ -66,6 +62,6 @@ public class JavaNGramExample {
System.out.println();
}
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java
index 41d7ad75b9..1f24a23609 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java
@@ -17,16 +17,13 @@
package org.apache.spark.examples.ml;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.classification.NaiveBayes;
import org.apache.spark.ml.classification.NaiveBayesModel;
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example off$
/**
@@ -35,13 +32,12 @@ import org.apache.spark.sql.SQLContext;
public class JavaNaiveBayesExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaNaiveBayesExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaNaiveBayesExample").getOrCreate();
// $example on$
// Load training data
- Dataset<Row> dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ Dataset<Row> dataFrame =
+ spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Split the data into train and test
Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
Dataset<Row> train = splits[0];
@@ -59,6 +55,6 @@ public class JavaNaiveBayesExample {
System.out.println("Precision = " + evaluator.evaluate(predictionAndLabels));
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
index 31cd752136..4b3a718ea9 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
@@ -17,9 +17,7 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import org.apache.spark.ml.feature.Normalizer;
@@ -29,12 +27,11 @@ import org.apache.spark.sql.Row;
public class JavaNormalizerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaNormalizerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaNormalizerExample").getOrCreate();
// $example on$
- Dataset<Row> dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ Dataset<Row> dataFrame =
+ spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Normalize each Vector using $L^1$ norm.
Normalizer normalizer = new Normalizer()
@@ -50,6 +47,6 @@ public class JavaNormalizerExample {
normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
lInfNormData.show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
index 882438ca28..d6e4d21ead 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
@@ -17,14 +17,12 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.OneHotEncoder;
import org.apache.spark.ml.feature.StringIndexer;
import org.apache.spark.ml.feature.StringIndexerModel;
@@ -39,26 +37,24 @@ import org.apache.spark.sql.types.StructType;
public class JavaOneHotEncoderExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaOneHotEncoderExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaOneHotEncoderExample").getOrCreate();
// $example on$
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(0, "a"),
RowFactory.create(1, "b"),
RowFactory.create(2, "c"),
RowFactory.create(3, "a"),
RowFactory.create(4, "a"),
RowFactory.create(5, "c")
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("category", DataTypes.StringType, false, Metadata.empty())
});
- Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> df = spark.createDataFrame(data, schema);
StringIndexerModel indexer = new StringIndexer()
.setInputCol("category")
@@ -72,7 +68,7 @@ public class JavaOneHotEncoderExample {
Dataset<Row> encoded = encoder.transform(indexed);
encoded.select("id", "categoryVec").show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
index 1f13b48bf8..9cc983bd11 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
@@ -19,8 +19,6 @@ package org.apache.spark.examples.ml;
import org.apache.commons.cli.*;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.classification.OneVsRest;
@@ -31,7 +29,7 @@ import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.StructField;
// $example off$
@@ -60,9 +58,7 @@ public class JavaOneVsRestExample {
public static void main(String[] args) {
// parse the arguments
Params params = parse(args);
- SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaOneVsRestExample").getOrCreate();
// $example on$
// configure the base classifier
@@ -82,7 +78,7 @@ public class JavaOneVsRestExample {
OneVsRest ovr = new OneVsRest().setClassifier(classifier);
String input = params.input;
- Dataset<Row> inputData = jsql.read().format("libsvm").load(input);
+ Dataset<Row> inputData = spark.read().format("libsvm").load(input);
Dataset<Row> train;
Dataset<Row> test;
@@ -92,7 +88,7 @@ public class JavaOneVsRestExample {
train = inputData;
// compute the number of features in the training set.
int numFeatures = inputData.first().<Vector>getAs(1).size();
- test = jsql.read().format("libsvm").option("numFeatures",
+ test = spark.read().format("libsvm").option("numFeatures",
String.valueOf(numFeatures)).load(testInput);
} else {
double f = params.fracTest;
@@ -131,7 +127,7 @@ public class JavaOneVsRestExample {
System.out.println(results);
// $example off$
- jsc.stop();
+ spark.stop();
}
private static Params parse(String[] args) {
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
index a792fd7d47..6b1dcb68ba 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
@@ -17,14 +17,12 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.PCA;
import org.apache.spark.ml.feature.PCAModel;
import org.apache.spark.mllib.linalg.VectorUDT;
@@ -39,22 +37,20 @@ import org.apache.spark.sql.types.StructType;
public class JavaPCAExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaPCAExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaPCAExample").getOrCreate();
// $example on$
- JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})),
RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("features", new VectorUDT(), false, Metadata.empty()),
});
- Dataset<Row> df = jsql.createDataFrame(data, schema);
+ Dataset<Row> df = spark.createDataFrame(data, schema);
PCAModel pca = new PCA()
.setInputCol("features")
@@ -65,7 +61,7 @@ public class JavaPCAExample {
Dataset<Row> result = pca.transform(df).select("pcaFeatures");
result.show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java
index 305420f208..556a457326 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java
@@ -19,11 +19,7 @@ package org.apache.spark.examples.ml;
// $example on$
import java.util.Arrays;
-// $example off$
-import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
-// $example on$
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
@@ -33,20 +29,18 @@ import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
// $example off$
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
/**
* Java example for simple text document 'Pipeline'.
*/
public class JavaPipelineExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaPipelineExample");
- SparkContext sc = new SparkContext(conf);
- SQLContext sqlContext = new SQLContext(sc);
+ SparkSession spark = SparkSession.builder().appName("JavaPipelineExample").getOrCreate();
// $example on$
// Prepare training documents, which are labeled.
- Dataset<Row> training = sqlContext.createDataFrame(Arrays.asList(
+ Dataset<Row> training = spark.createDataFrame(Arrays.asList(
new JavaLabeledDocument(0L, "a b c d e spark", 1.0),
new JavaLabeledDocument(1L, "b d", 0.0),
new JavaLabeledDocument(2L, "spark f g h", 1.0),
@@ -71,7 +65,7 @@ public class JavaPipelineExample {
PipelineModel model = pipeline.fit(training);
// Prepare test documents, which are unlabeled.
- Dataset<Row> test = sqlContext.createDataFrame(Arrays.asList(
+ Dataset<Row> test = spark.createDataFrame(Arrays.asList(
new JavaDocument(4L, "spark i j k"),
new JavaDocument(5L, "l m n"),
new JavaDocument(6L, "mapreduce spark"),
@@ -86,6 +80,6 @@ public class JavaPipelineExample {
}
// $example off$
- sc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
index 48fc3c8acb..e328454c70 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
@@ -17,15 +17,12 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
import java.util.List;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.PolynomialExpansion;
import org.apache.spark.mllib.linalg.VectorUDT;
import org.apache.spark.mllib.linalg.Vectors;
@@ -39,9 +36,7 @@ import org.apache.spark.sql.types.StructType;
public class JavaPolynomialExpansionExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaPolynomialExpansionExample").getOrCreate();
// $example on$
PolynomialExpansion polyExpansion = new PolynomialExpansion()
@@ -49,17 +44,17 @@ public class JavaPolynomialExpansionExample {
.setOutputCol("polyFeatures")
.setDegree(3);
- JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(Vectors.dense(-2.0, 2.3)),
RowFactory.create(Vectors.dense(0.0, 0.0)),
RowFactory.create(Vectors.dense(0.6, -1.1))
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("features", new VectorUDT(), false, Metadata.empty()),
});
- Dataset<Row> df = jsql.createDataFrame(data, schema);
+ Dataset<Row> df = spark.createDataFrame(data, schema);
Dataset<Row> polyDF = polyExpansion.transform(df);
List<Row> rows = polyDF.select("polyFeatures").takeAsList(3);
@@ -67,6 +62,6 @@ public class JavaPolynomialExpansionExample {
System.out.println(r.get(0));
}
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
index 7b226fede9..94e3fafcab 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
@@ -17,13 +17,11 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.QuantileDiscretizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
@@ -36,19 +34,16 @@ import org.apache.spark.sql.types.StructType;
public class JavaQuantileDiscretizerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaQuantileDiscretizerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaQuantileDiscretizerExample").getOrCreate();
// $example on$
- JavaRDD<Row> jrdd = jsc.parallelize(
- Arrays.asList(
- RowFactory.create(0, 18.0),
- RowFactory.create(1, 19.0),
- RowFactory.create(2, 8.0),
- RowFactory.create(3, 5.0),
- RowFactory.create(4, 2.2)
- )
+ List<Row> data = Arrays.asList(
+ RowFactory.create(0, 18.0),
+ RowFactory.create(1, 19.0),
+ RowFactory.create(2, 8.0),
+ RowFactory.create(3, 5.0),
+ RowFactory.create(4, 2.2)
);
StructType schema = new StructType(new StructField[]{
@@ -56,7 +51,7 @@ public class JavaQuantileDiscretizerExample {
new StructField("hour", DataTypes.DoubleType, false, Metadata.empty())
});
- Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> df = spark.createDataFrame(data, schema);
QuantileDiscretizer discretizer = new QuantileDiscretizer()
.setInputCol("hour")
@@ -66,6 +61,6 @@ public class JavaQuantileDiscretizerExample {
Dataset<Row> result = discretizer.fit(df).transform(df);
result.show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
index 8c453bf80d..8282ce01d3 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
@@ -17,14 +17,12 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.RFormula;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
@@ -37,9 +35,7 @@ import static org.apache.spark.sql.types.DataTypes.*;
public class JavaRFormulaExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaRFormulaExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaRFormulaExample").getOrCreate();
// $example on$
StructType schema = createStructType(new StructField[]{
@@ -49,13 +45,13 @@ public class JavaRFormulaExample {
createStructField("clicked", DoubleType, false)
});
- JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(7, "US", 18, 1.0),
RowFactory.create(8, "CA", 12, 0.0),
RowFactory.create(9, "NZ", 15, 0.0)
- ));
+ );
- Dataset<Row> dataset = sqlContext.createDataFrame(rdd, schema);
+ Dataset<Row> dataset = spark.createDataFrame(data, schema);
RFormula formula = new RFormula()
.setFormula("clicked ~ country + hour")
.setFeaturesCol("features")
@@ -63,7 +59,7 @@ public class JavaRFormulaExample {
Dataset<Row> output = formula.fit(dataset).transform(dataset);
output.select("features", "label").show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java
index 05c2bc9622..21e783a968 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java
@@ -17,8 +17,6 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
@@ -29,19 +27,17 @@ import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
import org.apache.spark.ml.feature.*;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example off$
public class JavaRandomForestClassifierExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaRandomForestClassifierExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaRandomForestClassifierExample").getOrCreate();
// $example on$
// Load and parse the data file, converting it to a DataFrame.
- Dataset<Row> data =
- sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
@@ -98,6 +94,6 @@ public class JavaRandomForestClassifierExample {
System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java
index d366967083..ece184a878 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java
@@ -17,8 +17,6 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
@@ -30,19 +28,17 @@ import org.apache.spark.ml.regression.RandomForestRegressionModel;
import org.apache.spark.ml.regression.RandomForestRegressor;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example off$
public class JavaRandomForestRegressorExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaRandomForestRegressorExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaRandomForestRegressorExample").getOrCreate();
// $example on$
// Load and parse the data file, converting it to a DataFrame.
- Dataset<Row> data =
- sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -87,6 +83,6 @@ public class JavaRandomForestRegressorExample {
System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
index 7e3ca99d7c..492718bbdb 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
@@ -19,36 +19,31 @@ package org.apache.spark.examples.ml;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.feature.SQLTransformer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.*;
// $example off$
public class JavaSQLTransformerExample {
public static void main(String[] args) {
-
- SparkConf conf = new SparkConf().setAppName("JavaSQLTransformerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaSQLTransformerExample").getOrCreate();
// $example on$
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(0, 1.0, 3.0),
RowFactory.create(2, 2.0, 5.0)
- ));
+ );
StructType schema = new StructType(new StructField [] {
new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("v2", DataTypes.DoubleType, false, Metadata.empty())
});
- Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> df = spark.createDataFrame(data, schema);
SQLTransformer sqlTrans = new SQLTransformer().setStatement(
"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__");
@@ -56,6 +51,6 @@ public class JavaSQLTransformerExample {
sqlTrans.transform(df).show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
index cb911ef5ef..f906843640 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -21,8 +21,6 @@ import java.util.List;
import com.google.common.collect.Lists;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.classification.LogisticRegressionModel;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.classification.LogisticRegression;
@@ -30,7 +28,7 @@ import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
/**
* A simple example demonstrating ways to specify parameters for Estimators and Transformers.
@@ -42,9 +40,7 @@ import org.apache.spark.sql.SQLContext;
public class JavaSimpleParamsExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaSimpleParamsExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaSimpleParamsExample").getOrCreate();
// Prepare training data.
// We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
@@ -55,7 +51,7 @@ public class JavaSimpleParamsExample {
new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
Dataset<Row> training =
- jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class);
+ spark.createDataFrame(localTraining, LabeledPoint.class);
// Create a LogisticRegression instance. This instance is an Estimator.
LogisticRegression lr = new LogisticRegression();
@@ -96,7 +92,7 @@ public class JavaSimpleParamsExample {
new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
- Dataset<Row> test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
+ Dataset<Row> test = spark.createDataFrame(localTest, LabeledPoint.class);
// Make predictions on test documents using the Transformer.transform() method.
// LogisticRegressionModel.transform will only use the 'features' column.
@@ -109,6 +105,6 @@ public class JavaSimpleParamsExample {
+ ", prediction=" + r.get(3));
}
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
index a18a60f448..9516ce1f4f 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -21,8 +21,6 @@ import java.util.List;
import com.google.common.collect.Lists;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
@@ -31,7 +29,7 @@ import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
/**
* A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
@@ -44,9 +42,8 @@ import org.apache.spark.sql.SQLContext;
public class JavaSimpleTextClassificationPipeline {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession
+ .builder().appName("JavaSimpleTextClassificationPipeline").getOrCreate();
// Prepare training documents, which are labeled.
List<LabeledDocument> localTraining = Lists.newArrayList(
@@ -55,7 +52,7 @@ public class JavaSimpleTextClassificationPipeline {
new LabeledDocument(2L, "spark f g h", 1.0),
new LabeledDocument(3L, "hadoop mapreduce", 0.0));
Dataset<Row> training =
- jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
+ spark.createDataFrame(localTraining, LabeledDocument.class);
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
Tokenizer tokenizer = new Tokenizer()
@@ -80,7 +77,7 @@ public class JavaSimpleTextClassificationPipeline {
new Document(5L, "l m n"),
new Document(6L, "spark hadoop spark"),
new Document(7L, "apache hadoop"));
- Dataset<Row> test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
+ Dataset<Row> test = spark.createDataFrame(localTest, Document.class);
// Make predictions on test documents.
Dataset<Row> predictions = model.transform(test);
@@ -89,6 +86,6 @@ public class JavaSimpleTextClassificationPipeline {
+ ", prediction=" + r.get(3));
}
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java
index e2dd759c0a..10f82f2233 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java
@@ -17,9 +17,7 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import org.apache.spark.ml.feature.StandardScaler;
@@ -30,12 +28,11 @@ import org.apache.spark.sql.Row;
public class JavaStandardScalerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaStandardScalerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaStandardScalerExample").getOrCreate();
// $example on$
- Dataset<Row> dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ Dataset<Row> dataFrame =
+ spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
StandardScaler scaler = new StandardScaler()
.setInputCol("features")
@@ -50,6 +47,6 @@ public class JavaStandardScalerExample {
Dataset<Row> scaledData = scalerModel.transform(dataFrame);
scaledData.show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
index 0ff3782cb3..23ed071c9f 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
@@ -17,14 +17,12 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.StopWordsRemover;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
@@ -38,28 +36,26 @@ import org.apache.spark.sql.types.StructType;
public class JavaStopWordsRemoverExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaStopWordsRemoverExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaStopWordsRemoverExample").getOrCreate();
// $example on$
StopWordsRemover remover = new StopWordsRemover()
.setInputCol("raw")
.setOutputCol("filtered");
- JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField(
"raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
});
- Dataset<Row> dataset = jsql.createDataFrame(rdd, schema);
+ Dataset<Row> dataset = spark.createDataFrame(data, schema);
remover.transform(dataset).show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
index ceacbb4fb3..d4c2cf96a7 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
@@ -17,14 +17,12 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.StringIndexer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
@@ -37,30 +35,28 @@ import static org.apache.spark.sql.types.DataTypes.*;
public class JavaStringIndexerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaStringIndexerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaStringIndexerExample").getOrCreate();
// $example on$
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(0, "a"),
RowFactory.create(1, "b"),
RowFactory.create(2, "c"),
RowFactory.create(3, "a"),
RowFactory.create(4, "a"),
RowFactory.create(5, "c")
- ));
+ );
StructType schema = new StructType(new StructField[]{
createStructField("id", IntegerType, false),
createStructField("category", StringType, false)
});
- Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> df = spark.createDataFrame(data, schema);
StringIndexer indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("categoryIndex");
Dataset<Row> indexed = indexer.fit(df).transform(df);
indexed.show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
index 107c835f2e..a816991777 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
@@ -19,10 +19,8 @@ package org.apache.spark.examples.ml;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.IDF;
import org.apache.spark.ml.feature.IDFModel;
@@ -31,7 +29,7 @@ import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
@@ -40,21 +38,19 @@ import org.apache.spark.sql.types.StructType;
public class JavaTfIdfExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaTfIdfExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaTfIdfExample").getOrCreate();
// $example on$
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(0, "Hi I heard about Spark"),
RowFactory.create(0, "I wish Java could use case classes"),
RowFactory.create(1, "Logistic regression models are neat")
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
- Dataset<Row> sentenceData = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> sentenceData = spark.createDataFrame(data, schema);
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
Dataset<Row> wordsData = tokenizer.transform(sentenceData);
int numFeatures = 20;
@@ -76,6 +72,6 @@ public class JavaTfIdfExample {
}
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
index 9225fe2262..a65735a5e5 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
@@ -17,14 +17,12 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.RegexTokenizer;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Dataset;
@@ -38,23 +36,21 @@ import org.apache.spark.sql.types.StructType;
public class JavaTokenizerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaTokenizerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaTokenizerExample").getOrCreate();
// $example on$
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(0, "Hi I heard about Spark"),
RowFactory.create(1, "I wish Java could use case classes"),
RowFactory.create(2, "Logistic,regression,models,are,neat")
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
- Dataset<Row> sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> sentenceDataFrame = spark.createDataFrame(data, schema);
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
@@ -70,6 +66,6 @@ public class JavaTokenizerExample {
.setOutputCol("words")
.setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
index 953ad455b1..9569bc2412 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
@@ -17,14 +17,11 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.VectorAssembler;
import org.apache.spark.mllib.linalg.VectorUDT;
import org.apache.spark.mllib.linalg.Vectors;
@@ -38,9 +35,7 @@ import static org.apache.spark.sql.types.DataTypes.*;
public class JavaVectorAssemblerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaVectorAssemblerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaVectorAssemblerExample").getOrCreate();
// $example on$
StructType schema = createStructType(new StructField[]{
@@ -51,8 +46,7 @@ public class JavaVectorAssemblerExample {
createStructField("clicked", DoubleType, false)
});
Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
- JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
- Dataset<Row> dataset = sqlContext.createDataFrame(rdd, schema);
+ Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema);
VectorAssembler assembler = new VectorAssembler()
.setInputCols(new String[]{"hour", "mobile", "userFeatures"})
@@ -61,7 +55,7 @@ public class JavaVectorAssemblerExample {
Dataset<Row> output = assembler.transform(dataset);
System.out.println(output.select("features", "clicked").first());
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java
index b3b5953ee7..217d5a06d1 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java
@@ -17,9 +17,7 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Map;
@@ -32,12 +30,10 @@ import org.apache.spark.sql.Row;
public class JavaVectorIndexerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaVectorIndexerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaVectorIndexerExample").getOrCreate();
// $example on$
- Dataset<Row> data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
VectorIndexer indexer = new VectorIndexer()
.setInputCol("features")
@@ -57,6 +53,6 @@ public class JavaVectorIndexerExample {
Dataset<Row> indexedData = indexerModel.transform(data);
indexedData.show();
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
index 2ae57c3577..4f1ea824a3 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
@@ -17,14 +17,13 @@
package org.apache.spark.examples.ml;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
// $example on$
+import java.util.List;
+
import com.google.common.collect.Lists;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.attribute.Attribute;
import org.apache.spark.ml.attribute.AttributeGroup;
import org.apache.spark.ml.attribute.NumericAttribute;
@@ -38,9 +37,7 @@ import org.apache.spark.sql.types.*;
public class JavaVectorSlicerExample {
public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("JavaVectorSlicerExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext jsql = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaVectorSlicerExample").getOrCreate();
// $example on$
Attribute[] attrs = new Attribute[]{
@@ -50,13 +47,13 @@ public class JavaVectorSlicerExample {
};
AttributeGroup group = new AttributeGroup("userFeatures", attrs);
- JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+ List<Row> data = Lists.newArrayList(
RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
- ));
+ );
Dataset<Row> dataset =
- jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
+ spark.createDataFrame(data, (new StructType()).add(group.toStructField()));
VectorSlicer vectorSlicer = new VectorSlicer()
.setInputCol("userFeatures").setOutputCol("features");
@@ -68,7 +65,7 @@ public class JavaVectorSlicerExample {
System.out.println(output.select("userFeatures", "features").first());
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
index c5bb1eaaa3..d9b1a79b52 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
@@ -19,37 +19,32 @@ package org.apache.spark.examples.ml;
// $example on$
import java.util.Arrays;
+import java.util.List;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.feature.Word2Vec;
import org.apache.spark.ml.feature.Word2VecModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.*;
// $example off$
public class JavaWord2VecExample {
public static void main(String[] args) {
-
- SparkConf conf = new SparkConf().setAppName("JavaWord2VecExample");
- JavaSparkContext jsc = new JavaSparkContext(conf);
- SQLContext sqlContext = new SQLContext(jsc);
+ SparkSession spark = SparkSession.builder().appName("JavaWord2VecExample").getOrCreate();
// $example on$
// Input data: Each row is a bag of words from a sentence or document.
- JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ List<Row> data = Arrays.asList(
RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
- ));
+ );
StructType schema = new StructType(new StructField[]{
new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
});
- Dataset<Row> documentDF = sqlContext.createDataFrame(jrdd, schema);
+ Dataset<Row> documentDF = spark.createDataFrame(data, schema);
// Learn a mapping from words to Vectors.
Word2Vec word2Vec = new Word2Vec()
@@ -64,6 +59,6 @@ public class JavaWord2VecExample {
}
// $example off$
- jsc.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
index 354a5306ed..ec2142e756 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
@@ -21,14 +21,12 @@ import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
-import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
public class JavaSparkSQL {
public static class Person implements Serializable {
@@ -53,13 +51,12 @@ public class JavaSparkSQL {
}
public static void main(String[] args) throws Exception {
- SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL");
- JavaSparkContext ctx = new JavaSparkContext(sparkConf);
- SQLContext sqlContext = new SQLContext(ctx);
+ SparkSession spark = SparkSession.builder().appName("JavaSparkSQL").getOrCreate();
System.out.println("=== Data source: RDD ===");
// Load a text file and convert each line to a Java Bean.
- JavaRDD<Person> people = ctx.textFile("examples/src/main/resources/people.txt").map(
+ String file = "examples/src/main/resources/people.txt";
+ JavaRDD<Person> people = spark.read().text(file).javaRDD().map(
new Function<String, Person>() {
@Override
public Person call(String line) {
@@ -74,12 +71,11 @@ public class JavaSparkSQL {
});
// Apply a schema to an RDD of Java Beans and register it as a table.
- Dataset<Row> schemaPeople = sqlContext.createDataFrame(people, Person.class);
+ Dataset<Row> schemaPeople = spark.createDataFrame(people, Person.class);
schemaPeople.registerTempTable("people");
// SQL can be run over RDDs that have been registered as tables.
- Dataset<Row> teenagers =
- sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
+ Dataset<Row> teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
// The results of SQL queries are DataFrames and support all the normal RDD operations.
// The columns of a row in the result can be accessed by ordinal.
@@ -100,12 +96,12 @@ public class JavaSparkSQL {
// Read in the parquet file created above.
// Parquet files are self-describing so the schema is preserved.
// The result of loading a parquet file is also a DataFrame.
- Dataset<Row> parquetFile = sqlContext.read().parquet("people.parquet");
+ Dataset<Row> parquetFile = spark.read().parquet("people.parquet");
//Parquet files can also be registered as tables and then used in SQL statements.
parquetFile.registerTempTable("parquetFile");
Dataset<Row> teenagers2 =
- sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
+ spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) {
@@ -121,7 +117,7 @@ public class JavaSparkSQL {
// The path can be either a single text file or a directory storing text files.
String path = "examples/src/main/resources/people.json";
// Create a DataFrame from the file(s) pointed by path
- Dataset<Row> peopleFromJsonFile = sqlContext.read().json(path);
+ Dataset<Row> peopleFromJsonFile = spark.read().json(path);
// Because the schema of a JSON dataset is automatically inferred, to write queries,
// it is better to take a look at what is the schema.
@@ -135,8 +131,7 @@ public class JavaSparkSQL {
peopleFromJsonFile.registerTempTable("people");
// SQL statements can be run by using the sql methods provided by sqlContext.
- Dataset<Row> teenagers3 =
- sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
+ Dataset<Row> teenagers3 = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
// The results of SQL queries are DataFrame and support all the normal RDD operations.
// The columns of a row in the result can be accessed by ordinal.
@@ -152,8 +147,8 @@ public class JavaSparkSQL {
// a RDD[String] storing one JSON object per string.
List<String> jsonData = Arrays.asList(
"{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
- JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
- Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());
+ JavaRDD<String> anotherPeopleRDD = spark.createDataFrame(jsonData, String.class).toJSON().javaRDD();
+ Dataset<Row> peopleFromJsonRDD = spark.read().json(anotherPeopleRDD);
// Take a look at the schema of this new DataFrame.
peopleFromJsonRDD.printSchema();
@@ -166,7 +161,7 @@ public class JavaSparkSQL {
peopleFromJsonRDD.registerTempTable("people2");
- Dataset<Row> peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2");
+ Dataset<Row> peopleWithCity = spark.sql("SELECT name, address.city FROM people2");
List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) {
@@ -177,6 +172,6 @@ public class JavaSparkSQL {
System.out.println(name);
}
- ctx.stop();
+ spark.stop();
}
}
diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java
index 7aa8862761..44f1e800fe 100644
--- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java
+++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java
@@ -22,14 +22,13 @@ import java.util.Iterator;
import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction2;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
import org.apache.spark.api.java.StorageLevels;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.Time;
@@ -82,7 +81,7 @@ public final class JavaSqlNetworkWordCount {
words.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() {
@Override
public void call(JavaRDD<String> rdd, Time time) {
- SQLContext sqlContext = JavaSQLContextSingleton.getInstance(rdd.context());
+ SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
// Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame
JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() {
@@ -93,14 +92,14 @@ public final class JavaSqlNetworkWordCount {
return record;
}
});
- Dataset<Row> wordsDataFrame = sqlContext.createDataFrame(rowRDD, JavaRecord.class);
+ Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class);
// Register as table
wordsDataFrame.registerTempTable("words");
// Do word count on table using SQL and print it
Dataset<Row> wordCountsDataFrame =
- sqlContext.sql("select word, count(*) as total from words group by word");
+ spark.sql("select word, count(*) as total from words group by word");
System.out.println("========= " + time + "=========");
wordCountsDataFrame.show();
}
@@ -111,12 +110,12 @@ public final class JavaSqlNetworkWordCount {
}
}
-/** Lazily instantiated singleton instance of SQLContext */
-class JavaSQLContextSingleton {
- private static transient SQLContext instance = null;
- public static SQLContext getInstance(SparkContext sparkContext) {
+/** Lazily instantiated singleton instance of SparkSession */
+class JavaSparkSessionSingleton {
+ private static transient SparkSession instance = null;
+ public static SparkSession getInstance(SparkConf sparkConf) {
if (instance == null) {
- instance = new SQLContext(sparkContext);
+ instance = SparkSession.builder().config(sparkConf).getOrCreate();
}
return instance;
}
diff --git a/examples/src/main/python/ml/als_example.py b/examples/src/main/python/ml/als_example.py
index 0c9ac583b2..e36444f185 100644
--- a/examples/src/main/python/ml/als_example.py
+++ b/examples/src/main/python/ml/als_example.py
@@ -21,8 +21,7 @@ import sys
if sys.version >= '3':
long = int
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.evaluation import RegressionEvaluator
@@ -31,15 +30,14 @@ from pyspark.sql import Row
# $example off$
if __name__ == "__main__":
- sc = SparkContext(appName="ALSExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("ALSExample").getOrCreate()
# $example on$
- lines = sc.textFile("data/mllib/als/sample_movielens_ratings.txt")
- parts = lines.map(lambda l: l.split("::"))
+ lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
+ parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
rating=float(p[2]), timestamp=long(p[3])))
- ratings = sqlContext.createDataFrame(ratingsRDD)
+ ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])
# Build the recommendation model using ALS on the training data
@@ -56,4 +54,4 @@ if __name__ == "__main__":
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
index 317cfa638a..072187e645 100644
--- a/examples/src/main/python/ml/binarizer_example.py
+++ b/examples/src/main/python/ml/binarizer_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Binarizer
# $example off$
if __name__ == "__main__":
- sc = SparkContext(appName="BinarizerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("BinarizerExample").getOrCreate()
# $example on$
- continuousDataFrame = sqlContext.createDataFrame([
+ continuousDataFrame = spark.createDataFrame([
(0, 0.1),
(1, 0.8),
(2, 0.2)
@@ -40,4 +38,4 @@ if __name__ == "__main__":
print(binarized_feature)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
index e6f6bfd7e8..836a89cde0 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -17,28 +17,26 @@
from __future__ import print_function
-from pyspark import SparkContext
# $example on$
from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel
from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors
from pyspark.mllib.linalg import Vectors
from pyspark.sql.types import Row
# $example off$
-from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
"""
A simple example demonstrating a bisecting k-means clustering.
"""
if __name__ == "__main__":
-
- sc = SparkContext(appName="PythonBisectingKMeansExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("PythonBisectingKMeansExample").getOrCreate()
# $example on$
- data = sc.textFile("data/mllib/kmeans_data.txt")
- parsed = data.map(lambda l: Row(features=Vectors.dense([float(x) for x in l.split(' ')])))
- training = sqlContext.createDataFrame(parsed)
+ data = spark.read.text("data/mllib/kmeans_data.txt").rdd
+ parsed = data\
+ .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')])))
+ training = spark.createDataFrame(parsed)
kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features")
@@ -54,4 +52,4 @@ if __name__ == "__main__":
print(center)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
index 4304255f35..288ec62bdf 100644
--- a/examples/src/main/python/ml/bucketizer_example.py
+++ b/examples/src/main/python/ml/bucketizer_example.py
@@ -17,21 +17,19 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Bucketizer
# $example off$
if __name__ == "__main__":
- sc = SparkContext(appName="BucketizerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("BucketizerExample").getOrCreate()
# $example on$
splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
- dataFrame = sqlContext.createDataFrame(data, ["features"])
+ dataFrame = spark.createDataFrame(data, ["features"])
bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
@@ -40,4 +38,4 @@ if __name__ == "__main__":
bucketedData.show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py
index 997a504735..8f58fc28de 100644
--- a/examples/src/main/python/ml/chisq_selector_example.py
+++ b/examples/src/main/python/ml/chisq_selector_example.py
@@ -17,19 +17,17 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import ChiSqSelector
from pyspark.mllib.linalg import Vectors
# $example off$
if __name__ == "__main__":
- sc = SparkContext(appName="ChiSqSelectorExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("ChiSqSelectorExample").getOrCreate()
# $example on$
- df = sqlContext.createDataFrame([
+ df = spark.createDataFrame([
(7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,),
(8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,),
(9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "clicked"])
@@ -41,4 +39,4 @@ if __name__ == "__main__":
result.show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py
index e839f645f7..9dbf9959d1 100644
--- a/examples/src/main/python/ml/count_vectorizer_example.py
+++ b/examples/src/main/python/ml/count_vectorizer_example.py
@@ -17,19 +17,17 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import CountVectorizer
# $example off$
if __name__ == "__main__":
- sc = SparkContext(appName="CountVectorizerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("CountVectorizerExample").getOrCreate()
# $example on$
# Input data: Each row is a bag of words with a ID.
- df = sqlContext.createDataFrame([
+ df = spark.createDataFrame([
(0, "a b c".split(" ")),
(1, "a b b c a".split(" "))
], ["id", "words"])
@@ -41,4 +39,4 @@ if __name__ == "__main__":
result.show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
index 5f0ef20218..a61d0f63d2 100644
--- a/examples/src/main/python/ml/cross_validator.py
+++ b/examples/src/main/python/ml/cross_validator.py
@@ -17,15 +17,14 @@
from __future__ import print_function
-from pyspark import SparkContext
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
-from pyspark.sql import Row, SQLContext
# $example off$
+from pyspark.sql import Row, SparkSession
"""
A simple example demonstrating model selection using CrossValidator.
@@ -36,25 +35,23 @@ Run with:
"""
if __name__ == "__main__":
- sc = SparkContext(appName="CrossValidatorExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("CrossValidatorExample").getOrCreate()
# $example on$
# Prepare training documents, which are labeled.
- LabeledDocument = Row("id", "text", "label")
- training = sc.parallelize([(0, "a b c d e spark", 1.0),
- (1, "b d", 0.0),
- (2, "spark f g h", 1.0),
- (3, "hadoop mapreduce", 0.0),
- (4, "b spark who", 1.0),
- (5, "g d a y", 0.0),
- (6, "spark fly", 1.0),
- (7, "was mapreduce", 0.0),
- (8, "e spark program", 1.0),
- (9, "a e c l", 0.0),
- (10, "spark compile", 1.0),
- (11, "hadoop software", 0.0)
- ]) \
- .map(lambda x: LabeledDocument(*x)).toDF()
+ training = spark.createDataFrame([
+ (0, "a b c d e spark", 1.0),
+ (1, "b d", 0.0),
+ (2, "spark f g h", 1.0),
+ (3, "hadoop mapreduce", 0.0),
+ (4, "b spark who", 1.0),
+ (5, "g d a y", 0.0),
+ (6, "spark fly", 1.0),
+ (7, "was mapreduce", 0.0),
+ (8, "e spark program", 1.0),
+ (9, "a e c l", 0.0),
+ (10, "spark compile", 1.0),
+ (11, "hadoop software", 0.0)
+ ], ["id", "text", "label"])
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
@@ -82,12 +79,12 @@ if __name__ == "__main__":
cvModel = crossval.fit(training)
# Prepare test documents, which are unlabeled.
- Document = Row("id", "text")
- test = sc.parallelize([(4L, "spark i j k"),
- (5L, "l m n"),
- (6L, "mapreduce spark"),
- (7L, "apache hadoop")]) \
- .map(lambda x: Document(*x)).toDF()
+ test = spark.createDataFrame([
+ (4L, "spark i j k"),
+ (5L, "l m n"),
+ (6L, "mapreduce spark"),
+ (7L, "apache hadoop")
+ ], ["id", "text"])
# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
@@ -96,4 +93,4 @@ if __name__ == "__main__":
print(row)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py
index d2644ca335..b3e671038e 100644
--- a/examples/src/main/python/ml/dataframe_example.py
+++ b/examples/src/main/python/ml/dataframe_example.py
@@ -26,16 +26,14 @@ import sys
import tempfile
import shutil
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
from pyspark.mllib.stat import Statistics
if __name__ == "__main__":
if len(sys.argv) > 2:
print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr)
exit(-1)
- sc = SparkContext(appName="DataFrameExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("DataFrameExample").getOrCreate()
if len(sys.argv) == 2:
input = sys.argv[1]
else:
@@ -43,7 +41,7 @@ if __name__ == "__main__":
# Load input data
print("Loading LIBSVM file with UDT from " + input + ".")
- df = sqlContext.read.format("libsvm").load(input).cache()
+ df = spark.read.format("libsvm").load(input).cache()
print("Schema from LIBSVM:")
df.printSchema()
print("Loaded training data as a DataFrame with " +
@@ -54,7 +52,7 @@ if __name__ == "__main__":
labelSummary.show()
# Convert features column to an RDD of vectors.
- features = df.select("features").map(lambda r: r.features)
+ features = df.select("features").rdd.map(lambda r: r.features)
summary = Statistics.colStats(features)
print("Selected features column with average values:\n" +
str(summary.mean()))
@@ -67,9 +65,9 @@ if __name__ == "__main__":
# Load the records back.
print("Loading Parquet file with UDT from " + tempdir)
- newDF = sqlContext.read.parquet(tempdir)
+ newDF = spark.read.parquet(tempdir)
print("Schema from Parquet:")
newDF.printSchema()
shutil.rmtree(tempdir)
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/dct_example.py b/examples/src/main/python/ml/dct_example.py
index 264d47f404..1bf8fc6d14 100644
--- a/examples/src/main/python/ml/dct_example.py
+++ b/examples/src/main/python/ml/dct_example.py
@@ -17,19 +17,17 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import DCT
from pyspark.mllib.linalg import Vectors
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="DCTExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("DCTExample").getOrCreate()
# $example on$
- df = sqlContext.createDataFrame([
+ df = spark.createDataFrame([
(Vectors.dense([0.0, 1.0, -2.0, 3.0]),),
(Vectors.dense([-1.0, 2.0, 4.0, -7.0]),),
(Vectors.dense([14.0, -2.0, -5.0, 1.0]),)], ["features"])
@@ -42,4 +40,4 @@ if __name__ == "__main__":
print(dcts)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/decision_tree_classification_example.py b/examples/src/main/python/ml/decision_tree_classification_example.py
index 86bdc65392..d2318e2436 100644
--- a/examples/src/main/python/ml/decision_tree_classification_example.py
+++ b/examples/src/main/python/ml/decision_tree_classification_example.py
@@ -21,20 +21,19 @@ Decision Tree Classification Example.
from __future__ import print_function
# $example on$
-from pyspark import SparkContext, SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="decision_tree_classification_example")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("decision_tree_classification_example").getOrCreate()
# $example on$
# Load the data stored in LIBSVM format as a DataFrame.
- data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
@@ -72,3 +71,5 @@ if __name__ == "__main__":
# summary only
print(treeModel)
# $example off$
+
+ spark.stop()
diff --git a/examples/src/main/python/ml/decision_tree_regression_example.py b/examples/src/main/python/ml/decision_tree_regression_example.py
index 8e20d5d857..9e8cb382a9 100644
--- a/examples/src/main/python/ml/decision_tree_regression_example.py
+++ b/examples/src/main/python/ml/decision_tree_regression_example.py
@@ -20,21 +20,20 @@ Decision Tree Regression Example.
"""
from __future__ import print_function
-from pyspark import SparkContext, SQLContext
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="decision_tree_classification_example")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("decision_tree_classification_example").getOrCreate()
# $example on$
# Load the data stored in LIBSVM format as a DataFrame.
- data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
@@ -69,3 +68,5 @@ if __name__ == "__main__":
# summary only
print(treeModel)
# $example off$
+
+ spark.stop()
diff --git a/examples/src/main/python/ml/elementwise_product_example.py b/examples/src/main/python/ml/elementwise_product_example.py
index c85cb0d895..6fa641b772 100644
--- a/examples/src/main/python/ml/elementwise_product_example.py
+++ b/examples/src/main/python/ml/elementwise_product_example.py
@@ -17,23 +17,21 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import ElementwiseProduct
from pyspark.mllib.linalg import Vectors
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="ElementwiseProductExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("ElementwiseProductExample").getOrCreate()
# $example on$
data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
- df = sqlContext.createDataFrame(data, ["vector"])
+ df = spark.createDataFrame(data, ["vector"])
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
inputCol="vector", outputCol="transformedVector")
transformer.transform(df).show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/estimator_transformer_param_example.py b/examples/src/main/python/ml/estimator_transformer_param_example.py
index 9a8993dac4..4993b5a984 100644
--- a/examples/src/main/python/ml/estimator_transformer_param_example.py
+++ b/examples/src/main/python/ml/estimator_transformer_param_example.py
@@ -18,20 +18,19 @@
"""
Estimator Transformer Param Example.
"""
-from pyspark import SparkContext, SQLContext
+
# $example on$
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
-
- sc = SparkContext(appName="EstimatorTransformerParamExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("EstimatorTransformerParamExample").getOrCreate()
# $example on$
# Prepare training data from a list of (label, features) tuples.
- training = sqlContext.createDataFrame([
+ training = spark.createDataFrame([
(1.0, Vectors.dense([0.0, 1.1, 0.1])),
(0.0, Vectors.dense([2.0, 1.0, -1.0])),
(0.0, Vectors.dense([2.0, 1.3, 1.0])),
@@ -69,7 +68,7 @@ if __name__ == "__main__":
print model2.extractParamMap()
# Prepare test data
- test = sqlContext.createDataFrame([
+ test = spark.createDataFrame([
(1.0, Vectors.dense([-1.0, 1.5, 1.3])),
(0.0, Vectors.dense([3.0, 2.0, -0.1])),
(1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])
@@ -84,4 +83,4 @@ if __name__ == "__main__":
print row
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
index f7e842f4b3..b09ad41da3 100644
--- a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
@@ -20,21 +20,20 @@ Gradient Boosted Tree Classifier Example.
"""
from __future__ import print_function
-from pyspark import SparkContext, SQLContext
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="gradient_boosted_tree_classifier_example")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("gradient_boosted_tree_classifier_example").getOrCreate()
# $example on$
# Load and parse the data file, converting it to a DataFrame.
- data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
@@ -72,4 +71,4 @@ if __name__ == "__main__":
print(gbtModel) # summary only
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
index f8b4de651c..caa7cfc4e1 100644
--- a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
@@ -20,21 +20,20 @@ Gradient Boosted Tree Regressor Example.
"""
from __future__ import print_function
-from pyspark import SparkContext, SQLContext
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="gradient_boosted_tree_regressor_example")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("gradient_boosted_tree_regressor_example").getOrCreate()
# $example on$
# Load and parse the data file, converting it to a DataFrame.
- data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -69,4 +68,4 @@ if __name__ == "__main__":
print(gbtModel) # summary only
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/index_to_string_example.py b/examples/src/main/python/ml/index_to_string_example.py
index fb0ba2950b..dd04b2c4b0 100644
--- a/examples/src/main/python/ml/index_to_string_example.py
+++ b/examples/src/main/python/ml/index_to_string_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
# $example on$
from pyspark.ml.feature import IndexToString, StringIndexer
# $example off$
-from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="IndexToStringExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("IndexToStringExample").getOrCreate()
# $example on$
- df = sqlContext.createDataFrame(
+ df = spark.createDataFrame(
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
@@ -42,4 +40,4 @@ if __name__ == "__main__":
converted.select("id", "originalCategory").show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py
index fa57a4d3ad..7d9d80e645 100644
--- a/examples/src/main/python/ml/kmeans_example.py
+++ b/examples/src/main/python/ml/kmeans_example.py
@@ -20,10 +20,9 @@ from __future__ import print_function
import sys
import numpy as np
-from pyspark import SparkContext
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.mllib.linalg import VectorUDT, _convert_to_vector
-from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
from pyspark.sql.types import Row, StructField, StructType
"""
@@ -35,8 +34,8 @@ This example requires NumPy (http://www.numpy.org/).
"""
-def parseVector(line):
- array = np.array([float(x) for x in line.split(' ')])
+def parseVector(row):
+ array = np.array([float(x) for x in row.value.split(' ')])
return _convert_to_vector(array)
@@ -50,14 +49,13 @@ if __name__ == "__main__":
path = sys.argv[1]
k = sys.argv[2]
- sc = SparkContext(appName="PythonKMeansExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("PythonKMeansExample").getOrCreate()
- lines = sc.textFile(path)
+ lines = spark.read.text(path).rdd
data = lines.map(parseVector)
row_rdd = data.map(lambda x: Row(x))
schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)])
- df = sqlContext.createDataFrame(row_rdd, schema)
+ df = spark.createDataFrame(row_rdd, schema)
kmeans = KMeans().setK(2).setSeed(1).setFeaturesCol(FEATURES_COL)
model = kmeans.fit(df)
@@ -67,4 +65,4 @@ if __name__ == "__main__":
for center in centers:
print(center)
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/linear_regression_with_elastic_net.py b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
index a4cd40cf26..99b7f7fe99 100644
--- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
@@ -17,19 +17,17 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.regression import LinearRegression
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="LinearRegressionWithElasticNet")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("LinearRegressionWithElasticNet").getOrCreate()
# $example on$
# Load training data
- training = sqlContext.read.format("libsvm")\
+ training = spark.read.format("libsvm")\
.load("data/mllib/sample_linear_regression_data.txt")
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
@@ -42,4 +40,4 @@ if __name__ == "__main__":
print("Intercept: " + str(lrModel.intercept))
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/logistic_regression_with_elastic_net.py b/examples/src/main/python/ml/logistic_regression_with_elastic_net.py
index b0b1d27e13..0d7112e723 100644
--- a/examples/src/main/python/ml/logistic_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/logistic_regression_with_elastic_net.py
@@ -17,19 +17,17 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.classification import LogisticRegression
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="LogisticRegressionWithElasticNet")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("LogisticRegressionWithElasticNet").getOrCreate()
# $example on$
# Load training data
- training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
@@ -41,4 +39,4 @@ if __name__ == "__main__":
print("Intercept: " + str(lrModel.intercept))
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/max_abs_scaler_example.py b/examples/src/main/python/ml/max_abs_scaler_example.py
index d9b69eef1c..1cb95a98f0 100644
--- a/examples/src/main/python/ml/max_abs_scaler_example.py
+++ b/examples/src/main/python/ml/max_abs_scaler_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import MaxAbsScaler
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="MaxAbsScalerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("MaxAbsScalerExample").getOrCreate()
# $example on$
- dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
@@ -40,4 +38,4 @@ if __name__ == "__main__":
scaledData.show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/min_max_scaler_example.py b/examples/src/main/python/ml/min_max_scaler_example.py
index 2f8e4ade46..8d91a59e2b 100644
--- a/examples/src/main/python/ml/min_max_scaler_example.py
+++ b/examples/src/main/python/ml/min_max_scaler_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import MinMaxScaler
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="MinMaxScalerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("MinMaxScalerExample").getOrCreate()
# $example on$
- dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
@@ -40,4 +38,4 @@ if __name__ == "__main__":
scaledData.show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py
index f84588f547..8bededc14d 100644
--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -17,21 +17,19 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
-
- sc = SparkContext(appName="multilayer_perceptron_classification_example")
- sqlContext = SQLContext(sc)
+ spark = SparkSession\
+ .builder.appName("multilayer_perceptron_classification_example").getOrCreate()
# $example on$
# Load training data
- data = sqlContext.read.format("libsvm")\
+ data = spark.read.format("libsvm")\
.load("data/mllib/sample_multiclass_classification_data.txt")
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
@@ -52,4 +50,4 @@ if __name__ == "__main__":
print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py
index f2d85f53e7..b7fecf0d68 100644
--- a/examples/src/main/python/ml/n_gram_example.py
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import NGram
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="NGramExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("NGramExample").getOrCreate()
# $example on$
- wordDataFrame = sqlContext.createDataFrame([
+ wordDataFrame = spark.createDataFrame([
(0, ["Hi", "I", "heard", "about", "Spark"]),
(1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
(2, ["Logistic", "regression", "models", "are", "neat"])
@@ -39,4 +37,4 @@ if __name__ == "__main__":
print(ngrams_label)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py
index db8fbea9bf..e37035542c 100644
--- a/examples/src/main/python/ml/naive_bayes_example.py
+++ b/examples/src/main/python/ml/naive_bayes_example.py
@@ -17,21 +17,18 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
-
- sc = SparkContext(appName="naive_bayes_example")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("naive_bayes_example").getOrCreate()
# $example on$
# Load training data
- data = sqlContext.read.format("libsvm") \
+ data = spark.read.format("libsvm") \
.load("data/mllib/sample_libsvm_data.txt")
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
@@ -50,4 +47,4 @@ if __name__ == "__main__":
print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py
index d490221474..ae25537619 100644
--- a/examples/src/main/python/ml/normalizer_example.py
+++ b/examples/src/main/python/ml/normalizer_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Normalizer
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="NormalizerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("NormalizerExample").getOrCreate()
# $example on$
- dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
@@ -40,4 +38,4 @@ if __name__ == "__main__":
lInfNormData.show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py
index 0f94c26638..9acc363dc9 100644
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import OneHotEncoder, StringIndexer
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="OneHotEncoderExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("OneHotEncoderExample").getOrCreate()
# $example on$
- df = sqlContext.createDataFrame([
+ df = spark.createDataFrame([
(0, "a"),
(1, "b"),
(2, "c"),
@@ -45,4 +43,4 @@ if __name__ == "__main__":
encoded.select("id", "categoryVec").show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py
index a17181f1b8..adab151734 100644
--- a/examples/src/main/python/ml/pca_example.py
+++ b/examples/src/main/python/ml/pca_example.py
@@ -17,26 +17,24 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import PCA
from pyspark.mllib.linalg import Vectors
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="PCAExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("PCAExample").getOrCreate()
# $example on$
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
(Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
(Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
- df = sqlContext.createDataFrame(data, ["features"])
+ df = spark.createDataFrame(data, ["features"])
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)
result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py
index 3288568f0c..ed9765d961 100644
--- a/examples/src/main/python/ml/pipeline_example.py
+++ b/examples/src/main/python/ml/pipeline_example.py
@@ -18,21 +18,20 @@
"""
Pipeline Example.
"""
-from pyspark import SparkContext, SQLContext
+
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
-
- sc = SparkContext(appName="PipelineExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("PipelineExample").getOrCreate()
# $example on$
# Prepare training documents from a list of (id, text, label) tuples.
- training = sqlContext.createDataFrame([
+ training = spark.createDataFrame([
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
@@ -48,7 +47,7 @@ if __name__ == "__main__":
model = pipeline.fit(training)
# Prepare test documents, which are unlabeled (id, text) tuples.
- test = sqlContext.createDataFrame([
+ test = spark.createDataFrame([
(4L, "spark i j k"),
(5L, "l m n"),
(6L, "mapreduce spark"),
@@ -61,4 +60,4 @@ if __name__ == "__main__":
print(row)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
index 89f5cbe8f2..328b559320 100644
--- a/examples/src/main/python/ml/polynomial_expansion_example.py
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -17,19 +17,17 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import PolynomialExpansion
from pyspark.mllib.linalg import Vectors
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="PolynomialExpansionExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("PolynomialExpansionExample").getOrCreate()
# $example on$
- df = sqlContext\
+ df = spark\
.createDataFrame([(Vectors.dense([-2.0, 2.3]),),
(Vectors.dense([0.0, 0.0]),),
(Vectors.dense([0.6, -1.1]),)],
@@ -40,4 +38,4 @@ if __name__ == "__main__":
print(expanded)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/random_forest_classifier_example.py b/examples/src/main/python/ml/random_forest_classifier_example.py
index c3570438c5..b0a93e050c 100644
--- a/examples/src/main/python/ml/random_forest_classifier_example.py
+++ b/examples/src/main/python/ml/random_forest_classifier_example.py
@@ -20,21 +20,20 @@ Random Forest Classifier Example.
"""
from __future__ import print_function
-from pyspark import SparkContext, SQLContext
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="random_forest_classifier_example")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("random_forest_classifier_example").getOrCreate()
# $example on$
# Load and parse the data file, converting it to a DataFrame.
- data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
@@ -72,4 +71,4 @@ if __name__ == "__main__":
print(rfModel) # summary only
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/random_forest_regressor_example.py b/examples/src/main/python/ml/random_forest_regressor_example.py
index b77014f379..4bb84f0de8 100644
--- a/examples/src/main/python/ml/random_forest_regressor_example.py
+++ b/examples/src/main/python/ml/random_forest_regressor_example.py
@@ -20,21 +20,20 @@ Random Forest Regressor Example.
"""
from __future__ import print_function
-from pyspark import SparkContext, SQLContext
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="random_forest_regressor_example")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("random_forest_regressor_example").getOrCreate()
# $example on$
# Load and parse the data file, converting it to a DataFrame.
- data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -69,4 +68,4 @@ if __name__ == "__main__":
print(rfModel) # summary only
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/rformula_example.py b/examples/src/main/python/ml/rformula_example.py
index b544a14700..45cc116ac2 100644
--- a/examples/src/main/python/ml/rformula_example.py
+++ b/examples/src/main/python/ml/rformula_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import RFormula
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="RFormulaExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("RFormulaExample").getOrCreate()
# $example on$
- dataset = sqlContext.createDataFrame(
+ dataset = spark.createDataFrame(
[(7, "US", 18, 1.0),
(8, "CA", 12, 0.0),
(9, "NZ", 15, 0.0)],
@@ -41,4 +39,4 @@ if __name__ == "__main__":
output.select("features", "label").show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py
index b4f06bf888..3600c12211 100644
--- a/examples/src/main/python/ml/simple_text_classification_pipeline.py
+++ b/examples/src/main/python/ml/simple_text_classification_pipeline.py
@@ -17,11 +17,10 @@
from __future__ import print_function
-from pyspark import SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
-from pyspark.sql import Row, SQLContext
+from pyspark.sql import Row, SparkSession
"""
@@ -34,16 +33,15 @@ pipeline in Python. Run with:
if __name__ == "__main__":
- sc = SparkContext(appName="SimpleTextClassificationPipeline")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("SimpleTextClassificationPipeline").getOrCreate()
# Prepare training documents, which are labeled.
- LabeledDocument = Row("id", "text", "label")
- training = sc.parallelize([(0, "a b c d e spark", 1.0),
- (1, "b d", 0.0),
- (2, "spark f g h", 1.0),
- (3, "hadoop mapreduce", 0.0)]) \
- .map(lambda x: LabeledDocument(*x)).toDF()
+ training = spark.createDataFrame([
+ (0, "a b c d e spark", 1.0),
+ (1, "b d", 0.0),
+ (2, "spark f g h", 1.0),
+ (3, "hadoop mapreduce", 0.0)
+ ], ["id", "text", "label"])
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
@@ -55,12 +53,12 @@ if __name__ == "__main__":
model = pipeline.fit(training)
# Prepare test documents, which are unlabeled.
- Document = Row("id", "text")
- test = sc.parallelize([(4, "spark i j k"),
- (5, "l m n"),
- (6, "spark hadoop spark"),
- (7, "apache hadoop")]) \
- .map(lambda x: Document(*x)).toDF()
+ test = spark.createDataFrame([
+ (4, "spark i j k"),
+ (5, "l m n"),
+ (6, "spark hadoop spark"),
+ (7, "apache hadoop")
+ ], ["id", "text"])
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
@@ -68,4 +66,4 @@ if __name__ == "__main__":
for row in selected.collect():
print(row)
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/sql_transformer.py b/examples/src/main/python/ml/sql_transformer.py
index 9575d728d8..26045db4be 100644
--- a/examples/src/main/python/ml/sql_transformer.py
+++ b/examples/src/main/python/ml/sql_transformer.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
# $example on$
from pyspark.ml.feature import SQLTransformer
# $example off$
-from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="SQLTransformerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("SQLTransformerExample").getOrCreate()
# $example on$
- df = sqlContext.createDataFrame([
+ df = spark.createDataFrame([
(0, 1.0, 3.0),
(2, 2.0, 5.0)
], ["id", "v1", "v2"])
@@ -37,4 +35,4 @@ if __name__ == "__main__":
sqlTrans.transform(df).show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/standard_scaler_example.py b/examples/src/main/python/ml/standard_scaler_example.py
index ae7aa85005..c50804f6bf 100644
--- a/examples/src/main/python/ml/standard_scaler_example.py
+++ b/examples/src/main/python/ml/standard_scaler_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import StandardScaler
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="StandardScalerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("StandardScalerExample").getOrCreate()
# $example on$
- dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
withStd=True, withMean=False)
@@ -40,4 +38,4 @@ if __name__ == "__main__":
scaledData.show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py
index 01f94af8ca..57362673df 100644
--- a/examples/src/main/python/ml/stopwords_remover_example.py
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import StopWordsRemover
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="StopWordsRemoverExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("StopWordsRemoverExample").getOrCreate()
# $example on$
- sentenceData = sqlContext.createDataFrame([
+ sentenceData = spark.createDataFrame([
(0, ["I", "saw", "the", "red", "baloon"]),
(1, ["Mary", "had", "a", "little", "lamb"])
], ["label", "raw"])
@@ -37,4 +35,4 @@ if __name__ == "__main__":
remover.transform(sentenceData).show(truncate=False)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/string_indexer_example.py b/examples/src/main/python/ml/string_indexer_example.py
index 58a8cb5d56..aacd4f999b 100644
--- a/examples/src/main/python/ml/string_indexer_example.py
+++ b/examples/src/main/python/ml/string_indexer_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import StringIndexer
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="StringIndexerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("StringIndexerExample").getOrCreate()
# $example on$
- df = sqlContext.createDataFrame(
+ df = spark.createDataFrame(
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
@@ -36,4 +34,4 @@ if __name__ == "__main__":
indexed.show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
index 141324d458..25df8166ef 100644
--- a/examples/src/main/python/ml/tf_idf_example.py
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
# $example on$
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
# $example off$
-from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="TfIdfExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("TfIdfExample").getOrCreate()
# $example on$
- sentenceData = sqlContext.createDataFrame([
+ sentenceData = spark.createDataFrame([
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
@@ -46,4 +44,4 @@ if __name__ == "__main__":
print(features_label)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py
index ce9b225be5..5be4b4cfe3 100644
--- a/examples/src/main/python/ml/tokenizer_example.py
+++ b/examples/src/main/python/ml/tokenizer_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="TokenizerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("TokenizerExample").getOrCreate()
# $example on$
- sentenceDataFrame = sqlContext.createDataFrame([
+ sentenceDataFrame = spark.createDataFrame([
(0, "Hi I heard about Spark"),
(1, "I wish Java could use case classes"),
(2, "Logistic,regression,models,are,neat")
@@ -41,4 +39,4 @@ if __name__ == "__main__":
# alternatively, pattern="\\w+", gaps(False)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py
index 161a200c61..2e43a0f8ae 100644
--- a/examples/src/main/python/ml/train_validation_split.py
+++ b/examples/src/main/python/ml/train_validation_split.py
@@ -15,13 +15,12 @@
# limitations under the License.
#
-from pyspark import SparkContext
# $example on$
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
-from pyspark.sql import SQLContext
# $example off$
+from pyspark.sql import SparkSession
"""
This example demonstrates applying TrainValidationSplit to split data
@@ -32,11 +31,10 @@ Run with:
"""
if __name__ == "__main__":
- sc = SparkContext(appName="TrainValidationSplit")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("TrainValidationSplit").getOrCreate()
# $example on$
# Prepare training and test data.
- data = sqlContext.read.format("libsvm")\
+ data = spark.read.format("libsvm")\
.load("data/mllib/sample_linear_regression_data.txt")
train, test = data.randomSplit([0.7, 0.3])
lr = LinearRegression(maxIter=10, regParam=0.1)
@@ -65,4 +63,4 @@ if __name__ == "__main__":
for row in prediction.take(5):
print(row)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py
index 04f64839f1..019a9ea6f7 100644
--- a/examples/src/main/python/ml/vector_assembler_example.py
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@@ -17,19 +17,17 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="VectorAssemblerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("VectorAssemblerExample").getOrCreate()
# $example on$
- dataset = sqlContext.createDataFrame(
+ dataset = spark.createDataFrame(
[(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
["id", "hour", "mobile", "userFeatures", "clicked"])
assembler = VectorAssembler(
@@ -39,4 +37,4 @@ if __name__ == "__main__":
print(output.select("features", "clicked").first())
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py
index 146f41c1dd..3cf5b8ebf1 100644
--- a/examples/src/main/python/ml/vector_indexer_example.py
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@@ -17,18 +17,16 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import VectorIndexer
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="VectorIndexerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("VectorIndexerExample").getOrCreate()
# $example on$
- data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
indexerModel = indexer.fit(data)
@@ -37,4 +35,4 @@ if __name__ == "__main__":
indexedData.show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/vector_slicer_example.py b/examples/src/main/python/ml/vector_slicer_example.py
index 31a753073c..0531bcdb06 100644
--- a/examples/src/main/python/ml/vector_slicer_example.py
+++ b/examples/src/main/python/ml/vector_slicer_example.py
@@ -17,20 +17,18 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import VectorSlicer
from pyspark.mllib.linalg import Vectors
from pyspark.sql.types import Row
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="VectorSlicerExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("VectorSlicerExample").getOrCreate()
# $example on$
- df = sqlContext.createDataFrame([
+ df = spark.createDataFrame([
Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3}),),
Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]),)])
@@ -41,4 +39,4 @@ if __name__ == "__main__":
output.select("userFeatures", "features").show()
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py
index 53c77feb10..6766a7b6aa 100644
--- a/examples/src/main/python/ml/word2vec_example.py
+++ b/examples/src/main/python/ml/word2vec_example.py
@@ -17,19 +17,17 @@
from __future__ import print_function
-from pyspark import SparkContext
-from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Word2Vec
# $example off$
+from pyspark.sql import SparkSession
if __name__ == "__main__":
- sc = SparkContext(appName="Word2VecExample")
- sqlContext = SQLContext(sc)
+ spark = SparkSession.builder.appName("Word2VecExample").getOrCreate()
# $example on$
# Input data: Each row is a bag of words from a sentence or document.
- documentDF = sqlContext.createDataFrame([
+ documentDF = spark.createDataFrame([
("Hi I heard about Spark".split(" "), ),
("I wish Java could use case classes".split(" "), ),
("Logistic regression models are neat".split(" "), )
@@ -42,4 +40,4 @@ if __name__ == "__main__":
print(feature)
# $example off$
- sc.stop()
+ spark.stop()
diff --git a/examples/src/main/python/mllib/binary_classification_metrics_example.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py
index 4e7ea289b2..8f0fc9d45d 100644
--- a/examples/src/main/python/mllib/binary_classification_metrics_example.py
+++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py
@@ -18,7 +18,7 @@
Binary Classification Metrics Example.
"""
from __future__ import print_function
-from pyspark import SparkContext, SQLContext
+from pyspark import SparkContext
# $example on$
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
@@ -27,7 +27,7 @@ from pyspark.mllib.util import MLUtils
if __name__ == "__main__":
sc = SparkContext(appName="BinaryClassificationMetricsExample")
- sqlContext = SQLContext(sc)
+
# $example on$
# Several of the methods available in scala are currently missing from pyspark
# Load training data in LIBSVM format
@@ -52,3 +52,5 @@ if __name__ == "__main__":
# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)
# $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py
index ea6a22dbfe..59a46cb283 100644
--- a/examples/src/main/python/sql.py
+++ b/examples/src/main/python/sql.py
@@ -63,7 +63,7 @@ if __name__ == "__main__":
# |-- age: long (nullable = true)
# |-- name: string (nullable = true)
- # Register this DataFrame as a table.
+ # Register this DataFrame as a temporary table.
people.registerTempTable("people")
# SQL statements can be run by using the sql methods provided by sqlContext
diff --git a/examples/src/main/python/streaming/sql_network_wordcount.py b/examples/src/main/python/streaming/sql_network_wordcount.py
index 1ba5e9fb78..588cbfee14 100644
--- a/examples/src/main/python/streaming/sql_network_wordcount.py
+++ b/examples/src/main/python/streaming/sql_network_wordcount.py
@@ -33,13 +33,14 @@ import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
-from pyspark.sql import SQLContext, Row
+from pyspark.sql import Row, SparkSession
-def getSqlContextInstance(sparkContext):
- if ('sqlContextSingletonInstance' not in globals()):
- globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
- return globals()['sqlContextSingletonInstance']
+def getSparkSessionInstance(sparkConf):
+ if ('sparkSessionSingletonInstance' not in globals()):
+ globals()['sparkSessionSingletonInstance'] =\
+ SparkSession.builder.config(conf=sparkConf).getOrCreate()
+ return globals()['sparkSessionSingletonInstance']
if __name__ == "__main__":
@@ -60,19 +61,19 @@ if __name__ == "__main__":
print("========= %s =========" % str(time))
try:
- # Get the singleton instance of SQLContext
- sqlContext = getSqlContextInstance(rdd.context)
+ # Get the singleton instance of SparkSession
+ spark = getSparkSessionInstance(rdd.context.getConf())
# Convert RDD[String] to RDD[Row] to DataFrame
rowRdd = rdd.map(lambda w: Row(word=w))
- wordsDataFrame = sqlContext.createDataFrame(rowRdd)
+ wordsDataFrame = spark.createDataFrame(rowRdd)
# Register as table
wordsDataFrame.registerTempTable("words")
# Do word count on table using SQL and print it
wordCountsDataFrame = \
- sqlContext.sql("select word, count(*) as total from words group by word")
+ spark.sql("select word, count(*) as total from words group by word")
wordCountsDataFrame.show()
except:
pass
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
index 21f58ddf3c..3795af8309 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala
@@ -18,12 +18,11 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.regression.AFTSurvivalRegression
import org.apache.spark.mllib.linalg.Vectors
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
/**
* An example for AFTSurvivalRegression.
@@ -31,12 +30,10 @@ import org.apache.spark.sql.SQLContext
object AFTSurvivalRegressionExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("AFTSurvivalRegressionExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("AFTSurvivalRegressionExample").getOrCreate()
// $example on$
- val training = sqlContext.createDataFrame(Seq(
+ val training = spark.createDataFrame(Seq(
(1.218, 1.0, Vectors.dense(1.560, -0.605)),
(2.949, 0.0, Vectors.dense(0.346, 2.158)),
(3.627, 0.0, Vectors.dense(1.380, 0.231)),
@@ -56,7 +53,7 @@ object AFTSurvivalRegressionExample {
model.transform(training).show(false)
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
index a79e15c767..41750ca779 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
@@ -18,12 +18,11 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
// $example on$
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType
@@ -43,13 +42,11 @@ object ALSExample {
// $example off$
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("ALSExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
+ val spark = SparkSession.builder.appName("ALSExample").getOrCreate()
+ import spark.implicits._
// $example on$
- val ratings = sc.textFile("data/mllib/als/sample_movielens_ratings.txt")
+ val ratings = spark.read.text("data/mllib/als/sample_movielens_ratings.txt")
.map(Rating.parseRating)
.toDF()
val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
@@ -75,7 +72,8 @@ object ALSExample {
val rmse = evaluator.evaluate(predictions)
println(s"Root-mean-square error = $rmse")
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
index 2ed8101c13..93c153f923 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.Binarizer
// $example off$
-import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.{DataFrame, SparkSession}
object BinarizerExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("BinarizerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("BinarizerExample").getOrCreate()
// $example on$
val data = Array((0, 0.1), (1, 0.8), (2, 0.2))
- val dataFrame: DataFrame = sqlContext.createDataFrame(data).toDF("label", "feature")
+ val dataFrame: DataFrame = spark.createDataFrame(data).toDF("label", "feature")
val binarizer: Binarizer = new Binarizer()
.setInputCol("feature")
@@ -42,7 +39,8 @@ object BinarizerExample {
val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
binarizedFeatures.collect().foreach(println)
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
index 6f6236a2b0..779ad33dbd 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
@@ -18,23 +18,20 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.Bucketizer
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object BucketizerExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("BucketizerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("BucketizerExample").getOrCreate()
// $example on$
val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
val data = Array(-0.5, -0.3, 0.0, 0.2)
- val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+ val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
val bucketizer = new Bucketizer()
.setInputCol("features")
@@ -45,7 +42,7 @@ object BucketizerExample {
val bucketedData = bucketizer.transform(dataFrame)
bucketedData.show()
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala
index 2be61537e6..84ca1f0b56 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala
@@ -18,20 +18,16 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.ChiSqSelector
import org.apache.spark.mllib.linalg.Vectors
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object ChiSqSelectorExample {
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("ChiSqSelectorExample")
- val sc = new SparkContext(conf)
-
- val sqlContext = SQLContext.getOrCreate(sc)
- import sqlContext.implicits._
+ val spark = SparkSession.builder.appName("ChiSqSelectorExample").getOrCreate()
+ import spark.implicits._
// $example on$
val data = Seq(
@@ -40,7 +36,7 @@ object ChiSqSelectorExample {
(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
)
- val df = sc.parallelize(data).toDF("id", "features", "clicked")
+ val df = spark.createDataset(data).toDF("id", "features", "clicked")
val selector = new ChiSqSelector()
.setNumTopFeatures(1)
@@ -51,7 +47,7 @@ object ChiSqSelectorExample {
val result = selector.fit(df).transform(df)
result.show()
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
index 7d07fc7dd1..9ab43a48bf 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object CountVectorizerExample {
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("CounterVectorizerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("CounterVectorizerExample").getOrCreate()
// $example on$
- val df = sqlContext.createDataFrame(Seq(
+ val df = spark.createDataFrame(Seq(
(0, Array("a", "b", "c")),
(1, Array("a", "b", "b", "c", "a"))
)).toDF("id", "words")
@@ -51,6 +48,8 @@ object CountVectorizerExample {
cvModel.transform(df).select("features").show()
// $example off$
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
index dc26b55a76..b415333c71 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
@@ -18,18 +18,15 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.DCT
import org.apache.spark.mllib.linalg.Vectors
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object DCTExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("DCTExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("DCTExample").getOrCreate()
// $example on$
val data = Seq(
@@ -37,7 +34,7 @@ object DCTExample {
Vectors.dense(-1.0, 2.0, 4.0, -7.0),
Vectors.dense(14.0, -2.0, -5.0, 1.0))
- val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+ val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
val dct = new DCT()
.setInputCol("features")
@@ -47,7 +44,8 @@ object DCTExample {
val dctDf = dct.transform(df)
dctDf.select("featuresDCT").show(3)
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
index 7e608a2812..2f892f8d72 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
@@ -23,11 +23,10 @@ import java.io.File
import com.google.common.io.Files
import scopt.OptionParser
-import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+import org.apache.spark.sql.{DataFrame, Row, SparkSession}
/**
* An example of how to use [[org.apache.spark.sql.DataFrame]] for ML. Run with
@@ -62,14 +61,11 @@ object DataFrameExample {
}
def run(params: Params) {
-
- val conf = new SparkConf().setAppName(s"DataFrameExample with $params")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName(s"DataFrameExample with $params").getOrCreate()
// Load input data
println(s"Loading LIBSVM file with UDT from ${params.input}.")
- val df: DataFrame = sqlContext.read.format("libsvm").load(params.input).cache()
+ val df: DataFrame = spark.read.format("libsvm").load(params.input).cache()
println("Schema from LIBSVM:")
df.printSchema()
println(s"Loaded training data as a DataFrame with ${df.count()} records.")
@@ -94,11 +90,11 @@ object DataFrameExample {
// Load the records back.
println(s"Loading Parquet file with UDT from $outputDir.")
- val newDF = sqlContext.read.parquet(outputDir)
+ val newDF = spark.read.parquet(outputDir)
println(s"Schema from Parquet:")
newDF.printSchema()
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
index 224d8da5f0..a0a2e1fb33 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
@@ -18,7 +18,6 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
@@ -26,16 +25,14 @@ import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object DecisionTreeClassificationExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("DecisionTreeClassificationExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("DecisionTreeClassificationExample").getOrCreate()
// $example on$
// Load the data stored in LIBSVM format as a DataFrame.
- val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
@@ -88,6 +85,8 @@ object DecisionTreeClassificationExample {
val treeModel = model.stages(2).asInstanceOf[DecisionTreeClassificationModel]
println("Learned classification tree model:\n" + treeModel.toDebugString)
// $example off$
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
index d2560cc00b..cea1d801aa 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
@@ -33,7 +33,7 @@ import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* An example runner for decision trees. Run with
@@ -134,18 +134,18 @@ object DecisionTreeExample {
/** Load a dataset from the given path, using the given format */
private[ml] def loadData(
- sqlContext: SQLContext,
+ spark: SparkSession,
path: String,
format: String,
expectedNumFeatures: Option[Int] = None): DataFrame = {
- import sqlContext.implicits._
+ import spark.implicits._
format match {
- case "dense" => MLUtils.loadLabeledPoints(sqlContext.sparkContext, path).toDF()
+ case "dense" => MLUtils.loadLabeledPoints(spark.sparkContext, path).toDF()
case "libsvm" => expectedNumFeatures match {
- case Some(numFeatures) => sqlContext.read.option("numFeatures", numFeatures.toString)
+ case Some(numFeatures) => spark.read.option("numFeatures", numFeatures.toString)
.format("libsvm").load(path)
- case None => sqlContext.read.format("libsvm").load(path)
+ case None => spark.read.format("libsvm").load(path)
}
case _ => throw new IllegalArgumentException(s"Bad data format: $format")
}
@@ -167,17 +167,17 @@ object DecisionTreeExample {
testInput: String,
algo: String,
fracTest: Double): (DataFrame, DataFrame) = {
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.getOrCreate()
// Load training data
- val origExamples: DataFrame = loadData(sqlContext, input, dataFormat)
+ val origExamples: DataFrame = loadData(spark, input, dataFormat)
// Load or create test set
val dataframes: Array[DataFrame] = if (testInput != "") {
// Load testInput.
val numFeatures = origExamples.first().getAs[Vector](1).size
val origTestExamples: DataFrame =
- loadData(sqlContext, testInput, dataFormat, Some(numFeatures))
+ loadData(spark, testInput, dataFormat, Some(numFeatures))
Array(origExamples, origTestExamples)
} else {
// Split input into training, test.
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
index ad32e5635a..26b52d0489 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
@@ -18,7 +18,6 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
@@ -26,17 +25,15 @@ import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.regression.DecisionTreeRegressor
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object DecisionTreeRegressionExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("DecisionTreeRegressionExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("DecisionTreeRegressionExample").getOrCreate()
// $example on$
// Load the data stored in LIBSVM format as a DataFrame.
- val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Automatically identify categorical features, and index them.
// Here, we treat features with > 4 distinct values as continuous.
@@ -78,6 +75,8 @@ object DecisionTreeRegressionExample {
val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel]
println("Learned regression tree model:\n" + treeModel.toDebugString)
// $example off$
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index 8d127f9b35..2aa1ab1ec8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -18,13 +18,12 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.classification.{ClassificationModel, Classifier, ClassifierParams}
import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext}
+import org.apache.spark.sql.{Dataset, Row, SparkSession}
/**
* A simple example demonstrating how to write your own learning algorithm using Estimator,
@@ -38,13 +37,11 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext}
object DeveloperApiExample {
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("DeveloperApiExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
+ val spark = SparkSession.builder.appName("DeveloperApiExample").getOrCreate()
+ import spark.implicits._
// Prepare training data.
- val training = sc.parallelize(Seq(
+ val training = spark.createDataFrame(Seq(
LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
@@ -62,13 +59,13 @@ object DeveloperApiExample {
val model = lr.fit(training.toDF())
// Prepare test data.
- val test = sc.parallelize(Seq(
+ val test = spark.createDataFrame(Seq(
LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
// Make predictions on test data.
- val sumPredictions: Double = model.transform(test.toDF())
+ val sumPredictions: Double = model.transform(test)
.select("features", "label", "prediction")
.collect()
.map { case Row(features: Vector, label: Double, prediction: Double) =>
@@ -77,7 +74,7 @@ object DeveloperApiExample {
assert(sumPredictions == 0.0,
"MyLogisticRegression predicted something other than 0, even though all coefficients are 0!")
- sc.stop()
+ spark.stop()
}
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala
index 629d322c43..f289c28df9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala
@@ -18,22 +18,19 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.ElementwiseProduct
import org.apache.spark.mllib.linalg.Vectors
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object ElementwiseProductExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("ElementwiseProductExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("ElementwiseProductExample").getOrCreate()
// $example on$
// Create some vector data; also works for sparse vectors
- val dataFrame = sqlContext.createDataFrame(Seq(
+ val dataFrame = spark.createDataFrame(Seq(
("a", Vectors.dense(1.0, 2.0, 3.0)),
("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")
@@ -46,7 +43,8 @@ object ElementwiseProductExample {
// Batch transform the vectors to create new column:
transformer.transform(dataFrame).show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
index 65e3c365ab..91076ccbc1 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
@@ -18,25 +18,22 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.sql.Row
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object EstimatorTransformerParamExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("EstimatorTransformerParamExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("EstimatorTransformerParamExample").getOrCreate()
// $example on$
// Prepare training data from a list of (label, features) tuples.
- val training = sqlContext.createDataFrame(Seq(
+ val training = spark.createDataFrame(Seq(
(1.0, Vectors.dense(0.0, 1.1, 0.1)),
(0.0, Vectors.dense(2.0, 1.0, -1.0)),
(0.0, Vectors.dense(2.0, 1.3, 1.0)),
@@ -76,7 +73,7 @@ object EstimatorTransformerParamExample {
println("Model 2 was fit using parameters: " + model2.parent.extractParamMap)
// Prepare test data.
- val test = sqlContext.createDataFrame(Seq(
+ val test = spark.createDataFrame(Seq(
(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
(0.0, Vectors.dense(3.0, 2.0, -0.1)),
(1.0, Vectors.dense(0.0, 2.2, -1.5))
@@ -94,7 +91,7 @@ object EstimatorTransformerParamExample {
}
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
index cd62a80382..412c54db7d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
@@ -18,24 +18,21 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object GradientBoostedTreeClassifierExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("GradientBoostedTreeClassifierExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("GradientBoostedTreeClassifierExample").getOrCreate()
// $example on$
// Load and parse the data file, converting it to a DataFrame.
- val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
@@ -91,7 +88,7 @@ object GradientBoostedTreeClassifierExample {
println("Learned classification GBT model:\n" + gbtModel.toDebugString)
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
index b8cf9629bb..fd43553cc6 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
@@ -18,24 +18,21 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object GradientBoostedTreeRegressorExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("GradientBoostedTreeRegressorExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("GradientBoostedTreeRegressorExample").getOrCreate()
// $example on$
// Load and parse the data file, converting it to a DataFrame.
- val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -79,7 +76,7 @@ object GradientBoostedTreeRegressorExample {
println("Learned regression GBT model:\n" + gbtModel.toDebugString)
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala
index 4cea09ba12..d873618726 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala
@@ -18,21 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.{IndexToString, StringIndexer}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object IndexToStringExample {
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("IndexToStringExample")
- val sc = new SparkContext(conf)
-
- val sqlContext = SQLContext.getOrCreate(sc)
+ val spark = SparkSession.builder.appName("IndexToStringExample").getOrCreate()
// $example on$
- val df = sqlContext.createDataFrame(Seq(
+ val df = spark.createDataFrame(Seq(
(0, "a"),
(1, "b"),
(2, "c"),
@@ -54,7 +50,8 @@ object IndexToStringExample {
val converted = converter.transform(indexed)
converted.select("id", "originalCategory").show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
index 7af011571f..d2573fad35 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
@@ -19,11 +19,10 @@ package org.apache.spark.examples.ml
// scalastyle:off println
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.{DataFrame, SparkSession}
// $example off$
/**
@@ -37,13 +36,11 @@ object KMeansExample {
def main(args: Array[String]): Unit = {
// Creates a Spark context and a SQL context
- val conf = new SparkConf().setAppName(s"${this.getClass.getSimpleName}")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName(s"${this.getClass.getSimpleName}").getOrCreate()
// $example on$
// Crates a DataFrame
- val dataset: DataFrame = sqlContext.createDataFrame(Seq(
+ val dataset: DataFrame = spark.createDataFrame(Seq(
(1, Vectors.dense(0.0, 0.0, 0.0)),
(2, Vectors.dense(0.1, 0.1, 0.1)),
(3, Vectors.dense(0.2, 0.2, 0.2)),
@@ -64,7 +61,7 @@ object KMeansExample {
model.clusterCenters.foreach(println)
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala
index f9ddac7709..c23adee1a3 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala
@@ -18,11 +18,10 @@
package org.apache.spark.examples.ml
// scalastyle:off println
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.clustering.LDA
import org.apache.spark.mllib.linalg.{Vectors, VectorUDT}
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StructField, StructType}
// $example off$
@@ -41,16 +40,14 @@ object LDAExample {
val input = "data/mllib/sample_lda_data.txt"
// Creates a Spark context and a SQL context
- val conf = new SparkConf().setAppName(s"${this.getClass.getSimpleName}")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName(s"${this.getClass.getSimpleName}").getOrCreate()
// $example on$
// Loads data
- val rowRDD = sc.textFile(input).filter(_.nonEmpty)
+ val rowRDD = spark.read.text(input).rdd.filter(_.nonEmpty)
.map(_.split(" ").map(_.toDouble)).map(Vectors.dense).map(Row(_))
val schema = StructType(Array(StructField(FEATURES_COL, new VectorUDT, false)))
- val dataset = sqlContext.createDataFrame(rowRDD, schema)
+ val dataset = spark.createDataFrame(rowRDD, schema)
// Trains a LDA model
val lda = new LDA()
@@ -71,7 +68,7 @@ object LDAExample {
transformed.show(false)
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala
index f68aef7082..cb6e2492f5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala
@@ -18,22 +18,19 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.regression.LinearRegression
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object LinearRegressionWithElasticNetExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("LinearRegressionWithElasticNetExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("LinearRegressionWithElasticNetExample").getOrCreate()
// $example on$
// Load training data
- val training = sqlContext.read.format("libsvm")
+ val training = spark.read.format("libsvm")
.load("data/mllib/sample_linear_regression_data.txt")
val lr = new LinearRegression()
@@ -56,7 +53,7 @@ object LinearRegressionWithElasticNetExample {
println(s"r2: ${trainingSummary.r2}")
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
index 89c5edf1ac..50670d7b38 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala
@@ -18,23 +18,20 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.max
object LogisticRegressionSummaryExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("LogisticRegressionSummaryExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
+ val spark = SparkSession.builder.appName("LogisticRegressionSummaryExample").getOrCreate()
+ import spark.implicits._
// Load training data
- val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val lr = new LogisticRegression()
.setMaxIter(10)
@@ -71,7 +68,7 @@ object LogisticRegressionSummaryExample {
lrModel.setThreshold(bestThreshold)
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala
index 6e27571f1d..fcba813d5b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala
@@ -18,22 +18,20 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.classification.LogisticRegression
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object LogisticRegressionWithElasticNetExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("LogisticRegressionWithElasticNetExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession
+ .builder.appName("LogisticRegressionWithElasticNetExample").getOrCreate()
// $example on$
// Load training data
- val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val lr = new LogisticRegression()
.setMaxIter(10)
@@ -47,7 +45,7 @@ object LogisticRegressionWithElasticNetExample {
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala
index aafb5efd69..896d8fadbe 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala
@@ -15,23 +15,19 @@
* limitations under the License.
*/
-// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.MaxAbsScaler
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object MaxAbsScalerExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("MaxAbsScalerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("MaxAbsScalerExample").getOrCreate()
// $example on$
- val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val scaler = new MaxAbsScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures")
@@ -43,7 +39,7 @@ object MaxAbsScalerExample {
val scaledData = scalerModel.transform(dataFrame)
scaledData.show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
-// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
index 9a03f69f5a..bcdca0fa04 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.MinMaxScaler
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object MinMaxScalerExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("MinMaxScalerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("MinMaxScalerExample").getOrCreate()
// $example on$
- val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val scaler = new MinMaxScaler()
.setInputCol("features")
@@ -44,7 +41,8 @@ object MinMaxScalerExample {
val scaledData = scalerModel.transform(dataFrame)
scaledData.show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala
index d1441b5497..5fb3536060 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala
@@ -18,7 +18,6 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
@@ -28,7 +27,7 @@ import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.Row
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
/**
* A simple example demonstrating model selection using CrossValidator.
@@ -42,13 +41,12 @@ import org.apache.spark.sql.SQLContext
object ModelSelectionViaCrossValidationExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("ModelSelectionViaCrossValidationExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession
+ .builder.appName("ModelSelectionViaCrossValidationExample").getOrCreate()
// $example on$
// Prepare training data from a list of (id, text, label) tuples.
- val training = sqlContext.createDataFrame(Seq(
+ val training = spark.createDataFrame(Seq(
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
@@ -98,7 +96,7 @@ object ModelSelectionViaCrossValidationExample {
val cvModel = cv.fit(training)
// Prepare test documents, which are unlabeled (id, text) tuples.
- val test = sqlContext.createDataFrame(Seq(
+ val test = spark.createDataFrame(Seq(
(4L, "spark i j k"),
(5L, "l m n"),
(6L, "mapreduce spark"),
@@ -114,7 +112,7 @@ object ModelSelectionViaCrossValidationExample {
}
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
index fcad17a817..6bc082982c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
@@ -17,13 +17,12 @@
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
/**
* A simple example demonstrating model selection using TrainValidationSplit.
@@ -36,13 +35,12 @@ import org.apache.spark.sql.SQLContext
object ModelSelectionViaTrainValidationSplitExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("ModelSelectionViaTrainValidationSplitExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession
+ .builder.appName("ModelSelectionViaTrainValidationSplitExample").getOrCreate()
// $example on$
// Prepare training and test data.
- val data = sqlContext.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt")
+ val data = spark.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt")
val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
val lr = new LinearRegression()
@@ -75,6 +73,6 @@ object ModelSelectionViaTrainValidationSplitExample {
.show()
// $example off$
- sc.stop()
+ spark.stop()
}
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
index d7d1e82f6f..a11fe1b4b2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
@@ -18,12 +18,11 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
/**
* An example for Multilayer Perceptron Classification.
@@ -31,13 +30,11 @@ import org.apache.spark.sql.SQLContext
object MultilayerPerceptronClassifierExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("MultilayerPerceptronClassifierExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("MultilayerPerceptronClassifierExample").getOrCreate()
// $example on$
// Load the data stored in LIBSVM format as a DataFrame.
- val data = sqlContext.read.format("libsvm")
+ val data = spark.read.format("libsvm")
.load("data/mllib/sample_multiclass_classification_data.txt")
// Split the data into train and test
val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L)
@@ -63,7 +60,7 @@ object MultilayerPerceptronClassifierExample {
println("Precision:" + evaluator.evaluate(predictionAndLabels))
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
index 77b913aaa3..1b71a39890 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.NGram
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object NGramExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("NGramExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("NGramExample").getOrCreate()
// $example on$
- val wordDataFrame = sqlContext.createDataFrame(Seq(
+ val wordDataFrame = spark.createDataFrame(Seq(
(0, Array("Hi", "I", "heard", "about", "Spark")),
(1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
(2, Array("Logistic", "regression", "models", "are", "neat"))
@@ -41,7 +38,8 @@ object NGramExample {
val ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
index 5ea1270c97..8d54555cd3 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
@@ -18,21 +18,18 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
-import org.apache.spark.ml.classification.{NaiveBayes}
+import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object NaiveBayesExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("NaiveBayesExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("NaiveBayesExample").getOrCreate()
// $example on$
// Load the data stored in LIBSVM format as a DataFrame.
- val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Split the data into training and test sets (30% held out for testing)
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
@@ -53,6 +50,8 @@ object NaiveBayesExample {
val precision = evaluator.evaluate(predictions)
println("Precision:" + precision)
// $example off$
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
index 6b33c16c74..4622d69ef9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.Normalizer
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object NormalizerExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("NormalizerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("NormalizerExample").getOrCreate()
// $example on$
- val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Normalize each Vector using $L^1$ norm.
val normalizer = new Normalizer()
@@ -46,7 +43,8 @@ object NormalizerExample {
val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
lInfNormData.show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
index cb9fe65a85..338436100c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object OneHotEncoderExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("OneHotEncoderExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("OneHotEncoderExample").getOrCreate()
// $example on$
- val df = sqlContext.createDataFrame(Seq(
+ val df = spark.createDataFrame(Seq(
(0, "a"),
(1, "b"),
(2, "c"),
@@ -52,7 +49,8 @@ object OneHotEncoderExample {
val encoded = encoder.transform(indexed)
encoded.select("id", "categoryVec").show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
index 0b5d31c0ff..e2351c682d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
@@ -22,7 +22,6 @@ import java.util.concurrent.TimeUnit.{NANOSECONDS => NANO}
import scopt.OptionParser
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
@@ -31,7 +30,7 @@ import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.DataFrame
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
/**
* An example runner for Multiclass to Binary Reduction with One Vs Rest.
@@ -110,18 +109,16 @@ object OneVsRestExample {
}
private def run(params: Params) {
- val conf = new SparkConf().setAppName(s"OneVsRestExample with $params")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName(s"OneVsRestExample with $params").getOrCreate()
// $example on$
- val inputData = sqlContext.read.format("libsvm").load(params.input)
+ val inputData = spark.read.format("libsvm").load(params.input)
// compute the train/test split: if testInput is not provided use part of input.
val data = params.testInput match {
case Some(t) =>
// compute the number of features in the training set.
val numFeatures = inputData.first().getAs[Vector](1).size
- val testData = sqlContext.read.option("numFeatures", numFeatures.toString)
+ val testData = spark.read.option("numFeatures", numFeatures.toString)
.format("libsvm").load(t)
Array[DataFrame](inputData, testData)
case None =>
@@ -175,7 +172,7 @@ object OneVsRestExample {
println(fprs.map {case (label, fpr) => label + "\t" + fpr}.mkString("\n"))
// $example off$
- sc.stop()
+ spark.stop()
}
private def time[R](block: => R): (Long, R) = {
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
index 535652ec6c..14394d5624 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
@@ -18,18 +18,15 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object PCAExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("PCAExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("PCAExample").getOrCreate()
// $example on$
val data = Array(
@@ -37,7 +34,7 @@ object PCAExample {
Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
)
- val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+ val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
val pca = new PCA()
.setInputCol("features")
.setOutputCol("pcaFeatures")
@@ -47,7 +44,8 @@ object PCAExample {
val result = pcaDF.select("pcaFeatures")
result.show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala
index 6c29063626..61b34aebd9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala
@@ -18,7 +18,6 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
@@ -26,18 +25,16 @@ import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.Row
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object PipelineExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("PipelineExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("PipelineExample").getOrCreate()
// $example on$
// Prepare training documents from a list of (id, text, label) tuples.
- val training = sqlContext.createDataFrame(Seq(
+ val training = spark.createDataFrame(Seq(
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
@@ -71,7 +68,7 @@ object PipelineExample {
val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model")
// Prepare test documents, which are unlabeled (id, text) tuples.
- val test = sqlContext.createDataFrame(Seq(
+ val test = spark.createDataFrame(Seq(
(4L, "spark i j k"),
(5L, "l m n"),
(6L, "mapreduce spark"),
@@ -87,7 +84,7 @@ object PipelineExample {
}
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
index 3014008ea0..4d8c672a55 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
@@ -18,18 +18,15 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.PolynomialExpansion
import org.apache.spark.mllib.linalg.Vectors
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object PolynomialExpansionExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("PolynomialExpansionExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("PolynomialExpansionExample").getOrCreate()
// $example on$
val data = Array(
@@ -37,7 +34,7 @@ object PolynomialExpansionExample {
Vectors.dense(0.0, 0.0),
Vectors.dense(0.6, -1.1)
)
- val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+ val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
val polynomialExpansion = new PolynomialExpansion()
.setInputCol("features")
.setOutputCol("polyFeatures")
@@ -45,7 +42,8 @@ object PolynomialExpansionExample {
val polyDF = polynomialExpansion.transform(df)
polyDF.select("polyFeatures").take(3).foreach(println)
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
index e64e673a48..0839c609f1 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
@@ -15,25 +15,21 @@
* limitations under the License.
*/
-// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.QuantileDiscretizer
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object QuantileDiscretizerExample {
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("QuantileDiscretizerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
+ val spark = SparkSession.builder.appName("QuantileDiscretizerExample").getOrCreate()
+ import spark.implicits._
// $example on$
val data = Array((0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2))
- val df = sc.parallelize(data).toDF("id", "hour")
+ val df = spark.createDataFrame(data).toDF("id", "hour")
val discretizer = new QuantileDiscretizer()
.setInputCol("hour")
@@ -43,7 +39,7 @@ object QuantileDiscretizerExample {
val result = discretizer.fit(df).transform(df)
result.show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
-// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
index bec831d51c..699b621db9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.RFormula
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object RFormulaExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("RFormulaExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("RFormulaExample").getOrCreate()
// $example on$
- val dataset = sqlContext.createDataFrame(Seq(
+ val dataset = spark.createDataFrame(Seq(
(7, "US", 18, 1.0),
(8, "CA", 12, 0.0),
(9, "NZ", 15, 0.0)
@@ -43,7 +40,8 @@ object RFormulaExample {
val output = formula.fit(dataset).transform(dataset)
output.select("features", "label").show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
index 6c9b52cf25..4192a9c737 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
@@ -18,24 +18,21 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object RandomForestClassifierExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("RandomForestClassifierExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("RandomForestClassifierExample").getOrCreate()
// $example on$
// Load and parse the data file, converting it to a DataFrame.
- val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
@@ -91,7 +88,7 @@ object RandomForestClassifierExample {
println("Learned classification forest model:\n" + rfModel.toDebugString)
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
index 4d2db017f3..5632f0419a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
@@ -18,24 +18,21 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object RandomForestRegressorExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("RandomForestRegressorExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("RandomForestRegressorExample").getOrCreate()
// $example on$
// Load and parse the data file, converting it to a DataFrame.
- val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -78,7 +75,7 @@ object RandomForestRegressorExample {
println("Learned regression forest model:\n" + rfModel.toDebugString)
// $example off$
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala
index 202925acad..f03b29ba32 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.SQLTransformer
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object SQLTransformerExample {
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("SQLTransformerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("SQLTransformerExample").getOrCreate()
// $example on$
- val df = sqlContext.createDataFrame(
+ val df = spark.createDataFrame(
Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2")
val sqlTrans = new SQLTransformer().setStatement(
@@ -39,6 +36,8 @@ object SQLTransformerExample {
sqlTrans.transform(df).show()
// $example off$
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index f4d1fe5785..dff7719507 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -18,12 +18,11 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.{Row, SparkSession}
/**
* A simple example demonstrating ways to specify parameters for Estimators and Transformers.
@@ -35,15 +34,13 @@ import org.apache.spark.sql.{Row, SQLContext}
object SimpleParamsExample {
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("SimpleParamsExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
+ val spark = SparkSession.builder.appName("SimpleParamsExample").getOrCreate()
+ import spark.implicits._
// Prepare training data.
// We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes
// into DataFrames, where it uses the case class metadata to infer the schema.
- val training = sc.parallelize(Seq(
+ val training = spark.createDataFrame(Seq(
LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
@@ -59,7 +56,7 @@ object SimpleParamsExample {
.setRegParam(0.01)
// Learn a LogisticRegression model. This uses the parameters stored in lr.
- val model1 = lr.fit(training.toDF())
+ val model1 = lr.fit(training)
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
// we can view the parameters it used during fit().
// This prints the parameter (name: value) pairs, where names are unique IDs for this
@@ -82,7 +79,7 @@ object SimpleParamsExample {
println("Model 2 was fit using parameters: " + model2.parent.extractParamMap())
// Prepare test data.
- val test = sc.parallelize(Seq(
+ val test = spark.createDataFrame(Seq(
LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
@@ -91,14 +88,14 @@ object SimpleParamsExample {
// LogisticRegressionModel.transform will only use the 'features' column.
// Note that model2.transform() outputs a 'myProbability' column instead of the usual
// 'probability' column since we renamed the lr.probabilityCol parameter previously.
- model2.transform(test.toDF())
+ model2.transform(test)
.select("features", "label", "myProbability", "prediction")
.collect()
.foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
println(s"($features, $label) -> prob=$prob, prediction=$prediction")
}
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
index 960280137c..05199007f0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -20,12 +20,11 @@ package org.apache.spark.examples.ml
import scala.beans.BeanInfo
-import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.{Row, SparkSession}
@BeanInfo
case class LabeledDocument(id: Long, text: String, label: Double)
@@ -43,13 +42,11 @@ case class Document(id: Long, text: String)
object SimpleTextClassificationPipeline {
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
+ val spark = SparkSession.builder.appName("SimpleTextClassificationPipeline").getOrCreate()
+ import spark.implicits._
// Prepare training documents, which are labeled.
- val training = sc.parallelize(Seq(
+ val training = spark.createDataFrame(Seq(
LabeledDocument(0L, "a b c d e spark", 1.0),
LabeledDocument(1L, "b d", 0.0),
LabeledDocument(2L, "spark f g h", 1.0),
@@ -73,7 +70,7 @@ object SimpleTextClassificationPipeline {
val model = pipeline.fit(training.toDF())
// Prepare test documents, which are unlabeled.
- val test = sc.parallelize(Seq(
+ val test = spark.createDataFrame(Seq(
Document(4L, "spark i j k"),
Document(5L, "l m n"),
Document(6L, "spark hadoop spark"),
@@ -87,7 +84,7 @@ object SimpleTextClassificationPipeline {
println(s"($id, $text) --> prob=$prob, prediction=$prediction")
}
- sc.stop()
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
index e3439677e7..55f777c6e2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.StandardScaler
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object StandardScalerExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("StandardScalerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("StandardScalerExample").getOrCreate()
// $example on$
- val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val scaler = new StandardScaler()
.setInputCol("features")
@@ -46,7 +43,8 @@ object StandardScalerExample {
val scaledData = scalerModel.transform(dataFrame)
scaledData.show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
index 8199be12c1..85e79c8cb3 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
@@ -18,31 +18,29 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.StopWordsRemover
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object StopWordsRemoverExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("StopWordsRemoverExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("StopWordsRemoverExample").getOrCreate()
// $example on$
val remover = new StopWordsRemover()
.setInputCol("raw")
.setOutputCol("filtered")
- val dataSet = sqlContext.createDataFrame(Seq(
+ val dataSet = spark.createDataFrame(Seq(
(0, Seq("I", "saw", "the", "red", "baloon")),
(1, Seq("Mary", "had", "a", "little", "lamb"))
)).toDF("id", "raw")
remover.transform(dataSet).show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
index 3f0e870c8d..e01a768da9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.StringIndexer
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object StringIndexerExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("StringIndexerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("StringIndexerExample").getOrCreate()
// $example on$
- val df = sqlContext.createDataFrame(
+ val df = spark.createDataFrame(
Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
).toDF("id", "category")
@@ -42,7 +39,8 @@ object StringIndexerExample {
val indexed = indexer.fit(df).transform(df)
indexed.show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
index 396f073e6b..910ef62a26 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
@@ -18,21 +18,18 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object TfIdfExample {
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("TfIdfExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("TfIdfExample").getOrCreate()
// $example on$
- val sentenceData = sqlContext.createDataFrame(Seq(
+ val sentenceData = spark.createDataFrame(Seq(
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
@@ -50,6 +47,8 @@ object TfIdfExample {
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("features", "label").take(3).foreach(println)
// $example off$
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
index c667728d63..4f0c47b3c8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object TokenizerExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("TokenizerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("TokenizerExample").getOrCreate()
// $example on$
- val sentenceDataFrame = sqlContext.createDataFrame(Seq(
+ val sentenceDataFrame = spark.createDataFrame(Seq(
(0, "Hi I heard about Spark"),
(1, "I wish Java could use case classes"),
(2, "Logistic,regression,models,are,neat")
@@ -48,7 +45,8 @@ object TokenizerExample {
val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("words", "label").take(3).foreach(println)
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
index 768a8c0690..56b7263b19 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
@@ -18,21 +18,18 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.mllib.linalg.Vectors
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object VectorAssemblerExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("VectorAssemblerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("VectorAssemblerExample").getOrCreate()
// $example on$
- val dataset = sqlContext.createDataFrame(
+ val dataset = spark.createDataFrame(
Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
).toDF("id", "hour", "mobile", "userFeatures", "clicked")
@@ -43,7 +40,8 @@ object VectorAssemblerExample {
val output = assembler.transform(dataset)
println(output.select("features", "clicked").first())
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
index 3bef37ba36..214ad91634 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
@@ -18,20 +18,17 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.VectorIndexer
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object VectorIndexerExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("VectorIndexerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("VectorIndexerExample").getOrCreate()
// $example on$
- val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val indexer = new VectorIndexer()
.setInputCol("features")
@@ -48,7 +45,8 @@ object VectorIndexerExample {
val indexedData = indexerModel.transform(data)
indexedData.show()
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
index 01377d80e7..716bf023a8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
@@ -18,31 +18,29 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
+import java.util.Arrays
+
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.feature.VectorSlicer
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object VectorSlicerExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("VectorSlicerExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("VectorSlicerExample").getOrCreate()
// $example on$
- val data = Array(Row(Vectors.dense(-2.0, 2.3, 0.0)))
+ val data = Arrays.asList(Row(Vectors.dense(-2.0, 2.3, 0.0)))
val defaultAttr = NumericAttribute.defaultAttr
val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])
- val dataRDD = sc.parallelize(data)
- val dataset = sqlContext.createDataFrame(dataRDD, StructType(Array(attrGroup.toStructField())))
+ val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField())))
val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")
@@ -52,7 +50,8 @@ object VectorSlicerExample {
val output = slicer.transform(dataset)
println(output.select("userFeatures", "features").first())
// $example off$
- sc.stop()
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
index e77aa59ba3..292b6d9f77 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
@@ -18,21 +18,18 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.ml.feature.Word2Vec
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object Word2VecExample {
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("Word2Vec example")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("Word2Vec example").getOrCreate()
// $example on$
// Input data: Each row is a bag of words from a sentence or document.
- val documentDF = sqlContext.createDataFrame(Seq(
+ val documentDF = spark.createDataFrame(Seq(
"Hi I heard about Spark".split(" "),
"I wish Java could use case classes".split(" "),
"Logistic regression models are neat".split(" ")
@@ -48,6 +45,8 @@ object Word2VecExample {
val result = model.transform(documentDF)
result.select("result").take(3).foreach(println)
// $example off$
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala
index e89d555884..c2bf1548b5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala
@@ -27,7 +27,7 @@ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, Regex
import org.apache.spark.mllib.clustering.{DistributedLDAModel, EMLDAOptimizer, LDA, OnlineLDAOptimizer}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.{Row, SparkSession}
/**
* An example Latent Dirichlet Allocation (LDA) app. Run with
@@ -189,8 +189,8 @@ object LDAExample {
vocabSize: Int,
stopwordFile: String): (RDD[(Long, Vector)], Array[String], Long) = {
- val sqlContext = SQLContext.getOrCreate(sc)
- import sqlContext.implicits._
+ val spark = SparkSession.builder.getOrCreate()
+ import spark.implicits._
// Get dataset of document texts
// One document per line in each text file. If the input consists of many small files,
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala
index fdb01b86dd..cd4f0bb0de 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala
@@ -18,22 +18,19 @@
// scalastyle:off println
package org.apache.spark.examples.mllib
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.evaluation.{RankingMetrics, RegressionMetrics}
import org.apache.spark.mllib.recommendation.{ALS, Rating}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object RankingMetricsExample {
def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("RankingMetricsExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
+ val spark = SparkSession.builder.appName("RankingMetricsExample").getOrCreate()
+ import spark.implicits._
// $example on$
// Read in the ratings data
- val ratings = sc.textFile("data/mllib/sample_movielens_data.txt").map { line =>
+ val ratings = spark.read.text("data/mllib/sample_movielens_data.txt").rdd.map { line =>
val fields = line.split("::")
Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble - 2.5)
}.cache()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
index add634c957..22c47a694d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
@@ -18,22 +18,22 @@
package org.apache.spark.examples.mllib
-import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.evaluation.RegressionMetrics
-import org.apache.spark.mllib.regression.LinearRegressionWithSGD
-import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
// $example off$
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
object RegressionMetricsExample {
def main(args: Array[String]): Unit = {
- val conf = new SparkConf().setAppName("RegressionMetricsExample")
- val sc = new SparkContext(conf)
- val sqlContext = new SQLContext(sc)
+ val spark = SparkSession.builder.appName("RegressionMetricsExample").getOrCreate()
// $example on$
// Load the data
- val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_linear_regression_data.txt").cache()
+ val data = spark
+ .read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt")
+ .rdd.map(row => LabeledPoint(row.getDouble(0), row.get(1).asInstanceOf[Vector]))
+ .cache()
// Build the model
val numIterations = 100
@@ -61,6 +61,8 @@ object RegressionMetricsExample {
// Explained variance
println(s"Explained variance = ${metrics.explainedVariance}")
// $example off$
+
+ spark.stop()
}
}
// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala
index 918e124065..2f0fe704f7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala
@@ -19,9 +19,8 @@
package org.apache.spark.examples.streaming
import org.apache.spark.SparkConf
-import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
@@ -60,9 +59,9 @@ object SqlNetworkWordCount {
// Convert RDDs of the words DStream to DataFrame and run SQL query
words.foreachRDD { (rdd: RDD[String], time: Time) =>
- // Get the singleton instance of SQLContext
- val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext)
- import sqlContext.implicits._
+ // Get the singleton instance of SparkSession
+ val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
+ import spark.implicits._
// Convert RDD[String] to RDD[case class] to DataFrame
val wordsDataFrame = rdd.map(w => Record(w)).toDF()
@@ -72,7 +71,7 @@ object SqlNetworkWordCount {
// Do word count on table using SQL and print it
val wordCountsDataFrame =
- sqlContext.sql("select word, count(*) as total from words group by word")
+ spark.sql("select word, count(*) as total from words group by word")
println(s"========= $time =========")
wordCountsDataFrame.show()
}
@@ -87,14 +86,14 @@ object SqlNetworkWordCount {
case class Record(word: String)
-/** Lazily instantiated singleton instance of SQLContext */
-object SQLContextSingleton {
+/** Lazily instantiated singleton instance of SparkSession */
+object SparkSessionSingleton {
- @transient private var instance: SQLContext = _
+ @transient private var instance: SparkSession = _
- def getInstance(sparkContext: SparkContext): SQLContext = {
+ def getInstance(sparkConf: SparkConf): SparkSession = {
if (instance == null) {
- instance = new SQLContext(sparkContext)
+ instance = SparkSession.builder.config(sparkConf).getOrCreate()
}
instance
}