diff options
author | Dongjoon Hyun <dongjoon@apache.org> | 2016-05-04 14:31:36 -0700 |
---|---|---|
committer | Andrew Or <andrew@databricks.com> | 2016-05-04 14:31:36 -0700 |
commit | cdce4e62a5674e2034e5d395578b1a60e3d8c435 (patch) | |
tree | c715f2555dad353683f82820962576f89b2db452 /examples/src | |
parent | cf2e9da612397233ae7bca0e9ce57309f16226b5 (diff) | |
download | spark-cdce4e62a5674e2034e5d395578b1a60e3d8c435.tar.gz spark-cdce4e62a5674e2034e5d395578b1a60e3d8c435.tar.bz2 spark-cdce4e62a5674e2034e5d395578b1a60e3d8c435.zip |
[SPARK-15031][EXAMPLE] Use SparkSession in Scala/Python/Java example.
## What changes were proposed in this pull request?
This PR aims to update Scala/Python/Java examples by replacing `SQLContext` with newly added `SparkSession`.
- Use **SparkSession Builder Pattern** in 154(Scala 55, Java 52, Python 47) files.
- Add `getConf` in Python SparkContext class: `python/pyspark/context.py`
- Replace **SQLContext Singleton Pattern** with **SparkSession Singleton Pattern**:
- `SqlNetworkWordCount.scala`
- `JavaSqlNetworkWordCount.java`
- `sql_network_wordcount.py`
Now, `SQLContexts` are used only in R examples and the following two Python examples. The python examples are untouched in this PR since it already fails some unknown issue.
- `simple_params_example.py`
- `aft_survival_regression.py`
## How was this patch tested?
Manual.
Author: Dongjoon Hyun <dongjoon@apache.org>
Closes #12809 from dongjoon-hyun/SPARK-15031.
Diffstat (limited to 'examples/src')
154 files changed, 847 insertions, 1232 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java index 22b93a3a85..ecb7084e03 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java @@ -21,23 +21,19 @@ package org.apache.spark.examples.ml; import java.util.Arrays; import java.util.List; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.regression.AFTSurvivalRegression; import org.apache.spark.ml.regression.AFTSurvivalRegressionModel; import org.apache.spark.mllib.linalg.*; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.*; // $example off$ public class JavaAFTSurvivalRegressionExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaAFTSurvivalRegressionExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaAFTSurvivalRegressionExample").getOrCreate(); // $example on$ List<Row> data = Arrays.asList( @@ -52,7 +48,7 @@ public class JavaAFTSurvivalRegressionExample { new StructField("censor", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); - Dataset<Row> training = jsql.createDataFrame(data, schema); + Dataset<Row> training = spark.createDataFrame(data, schema); double[] quantileProbabilities = new double[]{0.3, 0.6}; AFTSurvivalRegression aft = new AFTSurvivalRegression() .setQuantileProbabilities(quantileProbabilities) @@ -66,6 +62,6 @@ public class JavaAFTSurvivalRegressionExample { model.transform(training).show(false); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java index 088037d427..9a9a10489b 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java @@ -17,11 +17,9 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.io.Serializable; @@ -83,18 +81,17 @@ public class JavaALSExample { // $example off$ public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaALSExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaALSExample").getOrCreate(); // $example on$ - JavaRDD<Rating> ratingsRDD = jsc.textFile("data/mllib/als/sample_movielens_ratings.txt") + JavaRDD<Rating> ratingsRDD = spark + .read().text("data/mllib/als/sample_movielens_ratings.txt").javaRDD() .map(new Function<String, Rating>() { public Rating call(String str) { return Rating.parseRating(str); } }); - Dataset<Row> ratings = sqlContext.createDataFrame(ratingsRDD, Rating.class); + Dataset<Row> ratings = spark.createDataFrame(ratingsRDD, Rating.class); Dataset<Row>[] splits = ratings.randomSplit(new double[]{0.8, 0.2}); Dataset<Row> training = splits[0]; Dataset<Row> test = splits[1]; @@ -121,6 +118,6 @@ public class JavaALSExample { Double rmse = evaluator.evaluate(predictions); System.out.println("Root-mean-square error = " + rmse); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java index 0a6e9c2a1f..88e4298a61 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java @@ -20,10 +20,11 @@ package org.apache.spark.examples.ml; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.Binarizer; @@ -37,21 +38,19 @@ import org.apache.spark.sql.types.StructType; public class JavaBinarizerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaBinarizerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaBinarizerExample").getOrCreate(); // $example on$ - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(0, 0.1), RowFactory.create(1, 0.8), RowFactory.create(2, 0.2) - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); - Dataset<Row> continuousDataFrame = jsql.createDataFrame(jrdd, schema); + Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema); Binarizer binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") @@ -63,6 +62,6 @@ public class JavaBinarizerExample { System.out.println(binarized_value); } // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java index 1d1a518bbc..51aa35084e 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java @@ -18,12 +18,10 @@ package org.apache.spark.examples.ml; import java.util.Arrays; +import java.util.List; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import org.apache.spark.ml.clustering.BisectingKMeans; import org.apache.spark.ml.clustering.BisectingKMeansModel; @@ -44,25 +42,23 @@ import org.apache.spark.sql.types.StructType; public class JavaBisectingKMeansExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaBisectingKMeansExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaBisectingKMeansExample").getOrCreate(); // $example on$ - JavaRDD<Row> data = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(Vectors.dense(0.1, 0.1, 0.1)), RowFactory.create(Vectors.dense(0.3, 0.3, 0.25)), RowFactory.create(Vectors.dense(0.1, 0.1, -0.1)), RowFactory.create(Vectors.dense(20.3, 20.1, 19.9)), RowFactory.create(Vectors.dense(20.2, 20.1, 19.7)), RowFactory.create(Vectors.dense(18.9, 20.0, 19.7)) - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); - Dataset<Row> dataset = jsql.createDataFrame(data, schema); + Dataset<Row> dataset = spark.createDataFrame(data, schema); BisectingKMeans bkm = new BisectingKMeans().setK(2); BisectingKMeansModel model = bkm.fit(dataset); @@ -76,6 +72,6 @@ public class JavaBisectingKMeansExample { } // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java index 68ffa702ea..0c24f52cf5 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java @@ -17,14 +17,12 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.Bucketizer; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -37,23 +35,21 @@ import org.apache.spark.sql.types.StructType; public class JavaBucketizerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaBucketizerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaBucketizerExample").getOrCreate(); // $example on$ double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY}; - JavaRDD<Row> data = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), RowFactory.create(0.2) - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) }); - Dataset<Row> dataFrame = jsql.createDataFrame(data, schema); + Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Bucketizer bucketizer = new Bucketizer() .setInputCol("features") @@ -64,7 +60,7 @@ public class JavaBucketizerExample { Dataset<Row> bucketedData = bucketizer.transform(dataFrame); bucketedData.show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java index b1bf1cfeb2..684cf9a714 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java @@ -21,10 +21,11 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; import org.apache.spark.ml.feature.ChiSqSelector; import org.apache.spark.mllib.linalg.VectorUDT; @@ -39,23 +40,21 @@ import org.apache.spark.sql.types.StructType; public class JavaChiSqSelectorExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaChiSqSelectorExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaChiSqSelectorExample").getOrCreate(); // $example on$ - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0), RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0), RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0) - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()), new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty()) }); - Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> df = spark.createDataFrame(data, schema); ChiSqSelector selector = new ChiSqSelector() .setNumTopFeatures(1) @@ -66,6 +65,6 @@ public class JavaChiSqSelectorExample { Dataset<Row> result = selector.fit(df).transform(df); result.show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java index ec3ac202be..0631f9d6d5 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java @@ -19,36 +19,31 @@ package org.apache.spark.examples.ml; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.feature.CountVectorizer; import org.apache.spark.ml.feature.CountVectorizerModel; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.*; // $example off$ public class JavaCountVectorizerExample { public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaCountVectorizerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaCountVectorizerExample").getOrCreate(); // $example on$ // Input data: Each row is a bag of words from a sentence or document. - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(Arrays.asList("a", "b", "c")), RowFactory.create(Arrays.asList("a", "b", "b", "c", "a")) - )); + ); StructType schema = new StructType(new StructField [] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); - Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> df = spark.createDataFrame(data, schema); // fit a CountVectorizerModel from the corpus CountVectorizerModel cvModel = new CountVectorizer() @@ -66,6 +61,6 @@ public class JavaCountVectorizerExample { cvModel.transform(df).show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java index 4b15fde9c3..ec57a24451 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java @@ -20,10 +20,11 @@ package org.apache.spark.examples.ml; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.DCT; @@ -38,20 +39,18 @@ import org.apache.spark.sql.types.StructType; public class JavaDCTExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaDCTExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaDCTExample").getOrCreate(); // $example on$ - JavaRDD<Row> data = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)), RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)), RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0)) - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); - Dataset<Row> df = jsql.createDataFrame(data, schema); + Dataset<Row> df = spark.createDataFrame(data, schema); DCT dct = new DCT() .setInputCol("features") .setOutputCol("featuresDCT") @@ -59,7 +58,7 @@ public class JavaDCTExample { Dataset<Row> dctDf = dct.transform(df); dctDf.select("featuresDCT").show(3); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java index 8214952f80..733bc4181c 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java @@ -17,8 +17,6 @@ // scalastyle:off println package org.apache.spark.examples.ml; // $example on$ -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineModel; import org.apache.spark.ml.PipelineStage; @@ -28,18 +26,17 @@ import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.ml.feature.*; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example off$ public class JavaDecisionTreeClassificationExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaDecisionTreeClassificationExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaDecisionTreeClassificationExample").getOrCreate(); // $example on$ // Load the data stored in LIBSVM format as a DataFrame. - Dataset<Row> data = sqlContext + Dataset<Row> data = spark .read() .format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); @@ -100,6 +97,6 @@ public class JavaDecisionTreeClassificationExample { System.out.println("Learned classification tree model:\n" + treeModel.toDebugString()); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java index a4f3e97bf3..bd6dc3edd3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java @@ -17,8 +17,6 @@ // scalastyle:off println package org.apache.spark.examples.ml; // $example on$ -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineModel; import org.apache.spark.ml.PipelineStage; @@ -29,17 +27,16 @@ import org.apache.spark.ml.regression.DecisionTreeRegressionModel; import org.apache.spark.ml.regression.DecisionTreeRegressor; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example off$ public class JavaDecisionTreeRegressionExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaDecisionTreeRegressionExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaDecisionTreeRegressionExample").getOrCreate(); // $example on$ // Load the data stored in LIBSVM format as a DataFrame. - Dataset<Row> data = sqlContext.read().format("libsvm") + Dataset<Row> data = spark.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. @@ -85,6 +82,6 @@ public class JavaDecisionTreeRegressionExample { System.out.println("Learned regression tree model:\n" + treeModel.toDebugString()); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java index 0ba94786d4..90023ac06b 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java @@ -21,9 +21,7 @@ import java.util.List; import com.google.common.collect.Lists; -import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.classification.Classifier; import org.apache.spark.ml.classification.ClassificationModel; import org.apache.spark.ml.param.IntParam; @@ -35,7 +33,7 @@ import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; /** @@ -51,9 +49,7 @@ import org.apache.spark.sql.SQLContext; public class JavaDeveloperApiExample { public static void main(String[] args) throws Exception { - SparkConf conf = new SparkConf().setAppName("JavaDeveloperApiExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaDeveloperApiExample").getOrCreate(); // Prepare training data. List<LabeledPoint> localTraining = Lists.newArrayList( @@ -61,8 +57,7 @@ public class JavaDeveloperApiExample { new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))); - Dataset<Row> training = jsql.createDataFrame( - jsc.parallelize(localTraining), LabeledPoint.class); + Dataset<Row> training = spark.createDataFrame(localTraining, LabeledPoint.class); // Create a LogisticRegression instance. This instance is an Estimator. MyJavaLogisticRegression lr = new MyJavaLogisticRegression(); @@ -80,7 +75,7 @@ public class JavaDeveloperApiExample { new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))); - Dataset<Row> test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class); + Dataset<Row> test = spark.createDataFrame(localTest, LabeledPoint.class); // Make predictions on test documents. cvModel uses the best model found (lrModel). Dataset<Row> results = model.transform(test); @@ -93,7 +88,7 @@ public class JavaDeveloperApiExample { " even though all coefficients are 0!"); } - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java index 37de9cf359..a062a6fcd0 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java @@ -20,7 +20,7 @@ package org.apache.spark.examples.ml; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.ArrayList; @@ -41,16 +41,15 @@ import org.apache.spark.sql.types.StructType; public class JavaElementwiseProductExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaElementwiseProductExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaElementwiseProductExample").getOrCreate(); // $example on$ // Create some vector data; also works for sparse vectors - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)), RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0)) - )); + ); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("id", DataTypes.StringType, false)); @@ -58,7 +57,7 @@ public class JavaElementwiseProductExample { StructType schema = DataTypes.createStructType(fields); - Dataset<Row> dataFrame = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); @@ -70,6 +69,6 @@ public class JavaElementwiseProductExample { // Batch transform the vectors to create new column: transformer.transform(dataFrame).show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java index 604b193dd4..5ba8e6cf44 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java @@ -21,8 +21,6 @@ package org.apache.spark.examples.ml; import java.util.Arrays; // $example off$ -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; // $example on$ import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.LogisticRegressionModel; @@ -32,23 +30,21 @@ import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; // $example off$ -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; /** * Java example for Estimator, Transformer, and Param. */ public class JavaEstimatorTransformerParamExample { public static void main(String[] args) { - SparkConf conf = new SparkConf() - .setAppName("JavaEstimatorTransformerParamExample"); - SparkContext sc = new SparkContext(conf); - SQLContext sqlContext = new SQLContext(sc); + SparkSession spark = SparkSession + .builder().appName("JavaEstimatorTransformerParamExample").getOrCreate(); // $example on$ // Prepare training data. // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans into // DataFrames, where it uses the bean metadata to infer the schema. - Dataset<Row> training = sqlContext.createDataFrame( + Dataset<Row> training = spark.createDataFrame( Arrays.asList( new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), @@ -89,7 +85,7 @@ public class JavaEstimatorTransformerParamExample { System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap()); // Prepare test documents. - Dataset<Row> test = sqlContext.createDataFrame(Arrays.asList( + Dataset<Row> test = spark.createDataFrame(Arrays.asList( new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)) @@ -107,6 +103,6 @@ public class JavaEstimatorTransformerParamExample { } // $example off$ - sc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java index 553070dace..a7c89b9d19 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java @@ -29,18 +29,17 @@ import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.ml.feature.*; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example off$ public class JavaGradientBoostedTreeClassifierExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaGradientBoostedTreeClassifierExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaGradientBoostedTreeClassifierExample").getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. - Dataset<Row> data = sqlContext.read().format("libsvm") + Dataset<Row> data = spark.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. @@ -99,6 +98,6 @@ public class JavaGradientBoostedTreeClassifierExample { System.out.println("Learned classification GBT model:\n" + gbtModel.toDebugString()); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java index 83fd89e3bd..6d3f21fdaf 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java @@ -17,8 +17,6 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; // $example on$ import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineModel; @@ -30,19 +28,17 @@ import org.apache.spark.ml.regression.GBTRegressionModel; import org.apache.spark.ml.regression.GBTRegressor; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example off$ public class JavaGradientBoostedTreeRegressorExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaGradientBoostedTreeRegressorExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaGradientBoostedTreeRegressorExample").getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. - Dataset<Row> data = - sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -87,6 +83,6 @@ public class JavaGradientBoostedTreeRegressorExample { System.out.println("Learned regression GBT model:\n" + gbtModel.toDebugString()); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java index 9b8c22f3bd..ccd74f2920 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java @@ -17,14 +17,12 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; import org.apache.spark.ml.feature.IndexToString; import org.apache.spark.ml.feature.StringIndexer; @@ -39,24 +37,22 @@ import org.apache.spark.sql.types.StructType; public class JavaIndexToStringExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaIndexToStringExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaIndexToStringExample").getOrCreate(); // $example on$ - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(0, "a"), RowFactory.create(1, "b"), RowFactory.create(2, "c"), RowFactory.create(3, "a"), RowFactory.create(4, "a"), RowFactory.create(5, "c") - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("category", DataTypes.StringType, false, Metadata.empty()) }); - Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> df = spark.createDataFrame(data, schema); StringIndexerModel indexer = new StringIndexer() .setInputCol("category") @@ -70,6 +66,6 @@ public class JavaIndexToStringExample { Dataset<Row> converted = converter.transform(indexed); converted.select("id", "originalCategory").show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java index c5022f4c0b..e6d82a0513 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java @@ -19,12 +19,10 @@ package org.apache.spark.examples.ml; import java.util.regex.Pattern; -import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.expressions.GenericRow; // $example on$ import org.apache.spark.ml.clustering.KMeansModel; @@ -72,16 +70,14 @@ public class JavaKMeansExample { int k = Integer.parseInt(args[1]); // Parses the arguments - SparkConf conf = new SparkConf().setAppName("JavaKMeansExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaKMeansExample").getOrCreate(); // $example on$ // Loads data - JavaRDD<Row> points = jsc.textFile(inputFile).map(new ParsePoint()); + JavaRDD<Row> points = spark.read().text(inputFile).javaRDD().map(new ParsePoint()); StructField[] fields = {new StructField("features", new VectorUDT(), false, Metadata.empty())}; StructType schema = new StructType(fields); - Dataset<Row> dataset = sqlContext.createDataFrame(points, schema); + Dataset<Row> dataset = spark.createDataFrame(points, schema); // Trains a k-means model KMeans kmeans = new KMeans() @@ -96,6 +92,6 @@ public class JavaKMeansExample { } // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java index 351bc40118..b8baca5920 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java @@ -19,9 +19,7 @@ package org.apache.spark.examples.ml; // $example on$ import java.util.regex.Pattern; -import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.ml.clustering.LDA; import org.apache.spark.ml.clustering.LDAModel; @@ -30,7 +28,7 @@ import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.expressions.GenericRow; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; @@ -67,15 +65,13 @@ public class JavaLDAExample { String inputFile = "data/mllib/sample_lda_data.txt"; // Parses the arguments - SparkConf conf = new SparkConf().setAppName("JavaLDAExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaLDAExample").getOrCreate(); // Loads data - JavaRDD<Row> points = jsc.textFile(inputFile).map(new ParseVector()); + JavaRDD<Row> points = spark.read().text(inputFile).javaRDD().map(new ParseVector()); StructField[] fields = {new StructField("features", new VectorUDT(), false, Metadata.empty())}; StructType schema = new StructType(fields); - Dataset<Row> dataset = sqlContext.createDataFrame(points, schema); + Dataset<Row> dataset = spark.createDataFrame(points, schema); // Trains a LDA model LDA lda = new LDA() @@ -91,7 +87,7 @@ public class JavaLDAExample { topics.show(false); model.transform(dataset).show(false); - jsc.stop(); + spark.stop(); } // $example off$ } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java index 08fce89359..b6ea1fed25 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java @@ -17,8 +17,6 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; // $example on$ import org.apache.spark.ml.regression.LinearRegression; import org.apache.spark.ml.regression.LinearRegressionModel; @@ -26,18 +24,17 @@ import org.apache.spark.ml.regression.LinearRegressionTrainingSummary; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example off$ public class JavaLinearRegressionWithElasticNetExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaLinearRegressionWithElasticNetExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaLinearRegressionWithElasticNetExample").getOrCreate(); // $example on$ // Load training data - Dataset<Row> training = sqlContext.read().format("libsvm") + Dataset<Row> training = spark.read().format("libsvm") .load("data/mllib/sample_linear_regression_data.txt"); LinearRegression lr = new LinearRegression() @@ -61,6 +58,6 @@ public class JavaLinearRegressionWithElasticNetExample { System.out.println("r2: " + trainingSummary.r2()); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java index 73b028fb44..fd040aead4 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java @@ -17,8 +17,6 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; // $example on$ import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary; import org.apache.spark.ml.classification.LogisticRegression; @@ -26,18 +24,17 @@ import org.apache.spark.ml.classification.LogisticRegressionModel; import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.functions; // $example off$ public class JavaLogisticRegressionSummaryExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionSummaryExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaLogisticRegressionSummaryExample").getOrCreate(); // Load training data - Dataset<Row> training = sqlContext.read().format("libsvm") + Dataset<Row> training = spark.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); LogisticRegression lr = new LogisticRegression() @@ -80,6 +77,6 @@ public class JavaLogisticRegressionSummaryExample { lrModel.setThreshold(bestThreshold); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java index 6911668522..f00c7a05cd 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java @@ -17,25 +17,22 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; // $example on$ import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.LogisticRegressionModel; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example off$ public class JavaLogisticRegressionWithElasticNetExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithElasticNetExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaLogisticRegressionWithElasticNetExample").getOrCreate(); // $example on$ // Load training data - Dataset<Row> training = sqlContext.read().format("libsvm") + Dataset<Row> training = spark.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); LogisticRegression lr = new LogisticRegression() @@ -51,6 +48,6 @@ public class JavaLogisticRegressionWithElasticNetExample { + lrModel.coefficients() + " Intercept: " + lrModel.intercept()); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java index a2a072b253..80cdd364b9 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java @@ -17,25 +17,21 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; // $example on$ import org.apache.spark.ml.feature.MaxAbsScaler; import org.apache.spark.ml.feature.MaxAbsScalerModel; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; // $example off$ -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; public class JavaMaxAbsScalerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaMaxAbsScalerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaMaxAbsScalerExample").getOrCreate(); // $example on$ - Dataset<Row> dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset<Row> dataFrame = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); MaxAbsScaler scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); @@ -47,7 +43,7 @@ public class JavaMaxAbsScalerExample { Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java index 4aee18eeab..022940fd1e 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java @@ -17,9 +17,7 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import org.apache.spark.ml.feature.MinMaxScaler; @@ -30,12 +28,10 @@ import org.apache.spark.sql.Row; public class JavaMinMaxScalerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JaveMinMaxScalerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaMinMaxScalerExample").getOrCreate(); // $example on$ - Dataset<Row> dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset<Row> dataFrame = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); MinMaxScaler scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); @@ -47,6 +43,6 @@ public class JavaMinMaxScalerExample { Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java index c4122d1247..a4ec4f5815 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java @@ -21,8 +21,6 @@ package org.apache.spark.examples.ml; import java.util.Arrays; // $example off$ -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; // $example on$ import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineStage; @@ -37,21 +35,19 @@ import org.apache.spark.ml.tuning.ParamGridBuilder; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; // $example off$ -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; /** * Java example for Model Selection via Cross Validation. */ public class JavaModelSelectionViaCrossValidationExample { public static void main(String[] args) { - SparkConf conf = new SparkConf() - .setAppName("JavaModelSelectionViaCrossValidationExample"); - SparkContext sc = new SparkContext(conf); - SQLContext sqlContext = new SQLContext(sc); + SparkSession spark = SparkSession + .builder().appName("JavaModelSelectionViaCrossValidationExample").getOrCreate(); // $example on$ // Prepare training documents, which are labeled. - Dataset<Row> training = sqlContext.createDataFrame(Arrays.asList( + Dataset<Row> training = spark.createDataFrame(Arrays.asList( new JavaLabeledDocument(0L, "a b c d e spark", 1.0), new JavaLabeledDocument(1L, "b d", 0.0), new JavaLabeledDocument(2L,"spark f g h", 1.0), @@ -102,7 +98,7 @@ public class JavaModelSelectionViaCrossValidationExample { CrossValidatorModel cvModel = cv.fit(training); // Prepare test documents, which are unlabeled. - Dataset<Row> test = sqlContext.createDataFrame(Arrays.asList( + Dataset<Row> test = spark.createDataFrame(Arrays.asList( new JavaDocument(4L, "spark i j k"), new JavaDocument(5L, "l m n"), new JavaDocument(6L, "mapreduce spark"), @@ -117,6 +113,6 @@ public class JavaModelSelectionViaCrossValidationExample { } // $example off$ - sc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java index 4994f8f9fa..63a0ad1cb8 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java @@ -17,8 +17,6 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; // $example on$ import org.apache.spark.ml.evaluation.RegressionEvaluator; import org.apache.spark.ml.param.ParamMap; @@ -29,7 +27,7 @@ import org.apache.spark.ml.tuning.TrainValidationSplitModel; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; // $example off$ -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; /** * Java example demonstrating model selection using TrainValidationSplit. @@ -44,13 +42,11 @@ import org.apache.spark.sql.SQLContext; */ public class JavaModelSelectionViaTrainValidationSplitExample { public static void main(String[] args) { - SparkConf conf = new SparkConf() - .setAppName("JavaModelSelectionViaTrainValidationSplitExample"); - SparkContext sc = new SparkContext(conf); - SQLContext jsql = new SQLContext(sc); + SparkSession spark = SparkSession + .builder().appName("JavaModelSelectionViaTrainValidationSplitExample").getOrCreate(); // $example on$ - Dataset<Row> data = jsql.read().format("libsvm") + Dataset<Row> data = spark.read().format("libsvm") .load("data/mllib/sample_linear_regression_data.txt"); // Prepare training and test data. @@ -87,6 +83,6 @@ public class JavaModelSelectionViaTrainValidationSplitExample { .show(); // $example off$ - sc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java index 0ca528d8cd..d547a2a64b 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java @@ -18,11 +18,9 @@ package org.apache.spark.examples.ml; // $example on$ -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel; import org.apache.spark.ml.classification.MultilayerPerceptronClassifier; import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; @@ -34,14 +32,13 @@ import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; public class JavaMultilayerPerceptronClassifierExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaMultilayerPerceptronClassifierExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaMultilayerPerceptronClassifierExample").getOrCreate(); // $example on$ // Load training data String path = "data/mllib/sample_multiclass_classification_data.txt"; - Dataset<Row> dataFrame = jsql.read().format("libsvm").load(path); + Dataset<Row> dataFrame = spark.read().format("libsvm").load(path); // Split the data into train and test Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); Dataset<Row> train = splits[0]; @@ -66,6 +63,6 @@ public class JavaMultilayerPerceptronClassifierExample { System.out.println("Precision = " + evaluator.evaluate(predictionAndLabels)); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java index 608bd80285..325b7b5874 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java @@ -17,15 +17,13 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.NGram; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -37,16 +35,14 @@ import org.apache.spark.sql.types.StructType; public class JavaNGramExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaNGramExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaNGramExample").getOrCreate(); // $example on$ - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")), RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")), RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat")) - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), @@ -54,7 +50,7 @@ public class JavaNGramExample { "words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); - Dataset<Row> wordDataFrame = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema); NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams"); @@ -66,6 +62,6 @@ public class JavaNGramExample { System.out.println(); } // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java index 41d7ad75b9..1f24a23609 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java @@ -17,16 +17,13 @@ package org.apache.spark.examples.ml; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; // $example on$ import org.apache.spark.ml.classification.NaiveBayes; import org.apache.spark.ml.classification.NaiveBayesModel; import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example off$ /** @@ -35,13 +32,12 @@ import org.apache.spark.sql.SQLContext; public class JavaNaiveBayesExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaNaiveBayesExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaNaiveBayesExample").getOrCreate(); // $example on$ // Load training data - Dataset<Row> dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset<Row> dataFrame = + spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Split the data into train and test Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); Dataset<Row> train = splits[0]; @@ -59,6 +55,6 @@ public class JavaNaiveBayesExample { System.out.println("Precision = " + evaluator.evaluate(predictionAndLabels)); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java index 31cd752136..4b3a718ea9 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java @@ -17,9 +17,7 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import org.apache.spark.ml.feature.Normalizer; @@ -29,12 +27,11 @@ import org.apache.spark.sql.Row; public class JavaNormalizerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaNormalizerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaNormalizerExample").getOrCreate(); // $example on$ - Dataset<Row> dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset<Row> dataFrame = + spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Normalize each Vector using $L^1$ norm. Normalizer normalizer = new Normalizer() @@ -50,6 +47,6 @@ public class JavaNormalizerExample { normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY)); lInfNormData.show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java index 882438ca28..d6e4d21ead 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java @@ -17,14 +17,12 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.OneHotEncoder; import org.apache.spark.ml.feature.StringIndexer; import org.apache.spark.ml.feature.StringIndexerModel; @@ -39,26 +37,24 @@ import org.apache.spark.sql.types.StructType; public class JavaOneHotEncoderExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaOneHotEncoderExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaOneHotEncoderExample").getOrCreate(); // $example on$ - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(0, "a"), RowFactory.create(1, "b"), RowFactory.create(2, "c"), RowFactory.create(3, "a"), RowFactory.create(4, "a"), RowFactory.create(5, "c") - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("category", DataTypes.StringType, false, Metadata.empty()) }); - Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> df = spark.createDataFrame(data, schema); StringIndexerModel indexer = new StringIndexer() .setInputCol("category") @@ -72,7 +68,7 @@ public class JavaOneHotEncoderExample { Dataset<Row> encoded = encoder.transform(indexed); encoded.select("id", "categoryVec").show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java index 1f13b48bf8..9cc983bd11 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java @@ -19,8 +19,6 @@ package org.apache.spark.examples.ml; import org.apache.commons.cli.*; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; // $example on$ import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.OneVsRest; @@ -31,7 +29,7 @@ import org.apache.spark.mllib.linalg.Matrix; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.StructField; // $example off$ @@ -60,9 +58,7 @@ public class JavaOneVsRestExample { public static void main(String[] args) { // parse the arguments Params params = parse(args); - SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaOneVsRestExample").getOrCreate(); // $example on$ // configure the base classifier @@ -82,7 +78,7 @@ public class JavaOneVsRestExample { OneVsRest ovr = new OneVsRest().setClassifier(classifier); String input = params.input; - Dataset<Row> inputData = jsql.read().format("libsvm").load(input); + Dataset<Row> inputData = spark.read().format("libsvm").load(input); Dataset<Row> train; Dataset<Row> test; @@ -92,7 +88,7 @@ public class JavaOneVsRestExample { train = inputData; // compute the number of features in the training set. int numFeatures = inputData.first().<Vector>getAs(1).size(); - test = jsql.read().format("libsvm").option("numFeatures", + test = spark.read().format("libsvm").option("numFeatures", String.valueOf(numFeatures)).load(testInput); } else { double f = params.fracTest; @@ -131,7 +127,7 @@ public class JavaOneVsRestExample { System.out.println(results); // $example off$ - jsc.stop(); + spark.stop(); } private static Params parse(String[] args) { diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java index a792fd7d47..6b1dcb68ba 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java @@ -17,14 +17,12 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.PCA; import org.apache.spark.ml.feature.PCAModel; import org.apache.spark.mllib.linalg.VectorUDT; @@ -39,22 +37,20 @@ import org.apache.spark.sql.types.StructType; public class JavaPCAExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaPCAExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaPCAExample").getOrCreate(); // $example on$ - JavaRDD<Row> data = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})), RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)), RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); - Dataset<Row> df = jsql.createDataFrame(data, schema); + Dataset<Row> df = spark.createDataFrame(data, schema); PCAModel pca = new PCA() .setInputCol("features") @@ -65,7 +61,7 @@ public class JavaPCAExample { Dataset<Row> result = pca.transform(df).select("pcaFeatures"); result.show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java index 305420f208..556a457326 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java @@ -19,11 +19,7 @@ package org.apache.spark.examples.ml; // $example on$ import java.util.Arrays; -// $example off$ -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; -// $example on$ import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineModel; import org.apache.spark.ml.PipelineStage; @@ -33,20 +29,18 @@ import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; // $example off$ -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; /** * Java example for simple text document 'Pipeline'. */ public class JavaPipelineExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaPipelineExample"); - SparkContext sc = new SparkContext(conf); - SQLContext sqlContext = new SQLContext(sc); + SparkSession spark = SparkSession.builder().appName("JavaPipelineExample").getOrCreate(); // $example on$ // Prepare training documents, which are labeled. - Dataset<Row> training = sqlContext.createDataFrame(Arrays.asList( + Dataset<Row> training = spark.createDataFrame(Arrays.asList( new JavaLabeledDocument(0L, "a b c d e spark", 1.0), new JavaLabeledDocument(1L, "b d", 0.0), new JavaLabeledDocument(2L, "spark f g h", 1.0), @@ -71,7 +65,7 @@ public class JavaPipelineExample { PipelineModel model = pipeline.fit(training); // Prepare test documents, which are unlabeled. - Dataset<Row> test = sqlContext.createDataFrame(Arrays.asList( + Dataset<Row> test = spark.createDataFrame(Arrays.asList( new JavaDocument(4L, "spark i j k"), new JavaDocument(5L, "l m n"), new JavaDocument(6L, "mapreduce spark"), @@ -86,6 +80,6 @@ public class JavaPipelineExample { } // $example off$ - sc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java index 48fc3c8acb..e328454c70 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java @@ -17,15 +17,12 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.PolynomialExpansion; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; @@ -39,9 +36,7 @@ import org.apache.spark.sql.types.StructType; public class JavaPolynomialExpansionExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaPolynomialExpansionExample").getOrCreate(); // $example on$ PolynomialExpansion polyExpansion = new PolynomialExpansion() @@ -49,17 +44,17 @@ public class JavaPolynomialExpansionExample { .setOutputCol("polyFeatures") .setDegree(3); - JavaRDD<Row> data = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(Vectors.dense(-2.0, 2.3)), RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(0.6, -1.1)) - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); - Dataset<Row> df = jsql.createDataFrame(data, schema); + Dataset<Row> df = spark.createDataFrame(data, schema); Dataset<Row> polyDF = polyExpansion.transform(df); List<Row> rows = polyDF.select("polyFeatures").takeAsList(3); @@ -67,6 +62,6 @@ public class JavaPolynomialExpansionExample { System.out.println(r.get(0)); } // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java index 7b226fede9..94e3fafcab 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java @@ -17,13 +17,11 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.QuantileDiscretizer; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -36,19 +34,16 @@ import org.apache.spark.sql.types.StructType; public class JavaQuantileDiscretizerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaQuantileDiscretizerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaQuantileDiscretizerExample").getOrCreate(); // $example on$ - JavaRDD<Row> jrdd = jsc.parallelize( - Arrays.asList( - RowFactory.create(0, 18.0), - RowFactory.create(1, 19.0), - RowFactory.create(2, 8.0), - RowFactory.create(3, 5.0), - RowFactory.create(4, 2.2) - ) + List<Row> data = Arrays.asList( + RowFactory.create(0, 18.0), + RowFactory.create(1, 19.0), + RowFactory.create(2, 8.0), + RowFactory.create(3, 5.0), + RowFactory.create(4, 2.2) ); StructType schema = new StructType(new StructField[]{ @@ -56,7 +51,7 @@ public class JavaQuantileDiscretizerExample { new StructField("hour", DataTypes.DoubleType, false, Metadata.empty()) }); - Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> df = spark.createDataFrame(data, schema); QuantileDiscretizer discretizer = new QuantileDiscretizer() .setInputCol("hour") @@ -66,6 +61,6 @@ public class JavaQuantileDiscretizerExample { Dataset<Row> result = discretizer.fit(df).transform(df); result.show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java index 8c453bf80d..8282ce01d3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java @@ -17,14 +17,12 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.RFormula; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -37,9 +35,7 @@ import static org.apache.spark.sql.types.DataTypes.*; public class JavaRFormulaExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaRFormulaExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaRFormulaExample").getOrCreate(); // $example on$ StructType schema = createStructType(new StructField[]{ @@ -49,13 +45,13 @@ public class JavaRFormulaExample { createStructField("clicked", DoubleType, false) }); - JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(7, "US", 18, 1.0), RowFactory.create(8, "CA", 12, 0.0), RowFactory.create(9, "NZ", 15, 0.0) - )); + ); - Dataset<Row> dataset = sqlContext.createDataFrame(rdd, schema); + Dataset<Row> dataset = spark.createDataFrame(data, schema); RFormula formula = new RFormula() .setFormula("clicked ~ country + hour") .setFeaturesCol("features") @@ -63,7 +59,7 @@ public class JavaRFormulaExample { Dataset<Row> output = formula.fit(dataset).transform(dataset); output.select("features", "label").show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java index 05c2bc9622..21e783a968 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java @@ -17,8 +17,6 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; // $example on$ import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineModel; @@ -29,19 +27,17 @@ import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.ml.feature.*; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example off$ public class JavaRandomForestClassifierExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaRandomForestClassifierExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaRandomForestClassifierExample").getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. - Dataset<Row> data = - sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -98,6 +94,6 @@ public class JavaRandomForestClassifierExample { System.out.println("Learned classification forest model:\n" + rfModel.toDebugString()); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java index d366967083..ece184a878 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java @@ -17,8 +17,6 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; // $example on$ import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineModel; @@ -30,19 +28,17 @@ import org.apache.spark.ml.regression.RandomForestRegressionModel; import org.apache.spark.ml.regression.RandomForestRegressor; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example off$ public class JavaRandomForestRegressorExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaRandomForestRegressorExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaRandomForestRegressorExample").getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. - Dataset<Row> data = - sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -87,6 +83,6 @@ public class JavaRandomForestRegressorExample { System.out.println("Learned regression forest model:\n" + rfModel.toDebugString()); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java index 7e3ca99d7c..492718bbdb 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java @@ -19,36 +19,31 @@ package org.apache.spark.examples.ml; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.feature.SQLTransformer; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.*; // $example off$ public class JavaSQLTransformerExample { public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaSQLTransformerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaSQLTransformerExample").getOrCreate(); // $example on$ - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(0, 1.0, 3.0), RowFactory.create(2, 2.0, 5.0) - )); + ); StructType schema = new StructType(new StructField [] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("v2", DataTypes.DoubleType, false, Metadata.empty()) }); - Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> df = spark.createDataFrame(data, schema); SQLTransformer sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"); @@ -56,6 +51,6 @@ public class JavaSQLTransformerExample { sqlTrans.transform(df).show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java index cb911ef5ef..f906843640 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java @@ -21,8 +21,6 @@ import java.util.List; import com.google.common.collect.Lists; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.classification.LogisticRegressionModel; import org.apache.spark.ml.param.ParamMap; import org.apache.spark.ml.classification.LogisticRegression; @@ -30,7 +28,7 @@ import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; /** * A simple example demonstrating ways to specify parameters for Estimators and Transformers. @@ -42,9 +40,7 @@ import org.apache.spark.sql.SQLContext; public class JavaSimpleParamsExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaSimpleParamsExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaSimpleParamsExample").getOrCreate(); // Prepare training data. // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans @@ -55,7 +51,7 @@ public class JavaSimpleParamsExample { new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))); Dataset<Row> training = - jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class); + spark.createDataFrame(localTraining, LabeledPoint.class); // Create a LogisticRegression instance. This instance is an Estimator. LogisticRegression lr = new LogisticRegression(); @@ -96,7 +92,7 @@ public class JavaSimpleParamsExample { new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))); - Dataset<Row> test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class); + Dataset<Row> test = spark.createDataFrame(localTest, LabeledPoint.class); // Make predictions on test documents using the Transformer.transform() method. // LogisticRegressionModel.transform will only use the 'features' column. @@ -109,6 +105,6 @@ public class JavaSimpleParamsExample { + ", prediction=" + r.get(3)); } - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java index a18a60f448..9516ce1f4f 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java @@ -21,8 +21,6 @@ import java.util.List; import com.google.common.collect.Lists; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineModel; import org.apache.spark.ml.PipelineStage; @@ -31,7 +29,7 @@ import org.apache.spark.ml.feature.HashingTF; import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; /** * A simple text classification pipeline that recognizes "spark" from input text. It uses the Java @@ -44,9 +42,8 @@ import org.apache.spark.sql.SQLContext; public class JavaSimpleTextClassificationPipeline { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession + .builder().appName("JavaSimpleTextClassificationPipeline").getOrCreate(); // Prepare training documents, which are labeled. List<LabeledDocument> localTraining = Lists.newArrayList( @@ -55,7 +52,7 @@ public class JavaSimpleTextClassificationPipeline { new LabeledDocument(2L, "spark f g h", 1.0), new LabeledDocument(3L, "hadoop mapreduce", 0.0)); Dataset<Row> training = - jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class); + spark.createDataFrame(localTraining, LabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. Tokenizer tokenizer = new Tokenizer() @@ -80,7 +77,7 @@ public class JavaSimpleTextClassificationPipeline { new Document(5L, "l m n"), new Document(6L, "spark hadoop spark"), new Document(7L, "apache hadoop")); - Dataset<Row> test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class); + Dataset<Row> test = spark.createDataFrame(localTest, Document.class); // Make predictions on test documents. Dataset<Row> predictions = model.transform(test); @@ -89,6 +86,6 @@ public class JavaSimpleTextClassificationPipeline { + ", prediction=" + r.get(3)); } - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java index e2dd759c0a..10f82f2233 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java @@ -17,9 +17,7 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import org.apache.spark.ml.feature.StandardScaler; @@ -30,12 +28,11 @@ import org.apache.spark.sql.Row; public class JavaStandardScalerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaStandardScalerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaStandardScalerExample").getOrCreate(); // $example on$ - Dataset<Row> dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset<Row> dataFrame = + spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); StandardScaler scaler = new StandardScaler() .setInputCol("features") @@ -50,6 +47,6 @@ public class JavaStandardScalerExample { Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java index 0ff3782cb3..23ed071c9f 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java @@ -17,14 +17,12 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.StopWordsRemover; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -38,28 +36,26 @@ import org.apache.spark.sql.types.StructType; public class JavaStopWordsRemoverExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaStopWordsRemoverExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaStopWordsRemoverExample").getOrCreate(); // $example on$ StopWordsRemover remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered"); - JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) - )); + ); StructType schema = new StructType(new StructField[]{ new StructField( "raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); - Dataset<Row> dataset = jsql.createDataFrame(rdd, schema); + Dataset<Row> dataset = spark.createDataFrame(data, schema); remover.transform(dataset).show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java index ceacbb4fb3..d4c2cf96a7 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java @@ -17,14 +17,12 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.StringIndexer; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -37,30 +35,28 @@ import static org.apache.spark.sql.types.DataTypes.*; public class JavaStringIndexerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaStringIndexerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaStringIndexerExample").getOrCreate(); // $example on$ - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(0, "a"), RowFactory.create(1, "b"), RowFactory.create(2, "c"), RowFactory.create(3, "a"), RowFactory.create(4, "a"), RowFactory.create(5, "c") - )); + ); StructType schema = new StructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("category", StringType, false) }); - Dataset<Row> df = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> df = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex"); Dataset<Row> indexed = indexer.fit(df).transform(df); indexed.show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java index 107c835f2e..a816991777 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java @@ -19,10 +19,8 @@ package org.apache.spark.examples.ml; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.feature.HashingTF; import org.apache.spark.ml.feature.IDF; import org.apache.spark.ml.feature.IDFModel; @@ -31,7 +29,7 @@ import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; @@ -40,21 +38,19 @@ import org.apache.spark.sql.types.StructType; public class JavaTfIdfExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaTfIdfExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaTfIdfExample").getOrCreate(); // $example on$ - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(0, "Hi I heard about Spark"), RowFactory.create(0, "I wish Java could use case classes"), RowFactory.create(1, "Logistic regression models are neat") - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); - Dataset<Row> sentenceData = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> sentenceData = spark.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); Dataset<Row> wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; @@ -76,6 +72,6 @@ public class JavaTfIdfExample { } // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java index 9225fe2262..a65735a5e5 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java @@ -17,14 +17,12 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.RegexTokenizer; import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.sql.Dataset; @@ -38,23 +36,21 @@ import org.apache.spark.sql.types.StructType; public class JavaTokenizerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaTokenizerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaTokenizerExample").getOrCreate(); // $example on$ - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(0, "Hi I heard about Spark"), RowFactory.create(1, "I wish Java could use case classes"), RowFactory.create(2, "Logistic,regression,models,are,neat") - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.IntegerType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); - Dataset<Row> sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> sentenceDataFrame = spark.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); @@ -70,6 +66,6 @@ public class JavaTokenizerExample { .setOutputCol("words") .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java index 953ad455b1..9569bc2412 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java @@ -17,14 +17,11 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Arrays; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.VectorAssembler; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; @@ -38,9 +35,7 @@ import static org.apache.spark.sql.types.DataTypes.*; public class JavaVectorAssemblerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaVectorAssemblerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaVectorAssemblerExample").getOrCreate(); // $example on$ StructType schema = createStructType(new StructField[]{ @@ -51,8 +46,7 @@ public class JavaVectorAssemblerExample { createStructField("clicked", DoubleType, false) }); Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0); - JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row)); - Dataset<Row> dataset = sqlContext.createDataFrame(rdd, schema); + Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[]{"hour", "mobile", "userFeatures"}) @@ -61,7 +55,7 @@ public class JavaVectorAssemblerExample { Dataset<Row> output = assembler.transform(dataset); System.out.println(output.select("features", "clicked").first()); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java index b3b5953ee7..217d5a06d1 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java @@ -17,9 +17,7 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ import java.util.Map; @@ -32,12 +30,10 @@ import org.apache.spark.sql.Row; public class JavaVectorIndexerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaVectorIndexerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaVectorIndexerExample").getOrCreate(); // $example on$ - Dataset<Row> data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); VectorIndexer indexer = new VectorIndexer() .setInputCol("features") @@ -57,6 +53,6 @@ public class JavaVectorIndexerExample { Dataset<Row> indexedData = indexerModel.transform(data); indexedData.show(); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java index 2ae57c3577..4f1ea824a3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java @@ -17,14 +17,13 @@ package org.apache.spark.examples.ml; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; // $example on$ +import java.util.List; + import com.google.common.collect.Lists; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.attribute.Attribute; import org.apache.spark.ml.attribute.AttributeGroup; import org.apache.spark.ml.attribute.NumericAttribute; @@ -38,9 +37,7 @@ import org.apache.spark.sql.types.*; public class JavaVectorSlicerExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaVectorSlicerExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext jsql = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaVectorSlicerExample").getOrCreate(); // $example on$ Attribute[] attrs = new Attribute[]{ @@ -50,13 +47,13 @@ public class JavaVectorSlicerExample { }; AttributeGroup group = new AttributeGroup("userFeatures", attrs); - JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList( + List<Row> data = Lists.newArrayList( RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) - )); + ); Dataset<Row> dataset = - jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField())); + spark.createDataFrame(data, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer() .setInputCol("userFeatures").setOutputCol("features"); @@ -68,7 +65,7 @@ public class JavaVectorSlicerExample { System.out.println(output.select("userFeatures", "features").first()); // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java index c5bb1eaaa3..d9b1a79b52 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java @@ -19,37 +19,32 @@ package org.apache.spark.examples.ml; // $example on$ import java.util.Arrays; +import java.util.List; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.feature.Word2Vec; import org.apache.spark.ml.feature.Word2VecModel; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.*; // $example off$ public class JavaWord2VecExample { public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaWord2VecExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkSession spark = SparkSession.builder().appName("JavaWord2VecExample").getOrCreate(); // $example on$ // Input data: Each row is a bag of words from a sentence or document. - JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( + List<Row> data = Arrays.asList( RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" "))) - )); + ); StructType schema = new StructType(new StructField[]{ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); - Dataset<Row> documentDF = sqlContext.createDataFrame(jrdd, schema); + Dataset<Row> documentDF = spark.createDataFrame(data, schema); // Learn a mapping from words to Vectors. Word2Vec word2Vec = new Word2Vec() @@ -64,6 +59,6 @@ public class JavaWord2VecExample { } // $example off$ - jsc.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java index 354a5306ed..ec2142e756 100644 --- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java @@ -21,14 +21,12 @@ import java.io.Serializable; import java.util.Arrays; import java.util.List; -import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; public class JavaSparkSQL { public static class Person implements Serializable { @@ -53,13 +51,12 @@ public class JavaSparkSQL { } public static void main(String[] args) throws Exception { - SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL"); - JavaSparkContext ctx = new JavaSparkContext(sparkConf); - SQLContext sqlContext = new SQLContext(ctx); + SparkSession spark = SparkSession.builder().appName("JavaSparkSQL").getOrCreate(); System.out.println("=== Data source: RDD ==="); // Load a text file and convert each line to a Java Bean. - JavaRDD<Person> people = ctx.textFile("examples/src/main/resources/people.txt").map( + String file = "examples/src/main/resources/people.txt"; + JavaRDD<Person> people = spark.read().text(file).javaRDD().map( new Function<String, Person>() { @Override public Person call(String line) { @@ -74,12 +71,11 @@ public class JavaSparkSQL { }); // Apply a schema to an RDD of Java Beans and register it as a table. - Dataset<Row> schemaPeople = sqlContext.createDataFrame(people, Person.class); + Dataset<Row> schemaPeople = spark.createDataFrame(people, Person.class); schemaPeople.registerTempTable("people"); // SQL can be run over RDDs that have been registered as tables. - Dataset<Row> teenagers = - sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); + Dataset<Row> teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // The results of SQL queries are DataFrames and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. @@ -100,12 +96,12 @@ public class JavaSparkSQL { // Read in the parquet file created above. // Parquet files are self-describing so the schema is preserved. // The result of loading a parquet file is also a DataFrame. - Dataset<Row> parquetFile = sqlContext.read().parquet("people.parquet"); + Dataset<Row> parquetFile = spark.read().parquet("people.parquet"); //Parquet files can also be registered as tables and then used in SQL statements. parquetFile.registerTempTable("parquetFile"); Dataset<Row> teenagers2 = - sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); + spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() { @Override public String call(Row row) { @@ -121,7 +117,7 @@ public class JavaSparkSQL { // The path can be either a single text file or a directory storing text files. String path = "examples/src/main/resources/people.json"; // Create a DataFrame from the file(s) pointed by path - Dataset<Row> peopleFromJsonFile = sqlContext.read().json(path); + Dataset<Row> peopleFromJsonFile = spark.read().json(path); // Because the schema of a JSON dataset is automatically inferred, to write queries, // it is better to take a look at what is the schema. @@ -135,8 +131,7 @@ public class JavaSparkSQL { peopleFromJsonFile.registerTempTable("people"); // SQL statements can be run by using the sql methods provided by sqlContext. - Dataset<Row> teenagers3 = - sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); + Dataset<Row> teenagers3 = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // The results of SQL queries are DataFrame and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. @@ -152,8 +147,8 @@ public class JavaSparkSQL { // a RDD[String] storing one JSON object per string. List<String> jsonData = Arrays.asList( "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); - JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData); - Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd()); + JavaRDD<String> anotherPeopleRDD = spark.createDataFrame(jsonData, String.class).toJSON().javaRDD(); + Dataset<Row> peopleFromJsonRDD = spark.read().json(anotherPeopleRDD); // Take a look at the schema of this new DataFrame. peopleFromJsonRDD.printSchema(); @@ -166,7 +161,7 @@ public class JavaSparkSQL { peopleFromJsonRDD.registerTempTable("people2"); - Dataset<Row> peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2"); + Dataset<Row> peopleWithCity = spark.sql("SELECT name, address.city FROM people2"); List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() { @Override public String call(Row row) { @@ -177,6 +172,6 @@ public class JavaSparkSQL { System.out.println(name); } - ctx.stop(); + spark.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java index 7aa8862761..44f1e800fe 100644 --- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java +++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java @@ -22,14 +22,13 @@ import java.util.Iterator; import java.util.regex.Pattern; import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.VoidFunction2; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.api.java.StorageLevels; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.Time; @@ -82,7 +81,7 @@ public final class JavaSqlNetworkWordCount { words.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() { @Override public void call(JavaRDD<String> rdd, Time time) { - SQLContext sqlContext = JavaSQLContextSingleton.getInstance(rdd.context()); + SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() { @@ -93,14 +92,14 @@ public final class JavaSqlNetworkWordCount { return record; } }); - Dataset<Row> wordsDataFrame = sqlContext.createDataFrame(rowRDD, JavaRecord.class); + Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class); // Register as table wordsDataFrame.registerTempTable("words"); // Do word count on table using SQL and print it Dataset<Row> wordCountsDataFrame = - sqlContext.sql("select word, count(*) as total from words group by word"); + spark.sql("select word, count(*) as total from words group by word"); System.out.println("========= " + time + "========="); wordCountsDataFrame.show(); } @@ -111,12 +110,12 @@ public final class JavaSqlNetworkWordCount { } } -/** Lazily instantiated singleton instance of SQLContext */ -class JavaSQLContextSingleton { - private static transient SQLContext instance = null; - public static SQLContext getInstance(SparkContext sparkContext) { +/** Lazily instantiated singleton instance of SparkSession */ +class JavaSparkSessionSingleton { + private static transient SparkSession instance = null; + public static SparkSession getInstance(SparkConf sparkConf) { if (instance == null) { - instance = new SQLContext(sparkContext); + instance = SparkSession.builder().config(sparkConf).getOrCreate(); } return instance; } diff --git a/examples/src/main/python/ml/als_example.py b/examples/src/main/python/ml/als_example.py index 0c9ac583b2..e36444f185 100644 --- a/examples/src/main/python/ml/als_example.py +++ b/examples/src/main/python/ml/als_example.py @@ -21,8 +21,7 @@ import sys if sys.version >= '3': long = int -from pyspark import SparkContext -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession # $example on$ from pyspark.ml.evaluation import RegressionEvaluator @@ -31,15 +30,14 @@ from pyspark.sql import Row # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="ALSExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("ALSExample").getOrCreate() # $example on$ - lines = sc.textFile("data/mllib/als/sample_movielens_ratings.txt") - parts = lines.map(lambda l: l.split("::")) + lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd + parts = lines.map(lambda row: row.value.split("::")) ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=long(p[3]))) - ratings = sqlContext.createDataFrame(ratingsRDD) + ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data @@ -56,4 +54,4 @@ if __name__ == "__main__": rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py index 317cfa638a..072187e645 100644 --- a/examples/src/main/python/ml/binarizer_example.py +++ b/examples/src/main/python/ml/binarizer_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import Binarizer # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="BinarizerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("BinarizerExample").getOrCreate() # $example on$ - continuousDataFrame = sqlContext.createDataFrame([ + continuousDataFrame = spark.createDataFrame([ (0, 0.1), (1, 0.8), (2, 0.2) @@ -40,4 +38,4 @@ if __name__ == "__main__": print(binarized_feature) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index e6f6bfd7e8..836a89cde0 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -17,28 +17,26 @@ from __future__ import print_function -from pyspark import SparkContext # $example on$ from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors from pyspark.mllib.linalg import Vectors from pyspark.sql.types import Row # $example off$ -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession """ A simple example demonstrating a bisecting k-means clustering. """ if __name__ == "__main__": - - sc = SparkContext(appName="PythonBisectingKMeansExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("PythonBisectingKMeansExample").getOrCreate() # $example on$ - data = sc.textFile("data/mllib/kmeans_data.txt") - parsed = data.map(lambda l: Row(features=Vectors.dense([float(x) for x in l.split(' ')]))) - training = sqlContext.createDataFrame(parsed) + data = spark.read.text("data/mllib/kmeans_data.txt").rdd + parsed = data\ + .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')]))) + training = spark.createDataFrame(parsed) kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") @@ -54,4 +52,4 @@ if __name__ == "__main__": print(center) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py index 4304255f35..288ec62bdf 100644 --- a/examples/src/main/python/ml/bucketizer_example.py +++ b/examples/src/main/python/ml/bucketizer_example.py @@ -17,21 +17,19 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import Bucketizer # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="BucketizerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("BucketizerExample").getOrCreate() # $example on$ splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-0.5,), (-0.3,), (0.0,), (0.2,)] - dataFrame = sqlContext.createDataFrame(data, ["features"]) + dataFrame = spark.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") @@ -40,4 +38,4 @@ if __name__ == "__main__": bucketedData.show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py index 997a504735..8f58fc28de 100644 --- a/examples/src/main/python/ml/chisq_selector_example.py +++ b/examples/src/main/python/ml/chisq_selector_example.py @@ -17,19 +17,17 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import ChiSqSelector from pyspark.mllib.linalg import Vectors # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="ChiSqSelectorExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("ChiSqSelectorExample").getOrCreate() # $example on$ - df = sqlContext.createDataFrame([ + df = spark.createDataFrame([ (7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,), (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,), (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "clicked"]) @@ -41,4 +39,4 @@ if __name__ == "__main__": result.show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py index e839f645f7..9dbf9959d1 100644 --- a/examples/src/main/python/ml/count_vectorizer_example.py +++ b/examples/src/main/python/ml/count_vectorizer_example.py @@ -17,19 +17,17 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import CountVectorizer # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="CountVectorizerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("CountVectorizerExample").getOrCreate() # $example on$ # Input data: Each row is a bag of words with a ID. - df = sqlContext.createDataFrame([ + df = spark.createDataFrame([ (0, "a b c".split(" ")), (1, "a b b c a".split(" ")) ], ["id", "words"]) @@ -41,4 +39,4 @@ if __name__ == "__main__": result.show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py index 5f0ef20218..a61d0f63d2 100644 --- a/examples/src/main/python/ml/cross_validator.py +++ b/examples/src/main/python/ml/cross_validator.py @@ -17,15 +17,14 @@ from __future__ import print_function -from pyspark import SparkContext # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.tuning import CrossValidator, ParamGridBuilder -from pyspark.sql import Row, SQLContext # $example off$ +from pyspark.sql import Row, SparkSession """ A simple example demonstrating model selection using CrossValidator. @@ -36,25 +35,23 @@ Run with: """ if __name__ == "__main__": - sc = SparkContext(appName="CrossValidatorExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("CrossValidatorExample").getOrCreate() # $example on$ # Prepare training documents, which are labeled. - LabeledDocument = Row("id", "text", "label") - training = sc.parallelize([(0, "a b c d e spark", 1.0), - (1, "b d", 0.0), - (2, "spark f g h", 1.0), - (3, "hadoop mapreduce", 0.0), - (4, "b spark who", 1.0), - (5, "g d a y", 0.0), - (6, "spark fly", 1.0), - (7, "was mapreduce", 0.0), - (8, "e spark program", 1.0), - (9, "a e c l", 0.0), - (10, "spark compile", 1.0), - (11, "hadoop software", 0.0) - ]) \ - .map(lambda x: LabeledDocument(*x)).toDF() + training = spark.createDataFrame([ + (0, "a b c d e spark", 1.0), + (1, "b d", 0.0), + (2, "spark f g h", 1.0), + (3, "hadoop mapreduce", 0.0), + (4, "b spark who", 1.0), + (5, "g d a y", 0.0), + (6, "spark fly", 1.0), + (7, "was mapreduce", 0.0), + (8, "e spark program", 1.0), + (9, "a e c l", 0.0), + (10, "spark compile", 1.0), + (11, "hadoop software", 0.0) + ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") @@ -82,12 +79,12 @@ if __name__ == "__main__": cvModel = crossval.fit(training) # Prepare test documents, which are unlabeled. - Document = Row("id", "text") - test = sc.parallelize([(4L, "spark i j k"), - (5L, "l m n"), - (6L, "mapreduce spark"), - (7L, "apache hadoop")]) \ - .map(lambda x: Document(*x)).toDF() + test = spark.createDataFrame([ + (4L, "spark i j k"), + (5L, "l m n"), + (6L, "mapreduce spark"), + (7L, "apache hadoop") + ], ["id", "text"]) # Make predictions on test documents. cvModel uses the best model found (lrModel). prediction = cvModel.transform(test) @@ -96,4 +93,4 @@ if __name__ == "__main__": print(row) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py index d2644ca335..b3e671038e 100644 --- a/examples/src/main/python/ml/dataframe_example.py +++ b/examples/src/main/python/ml/dataframe_example.py @@ -26,16 +26,14 @@ import sys import tempfile import shutil -from pyspark import SparkContext -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession from pyspark.mllib.stat import Statistics if __name__ == "__main__": if len(sys.argv) > 2: print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr) exit(-1) - sc = SparkContext(appName="DataFrameExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("DataFrameExample").getOrCreate() if len(sys.argv) == 2: input = sys.argv[1] else: @@ -43,7 +41,7 @@ if __name__ == "__main__": # Load input data print("Loading LIBSVM file with UDT from " + input + ".") - df = sqlContext.read.format("libsvm").load(input).cache() + df = spark.read.format("libsvm").load(input).cache() print("Schema from LIBSVM:") df.printSchema() print("Loaded training data as a DataFrame with " + @@ -54,7 +52,7 @@ if __name__ == "__main__": labelSummary.show() # Convert features column to an RDD of vectors. - features = df.select("features").map(lambda r: r.features) + features = df.select("features").rdd.map(lambda r: r.features) summary = Statistics.colStats(features) print("Selected features column with average values:\n" + str(summary.mean())) @@ -67,9 +65,9 @@ if __name__ == "__main__": # Load the records back. print("Loading Parquet file with UDT from " + tempdir) - newDF = sqlContext.read.parquet(tempdir) + newDF = spark.read.parquet(tempdir) print("Schema from Parquet:") newDF.printSchema() shutil.rmtree(tempdir) - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/dct_example.py b/examples/src/main/python/ml/dct_example.py index 264d47f404..1bf8fc6d14 100644 --- a/examples/src/main/python/ml/dct_example.py +++ b/examples/src/main/python/ml/dct_example.py @@ -17,19 +17,17 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import DCT from pyspark.mllib.linalg import Vectors # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="DCTExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("DCTExample").getOrCreate() # $example on$ - df = sqlContext.createDataFrame([ + df = spark.createDataFrame([ (Vectors.dense([0.0, 1.0, -2.0, 3.0]),), (Vectors.dense([-1.0, 2.0, 4.0, -7.0]),), (Vectors.dense([14.0, -2.0, -5.0, 1.0]),)], ["features"]) @@ -42,4 +40,4 @@ if __name__ == "__main__": print(dcts) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/decision_tree_classification_example.py b/examples/src/main/python/ml/decision_tree_classification_example.py index 86bdc65392..d2318e2436 100644 --- a/examples/src/main/python/ml/decision_tree_classification_example.py +++ b/examples/src/main/python/ml/decision_tree_classification_example.py @@ -21,20 +21,19 @@ Decision Tree Classification Example. from __future__ import print_function # $example on$ -from pyspark import SparkContext, SQLContext from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="decision_tree_classification_example") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("decision_tree_classification_example").getOrCreate() # $example on$ # Load the data stored in LIBSVM format as a DataFrame. - data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. @@ -72,3 +71,5 @@ if __name__ == "__main__": # summary only print(treeModel) # $example off$ + + spark.stop() diff --git a/examples/src/main/python/ml/decision_tree_regression_example.py b/examples/src/main/python/ml/decision_tree_regression_example.py index 8e20d5d857..9e8cb382a9 100644 --- a/examples/src/main/python/ml/decision_tree_regression_example.py +++ b/examples/src/main/python/ml/decision_tree_regression_example.py @@ -20,21 +20,20 @@ Decision Tree Regression Example. """ from __future__ import print_function -from pyspark import SparkContext, SQLContext # $example on$ from pyspark.ml import Pipeline from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="decision_tree_classification_example") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("decision_tree_classification_example").getOrCreate() # $example on$ # Load the data stored in LIBSVM format as a DataFrame. - data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. @@ -69,3 +68,5 @@ if __name__ == "__main__": # summary only print(treeModel) # $example off$ + + spark.stop() diff --git a/examples/src/main/python/ml/elementwise_product_example.py b/examples/src/main/python/ml/elementwise_product_example.py index c85cb0d895..6fa641b772 100644 --- a/examples/src/main/python/ml/elementwise_product_example.py +++ b/examples/src/main/python/ml/elementwise_product_example.py @@ -17,23 +17,21 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import ElementwiseProduct from pyspark.mllib.linalg import Vectors # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="ElementwiseProductExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("ElementwiseProductExample").getOrCreate() # $example on$ data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)] - df = sqlContext.createDataFrame(data, ["vector"]) + df = spark.createDataFrame(data, ["vector"]) transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), inputCol="vector", outputCol="transformedVector") transformer.transform(df).show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/estimator_transformer_param_example.py b/examples/src/main/python/ml/estimator_transformer_param_example.py index 9a8993dac4..4993b5a984 100644 --- a/examples/src/main/python/ml/estimator_transformer_param_example.py +++ b/examples/src/main/python/ml/estimator_transformer_param_example.py @@ -18,20 +18,19 @@ """ Estimator Transformer Param Example. """ -from pyspark import SparkContext, SQLContext + # $example on$ from pyspark.mllib.linalg import Vectors from pyspark.ml.classification import LogisticRegression # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - - sc = SparkContext(appName="EstimatorTransformerParamExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("EstimatorTransformerParamExample").getOrCreate() # $example on$ # Prepare training data from a list of (label, features) tuples. - training = sqlContext.createDataFrame([ + training = spark.createDataFrame([ (1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), @@ -69,7 +68,7 @@ if __name__ == "__main__": print model2.extractParamMap() # Prepare test data - test = sqlContext.createDataFrame([ + test = spark.createDataFrame([ (1.0, Vectors.dense([-1.0, 1.5, 1.3])), (0.0, Vectors.dense([3.0, 2.0, -0.1])), (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"]) @@ -84,4 +83,4 @@ if __name__ == "__main__": print row # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py index f7e842f4b3..b09ad41da3 100644 --- a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py +++ b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py @@ -20,21 +20,20 @@ Gradient Boosted Tree Classifier Example. """ from __future__ import print_function -from pyspark import SparkContext, SQLContext # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import GBTClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="gradient_boosted_tree_classifier_example") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("gradient_boosted_tree_classifier_example").getOrCreate() # $example on$ # Load and parse the data file, converting it to a DataFrame. - data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. @@ -72,4 +71,4 @@ if __name__ == "__main__": print(gbtModel) # summary only # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py index f8b4de651c..caa7cfc4e1 100644 --- a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py +++ b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py @@ -20,21 +20,20 @@ Gradient Boosted Tree Regressor Example. """ from __future__ import print_function -from pyspark import SparkContext, SQLContext # $example on$ from pyspark.ml import Pipeline from pyspark.ml.regression import GBTRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="gradient_boosted_tree_regressor_example") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("gradient_boosted_tree_regressor_example").getOrCreate() # $example on$ # Load and parse the data file, converting it to a DataFrame. - data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -69,4 +68,4 @@ if __name__ == "__main__": print(gbtModel) # summary only # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/index_to_string_example.py b/examples/src/main/python/ml/index_to_string_example.py index fb0ba2950b..dd04b2c4b0 100644 --- a/examples/src/main/python/ml/index_to_string_example.py +++ b/examples/src/main/python/ml/index_to_string_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext # $example on$ from pyspark.ml.feature import IndexToString, StringIndexer # $example off$ -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="IndexToStringExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("IndexToStringExample").getOrCreate() # $example on$ - df = sqlContext.createDataFrame( + df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) @@ -42,4 +40,4 @@ if __name__ == "__main__": converted.select("id", "originalCategory").show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py index fa57a4d3ad..7d9d80e645 100644 --- a/examples/src/main/python/ml/kmeans_example.py +++ b/examples/src/main/python/ml/kmeans_example.py @@ -20,10 +20,9 @@ from __future__ import print_function import sys import numpy as np -from pyspark import SparkContext from pyspark.ml.clustering import KMeans, KMeansModel from pyspark.mllib.linalg import VectorUDT, _convert_to_vector -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession from pyspark.sql.types import Row, StructField, StructType """ @@ -35,8 +34,8 @@ This example requires NumPy (http://www.numpy.org/). """ -def parseVector(line): - array = np.array([float(x) for x in line.split(' ')]) +def parseVector(row): + array = np.array([float(x) for x in row.value.split(' ')]) return _convert_to_vector(array) @@ -50,14 +49,13 @@ if __name__ == "__main__": path = sys.argv[1] k = sys.argv[2] - sc = SparkContext(appName="PythonKMeansExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("PythonKMeansExample").getOrCreate() - lines = sc.textFile(path) + lines = spark.read.text(path).rdd data = lines.map(parseVector) row_rdd = data.map(lambda x: Row(x)) schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)]) - df = sqlContext.createDataFrame(row_rdd, schema) + df = spark.createDataFrame(row_rdd, schema) kmeans = KMeans().setK(2).setSeed(1).setFeaturesCol(FEATURES_COL) model = kmeans.fit(df) @@ -67,4 +65,4 @@ if __name__ == "__main__": for center in centers: print(center) - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/linear_regression_with_elastic_net.py b/examples/src/main/python/ml/linear_regression_with_elastic_net.py index a4cd40cf26..99b7f7fe99 100644 --- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py +++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py @@ -17,19 +17,17 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.regression import LinearRegression # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="LinearRegressionWithElasticNet") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("LinearRegressionWithElasticNet").getOrCreate() # $example on$ # Load training data - training = sqlContext.read.format("libsvm")\ + training = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) @@ -42,4 +40,4 @@ if __name__ == "__main__": print("Intercept: " + str(lrModel.intercept)) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/logistic_regression_with_elastic_net.py b/examples/src/main/python/ml/logistic_regression_with_elastic_net.py index b0b1d27e13..0d7112e723 100644 --- a/examples/src/main/python/ml/logistic_regression_with_elastic_net.py +++ b/examples/src/main/python/ml/logistic_regression_with_elastic_net.py @@ -17,19 +17,17 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.classification import LogisticRegression # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="LogisticRegressionWithElasticNet") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("LogisticRegressionWithElasticNet").getOrCreate() # $example on$ # Load training data - training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) @@ -41,4 +39,4 @@ if __name__ == "__main__": print("Intercept: " + str(lrModel.intercept)) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/max_abs_scaler_example.py b/examples/src/main/python/ml/max_abs_scaler_example.py index d9b69eef1c..1cb95a98f0 100644 --- a/examples/src/main/python/ml/max_abs_scaler_example.py +++ b/examples/src/main/python/ml/max_abs_scaler_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import MaxAbsScaler # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="MaxAbsScalerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("MaxAbsScalerExample").getOrCreate() # $example on$ - dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") @@ -40,4 +38,4 @@ if __name__ == "__main__": scaledData.show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/min_max_scaler_example.py b/examples/src/main/python/ml/min_max_scaler_example.py index 2f8e4ade46..8d91a59e2b 100644 --- a/examples/src/main/python/ml/min_max_scaler_example.py +++ b/examples/src/main/python/ml/min_max_scaler_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import MinMaxScaler # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="MinMaxScalerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("MinMaxScalerExample").getOrCreate() # $example on$ - dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") @@ -40,4 +38,4 @@ if __name__ == "__main__": scaledData.show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py index f84588f547..8bededc14d 100644 --- a/examples/src/main/python/ml/multilayer_perceptron_classification.py +++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py @@ -17,21 +17,19 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - - sc = SparkContext(appName="multilayer_perceptron_classification_example") - sqlContext = SQLContext(sc) + spark = SparkSession\ + .builder.appName("multilayer_perceptron_classification_example").getOrCreate() # $example on$ # Load training data - data = sqlContext.read.format("libsvm")\ + data = spark.read.format("libsvm")\ .load("data/mllib/sample_multiclass_classification_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) @@ -52,4 +50,4 @@ if __name__ == "__main__": print("Precision:" + str(evaluator.evaluate(predictionAndLabels))) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py index f2d85f53e7..b7fecf0d68 100644 --- a/examples/src/main/python/ml/n_gram_example.py +++ b/examples/src/main/python/ml/n_gram_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import NGram # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="NGramExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("NGramExample").getOrCreate() # $example on$ - wordDataFrame = sqlContext.createDataFrame([ + wordDataFrame = spark.createDataFrame([ (0, ["Hi", "I", "heard", "about", "Spark"]), (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"]) @@ -39,4 +37,4 @@ if __name__ == "__main__": print(ngrams_label) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py index db8fbea9bf..e37035542c 100644 --- a/examples/src/main/python/ml/naive_bayes_example.py +++ b/examples/src/main/python/ml/naive_bayes_example.py @@ -17,21 +17,18 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.classification import NaiveBayes from pyspark.ml.evaluation import MulticlassClassificationEvaluator # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - - sc = SparkContext(appName="naive_bayes_example") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("naive_bayes_example").getOrCreate() # $example on$ # Load training data - data = sqlContext.read.format("libsvm") \ + data = spark.read.format("libsvm") \ .load("data/mllib/sample_libsvm_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) @@ -50,4 +47,4 @@ if __name__ == "__main__": print("Precision:" + str(evaluator.evaluate(predictionAndLabels))) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py index d490221474..ae25537619 100644 --- a/examples/src/main/python/ml/normalizer_example.py +++ b/examples/src/main/python/ml/normalizer_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import Normalizer # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="NormalizerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("NormalizerExample").getOrCreate() # $example on$ - dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) @@ -40,4 +38,4 @@ if __name__ == "__main__": lInfNormData.show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py index 0f94c26638..9acc363dc9 100644 --- a/examples/src/main/python/ml/onehot_encoder_example.py +++ b/examples/src/main/python/ml/onehot_encoder_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import OneHotEncoder, StringIndexer # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="OneHotEncoderExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("OneHotEncoderExample").getOrCreate() # $example on$ - df = sqlContext.createDataFrame([ + df = spark.createDataFrame([ (0, "a"), (1, "b"), (2, "c"), @@ -45,4 +43,4 @@ if __name__ == "__main__": encoded.select("id", "categoryVec").show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py index a17181f1b8..adab151734 100644 --- a/examples/src/main/python/ml/pca_example.py +++ b/examples/src/main/python/ml/pca_example.py @@ -17,26 +17,24 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import PCA from pyspark.mllib.linalg import Vectors # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="PCAExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("PCAExample").getOrCreate() # $example on$ data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] - df = sqlContext.createDataFrame(data, ["features"]) + df = spark.createDataFrame(data, ["features"]) pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") model = pca.fit(df) result = model.transform(df).select("pcaFeatures") result.show(truncate=False) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py index 3288568f0c..ed9765d961 100644 --- a/examples/src/main/python/ml/pipeline_example.py +++ b/examples/src/main/python/ml/pipeline_example.py @@ -18,21 +18,20 @@ """ Pipeline Example. """ -from pyspark import SparkContext, SQLContext + # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - - sc = SparkContext(appName="PipelineExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("PipelineExample").getOrCreate() # $example on$ # Prepare training documents from a list of (id, text, label) tuples. - training = sqlContext.createDataFrame([ + training = spark.createDataFrame([ (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), @@ -48,7 +47,7 @@ if __name__ == "__main__": model = pipeline.fit(training) # Prepare test documents, which are unlabeled (id, text) tuples. - test = sqlContext.createDataFrame([ + test = spark.createDataFrame([ (4L, "spark i j k"), (5L, "l m n"), (6L, "mapreduce spark"), @@ -61,4 +60,4 @@ if __name__ == "__main__": print(row) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py index 89f5cbe8f2..328b559320 100644 --- a/examples/src/main/python/ml/polynomial_expansion_example.py +++ b/examples/src/main/python/ml/polynomial_expansion_example.py @@ -17,19 +17,17 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import PolynomialExpansion from pyspark.mllib.linalg import Vectors # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="PolynomialExpansionExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("PolynomialExpansionExample").getOrCreate() # $example on$ - df = sqlContext\ + df = spark\ .createDataFrame([(Vectors.dense([-2.0, 2.3]),), (Vectors.dense([0.0, 0.0]),), (Vectors.dense([0.6, -1.1]),)], @@ -40,4 +38,4 @@ if __name__ == "__main__": print(expanded) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/random_forest_classifier_example.py b/examples/src/main/python/ml/random_forest_classifier_example.py index c3570438c5..b0a93e050c 100644 --- a/examples/src/main/python/ml/random_forest_classifier_example.py +++ b/examples/src/main/python/ml/random_forest_classifier_example.py @@ -20,21 +20,20 @@ Random Forest Classifier Example. """ from __future__ import print_function -from pyspark import SparkContext, SQLContext # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="random_forest_classifier_example") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("random_forest_classifier_example").getOrCreate() # $example on$ # Load and parse the data file, converting it to a DataFrame. - data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. @@ -72,4 +71,4 @@ if __name__ == "__main__": print(rfModel) # summary only # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/random_forest_regressor_example.py b/examples/src/main/python/ml/random_forest_regressor_example.py index b77014f379..4bb84f0de8 100644 --- a/examples/src/main/python/ml/random_forest_regressor_example.py +++ b/examples/src/main/python/ml/random_forest_regressor_example.py @@ -20,21 +20,20 @@ Random Forest Regressor Example. """ from __future__ import print_function -from pyspark import SparkContext, SQLContext # $example on$ from pyspark.ml import Pipeline from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="random_forest_regressor_example") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("random_forest_regressor_example").getOrCreate() # $example on$ # Load and parse the data file, converting it to a DataFrame. - data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -69,4 +68,4 @@ if __name__ == "__main__": print(rfModel) # summary only # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/rformula_example.py b/examples/src/main/python/ml/rformula_example.py index b544a14700..45cc116ac2 100644 --- a/examples/src/main/python/ml/rformula_example.py +++ b/examples/src/main/python/ml/rformula_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import RFormula # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="RFormulaExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("RFormulaExample").getOrCreate() # $example on$ - dataset = sqlContext.createDataFrame( + dataset = spark.createDataFrame( [(7, "US", 18, 1.0), (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0)], @@ -41,4 +39,4 @@ if __name__ == "__main__": output.select("features", "label").show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py index b4f06bf888..3600c12211 100644 --- a/examples/src/main/python/ml/simple_text_classification_pipeline.py +++ b/examples/src/main/python/ml/simple_text_classification_pipeline.py @@ -17,11 +17,10 @@ from __future__ import print_function -from pyspark import SparkContext from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer -from pyspark.sql import Row, SQLContext +from pyspark.sql import Row, SparkSession """ @@ -34,16 +33,15 @@ pipeline in Python. Run with: if __name__ == "__main__": - sc = SparkContext(appName="SimpleTextClassificationPipeline") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("SimpleTextClassificationPipeline").getOrCreate() # Prepare training documents, which are labeled. - LabeledDocument = Row("id", "text", "label") - training = sc.parallelize([(0, "a b c d e spark", 1.0), - (1, "b d", 0.0), - (2, "spark f g h", 1.0), - (3, "hadoop mapreduce", 0.0)]) \ - .map(lambda x: LabeledDocument(*x)).toDF() + training = spark.createDataFrame([ + (0, "a b c d e spark", 1.0), + (1, "b d", 0.0), + (2, "spark f g h", 1.0), + (3, "hadoop mapreduce", 0.0) + ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") @@ -55,12 +53,12 @@ if __name__ == "__main__": model = pipeline.fit(training) # Prepare test documents, which are unlabeled. - Document = Row("id", "text") - test = sc.parallelize([(4, "spark i j k"), - (5, "l m n"), - (6, "spark hadoop spark"), - (7, "apache hadoop")]) \ - .map(lambda x: Document(*x)).toDF() + test = spark.createDataFrame([ + (4, "spark i j k"), + (5, "l m n"), + (6, "spark hadoop spark"), + (7, "apache hadoop") + ], ["id", "text"]) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) @@ -68,4 +66,4 @@ if __name__ == "__main__": for row in selected.collect(): print(row) - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/sql_transformer.py b/examples/src/main/python/ml/sql_transformer.py index 9575d728d8..26045db4be 100644 --- a/examples/src/main/python/ml/sql_transformer.py +++ b/examples/src/main/python/ml/sql_transformer.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext # $example on$ from pyspark.ml.feature import SQLTransformer # $example off$ -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="SQLTransformerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("SQLTransformerExample").getOrCreate() # $example on$ - df = sqlContext.createDataFrame([ + df = spark.createDataFrame([ (0, 1.0, 3.0), (2, 2.0, 5.0) ], ["id", "v1", "v2"]) @@ -37,4 +35,4 @@ if __name__ == "__main__": sqlTrans.transform(df).show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/standard_scaler_example.py b/examples/src/main/python/ml/standard_scaler_example.py index ae7aa85005..c50804f6bf 100644 --- a/examples/src/main/python/ml/standard_scaler_example.py +++ b/examples/src/main/python/ml/standard_scaler_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import StandardScaler # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="StandardScalerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("StandardScalerExample").getOrCreate() # $example on$ - dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) @@ -40,4 +38,4 @@ if __name__ == "__main__": scaledData.show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py index 01f94af8ca..57362673df 100644 --- a/examples/src/main/python/ml/stopwords_remover_example.py +++ b/examples/src/main/python/ml/stopwords_remover_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import StopWordsRemover # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="StopWordsRemoverExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("StopWordsRemoverExample").getOrCreate() # $example on$ - sentenceData = sqlContext.createDataFrame([ + sentenceData = spark.createDataFrame([ (0, ["I", "saw", "the", "red", "baloon"]), (1, ["Mary", "had", "a", "little", "lamb"]) ], ["label", "raw"]) @@ -37,4 +35,4 @@ if __name__ == "__main__": remover.transform(sentenceData).show(truncate=False) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/string_indexer_example.py b/examples/src/main/python/ml/string_indexer_example.py index 58a8cb5d56..aacd4f999b 100644 --- a/examples/src/main/python/ml/string_indexer_example.py +++ b/examples/src/main/python/ml/string_indexer_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import StringIndexer # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="StringIndexerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("StringIndexerExample").getOrCreate() # $example on$ - df = sqlContext.createDataFrame( + df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") @@ -36,4 +34,4 @@ if __name__ == "__main__": indexed.show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py index 141324d458..25df8166ef 100644 --- a/examples/src/main/python/ml/tf_idf_example.py +++ b/examples/src/main/python/ml/tf_idf_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext # $example on$ from pyspark.ml.feature import HashingTF, IDF, Tokenizer # $example off$ -from pyspark.sql import SQLContext +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="TfIdfExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("TfIdfExample").getOrCreate() # $example on$ - sentenceData = sqlContext.createDataFrame([ + sentenceData = spark.createDataFrame([ (0, "Hi I heard about Spark"), (0, "I wish Java could use case classes"), (1, "Logistic regression models are neat") @@ -46,4 +44,4 @@ if __name__ == "__main__": print(features_label) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py index ce9b225be5..5be4b4cfe3 100644 --- a/examples/src/main/python/ml/tokenizer_example.py +++ b/examples/src/main/python/ml/tokenizer_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import Tokenizer, RegexTokenizer # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="TokenizerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("TokenizerExample").getOrCreate() # $example on$ - sentenceDataFrame = sqlContext.createDataFrame([ + sentenceDataFrame = spark.createDataFrame([ (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") @@ -41,4 +39,4 @@ if __name__ == "__main__": # alternatively, pattern="\\w+", gaps(False) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py index 161a200c61..2e43a0f8ae 100644 --- a/examples/src/main/python/ml/train_validation_split.py +++ b/examples/src/main/python/ml/train_validation_split.py @@ -15,13 +15,12 @@ # limitations under the License. # -from pyspark import SparkContext # $example on$ from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.regression import LinearRegression from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit -from pyspark.sql import SQLContext # $example off$ +from pyspark.sql import SparkSession """ This example demonstrates applying TrainValidationSplit to split data @@ -32,11 +31,10 @@ Run with: """ if __name__ == "__main__": - sc = SparkContext(appName="TrainValidationSplit") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("TrainValidationSplit").getOrCreate() # $example on$ # Prepare training and test data. - data = sqlContext.read.format("libsvm")\ + data = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") train, test = data.randomSplit([0.7, 0.3]) lr = LinearRegression(maxIter=10, regParam=0.1) @@ -65,4 +63,4 @@ if __name__ == "__main__": for row in prediction.take(5): print(row) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py index 04f64839f1..019a9ea6f7 100644 --- a/examples/src/main/python/ml/vector_assembler_example.py +++ b/examples/src/main/python/ml/vector_assembler_example.py @@ -17,19 +17,17 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.mllib.linalg import Vectors from pyspark.ml.feature import VectorAssembler # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="VectorAssemblerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("VectorAssemblerExample").getOrCreate() # $example on$ - dataset = sqlContext.createDataFrame( + dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) assembler = VectorAssembler( @@ -39,4 +37,4 @@ if __name__ == "__main__": print(output.select("features", "clicked").first()) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py index 146f41c1dd..3cf5b8ebf1 100644 --- a/examples/src/main/python/ml/vector_indexer_example.py +++ b/examples/src/main/python/ml/vector_indexer_example.py @@ -17,18 +17,16 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import VectorIndexer # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="VectorIndexerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("VectorIndexerExample").getOrCreate() # $example on$ - data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) @@ -37,4 +35,4 @@ if __name__ == "__main__": indexedData.show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/vector_slicer_example.py b/examples/src/main/python/ml/vector_slicer_example.py index 31a753073c..0531bcdb06 100644 --- a/examples/src/main/python/ml/vector_slicer_example.py +++ b/examples/src/main/python/ml/vector_slicer_example.py @@ -17,20 +17,18 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import VectorSlicer from pyspark.mllib.linalg import Vectors from pyspark.sql.types import Row # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="VectorSlicerExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("VectorSlicerExample").getOrCreate() # $example on$ - df = sqlContext.createDataFrame([ + df = spark.createDataFrame([ Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3}),), Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]),)]) @@ -41,4 +39,4 @@ if __name__ == "__main__": output.select("userFeatures", "features").show() # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py index 53c77feb10..6766a7b6aa 100644 --- a/examples/src/main/python/ml/word2vec_example.py +++ b/examples/src/main/python/ml/word2vec_example.py @@ -17,19 +17,17 @@ from __future__ import print_function -from pyspark import SparkContext -from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import Word2Vec # $example off$ +from pyspark.sql import SparkSession if __name__ == "__main__": - sc = SparkContext(appName="Word2VecExample") - sqlContext = SQLContext(sc) + spark = SparkSession.builder.appName("Word2VecExample").getOrCreate() # $example on$ # Input data: Each row is a bag of words from a sentence or document. - documentDF = sqlContext.createDataFrame([ + documentDF = spark.createDataFrame([ ("Hi I heard about Spark".split(" "), ), ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), ) @@ -42,4 +40,4 @@ if __name__ == "__main__": print(feature) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/python/mllib/binary_classification_metrics_example.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py index 4e7ea289b2..8f0fc9d45d 100644 --- a/examples/src/main/python/mllib/binary_classification_metrics_example.py +++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py @@ -18,7 +18,7 @@ Binary Classification Metrics Example. """ from __future__ import print_function -from pyspark import SparkContext, SQLContext +from pyspark import SparkContext # $example on$ from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.mllib.evaluation import BinaryClassificationMetrics @@ -27,7 +27,7 @@ from pyspark.mllib.util import MLUtils if __name__ == "__main__": sc = SparkContext(appName="BinaryClassificationMetricsExample") - sqlContext = SQLContext(sc) + # $example on$ # Several of the methods available in scala are currently missing from pyspark # Load training data in LIBSVM format @@ -52,3 +52,5 @@ if __name__ == "__main__": # Area under ROC curve print("Area under ROC = %s" % metrics.areaUnderROC) # $example off$ + + sc.stop() diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py index ea6a22dbfe..59a46cb283 100644 --- a/examples/src/main/python/sql.py +++ b/examples/src/main/python/sql.py @@ -63,7 +63,7 @@ if __name__ == "__main__": # |-- age: long (nullable = true) # |-- name: string (nullable = true) - # Register this DataFrame as a table. + # Register this DataFrame as a temporary table. people.registerTempTable("people") # SQL statements can be run by using the sql methods provided by sqlContext diff --git a/examples/src/main/python/streaming/sql_network_wordcount.py b/examples/src/main/python/streaming/sql_network_wordcount.py index 1ba5e9fb78..588cbfee14 100644 --- a/examples/src/main/python/streaming/sql_network_wordcount.py +++ b/examples/src/main/python/streaming/sql_network_wordcount.py @@ -33,13 +33,14 @@ import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext -from pyspark.sql import SQLContext, Row +from pyspark.sql import Row, SparkSession -def getSqlContextInstance(sparkContext): - if ('sqlContextSingletonInstance' not in globals()): - globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext) - return globals()['sqlContextSingletonInstance'] +def getSparkSessionInstance(sparkConf): + if ('sparkSessionSingletonInstance' not in globals()): + globals()['sparkSessionSingletonInstance'] =\ + SparkSession.builder.config(conf=sparkConf).getOrCreate() + return globals()['sparkSessionSingletonInstance'] if __name__ == "__main__": @@ -60,19 +61,19 @@ if __name__ == "__main__": print("========= %s =========" % str(time)) try: - # Get the singleton instance of SQLContext - sqlContext = getSqlContextInstance(rdd.context) + # Get the singleton instance of SparkSession + spark = getSparkSessionInstance(rdd.context.getConf()) # Convert RDD[String] to RDD[Row] to DataFrame rowRdd = rdd.map(lambda w: Row(word=w)) - wordsDataFrame = sqlContext.createDataFrame(rowRdd) + wordsDataFrame = spark.createDataFrame(rowRdd) # Register as table wordsDataFrame.registerTempTable("words") # Do word count on table using SQL and print it wordCountsDataFrame = \ - sqlContext.sql("select word, count(*) as total from words group by word") + spark.sql("select word, count(*) as total from words group by word") wordCountsDataFrame.show() except: pass diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala index 21f58ddf3c..3795af8309 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala @@ -18,12 +18,11 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.regression.AFTSurvivalRegression import org.apache.spark.mllib.linalg.Vectors // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession /** * An example for AFTSurvivalRegression. @@ -31,12 +30,10 @@ import org.apache.spark.sql.SQLContext object AFTSurvivalRegressionExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("AFTSurvivalRegressionExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("AFTSurvivalRegressionExample").getOrCreate() // $example on$ - val training = sqlContext.createDataFrame(Seq( + val training = spark.createDataFrame(Seq( (1.218, 1.0, Vectors.dense(1.560, -0.605)), (2.949, 0.0, Vectors.dense(0.346, 2.158)), (3.627, 0.0, Vectors.dense(1.380, 0.231)), @@ -56,7 +53,7 @@ object AFTSurvivalRegressionExample { model.transform(training).show(false) // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala index a79e15c767..41750ca779 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala @@ -18,12 +18,11 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.recommendation.ALS // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession // $example on$ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @@ -43,13 +42,11 @@ object ALSExample { // $example off$ def main(args: Array[String]) { - val conf = new SparkConf().setAppName("ALSExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) - import sqlContext.implicits._ + val spark = SparkSession.builder.appName("ALSExample").getOrCreate() + import spark.implicits._ // $example on$ - val ratings = sc.textFile("data/mllib/als/sample_movielens_ratings.txt") + val ratings = spark.read.text("data/mllib/als/sample_movielens_ratings.txt") .map(Rating.parseRating) .toDF() val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) @@ -75,7 +72,8 @@ object ALSExample { val rmse = evaluator.evaluate(predictions) println(s"Root-mean-square error = $rmse") // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala index 2ed8101c13..93c153f923 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.Binarizer // $example off$ -import org.apache.spark.sql.{DataFrame, SQLContext} +import org.apache.spark.sql.{DataFrame, SparkSession} object BinarizerExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("BinarizerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("BinarizerExample").getOrCreate() // $example on$ val data = Array((0, 0.1), (1, 0.8), (2, 0.2)) - val dataFrame: DataFrame = sqlContext.createDataFrame(data).toDF("label", "feature") + val dataFrame: DataFrame = spark.createDataFrame(data).toDF("label", "feature") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") @@ -42,7 +39,8 @@ object BinarizerExample { val binarizedFeatures = binarizedDataFrame.select("binarized_feature") binarizedFeatures.collect().foreach(println) // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala index 6f6236a2b0..779ad33dbd 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala @@ -18,23 +18,20 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.Bucketizer // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object BucketizerExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("BucketizerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("BucketizerExample").getOrCreate() // $example on$ val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity) val data = Array(-0.5, -0.3, 0.0, 0.2) - val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") + val dataFrame = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val bucketizer = new Bucketizer() .setInputCol("features") @@ -45,7 +42,7 @@ object BucketizerExample { val bucketedData = bucketizer.transform(dataFrame) bucketedData.show() // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala index 2be61537e6..84ca1f0b56 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala @@ -18,20 +18,16 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.ChiSqSelector import org.apache.spark.mllib.linalg.Vectors // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object ChiSqSelectorExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("ChiSqSelectorExample") - val sc = new SparkContext(conf) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ + val spark = SparkSession.builder.appName("ChiSqSelectorExample").getOrCreate() + import spark.implicits._ // $example on$ val data = Seq( @@ -40,7 +36,7 @@ object ChiSqSelectorExample { (9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0) ) - val df = sc.parallelize(data).toDF("id", "features", "clicked") + val df = spark.createDataset(data).toDF("id", "features", "clicked") val selector = new ChiSqSelector() .setNumTopFeatures(1) @@ -51,7 +47,7 @@ object ChiSqSelectorExample { val result = selector.fit(df).transform(df) result.show() // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala index 7d07fc7dd1..9ab43a48bf 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object CountVectorizerExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("CounterVectorizerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("CounterVectorizerExample").getOrCreate() // $example on$ - val df = sqlContext.createDataFrame(Seq( + val df = spark.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") @@ -51,6 +48,8 @@ object CountVectorizerExample { cvModel.transform(df).select("features").show() // $example off$ + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala index dc26b55a76..b415333c71 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala @@ -18,18 +18,15 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.DCT import org.apache.spark.mllib.linalg.Vectors // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object DCTExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("DCTExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("DCTExample").getOrCreate() // $example on$ val data = Seq( @@ -37,7 +34,7 @@ object DCTExample { Vectors.dense(-1.0, 2.0, 4.0, -7.0), Vectors.dense(14.0, -2.0, -5.0, 1.0)) - val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val dct = new DCT() .setInputCol("features") @@ -47,7 +44,8 @@ object DCTExample { val dctDf = dct.transform(df) dctDf.select("featuresDCT").show(3) // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala index 7e608a2812..2f892f8d72 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala @@ -23,11 +23,10 @@ import java.io.File import com.google.common.io.Files import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer -import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} /** * An example of how to use [[org.apache.spark.sql.DataFrame]] for ML. Run with @@ -62,14 +61,11 @@ object DataFrameExample { } def run(params: Params) { - - val conf = new SparkConf().setAppName(s"DataFrameExample with $params") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName(s"DataFrameExample with $params").getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") - val df: DataFrame = sqlContext.read.format("libsvm").load(params.input).cache() + val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") @@ -94,11 +90,11 @@ object DataFrameExample { // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") - val newDF = sqlContext.read.parquet(outputDir) + val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala index 224d8da5f0..a0a2e1fb33 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala @@ -18,7 +18,6 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.DecisionTreeClassificationModel @@ -26,16 +25,14 @@ import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object DecisionTreeClassificationExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("DecisionTreeClassificationExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("DecisionTreeClassificationExample").getOrCreate() // $example on$ // Load the data stored in LIBSVM format as a DataFrame. - val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -88,6 +85,8 @@ object DecisionTreeClassificationExample { val treeModel = model.stages(2).asInstanceOf[DecisionTreeClassificationModel] println("Learned classification tree model:\n" + treeModel.toDebugString) // $example off$ + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala index d2560cc00b..cea1d801aa 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala @@ -33,7 +33,7 @@ import org.apache.spark.ml.util.MetadataUtils import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.MLUtils -import org.apache.spark.sql.{DataFrame, SQLContext} +import org.apache.spark.sql.{DataFrame, SparkSession} /** * An example runner for decision trees. Run with @@ -134,18 +134,18 @@ object DecisionTreeExample { /** Load a dataset from the given path, using the given format */ private[ml] def loadData( - sqlContext: SQLContext, + spark: SparkSession, path: String, format: String, expectedNumFeatures: Option[Int] = None): DataFrame = { - import sqlContext.implicits._ + import spark.implicits._ format match { - case "dense" => MLUtils.loadLabeledPoints(sqlContext.sparkContext, path).toDF() + case "dense" => MLUtils.loadLabeledPoints(spark.sparkContext, path).toDF() case "libsvm" => expectedNumFeatures match { - case Some(numFeatures) => sqlContext.read.option("numFeatures", numFeatures.toString) + case Some(numFeatures) => spark.read.option("numFeatures", numFeatures.toString) .format("libsvm").load(path) - case None => sqlContext.read.format("libsvm").load(path) + case None => spark.read.format("libsvm").load(path) } case _ => throw new IllegalArgumentException(s"Bad data format: $format") } @@ -167,17 +167,17 @@ object DecisionTreeExample { testInput: String, algo: String, fracTest: Double): (DataFrame, DataFrame) = { - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.getOrCreate() // Load training data - val origExamples: DataFrame = loadData(sqlContext, input, dataFormat) + val origExamples: DataFrame = loadData(spark, input, dataFormat) // Load or create test set val dataframes: Array[DataFrame] = if (testInput != "") { // Load testInput. val numFeatures = origExamples.first().getAs[Vector](1).size val origTestExamples: DataFrame = - loadData(sqlContext, testInput, dataFormat, Some(numFeatures)) + loadData(spark, testInput, dataFormat, Some(numFeatures)) Array(origExamples, origTestExamples) } else { // Split input into training, test. diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala index ad32e5635a..26b52d0489 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala @@ -18,7 +18,6 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator @@ -26,17 +25,15 @@ import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.DecisionTreeRegressionModel import org.apache.spark.ml.regression.DecisionTreeRegressor // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object DecisionTreeRegressionExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("DecisionTreeRegressionExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("DecisionTreeRegressionExample").getOrCreate() // $example on$ // Load the data stored in LIBSVM format as a DataFrame. - val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Automatically identify categorical features, and index them. // Here, we treat features with > 4 distinct values as continuous. @@ -78,6 +75,8 @@ object DecisionTreeRegressionExample { val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel] println("Learned regression tree model:\n" + treeModel.toDebugString) // $example off$ + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala index 8d127f9b35..2aa1ab1ec8 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala @@ -18,13 +18,12 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.classification.{ClassificationModel, Classifier, ClassifierParams} import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} +import org.apache.spark.sql.{Dataset, Row, SparkSession} /** * A simple example demonstrating how to write your own learning algorithm using Estimator, @@ -38,13 +37,11 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} object DeveloperApiExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("DeveloperApiExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) - import sqlContext.implicits._ + val spark = SparkSession.builder.appName("DeveloperApiExample").getOrCreate() + import spark.implicits._ // Prepare training data. - val training = sc.parallelize(Seq( + val training = spark.createDataFrame(Seq( LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), @@ -62,13 +59,13 @@ object DeveloperApiExample { val model = lr.fit(training.toDF()) // Prepare test data. - val test = sc.parallelize(Seq( + val test = spark.createDataFrame(Seq( LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)))) // Make predictions on test data. - val sumPredictions: Double = model.transform(test.toDF()) + val sumPredictions: Double = model.transform(test) .select("features", "label", "prediction") .collect() .map { case Row(features: Vector, label: Double, prediction: Double) => @@ -77,7 +74,7 @@ object DeveloperApiExample { assert(sumPredictions == 0.0, "MyLogisticRegression predicted something other than 0, even though all coefficients are 0!") - sc.stop() + spark.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala index 629d322c43..f289c28df9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala @@ -18,22 +18,19 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.ElementwiseProduct import org.apache.spark.mllib.linalg.Vectors // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object ElementwiseProductExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("ElementwiseProductExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("ElementwiseProductExample").getOrCreate() // $example on$ // Create some vector data; also works for sparse vectors - val dataFrame = sqlContext.createDataFrame(Seq( + val dataFrame = spark.createDataFrame(Seq( ("a", Vectors.dense(1.0, 2.0, 3.0)), ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector") @@ -46,7 +43,8 @@ object ElementwiseProductExample { // Batch transform the vectors to create new column: transformer.transform(dataFrame).show() // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala index 65e3c365ab..91076ccbc1 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala @@ -18,25 +18,22 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.param.ParamMap import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.sql.Row // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object EstimatorTransformerParamExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("EstimatorTransformerParamExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("EstimatorTransformerParamExample").getOrCreate() // $example on$ // Prepare training data from a list of (label, features) tuples. - val training = sqlContext.createDataFrame(Seq( + val training = spark.createDataFrame(Seq( (1.0, Vectors.dense(0.0, 1.1, 0.1)), (0.0, Vectors.dense(2.0, 1.0, -1.0)), (0.0, Vectors.dense(2.0, 1.3, 1.0)), @@ -76,7 +73,7 @@ object EstimatorTransformerParamExample { println("Model 2 was fit using parameters: " + model2.parent.extractParamMap) // Prepare test data. - val test = sqlContext.createDataFrame(Seq( + val test = spark.createDataFrame(Seq( (1.0, Vectors.dense(-1.0, 1.5, 1.3)), (0.0, Vectors.dense(3.0, 2.0, -0.1)), (1.0, Vectors.dense(0.0, 2.2, -1.5)) @@ -94,7 +91,7 @@ object EstimatorTransformerParamExample { } // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala index cd62a80382..412c54db7d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala @@ -18,24 +18,21 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object GradientBoostedTreeClassifierExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("GradientBoostedTreeClassifierExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("GradientBoostedTreeClassifierExample").getOrCreate() // $example on$ // Load and parse the data file, converting it to a DataFrame. - val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -91,7 +88,7 @@ object GradientBoostedTreeClassifierExample { println("Learned classification GBT model:\n" + gbtModel.toDebugString) // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala index b8cf9629bb..fd43553cc6 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala @@ -18,24 +18,21 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object GradientBoostedTreeRegressorExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("GradientBoostedTreeRegressorExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("GradientBoostedTreeRegressorExample").getOrCreate() // $example on$ // Load and parse the data file, converting it to a DataFrame. - val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -79,7 +76,7 @@ object GradientBoostedTreeRegressorExample { println("Learned regression GBT model:\n" + gbtModel.toDebugString) // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala index 4cea09ba12..d873618726 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala @@ -18,21 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.{IndexToString, StringIndexer} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object IndexToStringExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("IndexToStringExample") - val sc = new SparkContext(conf) - - val sqlContext = SQLContext.getOrCreate(sc) + val spark = SparkSession.builder.appName("IndexToStringExample").getOrCreate() // $example on$ - val df = sqlContext.createDataFrame(Seq( + val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), @@ -54,7 +50,8 @@ object IndexToStringExample { val converted = converter.transform(indexed) converted.select("id", "originalCategory").show() // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala index 7af011571f..d2573fad35 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala @@ -19,11 +19,10 @@ package org.apache.spark.examples.ml // scalastyle:off println -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.sql.{DataFrame, SQLContext} +import org.apache.spark.sql.{DataFrame, SparkSession} // $example off$ /** @@ -37,13 +36,11 @@ object KMeansExample { def main(args: Array[String]): Unit = { // Creates a Spark context and a SQL context - val conf = new SparkConf().setAppName(s"${this.getClass.getSimpleName}") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName(s"${this.getClass.getSimpleName}").getOrCreate() // $example on$ // Crates a DataFrame - val dataset: DataFrame = sqlContext.createDataFrame(Seq( + val dataset: DataFrame = spark.createDataFrame(Seq( (1, Vectors.dense(0.0, 0.0, 0.0)), (2, Vectors.dense(0.1, 0.1, 0.1)), (3, Vectors.dense(0.2, 0.2, 0.2)), @@ -64,7 +61,7 @@ object KMeansExample { model.clusterCenters.foreach(println) // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala index f9ddac7709..c23adee1a3 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala @@ -18,11 +18,10 @@ package org.apache.spark.examples.ml // scalastyle:off println -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.clustering.LDA import org.apache.spark.mllib.linalg.{Vectors, VectorUDT} -import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.{StructField, StructType} // $example off$ @@ -41,16 +40,14 @@ object LDAExample { val input = "data/mllib/sample_lda_data.txt" // Creates a Spark context and a SQL context - val conf = new SparkConf().setAppName(s"${this.getClass.getSimpleName}") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName(s"${this.getClass.getSimpleName}").getOrCreate() // $example on$ // Loads data - val rowRDD = sc.textFile(input).filter(_.nonEmpty) + val rowRDD = spark.read.text(input).rdd.filter(_.nonEmpty) .map(_.split(" ").map(_.toDouble)).map(Vectors.dense).map(Row(_)) val schema = StructType(Array(StructField(FEATURES_COL, new VectorUDT, false))) - val dataset = sqlContext.createDataFrame(rowRDD, schema) + val dataset = spark.createDataFrame(rowRDD, schema) // Trains a LDA model val lda = new LDA() @@ -71,7 +68,7 @@ object LDAExample { transformed.show(false) // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala index f68aef7082..cb6e2492f5 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala @@ -18,22 +18,19 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.regression.LinearRegression // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object LinearRegressionWithElasticNetExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("LinearRegressionWithElasticNetExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("LinearRegressionWithElasticNetExample").getOrCreate() // $example on$ // Load training data - val training = sqlContext.read.format("libsvm") + val training = spark.read.format("libsvm") .load("data/mllib/sample_linear_regression_data.txt") val lr = new LinearRegression() @@ -56,7 +53,7 @@ object LinearRegressionWithElasticNetExample { println(s"r2: ${trainingSummary.r2}") // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala index 89c5edf1ac..50670d7b38 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala @@ -18,23 +18,20 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.max object LogisticRegressionSummaryExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("LogisticRegressionSummaryExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) - import sqlContext.implicits._ + val spark = SparkSession.builder.appName("LogisticRegressionSummaryExample").getOrCreate() + import spark.implicits._ // Load training data - val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val lr = new LogisticRegression() .setMaxIter(10) @@ -71,7 +68,7 @@ object LogisticRegressionSummaryExample { lrModel.setThreshold(bestThreshold) // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala index 6e27571f1d..fcba813d5b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala @@ -18,22 +18,20 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.classification.LogisticRegression // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object LogisticRegressionWithElasticNetExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("LogisticRegressionWithElasticNetExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession + .builder.appName("LogisticRegressionWithElasticNetExample").getOrCreate() // $example on$ // Load training data - val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val lr = new LogisticRegression() .setMaxIter(10) @@ -47,7 +45,7 @@ object LogisticRegressionWithElasticNetExample { println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}") // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala index aafb5efd69..896d8fadbe 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala @@ -15,23 +15,19 @@ * limitations under the License. */ -// scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.MaxAbsScaler // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object MaxAbsScalerExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("MaxAbsScalerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("MaxAbsScalerExample").getOrCreate() // $example on$ - val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaledFeatures") @@ -43,7 +39,7 @@ object MaxAbsScalerExample { val scaledData = scalerModel.transform(dataFrame) scaledData.show() // $example off$ - sc.stop() + + spark.stop() } } -// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala index 9a03f69f5a..bcdca0fa04 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.MinMaxScaler // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object MinMaxScalerExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("MinMaxScalerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("MinMaxScalerExample").getOrCreate() // $example on$ - val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val scaler = new MinMaxScaler() .setInputCol("features") @@ -44,7 +41,8 @@ object MinMaxScalerExample { val scaledData = scalerModel.transform(dataFrame) scaledData.show() // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala index d1441b5497..5fb3536060 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala @@ -18,7 +18,6 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression @@ -28,7 +27,7 @@ import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.Row // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession /** * A simple example demonstrating model selection using CrossValidator. @@ -42,13 +41,12 @@ import org.apache.spark.sql.SQLContext object ModelSelectionViaCrossValidationExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("ModelSelectionViaCrossValidationExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession + .builder.appName("ModelSelectionViaCrossValidationExample").getOrCreate() // $example on$ // Prepare training data from a list of (id, text, label) tuples. - val training = sqlContext.createDataFrame(Seq( + val training = spark.createDataFrame(Seq( (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), @@ -98,7 +96,7 @@ object ModelSelectionViaCrossValidationExample { val cvModel = cv.fit(training) // Prepare test documents, which are unlabeled (id, text) tuples. - val test = sqlContext.createDataFrame(Seq( + val test = spark.createDataFrame(Seq( (4L, "spark i j k"), (5L, "l m n"), (6L, "mapreduce spark"), @@ -114,7 +112,7 @@ object ModelSelectionViaCrossValidationExample { } // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala index fcad17a817..6bc082982c 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala @@ -17,13 +17,12 @@ package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession /** * A simple example demonstrating model selection using TrainValidationSplit. @@ -36,13 +35,12 @@ import org.apache.spark.sql.SQLContext object ModelSelectionViaTrainValidationSplitExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("ModelSelectionViaTrainValidationSplitExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession + .builder.appName("ModelSelectionViaTrainValidationSplitExample").getOrCreate() // $example on$ // Prepare training and test data. - val data = sqlContext.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt") + val data = spark.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt") val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345) val lr = new LinearRegression() @@ -75,6 +73,6 @@ object ModelSelectionViaTrainValidationSplitExample { .show() // $example off$ - sc.stop() + spark.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala index d7d1e82f6f..a11fe1b4b2 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala @@ -18,12 +18,11 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession /** * An example for Multilayer Perceptron Classification. @@ -31,13 +30,11 @@ import org.apache.spark.sql.SQLContext object MultilayerPerceptronClassifierExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("MultilayerPerceptronClassifierExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("MultilayerPerceptronClassifierExample").getOrCreate() // $example on$ // Load the data stored in LIBSVM format as a DataFrame. - val data = sqlContext.read.format("libsvm") + val data = spark.read.format("libsvm") .load("data/mllib/sample_multiclass_classification_data.txt") // Split the data into train and test val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L) @@ -63,7 +60,7 @@ object MultilayerPerceptronClassifierExample { println("Precision:" + evaluator.evaluate(predictionAndLabels)) // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala index 77b913aaa3..1b71a39890 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.NGram // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object NGramExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("NGramExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("NGramExample").getOrCreate() // $example on$ - val wordDataFrame = sqlContext.createDataFrame(Seq( + val wordDataFrame = spark.createDataFrame(Seq( (0, Array("Hi", "I", "heard", "about", "Spark")), (1, Array("I", "wish", "Java", "could", "use", "case", "classes")), (2, Array("Logistic", "regression", "models", "are", "neat")) @@ -41,7 +38,8 @@ object NGramExample { val ngramDataFrame = ngram.transform(wordDataFrame) ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println) // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala index 5ea1270c97..8d54555cd3 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala @@ -18,21 +18,18 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ -import org.apache.spark.ml.classification.{NaiveBayes} +import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object NaiveBayesExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("NaiveBayesExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("NaiveBayesExample").getOrCreate() // $example on$ // Load the data stored in LIBSVM format as a DataFrame. - val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Split the data into training and test sets (30% held out for testing) val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) @@ -53,6 +50,8 @@ object NaiveBayesExample { val precision = evaluator.evaluate(predictions) println("Precision:" + precision) // $example off$ + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala index 6b33c16c74..4622d69ef9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.Normalizer // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object NormalizerExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("NormalizerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("NormalizerExample").getOrCreate() // $example on$ - val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Normalize each Vector using $L^1$ norm. val normalizer = new Normalizer() @@ -46,7 +43,8 @@ object NormalizerExample { val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity) lInfNormData.show() // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala index cb9fe65a85..338436100c 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object OneHotEncoderExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("OneHotEncoderExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("OneHotEncoderExample").getOrCreate() // $example on$ - val df = sqlContext.createDataFrame(Seq( + val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), @@ -52,7 +49,8 @@ object OneHotEncoderExample { val encoded = encoder.transform(indexed) encoded.select("id", "categoryVec").show() // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala index 0b5d31c0ff..e2351c682d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala @@ -22,7 +22,6 @@ import java.util.concurrent.TimeUnit.{NANOSECONDS => NANO} import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} @@ -31,7 +30,7 @@ import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.DataFrame // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession /** * An example runner for Multiclass to Binary Reduction with One Vs Rest. @@ -110,18 +109,16 @@ object OneVsRestExample { } private def run(params: Params) { - val conf = new SparkConf().setAppName(s"OneVsRestExample with $params") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName(s"OneVsRestExample with $params").getOrCreate() // $example on$ - val inputData = sqlContext.read.format("libsvm").load(params.input) + val inputData = spark.read.format("libsvm").load(params.input) // compute the train/test split: if testInput is not provided use part of input. val data = params.testInput match { case Some(t) => // compute the number of features in the training set. val numFeatures = inputData.first().getAs[Vector](1).size - val testData = sqlContext.read.option("numFeatures", numFeatures.toString) + val testData = spark.read.option("numFeatures", numFeatures.toString) .format("libsvm").load(t) Array[DataFrame](inputData, testData) case None => @@ -175,7 +172,7 @@ object OneVsRestExample { println(fprs.map {case (label, fpr) => label + "\t" + fpr}.mkString("\n")) // $example off$ - sc.stop() + spark.stop() } private def time[R](block: => R): (Long, R) = { diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala index 535652ec6c..14394d5624 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala @@ -18,18 +18,15 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.PCA import org.apache.spark.mllib.linalg.Vectors // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object PCAExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("PCAExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("PCAExample").getOrCreate() // $example on$ val data = Array( @@ -37,7 +34,7 @@ object PCAExample { Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) - val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") @@ -47,7 +44,8 @@ object PCAExample { val result = pcaDF.select("pcaFeatures") result.show() // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala index 6c29063626..61b34aebd9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala @@ -18,7 +18,6 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.LogisticRegression @@ -26,18 +25,16 @@ import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.Row // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object PipelineExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("PipelineExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("PipelineExample").getOrCreate() // $example on$ // Prepare training documents from a list of (id, text, label) tuples. - val training = sqlContext.createDataFrame(Seq( + val training = spark.createDataFrame(Seq( (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), @@ -71,7 +68,7 @@ object PipelineExample { val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model") // Prepare test documents, which are unlabeled (id, text) tuples. - val test = sqlContext.createDataFrame(Seq( + val test = spark.createDataFrame(Seq( (4L, "spark i j k"), (5L, "l m n"), (6L, "mapreduce spark"), @@ -87,7 +84,7 @@ object PipelineExample { } // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala index 3014008ea0..4d8c672a55 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala @@ -18,18 +18,15 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.PolynomialExpansion import org.apache.spark.mllib.linalg.Vectors // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object PolynomialExpansionExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("PolynomialExpansionExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("PolynomialExpansionExample").getOrCreate() // $example on$ val data = Array( @@ -37,7 +34,7 @@ object PolynomialExpansionExample { Vectors.dense(0.0, 0.0), Vectors.dense(0.6, -1.1) ) - val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val polynomialExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") @@ -45,7 +42,8 @@ object PolynomialExpansionExample { val polyDF = polynomialExpansion.transform(df) polyDF.select("polyFeatures").take(3).foreach(println) // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala index e64e673a48..0839c609f1 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala @@ -15,25 +15,21 @@ * limitations under the License. */ -// scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.QuantileDiscretizer // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object QuantileDiscretizerExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("QuantileDiscretizerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) - import sqlContext.implicits._ + val spark = SparkSession.builder.appName("QuantileDiscretizerExample").getOrCreate() + import spark.implicits._ // $example on$ val data = Array((0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)) - val df = sc.parallelize(data).toDF("id", "hour") + val df = spark.createDataFrame(data).toDF("id", "hour") val discretizer = new QuantileDiscretizer() .setInputCol("hour") @@ -43,7 +39,7 @@ object QuantileDiscretizerExample { val result = discretizer.fit(df).transform(df) result.show() // $example off$ - sc.stop() + + spark.stop() } } -// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala index bec831d51c..699b621db9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.RFormula // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object RFormulaExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("RFormulaExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("RFormulaExample").getOrCreate() // $example on$ - val dataset = sqlContext.createDataFrame(Seq( + val dataset = spark.createDataFrame(Seq( (7, "US", 18, 1.0), (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0) @@ -43,7 +40,8 @@ object RFormulaExample { val output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala index 6c9b52cf25..4192a9c737 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala @@ -18,24 +18,21 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object RandomForestClassifierExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("RandomForestClassifierExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("RandomForestClassifierExample").getOrCreate() // $example on$ // Load and parse the data file, converting it to a DataFrame. - val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -91,7 +88,7 @@ object RandomForestClassifierExample { println("Learned classification forest model:\n" + rfModel.toDebugString) // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala index 4d2db017f3..5632f0419a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala @@ -18,24 +18,21 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object RandomForestRegressorExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("RandomForestRegressorExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("RandomForestRegressorExample").getOrCreate() // $example on$ // Load and parse the data file, converting it to a DataFrame. - val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -78,7 +75,7 @@ object RandomForestRegressorExample { println("Learned regression forest model:\n" + rfModel.toDebugString) // $example off$ - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala index 202925acad..f03b29ba32 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.SQLTransformer // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object SQLTransformerExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("SQLTransformerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("SQLTransformerExample").getOrCreate() // $example on$ - val df = sqlContext.createDataFrame( + val df = spark.createDataFrame( Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2") val sqlTrans = new SQLTransformer().setStatement( @@ -39,6 +36,8 @@ object SQLTransformerExample { sqlTrans.transform(df).show() // $example off$ + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala index f4d1fe5785..dff7719507 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala @@ -18,12 +18,11 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.param.ParamMap import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.{Row, SparkSession} /** * A simple example demonstrating ways to specify parameters for Estimators and Transformers. @@ -35,15 +34,13 @@ import org.apache.spark.sql.{Row, SQLContext} object SimpleParamsExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("SimpleParamsExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) - import sqlContext.implicits._ + val spark = SparkSession.builder.appName("SimpleParamsExample").getOrCreate() + import spark.implicits._ // Prepare training data. // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes // into DataFrames, where it uses the case class metadata to infer the schema. - val training = sc.parallelize(Seq( + val training = spark.createDataFrame(Seq( LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), @@ -59,7 +56,7 @@ object SimpleParamsExample { .setRegParam(0.01) // Learn a LogisticRegression model. This uses the parameters stored in lr. - val model1 = lr.fit(training.toDF()) + val model1 = lr.fit(training) // Since model1 is a Model (i.e., a Transformer produced by an Estimator), // we can view the parameters it used during fit(). // This prints the parameter (name: value) pairs, where names are unique IDs for this @@ -82,7 +79,7 @@ object SimpleParamsExample { println("Model 2 was fit using parameters: " + model2.parent.extractParamMap()) // Prepare test data. - val test = sc.parallelize(Seq( + val test = spark.createDataFrame(Seq( LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)))) @@ -91,14 +88,14 @@ object SimpleParamsExample { // LogisticRegressionModel.transform will only use the 'features' column. // Note that model2.transform() outputs a 'myProbability' column instead of the usual // 'probability' column since we renamed the lr.probabilityCol parameter previously. - model2.transform(test.toDF()) + model2.transform(test) .select("features", "label", "myProbability", "prediction") .collect() .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) => println(s"($features, $label) -> prob=$prob, prediction=$prediction") } - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala index 960280137c..05199007f0 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala @@ -20,12 +20,11 @@ package org.apache.spark.examples.ml import scala.beans.BeanInfo -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.mllib.linalg.Vector -import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.{Row, SparkSession} @BeanInfo case class LabeledDocument(id: Long, text: String, label: Double) @@ -43,13 +42,11 @@ case class Document(id: Long, text: String) object SimpleTextClassificationPipeline { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) - import sqlContext.implicits._ + val spark = SparkSession.builder.appName("SimpleTextClassificationPipeline").getOrCreate() + import spark.implicits._ // Prepare training documents, which are labeled. - val training = sc.parallelize(Seq( + val training = spark.createDataFrame(Seq( LabeledDocument(0L, "a b c d e spark", 1.0), LabeledDocument(1L, "b d", 0.0), LabeledDocument(2L, "spark f g h", 1.0), @@ -73,7 +70,7 @@ object SimpleTextClassificationPipeline { val model = pipeline.fit(training.toDF()) // Prepare test documents, which are unlabeled. - val test = sc.parallelize(Seq( + val test = spark.createDataFrame(Seq( Document(4L, "spark i j k"), Document(5L, "l m n"), Document(6L, "spark hadoop spark"), @@ -87,7 +84,7 @@ object SimpleTextClassificationPipeline { println(s"($id, $text) --> prob=$prob, prediction=$prediction") } - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala index e3439677e7..55f777c6e2 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.StandardScaler // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object StandardScalerExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("StandardScalerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("StandardScalerExample").getOrCreate() // $example on$ - val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val scaler = new StandardScaler() .setInputCol("features") @@ -46,7 +43,8 @@ object StandardScalerExample { val scaledData = scalerModel.transform(dataFrame) scaledData.show() // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala index 8199be12c1..85e79c8cb3 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala @@ -18,31 +18,29 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.StopWordsRemover // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object StopWordsRemoverExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("StopWordsRemoverExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("StopWordsRemoverExample").getOrCreate() // $example on$ val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") - val dataSet = sqlContext.createDataFrame(Seq( + val dataSet = spark.createDataFrame(Seq( (0, Seq("I", "saw", "the", "red", "baloon")), (1, Seq("Mary", "had", "a", "little", "lamb")) )).toDF("id", "raw") remover.transform(dataSet).show() // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala index 3f0e870c8d..e01a768da9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.StringIndexer // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object StringIndexerExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("StringIndexerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("StringIndexerExample").getOrCreate() // $example on$ - val df = sqlContext.createDataFrame( + val df = spark.createDataFrame( Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")) ).toDF("id", "category") @@ -42,7 +39,8 @@ object StringIndexerExample { val indexed = indexer.fit(df).transform(df) indexed.show() // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala index 396f073e6b..910ef62a26 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala @@ -18,21 +18,18 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object TfIdfExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("TfIdfExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("TfIdfExample").getOrCreate() // $example on$ - val sentenceData = sqlContext.createDataFrame(Seq( + val sentenceData = spark.createDataFrame(Seq( (0, "Hi I heard about Spark"), (0, "I wish Java could use case classes"), (1, "Logistic regression models are neat") @@ -50,6 +47,8 @@ object TfIdfExample { val rescaledData = idfModel.transform(featurizedData) rescaledData.select("features", "label").take(3).foreach(println) // $example off$ + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala index c667728d63..4f0c47b3c8 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object TokenizerExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("TokenizerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("TokenizerExample").getOrCreate() // $example on$ - val sentenceDataFrame = sqlContext.createDataFrame(Seq( + val sentenceDataFrame = spark.createDataFrame(Seq( (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") @@ -48,7 +45,8 @@ object TokenizerExample { val regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("words", "label").take(3).foreach(println) // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala index 768a8c0690..56b7263b19 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala @@ -18,21 +18,18 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.mllib.linalg.Vectors // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object VectorAssemblerExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("VectorAssemblerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("VectorAssemblerExample").getOrCreate() // $example on$ - val dataset = sqlContext.createDataFrame( + val dataset = spark.createDataFrame( Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0)) ).toDF("id", "hour", "mobile", "userFeatures", "clicked") @@ -43,7 +40,8 @@ object VectorAssemblerExample { val output = assembler.transform(dataset) println(output.select("features", "clicked").first()) // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala index 3bef37ba36..214ad91634 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala @@ -18,20 +18,17 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.VectorIndexer // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object VectorIndexerExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("VectorIndexerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("VectorIndexerExample").getOrCreate() // $example on$ - val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val indexer = new VectorIndexer() .setInputCol("features") @@ -48,7 +45,8 @@ object VectorIndexerExample { val indexedData = indexerModel.transform(data) indexedData.show() // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala index 01377d80e7..716bf023a8 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala @@ -18,31 +18,29 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ +import java.util.Arrays + import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object VectorSlicerExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("VectorSlicerExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("VectorSlicerExample").getOrCreate() // $example on$ - val data = Array(Row(Vectors.dense(-2.0, 2.3, 0.0))) + val data = Arrays.asList(Row(Vectors.dense(-2.0, 2.3, 0.0))) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) - val dataRDD = sc.parallelize(data) - val dataset = sqlContext.createDataFrame(dataRDD, StructType(Array(attrGroup.toStructField()))) + val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") @@ -52,7 +50,8 @@ object VectorSlicerExample { val output = slicer.transform(dataset) println(output.select("userFeatures", "features").first()) // $example off$ - sc.stop() + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala index e77aa59ba3..292b6d9f77 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala @@ -18,21 +18,18 @@ // scalastyle:off println package org.apache.spark.examples.ml -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.Word2Vec // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object Word2VecExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("Word2Vec example") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("Word2Vec example").getOrCreate() // $example on$ // Input data: Each row is a bag of words from a sentence or document. - val documentDF = sqlContext.createDataFrame(Seq( + val documentDF = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") @@ -48,6 +45,8 @@ object Word2VecExample { val result = model.transform(documentDF) result.select("result").take(3).foreach(println) // $example off$ + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala index e89d555884..c2bf1548b5 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala @@ -27,7 +27,7 @@ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, Regex import org.apache.spark.mllib.clustering.{DistributedLDAModel, EMLDAOptimizer, LDA, OnlineLDAOptimizer} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.{Row, SparkSession} /** * An example Latent Dirichlet Allocation (LDA) app. Run with @@ -189,8 +189,8 @@ object LDAExample { vocabSize: Int, stopwordFile: String): (RDD[(Long, Vector)], Array[String], Long) = { - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ + val spark = SparkSession.builder.getOrCreate() + import spark.implicits._ // Get dataset of document texts // One document per line in each text file. If the input consists of many small files, diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala index fdb01b86dd..cd4f0bb0de 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala @@ -18,22 +18,19 @@ // scalastyle:off println package org.apache.spark.examples.mllib -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.evaluation.{RankingMetrics, RegressionMetrics} import org.apache.spark.mllib.recommendation.{ALS, Rating} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object RankingMetricsExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("RankingMetricsExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) - import sqlContext.implicits._ + val spark = SparkSession.builder.appName("RankingMetricsExample").getOrCreate() + import spark.implicits._ // $example on$ // Read in the ratings data - val ratings = sc.textFile("data/mllib/sample_movielens_data.txt").map { line => + val ratings = spark.read.text("data/mllib/sample_movielens_data.txt").rdd.map { line => val fields = line.split("::") Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble - 2.5) }.cache() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala index add634c957..22c47a694d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala @@ -18,22 +18,22 @@ package org.apache.spark.examples.mllib -import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.evaluation.RegressionMetrics -import org.apache.spark.mllib.regression.LinearRegressionWithSGD -import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession object RegressionMetricsExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("RegressionMetricsExample") - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) + val spark = SparkSession.builder.appName("RegressionMetricsExample").getOrCreate() // $example on$ // Load the data - val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_linear_regression_data.txt").cache() + val data = spark + .read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt") + .rdd.map(row => LabeledPoint(row.getDouble(0), row.get(1).asInstanceOf[Vector])) + .cache() // Build the model val numIterations = 100 @@ -61,6 +61,8 @@ object RegressionMetricsExample { // Explained variance println(s"Explained variance = ${metrics.explainedVariance}") // $example off$ + + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala index 918e124065..2f0fe704f7 100644 --- a/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala @@ -19,9 +19,8 @@ package org.apache.spark.examples.streaming import org.apache.spark.SparkConf -import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} @@ -60,9 +59,9 @@ object SqlNetworkWordCount { // Convert RDDs of the words DStream to DataFrame and run SQL query words.foreachRDD { (rdd: RDD[String], time: Time) => - // Get the singleton instance of SQLContext - val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext) - import sqlContext.implicits._ + // Get the singleton instance of SparkSession + val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf) + import spark.implicits._ // Convert RDD[String] to RDD[case class] to DataFrame val wordsDataFrame = rdd.map(w => Record(w)).toDF() @@ -72,7 +71,7 @@ object SqlNetworkWordCount { // Do word count on table using SQL and print it val wordCountsDataFrame = - sqlContext.sql("select word, count(*) as total from words group by word") + spark.sql("select word, count(*) as total from words group by word") println(s"========= $time =========") wordCountsDataFrame.show() } @@ -87,14 +86,14 @@ object SqlNetworkWordCount { case class Record(word: String) -/** Lazily instantiated singleton instance of SQLContext */ -object SQLContextSingleton { +/** Lazily instantiated singleton instance of SparkSession */ +object SparkSessionSingleton { - @transient private var instance: SQLContext = _ + @transient private var instance: SparkSession = _ - def getInstance(sparkContext: SparkContext): SQLContext = { + def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { - instance = new SQLContext(sparkContext) + instance = SparkSession.builder.config(sparkConf).getOrCreate() } instance } |