From 1d542785b9949e7f92025e6754973a779cc37c52 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Thu, 10 Mar 2016 17:00:17 -0800 Subject: [SPARK-13244][SQL] Migrates DataFrame to Dataset ## What changes were proposed in this pull request? This PR unifies DataFrame and Dataset by migrating existing DataFrame operations to Dataset and make `DataFrame` a type alias of `Dataset[Row]`. Most Scala code changes are source compatible, but Java API is broken as Java knows nothing about Scala type alias (mostly replacing `DataFrame` with `Dataset`). There are several noticeable API changes related to those returning arrays: 1. `collect`/`take` - Old APIs in class `DataFrame`: ```scala def collect(): Array[Row] def take(n: Int): Array[Row] ``` - New APIs in class `Dataset[T]`: ```scala def collect(): Array[T] def take(n: Int): Array[T] def collectRows(): Array[Row] def takeRows(n: Int): Array[Row] ``` Two specialized methods `collectRows` and `takeRows` are added because Java doesn't support returning generic arrays. Thus, for example, `DataFrame.collect(): Array[T]` actually returns `Object` instead of `Array` from Java side. Normally, Java users may fall back to `collectAsList` and `takeAsList`. The two new specialized versions are added to avoid performance regression in ML related code (but maybe I'm wrong and they are not necessary here). 1. `randomSplit` - Old APIs in class `DataFrame`: ```scala def randomSplit(weights: Array[Double], seed: Long): Array[DataFrame] def randomSplit(weights: Array[Double]): Array[DataFrame] ``` - New APIs in class `Dataset[T]`: ```scala def randomSplit(weights: Array[Double], seed: Long): Array[Dataset[T]] def randomSplit(weights: Array[Double]): Array[Dataset[T]] ``` Similar problem as above, but hasn't been addressed for Java API yet. We can probably add `randomSplitAsList` to fix this one. 1. `groupBy` Some original `DataFrame.groupBy` methods have conflicting signature with original `Dataset.groupBy` methods. To distinguish these two, typed `Dataset.groupBy` methods are renamed to `groupByKey`. Other noticeable changes: 1. Dataset always do eager analysis now We used to support disabling DataFrame eager analysis to help reporting partially analyzed malformed logical plan on analysis failure. However, Dataset encoders requires eager analysi during Dataset construction. To preserve the error reporting feature, `AnalysisException` now takes an extra `Option[LogicalPlan]` argument to hold the partially analyzed plan, so that we can check the plan tree when reporting test failures. This plan is passed by `QueryExecution.assertAnalyzed`. ## How was this patch tested? Existing tests do the work. ## TODO - [ ] Fix all tests - [ ] Re-enable MiMA check - [ ] Update ScalaDoc (`since`, `group`, and example code) Author: Cheng Lian Author: Yin Huai Author: Wenchen Fan Author: Cheng Lian Closes #11443 from liancheng/ds-to-df. --- .../ml/JavaAFTSurvivalRegressionExample.java | 3 ++- .../org/apache/spark/examples/ml/JavaALSExample.java | 14 ++++++++------ .../spark/examples/ml/JavaBinarizerExample.java | 11 ++++++----- .../examples/ml/JavaBisectingKMeansExample.java | 4 ++-- .../spark/examples/ml/JavaBucketizerExample.java | 6 +++--- .../spark/examples/ml/JavaChiSqSelectorExample.java | 6 +++--- .../examples/ml/JavaCountVectorizerExample.java | 4 ++-- .../spark/examples/ml/JavaCrossValidatorExample.java | 10 ++++++---- .../org/apache/spark/examples/ml/JavaDCTExample.java | 6 +++--- .../ml/JavaDecisionTreeClassificationExample.java | 13 +++++++------ .../ml/JavaDecisionTreeRegressionExample.java | 13 +++++++------ .../spark/examples/ml/JavaDeveloperApiExample.java | 12 +++++++----- .../examples/ml/JavaElementwiseProductExample.java | 6 +++--- .../ml/JavaEstimatorTransformerParamExample.java | 9 +++++---- .../ml/JavaGradientBoostedTreeClassifierExample.java | 13 +++++++------ .../ml/JavaGradientBoostedTreeRegressorExample.java | 14 ++++++++------ .../spark/examples/ml/JavaIndexToStringExample.java | 8 ++++---- .../apache/spark/examples/ml/JavaKMeansExample.java | 3 ++- .../org/apache/spark/examples/ml/JavaLDAExample.java | 5 +++-- .../JavaLinearRegressionWithElasticNetExample.java | 5 +++-- .../ml/JavaLogisticRegressionSummaryExample.java | 9 +++++---- .../JavaLogisticRegressionWithElasticNetExample.java | 5 +++-- .../spark/examples/ml/JavaMinMaxScalerExample.java | 9 +++++---- .../JavaModelSelectionViaCrossValidationExample.java | 10 +++++----- ...ModelSelectionViaTrainValidationSplitExample.java | 11 ++++++----- .../JavaMultilayerPerceptronClassifierExample.java | 15 ++++++++------- .../apache/spark/examples/ml/JavaNGramExample.java | 10 +++++----- .../spark/examples/ml/JavaNormalizerExample.java | 11 ++++++----- .../spark/examples/ml/JavaOneHotEncoderExample.java | 8 ++++---- .../spark/examples/ml/JavaOneVsRestExample.java | 12 +++++++----- .../org/apache/spark/examples/ml/JavaPCAExample.java | 6 +++--- .../spark/examples/ml/JavaPipelineExample.java | 10 +++++----- .../examples/ml/JavaPolynomialExpansionExample.java | 10 +++++----- .../examples/ml/JavaQuantileDiscretizerExample.java | 6 +++--- .../spark/examples/ml/JavaRFormulaExample.java | 6 +++--- .../ml/JavaRandomForestClassifierExample.java | 14 ++++++++------ .../ml/JavaRandomForestRegressorExample.java | 14 ++++++++------ .../spark/examples/ml/JavaSQLTransformerExample.java | 3 ++- .../spark/examples/ml/JavaSimpleParamsExample.java | 11 ++++++----- .../ml/JavaSimpleTextClassificationPipeline.java | 11 ++++++----- .../spark/examples/ml/JavaStandardScalerExample.java | 9 +++++---- .../examples/ml/JavaStopWordsRemoverExample.java | 4 ++-- .../spark/examples/ml/JavaStringIndexerExample.java | 8 ++++---- .../apache/spark/examples/ml/JavaTfIdfExample.java | 12 ++++++------ .../spark/examples/ml/JavaTokenizerExample.java | 8 ++++---- .../examples/ml/JavaTrainValidationSplitExample.java | 11 ++++++----- .../examples/ml/JavaVectorAssemblerExample.java | 6 +++--- .../spark/examples/ml/JavaVectorIndexerExample.java | 9 +++++---- .../spark/examples/ml/JavaVectorSlicerExample.java | 7 ++++--- .../spark/examples/ml/JavaWord2VecExample.java | 8 ++++---- .../org/apache/spark/examples/sql/JavaSparkSQL.java | 20 +++++++++++--------- .../examples/streaming/JavaSqlNetworkWordCount.java | 7 ++++--- 52 files changed, 252 insertions(+), 213 deletions(-) (limited to 'examples/src') diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java index 69a174562f..39053109da 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java @@ -27,6 +27,7 @@ import org.apache.spark.ml.regression.AFTSurvivalRegression; import org.apache.spark.ml.regression.AFTSurvivalRegressionModel; import org.apache.spark.mllib.linalg.*; import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; @@ -52,7 +53,7 @@ public class JavaAFTSurvivalRegressionExample { new StructField("censor", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); - DataFrame training = jsql.createDataFrame(data, schema); + Dataset training = jsql.createDataFrame(data, schema); double[] quantileProbabilities = new double[]{0.3, 0.6}; AFTSurvivalRegression aft = new AFTSurvivalRegression() .setQuantileProbabilities(quantileProbabilities) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java index 90d2ac2b13..9754ba5268 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java @@ -19,6 +19,8 @@ package org.apache.spark.examples.ml; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; // $example on$ @@ -93,10 +95,10 @@ public class JavaALSExample { return Rating.parseRating(str); } }); - DataFrame ratings = sqlContext.createDataFrame(ratingsRDD, Rating.class); - DataFrame[] splits = ratings.randomSplit(new double[]{0.8, 0.2}); - DataFrame training = splits[0]; - DataFrame test = splits[1]; + Dataset ratings = sqlContext.createDataFrame(ratingsRDD, Rating.class); + Dataset[] splits = ratings.randomSplit(new double[]{0.8, 0.2}); + Dataset training = splits[0]; + Dataset test = splits[1]; // Build the recommendation model using ALS on the training data ALS als = new ALS() @@ -108,8 +110,8 @@ public class JavaALSExample { ALSModel model = als.fit(training); // Evaluate the model by computing the RMSE on the test data - DataFrame rawPredictions = model.transform(test); - DataFrame predictions = rawPredictions + Dataset rawPredictions = model.transform(test); + Dataset predictions = rawPredictions .withColumn("rating", rawPredictions.col("rating").cast(DataTypes.DoubleType)) .withColumn("prediction", rawPredictions.col("prediction").cast(DataTypes.DoubleType)); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java index 1eda1f694f..84eef1fb8a 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java @@ -19,6 +19,7 @@ package org.apache.spark.examples.ml; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SQLContext; // $example on$ @@ -51,18 +52,18 @@ public class JavaBinarizerExample { new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); - DataFrame continuousDataFrame = jsql.createDataFrame(jrdd, schema); + Dataset continuousDataFrame = jsql.createDataFrame(jrdd, schema); Binarizer binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(0.5); - DataFrame binarizedDataFrame = binarizer.transform(continuousDataFrame); - DataFrame binarizedFeatures = binarizedDataFrame.select("binarized_feature"); - for (Row r : binarizedFeatures.collect()) { + Dataset binarizedDataFrame = binarizer.transform(continuousDataFrame); + Dataset binarizedFeatures = binarizedDataFrame.select("binarized_feature"); + for (Row r : binarizedFeatures.collectRows()) { Double binarized_value = r.getDouble(0); System.out.println(binarized_value); } // $example off$ jsc.stop(); } -} \ No newline at end of file +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java index e124c1cf18..1d1a518bbc 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java @@ -30,7 +30,7 @@ import org.apache.spark.ml.clustering.BisectingKMeansModel; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; @@ -62,7 +62,7 @@ public class JavaBisectingKMeansExample { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); - DataFrame dataset = jsql.createDataFrame(data, schema); + Dataset dataset = jsql.createDataFrame(data, schema); BisectingKMeans bkm = new BisectingKMeans().setK(2); BisectingKMeansModel model = bkm.fit(dataset); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java index 8ad369cc93..68ffa702ea 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java @@ -26,7 +26,7 @@ import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.Bucketizer; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.DataTypes; @@ -53,7 +53,7 @@ public class JavaBucketizerExample { StructType schema = new StructType(new StructField[]{ new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) }); - DataFrame dataFrame = jsql.createDataFrame(data, schema); + Dataset dataFrame = jsql.createDataFrame(data, schema); Bucketizer bucketizer = new Bucketizer() .setInputCol("features") @@ -61,7 +61,7 @@ public class JavaBucketizerExample { .setSplits(splits); // Transform original data into its bucket index. - DataFrame bucketedData = bucketizer.transform(dataFrame); + Dataset bucketedData = bucketizer.transform(dataFrame); bucketedData.show(); // $example off$ jsc.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java index ede05d6e20..b1bf1cfeb2 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SQLContext; // $example on$ @@ -28,7 +29,6 @@ import java.util.Arrays; import org.apache.spark.ml.feature.ChiSqSelector; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.DataTypes; @@ -55,7 +55,7 @@ public class JavaChiSqSelectorExample { new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty()) }); - DataFrame df = sqlContext.createDataFrame(jrdd, schema); + Dataset df = sqlContext.createDataFrame(jrdd, schema); ChiSqSelector selector = new ChiSqSelector() .setNumTopFeatures(1) @@ -63,7 +63,7 @@ public class JavaChiSqSelectorExample { .setLabelCol("clicked") .setOutputCol("selectedFeatures"); - DataFrame result = selector.fit(df).transform(df); + Dataset result = selector.fit(df).transform(df); result.show(); // $example off$ jsc.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java index 872e5a07d1..ec3ac202be 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java @@ -25,7 +25,7 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.feature.CountVectorizer; import org.apache.spark.ml.feature.CountVectorizerModel; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; @@ -48,7 +48,7 @@ public class JavaCountVectorizerExample { StructType schema = new StructType(new StructField [] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); - DataFrame df = sqlContext.createDataFrame(jrdd, schema); + Dataset df = sqlContext.createDataFrame(jrdd, schema); // fit a CountVectorizerModel from the corpus CountVectorizerModel cvModel = new CountVectorizer() diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java index 9bbc14ea40..fb6c47be39 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java @@ -34,6 +34,7 @@ import org.apache.spark.ml.tuning.CrossValidator; import org.apache.spark.ml.tuning.CrossValidatorModel; import org.apache.spark.ml.tuning.ParamGridBuilder; import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; @@ -71,7 +72,8 @@ public class JavaCrossValidatorExample { new LabeledDocument(9L, "a e c l", 0.0), new LabeledDocument(10L, "spark compile", 1.0), new LabeledDocument(11L, "hadoop software", 0.0)); - DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class); + Dataset training = jsql.createDataFrame( + jsc.parallelize(localTraining), LabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. Tokenizer tokenizer = new Tokenizer() @@ -112,11 +114,11 @@ public class JavaCrossValidatorExample { new Document(5L, "l m n"), new Document(6L, "mapreduce spark"), new Document(7L, "apache hadoop")); - DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class); + Dataset test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class); // Make predictions on test documents. cvModel uses the best model found (lrModel). - DataFrame predictions = cvModel.transform(test); - for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) { + Dataset predictions = cvModel.transform(test); + for (Row r: predictions.select("id", "text", "probability", "prediction").collectRows()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java index 35c0d534a4..4b15fde9c3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java @@ -19,6 +19,7 @@ package org.apache.spark.examples.ml; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SQLContext; // $example on$ @@ -28,7 +29,6 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.DCT; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.Metadata; @@ -51,12 +51,12 @@ public class JavaDCTExample { StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); - DataFrame df = jsql.createDataFrame(data, schema); + Dataset df = jsql.createDataFrame(data, schema); DCT dct = new DCT() .setInputCol("features") .setOutputCol("featuresDCT") .setInverse(false); - DataFrame dctDf = dct.transform(df); + Dataset dctDf = dct.transform(df); dctDf.select("featuresDCT").show(3); // $example off$ jsc.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java index b5347b7650..5bd61fe508 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java @@ -26,7 +26,8 @@ import org.apache.spark.ml.classification.DecisionTreeClassifier; import org.apache.spark.ml.classification.DecisionTreeClassificationModel; import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.ml.feature.*; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; // $example off$ @@ -38,7 +39,7 @@ public class JavaDecisionTreeClassificationExample { // $example on$ // Load the data stored in LIBSVM format as a DataFrame. - DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -55,9 +56,9 @@ public class JavaDecisionTreeClassificationExample { .fit(data); // Split the data into training and test sets (30% held out for testing) - DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3}); - DataFrame trainingData = splits[0]; - DataFrame testData = splits[1]; + Dataset[] splits = data.randomSplit(new double[]{0.7, 0.3}); + Dataset trainingData = splits[0]; + Dataset testData = splits[1]; // Train a DecisionTree model. DecisionTreeClassifier dt = new DecisionTreeClassifier() @@ -78,7 +79,7 @@ public class JavaDecisionTreeClassificationExample { PipelineModel model = pipeline.fit(trainingData); // Make predictions. - DataFrame predictions = model.transform(testData); + Dataset predictions = model.transform(testData); // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java index 9cb67be04a..a4f3e97bf3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java @@ -27,7 +27,8 @@ import org.apache.spark.ml.feature.VectorIndexer; import org.apache.spark.ml.feature.VectorIndexerModel; import org.apache.spark.ml.regression.DecisionTreeRegressionModel; import org.apache.spark.ml.regression.DecisionTreeRegressor; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; // $example off$ @@ -38,7 +39,7 @@ public class JavaDecisionTreeRegressionExample { SQLContext sqlContext = new SQLContext(jsc); // $example on$ // Load the data stored in LIBSVM format as a DataFrame. - DataFrame data = sqlContext.read().format("libsvm") + Dataset data = sqlContext.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. @@ -50,9 +51,9 @@ public class JavaDecisionTreeRegressionExample { .fit(data); // Split the data into training and test sets (30% held out for testing) - DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3}); - DataFrame trainingData = splits[0]; - DataFrame testData = splits[1]; + Dataset[] splits = data.randomSplit(new double[]{0.7, 0.3}); + Dataset trainingData = splits[0]; + Dataset testData = splits[1]; // Train a DecisionTree model. DecisionTreeRegressor dt = new DecisionTreeRegressor() @@ -66,7 +67,7 @@ public class JavaDecisionTreeRegressionExample { PipelineModel model = pipeline.fit(trainingData); // Make predictions. - DataFrame predictions = model.transform(testData); + Dataset predictions = model.transform(testData); // Select example rows to display. predictions.select("label", "features").show(5); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java index da2012ad51..e568bea607 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java @@ -34,6 +34,7 @@ import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; @@ -61,7 +62,8 @@ public class JavaDeveloperApiExample { new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))); - DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class); + Dataset training = jsql.createDataFrame( + jsc.parallelize(localTraining), LabeledPoint.class); // Create a LogisticRegression instance. This instance is an Estimator. MyJavaLogisticRegression lr = new MyJavaLogisticRegression(); @@ -79,12 +81,12 @@ public class JavaDeveloperApiExample { new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))); - DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class); + Dataset test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class); // Make predictions on test documents. cvModel uses the best model found (lrModel). - DataFrame results = model.transform(test); + Dataset results = model.transform(test); double sumPredictions = 0; - for (Row r : results.select("features", "label", "prediction").collect()) { + for (Row r : results.select("features", "label", "prediction").collectRows()) { sumPredictions += r.getDouble(2); } if (sumPredictions != 0.0) { @@ -145,7 +147,7 @@ class MyJavaLogisticRegression // This method is used by fit(). // In Java, we have to make it public since Java does not understand Scala's protected modifier. - public MyJavaLogisticRegressionModel train(DataFrame dataset) { + public MyJavaLogisticRegressionModel train(Dataset dataset) { // Extract columns from data using helper method. JavaRDD oldDataset = extractLabeledPoints(dataset).toJavaRDD(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java index c1f00dde0e..37de9cf359 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java @@ -19,6 +19,7 @@ package org.apache.spark.examples.ml; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SQLContext; // $example on$ @@ -31,7 +32,6 @@ import org.apache.spark.ml.feature.ElementwiseProduct; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.DataTypes; @@ -58,7 +58,7 @@ public class JavaElementwiseProductExample { StructType schema = DataTypes.createStructType(fields); - DataFrame dataFrame = sqlContext.createDataFrame(jrdd, schema); + Dataset dataFrame = sqlContext.createDataFrame(jrdd, schema); Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); @@ -72,4 +72,4 @@ public class JavaElementwiseProductExample { // $example off$ jsc.stop(); } -} \ No newline at end of file +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java index 44cf3507f3..8a02f60aa4 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java @@ -30,6 +30,7 @@ import org.apache.spark.ml.param.ParamMap; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; // $example off$ import org.apache.spark.sql.SQLContext; @@ -48,7 +49,7 @@ public class JavaEstimatorTransformerParamExample { // Prepare training data. // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans into // DataFrames, where it uses the bean metadata to infer the schema. - DataFrame training = sqlContext.createDataFrame( + Dataset training = sqlContext.createDataFrame( Arrays.asList( new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), @@ -89,7 +90,7 @@ public class JavaEstimatorTransformerParamExample { System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap()); // Prepare test documents. - DataFrame test = sqlContext.createDataFrame(Arrays.asList( + Dataset test = sqlContext.createDataFrame(Arrays.asList( new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)) @@ -99,8 +100,8 @@ public class JavaEstimatorTransformerParamExample { // LogisticRegression.transform will only use the 'features' column. // Note that model2.transform() outputs a 'myProbability' column instead of the usual // 'probability' column since we renamed the lr.probabilityCol parameter previously. - DataFrame results = model2.transform(test); - for (Row r : results.select("features", "label", "myProbability", "prediction").collect()) { + Dataset results = model2.transform(test); + for (Row r : results.select("features", "label", "myProbability", "prediction").collectRows()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java index 848fe6566c..c2cb955385 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java @@ -27,7 +27,8 @@ import org.apache.spark.ml.classification.GBTClassificationModel; import org.apache.spark.ml.classification.GBTClassifier; import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.ml.feature.*; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; // $example off$ @@ -39,7 +40,7 @@ public class JavaGradientBoostedTreeClassifierExample { // $example on$ // Load and parse the data file, converting it to a DataFrame. - DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -56,9 +57,9 @@ public class JavaGradientBoostedTreeClassifierExample { .fit(data); // Split the data into training and test sets (30% held out for testing) - DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3}); - DataFrame trainingData = splits[0]; - DataFrame testData = splits[1]; + Dataset[] splits = data.randomSplit(new double[] {0.7, 0.3}); + Dataset trainingData = splits[0]; + Dataset testData = splits[1]; // Train a GBT model. GBTClassifier gbt = new GBTClassifier() @@ -80,7 +81,7 @@ public class JavaGradientBoostedTreeClassifierExample { PipelineModel model = pipeline.fit(trainingData); // Make predictions. - DataFrame predictions = model.transform(testData); + Dataset predictions = model.transform(testData); // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java index 1f67b0842d..83fd89e3bd 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java @@ -28,7 +28,8 @@ import org.apache.spark.ml.feature.VectorIndexer; import org.apache.spark.ml.feature.VectorIndexerModel; import org.apache.spark.ml.regression.GBTRegressionModel; import org.apache.spark.ml.regression.GBTRegressor; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; // $example off$ @@ -40,7 +41,8 @@ public class JavaGradientBoostedTreeRegressorExample { // $example on$ // Load and parse the data file, converting it to a DataFrame. - DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset data = + sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -51,9 +53,9 @@ public class JavaGradientBoostedTreeRegressorExample { .fit(data); // Split the data into training and test sets (30% held out for testing) - DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3}); - DataFrame trainingData = splits[0]; - DataFrame testData = splits[1]; + Dataset[] splits = data.randomSplit(new double[] {0.7, 0.3}); + Dataset trainingData = splits[0]; + Dataset testData = splits[1]; // Train a GBT model. GBTRegressor gbt = new GBTRegressor() @@ -68,7 +70,7 @@ public class JavaGradientBoostedTreeRegressorExample { PipelineModel model = pipeline.fit(trainingData); // Make predictions. - DataFrame predictions = model.transform(testData); + Dataset predictions = model.transform(testData); // Select example rows to display. predictions.select("prediction", "label", "features").show(5); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java index 3ccd699326..9b8c22f3bd 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SQLContext; // $example on$ @@ -28,7 +29,6 @@ import java.util.Arrays; import org.apache.spark.ml.feature.IndexToString; import org.apache.spark.ml.feature.StringIndexer; import org.apache.spark.ml.feature.StringIndexerModel; -import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.DataTypes; @@ -56,18 +56,18 @@ public class JavaIndexToStringExample { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("category", DataTypes.StringType, false, Metadata.empty()) }); - DataFrame df = sqlContext.createDataFrame(jrdd, schema); + Dataset df = sqlContext.createDataFrame(jrdd, schema); StringIndexerModel indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df); - DataFrame indexed = indexer.transform(df); + Dataset indexed = indexer.transform(df); IndexToString converter = new IndexToString() .setInputCol("categoryIndex") .setOutputCol("originalCategory"); - DataFrame converted = converter.transform(indexed); + Dataset converted = converter.transform(indexed); converted.select("id", "originalCategory").show(); // $example off$ jsc.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java index 96481d882a..30ccf30885 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java @@ -23,6 +23,7 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.catalyst.expressions.GenericRow; // $example on$ @@ -81,7 +82,7 @@ public class JavaKMeansExample { JavaRDD points = jsc.textFile(inputFile).map(new ParsePoint()); StructField[] fields = {new StructField("features", new VectorUDT(), false, Metadata.empty())}; StructType schema = new StructType(fields); - DataFrame dataset = sqlContext.createDataFrame(points, schema); + Dataset dataset = sqlContext.createDataFrame(points, schema); // Trains a k-means model KMeans kmeans = new KMeans() diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java index 3a5d3237c8..c70d44c297 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java @@ -29,6 +29,7 @@ import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.catalyst.expressions.GenericRow; @@ -75,7 +76,7 @@ public class JavaLDAExample { JavaRDD points = jsc.textFile(inputFile).map(new ParseVector()); StructField[] fields = {new StructField("features", new VectorUDT(), false, Metadata.empty())}; StructType schema = new StructType(fields); - DataFrame dataset = sqlContext.createDataFrame(points, schema); + Dataset dataset = sqlContext.createDataFrame(points, schema); // Trains a LDA model LDA lda = new LDA() @@ -87,7 +88,7 @@ public class JavaLDAExample { System.out.println(model.logPerplexity(dataset)); // Shows the result - DataFrame topics = model.describeTopics(3); + Dataset topics = model.describeTopics(3); topics.show(false); model.transform(dataset).show(false); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java index 4ad7676c8d..08fce89359 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java @@ -24,7 +24,8 @@ import org.apache.spark.ml.regression.LinearRegression; import org.apache.spark.ml.regression.LinearRegressionModel; import org.apache.spark.ml.regression.LinearRegressionTrainingSummary; import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; // $example off$ @@ -36,7 +37,7 @@ public class JavaLinearRegressionWithElasticNetExample { // $example on$ // Load training data - DataFrame training = sqlContext.read().format("libsvm") + Dataset training = sqlContext.read().format("libsvm") .load("data/mllib/sample_linear_regression_data.txt"); LinearRegression lr = new LinearRegression() diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java index 986f3b3b28..73b028fb44 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java @@ -24,7 +24,8 @@ import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary; import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.LogisticRegressionModel; import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.functions; // $example off$ @@ -36,7 +37,7 @@ public class JavaLogisticRegressionSummaryExample { SQLContext sqlContext = new SQLContext(jsc); // Load training data - DataFrame training = sqlContext.read().format("libsvm") + Dataset training = sqlContext.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); LogisticRegression lr = new LogisticRegression() @@ -65,14 +66,14 @@ public class JavaLogisticRegressionSummaryExample { (BinaryLogisticRegressionSummary) trainingSummary; // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. - DataFrame roc = binarySummary.roc(); + Dataset roc = binarySummary.roc(); roc.show(); roc.select("FPR").show(); System.out.println(binarySummary.areaUnderROC()); // Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with // this selected threshold. - DataFrame fMeasure = binarySummary.fMeasureByThreshold(); + Dataset fMeasure = binarySummary.fMeasureByThreshold(); double maxFMeasure = fMeasure.select(functions.max("F-Measure")).head().getDouble(0); double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure)) .select("threshold").head().getDouble(0); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java index 1d28279d72..6911668522 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java @@ -22,7 +22,8 @@ import org.apache.spark.api.java.JavaSparkContext; // $example on$ import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; // $example off$ @@ -34,7 +35,7 @@ public class JavaLogisticRegressionWithElasticNetExample { // $example on$ // Load training data - DataFrame training = sqlContext.read().format("libsvm") + Dataset training = sqlContext.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); LogisticRegression lr = new LogisticRegression() diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java index 2d50ba7faa..4aee18eeab 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java @@ -24,7 +24,8 @@ import org.apache.spark.sql.SQLContext; // $example on$ import org.apache.spark.ml.feature.MinMaxScaler; import org.apache.spark.ml.feature.MinMaxScalerModel; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; // $example off$ public class JavaMinMaxScalerExample { @@ -34,7 +35,7 @@ public class JavaMinMaxScalerExample { SQLContext jsql = new SQLContext(jsc); // $example on$ - DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); MinMaxScaler scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); @@ -43,9 +44,9 @@ public class JavaMinMaxScalerExample { MinMaxScalerModel scalerModel = scaler.fit(dataFrame); // rescale each feature to range [min, max]. - DataFrame scaledData = scalerModel.transform(dataFrame); + Dataset scaledData = scalerModel.transform(dataFrame); scaledData.show(); // $example off$ jsc.stop(); } -} \ No newline at end of file +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java index 87ad119491..e394605db7 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java @@ -34,7 +34,7 @@ import org.apache.spark.ml.param.ParamMap; import org.apache.spark.ml.tuning.CrossValidator; import org.apache.spark.ml.tuning.CrossValidatorModel; import org.apache.spark.ml.tuning.ParamGridBuilder; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; // $example off$ import org.apache.spark.sql.SQLContext; @@ -51,7 +51,7 @@ public class JavaModelSelectionViaCrossValidationExample { // $example on$ // Prepare training documents, which are labeled. - DataFrame training = sqlContext.createDataFrame(Arrays.asList( + Dataset training = sqlContext.createDataFrame(Arrays.asList( new JavaLabeledDocument(0L, "a b c d e spark", 1.0), new JavaLabeledDocument(1L, "b d", 0.0), new JavaLabeledDocument(2L,"spark f g h", 1.0), @@ -102,7 +102,7 @@ public class JavaModelSelectionViaCrossValidationExample { CrossValidatorModel cvModel = cv.fit(training); // Prepare test documents, which are unlabeled. - DataFrame test = sqlContext.createDataFrame(Arrays.asList( + Dataset test = sqlContext.createDataFrame(Arrays.asList( new JavaDocument(4L, "spark i j k"), new JavaDocument(5L, "l m n"), new JavaDocument(6L, "mapreduce spark"), @@ -110,8 +110,8 @@ public class JavaModelSelectionViaCrossValidationExample { ), JavaDocument.class); // Make predictions on test documents. cvModel uses the best model found (lrModel). - DataFrame predictions = cvModel.transform(test); - for (Row r : predictions.select("id", "text", "probability", "prediction").collect()) { + Dataset predictions = cvModel.transform(test); + for (Row r : predictions.select("id", "text", "probability", "prediction").collectRows()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java index 77adb02dfd..6ac4aea3c4 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java @@ -26,7 +26,8 @@ import org.apache.spark.ml.regression.LinearRegression; import org.apache.spark.ml.tuning.ParamGridBuilder; import org.apache.spark.ml.tuning.TrainValidationSplit; import org.apache.spark.ml.tuning.TrainValidationSplitModel; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; // $example off$ import org.apache.spark.sql.SQLContext; @@ -41,13 +42,13 @@ public class JavaModelSelectionViaTrainValidationSplitExample { SQLContext jsql = new SQLContext(sc); // $example on$ - DataFrame data = jsql.read().format("libsvm") + Dataset data = jsql.read().format("libsvm") .load("data/mllib/sample_linear_regression_data.txt"); // Prepare training and test data. - DataFrame[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345); - DataFrame training = splits[0]; - DataFrame test = splits[1]; + Dataset[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345); + Dataset training = splits[0]; + Dataset test = splits[1]; LinearRegression lr = new LinearRegression(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java index 84369f6681..0ca528d8cd 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java @@ -20,11 +20,12 @@ package org.apache.spark.examples.ml; // $example on$ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel; import org.apache.spark.ml.classification.MultilayerPerceptronClassifier; import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; -import org.apache.spark.sql.DataFrame; // $example off$ /** @@ -40,11 +41,11 @@ public class JavaMultilayerPerceptronClassifierExample { // $example on$ // Load training data String path = "data/mllib/sample_multiclass_classification_data.txt"; - DataFrame dataFrame = jsql.read().format("libsvm").load(path); + Dataset dataFrame = jsql.read().format("libsvm").load(path); // Split the data into train and test - DataFrame[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); - DataFrame train = splits[0]; - DataFrame test = splits[1]; + Dataset[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); + Dataset train = splits[0]; + Dataset test = splits[1]; // specify layers for the neural network: // input layer of size 4 (features), two intermediate of size 5 and 4 // and output of size 3 (classes) @@ -58,8 +59,8 @@ public class JavaMultilayerPerceptronClassifierExample { // train the model MultilayerPerceptronClassificationModel model = trainer.fit(train); // compute precision on the test set - DataFrame result = model.transform(test); - DataFrame predictionAndLabels = result.select("prediction", "label"); + Dataset result = model.transform(test); + Dataset predictionAndLabels = result.select("prediction", "label"); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setMetricName("precision"); System.out.println("Precision = " + evaluator.evaluate(predictionAndLabels)); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java index 8fd75ed8b5..0305f737ca 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java @@ -19,6 +19,7 @@ package org.apache.spark.examples.ml; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SQLContext; // $example on$ @@ -26,7 +27,6 @@ import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.NGram; -import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.DataTypes; @@ -54,13 +54,13 @@ public class JavaNGramExample { "words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); - DataFrame wordDataFrame = sqlContext.createDataFrame(jrdd, schema); + Dataset wordDataFrame = sqlContext.createDataFrame(jrdd, schema); NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams"); - DataFrame ngramDataFrame = ngramTransformer.transform(wordDataFrame); + Dataset ngramDataFrame = ngramTransformer.transform(wordDataFrame); - for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) { + for (Row r : ngramDataFrame.select("ngrams", "label").takeRows(3)) { java.util.List ngrams = r.getList(0); for (String ngram : ngrams) System.out.print(ngram + " --- "); System.out.println(); @@ -68,4 +68,4 @@ public class JavaNGramExample { // $example off$ jsc.stop(); } -} \ No newline at end of file +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java index ed3f6163c0..31cd752136 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java @@ -23,7 +23,8 @@ import org.apache.spark.sql.SQLContext; // $example on$ import org.apache.spark.ml.feature.Normalizer; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; // $example off$ public class JavaNormalizerExample { @@ -33,7 +34,7 @@ public class JavaNormalizerExample { SQLContext jsql = new SQLContext(jsc); // $example on$ - DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Normalize each Vector using $L^1$ norm. Normalizer normalizer = new Normalizer() @@ -41,14 +42,14 @@ public class JavaNormalizerExample { .setOutputCol("normFeatures") .setP(1.0); - DataFrame l1NormData = normalizer.transform(dataFrame); + Dataset l1NormData = normalizer.transform(dataFrame); l1NormData.show(); // Normalize each Vector using $L^\infty$ norm. - DataFrame lInfNormData = + Dataset lInfNormData = normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY)); lInfNormData.show(); // $example off$ jsc.stop(); } -} \ No newline at end of file +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java index bc50960708..882438ca28 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java @@ -28,7 +28,7 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.OneHotEncoder; import org.apache.spark.ml.feature.StringIndexer; import org.apache.spark.ml.feature.StringIndexerModel; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.DataTypes; @@ -58,18 +58,18 @@ public class JavaOneHotEncoderExample { new StructField("category", DataTypes.StringType, false, Metadata.empty()) }); - DataFrame df = sqlContext.createDataFrame(jrdd, schema); + Dataset df = sqlContext.createDataFrame(jrdd, schema); StringIndexerModel indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df); - DataFrame indexed = indexer.transform(df); + Dataset indexed = indexer.transform(df); OneHotEncoder encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec"); - DataFrame encoded = encoder.transform(indexed); + Dataset encoded = encoder.transform(indexed); encoded.select("id", "categoryVec").show(); // $example off$ jsc.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java index 42374e77ac..8288f73c1b 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java @@ -30,6 +30,8 @@ import org.apache.spark.mllib.evaluation.MulticlassMetrics; import org.apache.spark.mllib.linalg.Matrix; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.StructField; // $example off$ @@ -81,9 +83,9 @@ public class JavaOneVsRestExample { OneVsRest ovr = new OneVsRest().setClassifier(classifier); String input = params.input; - DataFrame inputData = jsql.read().format("libsvm").load(input); - DataFrame train; - DataFrame test; + Dataset inputData = jsql.read().format("libsvm").load(input); + Dataset train; + Dataset test; // compute the train/ test split: if testInput is not provided use part of input String testInput = params.testInput; @@ -95,7 +97,7 @@ public class JavaOneVsRestExample { String.valueOf(numFeatures)).load(testInput); } else { double f = params.fracTest; - DataFrame[] tmp = inputData.randomSplit(new double[]{1 - f, f}, 12345); + Dataset[] tmp = inputData.randomSplit(new double[]{1 - f, f}, 12345); train = tmp[0]; test = tmp[1]; } @@ -104,7 +106,7 @@ public class JavaOneVsRestExample { OneVsRestModel ovrModel = ovr.fit(train.cache()); // score the model on test data - DataFrame predictions = ovrModel.transform(test.cache()) + Dataset predictions = ovrModel.transform(test.cache()) .select("prediction", "label"); // obtain metrics diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java index 8282fab084..a792fd7d47 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java @@ -29,7 +29,7 @@ import org.apache.spark.ml.feature.PCA; import org.apache.spark.ml.feature.PCAModel; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.Metadata; @@ -54,7 +54,7 @@ public class JavaPCAExample { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); - DataFrame df = jsql.createDataFrame(data, schema); + Dataset df = jsql.createDataFrame(data, schema); PCAModel pca = new PCA() .setInputCol("features") @@ -62,7 +62,7 @@ public class JavaPCAExample { .setK(3) .fit(df); - DataFrame result = pca.transform(df).select("pcaFeatures"); + Dataset result = pca.transform(df).select("pcaFeatures"); result.show(); // $example off$ jsc.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java index 3407c25c83..6ae418d564 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java @@ -30,7 +30,7 @@ import org.apache.spark.ml.PipelineStage; import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.feature.HashingTF; import org.apache.spark.ml.feature.Tokenizer; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; // $example off$ import org.apache.spark.sql.SQLContext; @@ -46,7 +46,7 @@ public class JavaPipelineExample { // $example on$ // Prepare training documents, which are labeled. - DataFrame training = sqlContext.createDataFrame(Arrays.asList( + Dataset training = sqlContext.createDataFrame(Arrays.asList( new JavaLabeledDocument(0L, "a b c d e spark", 1.0), new JavaLabeledDocument(1L, "b d", 0.0), new JavaLabeledDocument(2L, "spark f g h", 1.0), @@ -71,7 +71,7 @@ public class JavaPipelineExample { PipelineModel model = pipeline.fit(training); // Prepare test documents, which are unlabeled. - DataFrame test = sqlContext.createDataFrame(Arrays.asList( + Dataset test = sqlContext.createDataFrame(Arrays.asList( new JavaDocument(4L, "spark i j k"), new JavaDocument(5L, "l m n"), new JavaDocument(6L, "mapreduce spark"), @@ -79,8 +79,8 @@ public class JavaPipelineExample { ), JavaDocument.class); // Make predictions on test documents. - DataFrame predictions = model.transform(test); - for (Row r : predictions.select("id", "text", "probability", "prediction").collect()) { + Dataset predictions = model.transform(test); + for (Row r : predictions.select("id", "text", "probability", "prediction").collectRows()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java index 668f71e640..5a4064c604 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java @@ -28,7 +28,7 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.PolynomialExpansion; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.Metadata; @@ -58,14 +58,14 @@ public class JavaPolynomialExpansionExample { new StructField("features", new VectorUDT(), false, Metadata.empty()), }); - DataFrame df = jsql.createDataFrame(data, schema); - DataFrame polyDF = polyExpansion.transform(df); + Dataset df = jsql.createDataFrame(data, schema); + Dataset polyDF = polyExpansion.transform(df); - Row[] row = polyDF.select("polyFeatures").take(3); + Row[] row = polyDF.select("polyFeatures").takeRows(3); for (Row r : row) { System.out.println(r.get(0)); } // $example off$ jsc.stop(); } -} \ No newline at end of file +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java index 251ae79d9a..7b226fede9 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java @@ -25,7 +25,7 @@ import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.QuantileDiscretizer; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.DataTypes; @@ -56,14 +56,14 @@ public class JavaQuantileDiscretizerExample { new StructField("hour", DataTypes.DoubleType, false, Metadata.empty()) }); - DataFrame df = sqlContext.createDataFrame(jrdd, schema); + Dataset df = sqlContext.createDataFrame(jrdd, schema); QuantileDiscretizer discretizer = new QuantileDiscretizer() .setInputCol("hour") .setOutputCol("result") .setNumBuckets(3); - DataFrame result = discretizer.fit(df).transform(df); + Dataset result = discretizer.fit(df).transform(df); result.show(); // $example off$ jsc.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java index 1e1062b541..8c453bf80d 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java @@ -26,7 +26,7 @@ import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.RFormula; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.StructField; @@ -55,12 +55,12 @@ public class JavaRFormulaExample { RowFactory.create(9, "NZ", 15, 0.0) )); - DataFrame dataset = sqlContext.createDataFrame(rdd, schema); + Dataset dataset = sqlContext.createDataFrame(rdd, schema); RFormula formula = new RFormula() .setFormula("clicked ~ country + hour") .setFeaturesCol("features") .setLabelCol("label"); - DataFrame output = formula.fit(dataset).transform(dataset); + Dataset output = formula.fit(dataset).transform(dataset); output.select("features", "label").show(); // $example off$ jsc.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java index 5a62496660..05c2bc9622 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java @@ -27,7 +27,8 @@ import org.apache.spark.ml.classification.RandomForestClassificationModel; import org.apache.spark.ml.classification.RandomForestClassifier; import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.ml.feature.*; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; // $example off$ @@ -39,7 +40,8 @@ public class JavaRandomForestClassifierExample { // $example on$ // Load and parse the data file, converting it to a DataFrame. - DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset data = + sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -56,9 +58,9 @@ public class JavaRandomForestClassifierExample { .fit(data); // Split the data into training and test sets (30% held out for testing) - DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3}); - DataFrame trainingData = splits[0]; - DataFrame testData = splits[1]; + Dataset[] splits = data.randomSplit(new double[] {0.7, 0.3}); + Dataset trainingData = splits[0]; + Dataset testData = splits[1]; // Train a RandomForest model. RandomForestClassifier rf = new RandomForestClassifier() @@ -79,7 +81,7 @@ public class JavaRandomForestClassifierExample { PipelineModel model = pipeline.fit(trainingData); // Make predictions. - DataFrame predictions = model.transform(testData); + Dataset predictions = model.transform(testData); // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java index 05782a0724..d366967083 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java @@ -28,7 +28,8 @@ import org.apache.spark.ml.feature.VectorIndexer; import org.apache.spark.ml.feature.VectorIndexerModel; import org.apache.spark.ml.regression.RandomForestRegressionModel; import org.apache.spark.ml.regression.RandomForestRegressor; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; // $example off$ @@ -40,7 +41,8 @@ public class JavaRandomForestRegressorExample { // $example on$ // Load and parse the data file, converting it to a DataFrame. - DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset data = + sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -51,9 +53,9 @@ public class JavaRandomForestRegressorExample { .fit(data); // Split the data into training and test sets (30% held out for testing) - DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3}); - DataFrame trainingData = splits[0]; - DataFrame testData = splits[1]; + Dataset[] splits = data.randomSplit(new double[] {0.7, 0.3}); + Dataset trainingData = splits[0]; + Dataset testData = splits[1]; // Train a RandomForest model. RandomForestRegressor rf = new RandomForestRegressor() @@ -68,7 +70,7 @@ public class JavaRandomForestRegressorExample { PipelineModel model = pipeline.fit(trainingData); // Make predictions. - DataFrame predictions = model.transform(testData); + Dataset predictions = model.transform(testData); // Select example rows to display. predictions.select("prediction", "label", "features").show(5); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java index a9d64d5e3f..e413cbaf71 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java @@ -25,6 +25,7 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.feature.SQLTransformer; import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; @@ -48,7 +49,7 @@ public class JavaSQLTransformerExample { new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("v2", DataTypes.DoubleType, false, Metadata.empty()) }); - DataFrame df = sqlContext.createDataFrame(jrdd, schema); + Dataset df = sqlContext.createDataFrame(jrdd, schema); SQLTransformer sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java index ea83e8fef9..52bb4ec050 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java @@ -28,7 +28,7 @@ import org.apache.spark.ml.param.ParamMap; import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; @@ -54,7 +54,8 @@ public class JavaSimpleParamsExample { new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))); - DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class); + Dataset training = + jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class); // Create a LogisticRegression instance. This instance is an Estimator. LogisticRegression lr = new LogisticRegression(); @@ -95,14 +96,14 @@ public class JavaSimpleParamsExample { new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))); - DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class); + Dataset test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class); // Make predictions on test documents using the Transformer.transform() method. // LogisticRegressionModel.transform will only use the 'features' column. // Note that model2.transform() outputs a 'myProbability' column instead of the usual // 'probability' column since we renamed the lr.probabilityCol parameter previously. - DataFrame results = model2.transform(test); - for (Row r: results.select("features", "label", "myProbability", "prediction").collect()) { + Dataset results = model2.transform(test); + for (Row r: results.select("features", "label", "myProbability", "prediction").collectRows()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java index 54738813d0..9bd543c44f 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java @@ -29,7 +29,7 @@ import org.apache.spark.ml.PipelineStage; import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.feature.HashingTF; import org.apache.spark.ml.feature.Tokenizer; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; @@ -54,7 +54,8 @@ public class JavaSimpleTextClassificationPipeline { new LabeledDocument(1L, "b d", 0.0), new LabeledDocument(2L, "spark f g h", 1.0), new LabeledDocument(3L, "hadoop mapreduce", 0.0)); - DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class); + Dataset training = + jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. Tokenizer tokenizer = new Tokenizer() @@ -79,11 +80,11 @@ public class JavaSimpleTextClassificationPipeline { new Document(5L, "l m n"), new Document(6L, "spark hadoop spark"), new Document(7L, "apache hadoop")); - DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class); + Dataset test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class); // Make predictions on test documents. - DataFrame predictions = model.transform(test); - for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) { + Dataset predictions = model.transform(test); + for (Row r: predictions.select("id", "text", "probability", "prediction").collectRows()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java index da4756643f..e2dd759c0a 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java @@ -24,7 +24,8 @@ import org.apache.spark.sql.SQLContext; // $example on$ import org.apache.spark.ml.feature.StandardScaler; import org.apache.spark.ml.feature.StandardScalerModel; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; // $example off$ public class JavaStandardScalerExample { @@ -34,7 +35,7 @@ public class JavaStandardScalerExample { SQLContext jsql = new SQLContext(jsc); // $example on$ - DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); StandardScaler scaler = new StandardScaler() .setInputCol("features") @@ -46,9 +47,9 @@ public class JavaStandardScalerExample { StandardScalerModel scalerModel = scaler.fit(dataFrame); // Normalize each feature to have unit standard deviation. - DataFrame scaledData = scalerModel.transform(dataFrame); + Dataset scaledData = scalerModel.transform(dataFrame); scaledData.show(); // $example off$ jsc.stop(); } -} \ No newline at end of file +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java index b6b201c6b6..0ff3782cb3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java @@ -26,7 +26,7 @@ import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.StopWordsRemover; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.DataTypes; @@ -57,7 +57,7 @@ public class JavaStopWordsRemoverExample { "raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); - DataFrame dataset = jsql.createDataFrame(rdd, schema); + Dataset dataset = jsql.createDataFrame(rdd, schema); remover.transform(dataset).show(); // $example off$ jsc.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java index 05d12c1e70..ceacbb4fb3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java @@ -26,7 +26,7 @@ import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.StringIndexer; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.StructField; @@ -54,13 +54,13 @@ public class JavaStringIndexerExample { createStructField("id", IntegerType, false), createStructField("category", StringType, false) }); - DataFrame df = sqlContext.createDataFrame(jrdd, schema); + Dataset df = sqlContext.createDataFrame(jrdd, schema); StringIndexer indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex"); - DataFrame indexed = indexer.fit(df).transform(df); + Dataset indexed = indexer.fit(df).transform(df); indexed.show(); // $example off$ jsc.stop(); } -} \ No newline at end of file +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java index a41a5ec9bf..fd1ce424bf 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java @@ -28,7 +28,7 @@ import org.apache.spark.ml.feature.IDF; import org.apache.spark.ml.feature.IDFModel; import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; @@ -54,19 +54,19 @@ public class JavaTfIdfExample { new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); - DataFrame sentenceData = sqlContext.createDataFrame(jrdd, schema); + Dataset sentenceData = sqlContext.createDataFrame(jrdd, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); - DataFrame wordsData = tokenizer.transform(sentenceData); + Dataset wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; HashingTF hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("rawFeatures") .setNumFeatures(numFeatures); - DataFrame featurizedData = hashingTF.transform(wordsData); + Dataset featurizedData = hashingTF.transform(wordsData); IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features"); IDFModel idfModel = idf.fit(featurizedData); - DataFrame rescaledData = idfModel.transform(featurizedData); - for (Row r : rescaledData.select("features", "label").take(3)) { + Dataset rescaledData = idfModel.transform(featurizedData); + for (Row r : rescaledData.select("features", "label").takeRows(3)) { Vector features = r.getAs(0); Double label = r.getDouble(1); System.out.println(features); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java index 617dc3f66e..a2f8c436e3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java @@ -27,7 +27,7 @@ import java.util.Arrays; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.RegexTokenizer; import org.apache.spark.ml.feature.Tokenizer; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.DataTypes; @@ -54,12 +54,12 @@ public class JavaTokenizerExample { new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); - DataFrame sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema); + Dataset sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); - DataFrame wordsDataFrame = tokenizer.transform(sentenceDataFrame); - for (Row r : wordsDataFrame.select("words", "label"). take(3)) { + Dataset wordsDataFrame = tokenizer.transform(sentenceDataFrame); + for (Row r : wordsDataFrame.select("words", "label").takeRows(3)) { java.util.List words = r.getList(0); for (String word : words) System.out.print(word + " "); System.out.println(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java index d433905fc8..09bbc39c01 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java @@ -23,7 +23,8 @@ import org.apache.spark.ml.evaluation.RegressionEvaluator; import org.apache.spark.ml.param.ParamMap; import org.apache.spark.ml.regression.LinearRegression; import org.apache.spark.ml.tuning.*; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; /** @@ -44,12 +45,12 @@ public class JavaTrainValidationSplitExample { JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext jsql = new SQLContext(jsc); - DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Prepare training and test data. - DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345); - DataFrame training = splits[0]; - DataFrame test = splits[1]; + Dataset[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345); + Dataset training = splits[0]; + Dataset test = splits[1]; LinearRegression lr = new LinearRegression(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java index 7e230b5897..953ad455b1 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java @@ -28,7 +28,7 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.VectorAssembler; import org.apache.spark.mllib.linalg.VectorUDT; import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.*; @@ -52,13 +52,13 @@ public class JavaVectorAssemblerExample { }); Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0); JavaRDD rdd = jsc.parallelize(Arrays.asList(row)); - DataFrame dataset = sqlContext.createDataFrame(rdd, schema); + Dataset dataset = sqlContext.createDataFrame(rdd, schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[]{"hour", "mobile", "userFeatures"}) .setOutputCol("features"); - DataFrame output = assembler.transform(dataset); + Dataset output = assembler.transform(dataset); System.out.println(output.select("features", "clicked").first()); // $example off$ jsc.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java index 545758e31d..b3b5953ee7 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java @@ -26,7 +26,8 @@ import java.util.Map; import org.apache.spark.ml.feature.VectorIndexer; import org.apache.spark.ml.feature.VectorIndexerModel; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; // $example off$ public class JavaVectorIndexerExample { @@ -36,7 +37,7 @@ public class JavaVectorIndexerExample { SQLContext jsql = new SQLContext(jsc); // $example on$ - DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); + Dataset data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); VectorIndexer indexer = new VectorIndexer() .setInputCol("features") @@ -53,9 +54,9 @@ public class JavaVectorIndexerExample { System.out.println(); // Create new column "indexed" with categorical values transformed to indices - DataFrame indexedData = indexerModel.transform(data); + Dataset indexedData = indexerModel.transform(data); indexedData.show(); // $example off$ jsc.stop(); } -} \ No newline at end of file +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java index 4d5cb04ff5..2ae57c3577 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java @@ -30,7 +30,7 @@ import org.apache.spark.ml.attribute.AttributeGroup; import org.apache.spark.ml.attribute.NumericAttribute; import org.apache.spark.ml.feature.VectorSlicer; import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.types.*; @@ -55,7 +55,8 @@ public class JavaVectorSlicerExample { RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) )); - DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField())); + Dataset dataset = + jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer() .setInputCol("userFeatures").setOutputCol("features"); @@ -63,7 +64,7 @@ public class JavaVectorSlicerExample { vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"}); // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"}) - DataFrame output = vectorSlicer.transform(dataset); + Dataset output = vectorSlicer.transform(dataset); System.out.println(output.select("userFeatures", "features").first()); // $example off$ diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java index a4a05af7c6..2dce8c2168 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java @@ -25,7 +25,7 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.feature.Word2Vec; import org.apache.spark.ml.feature.Word2VecModel; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; @@ -49,7 +49,7 @@ public class JavaWord2VecExample { StructType schema = new StructType(new StructField[]{ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); - DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema); + Dataset documentDF = sqlContext.createDataFrame(jrdd, schema); // Learn a mapping from words to Vectors. Word2Vec word2Vec = new Word2Vec() @@ -58,8 +58,8 @@ public class JavaWord2VecExample { .setVectorSize(3) .setMinCount(0); Word2VecModel model = word2Vec.fit(documentDF); - DataFrame result = model.transform(documentDF); - for (Row r : result.select("result").take(3)) { + Dataset result = model.transform(documentDF); + for (Row r : result.select("result").takeRows(3)) { System.out.println(r); } // $example off$ diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java index afee279ec3..354a5306ed 100644 --- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java @@ -26,7 +26,7 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; -import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; @@ -74,11 +74,12 @@ public class JavaSparkSQL { }); // Apply a schema to an RDD of Java Beans and register it as a table. - DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class); + Dataset schemaPeople = sqlContext.createDataFrame(people, Person.class); schemaPeople.registerTempTable("people"); // SQL can be run over RDDs that have been registered as tables. - DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); + Dataset teenagers = + sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // The results of SQL queries are DataFrames and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. @@ -99,11 +100,11 @@ public class JavaSparkSQL { // Read in the parquet file created above. // Parquet files are self-describing so the schema is preserved. // The result of loading a parquet file is also a DataFrame. - DataFrame parquetFile = sqlContext.read().parquet("people.parquet"); + Dataset parquetFile = sqlContext.read().parquet("people.parquet"); //Parquet files can also be registered as tables and then used in SQL statements. parquetFile.registerTempTable("parquetFile"); - DataFrame teenagers2 = + Dataset teenagers2 = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers2.toJavaRDD().map(new Function() { @Override @@ -120,7 +121,7 @@ public class JavaSparkSQL { // The path can be either a single text file or a directory storing text files. String path = "examples/src/main/resources/people.json"; // Create a DataFrame from the file(s) pointed by path - DataFrame peopleFromJsonFile = sqlContext.read().json(path); + Dataset peopleFromJsonFile = sqlContext.read().json(path); // Because the schema of a JSON dataset is automatically inferred, to write queries, // it is better to take a look at what is the schema. @@ -134,7 +135,8 @@ public class JavaSparkSQL { peopleFromJsonFile.registerTempTable("people"); // SQL statements can be run by using the sql methods provided by sqlContext. - DataFrame teenagers3 = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); + Dataset teenagers3 = + sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // The results of SQL queries are DataFrame and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. @@ -151,7 +153,7 @@ public class JavaSparkSQL { List jsonData = Arrays.asList( "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); JavaRDD anotherPeopleRDD = ctx.parallelize(jsonData); - DataFrame peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd()); + Dataset peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd()); // Take a look at the schema of this new DataFrame. peopleFromJsonRDD.printSchema(); @@ -164,7 +166,7 @@ public class JavaSparkSQL { peopleFromJsonRDD.registerTempTable("people2"); - DataFrame peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2"); + Dataset peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2"); List nameAndCity = peopleWithCity.toJavaRDD().map(new Function() { @Override public String call(Row row) { diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java index f0228f5e63..4b9d9efc85 100644 --- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java +++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java @@ -27,8 +27,9 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.VoidFunction2; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.DataFrame; import org.apache.spark.api.java.StorageLevels; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.Time; @@ -92,13 +93,13 @@ public final class JavaSqlNetworkWordCount { return record; } }); - DataFrame wordsDataFrame = sqlContext.createDataFrame(rowRDD, JavaRecord.class); + Dataset wordsDataFrame = sqlContext.createDataFrame(rowRDD, JavaRecord.class); // Register as table wordsDataFrame.registerTempTable("words"); // Do word count on table using SQL and print it - DataFrame wordCountsDataFrame = + Dataset wordCountsDataFrame = sqlContext.sql("select word, count(*) as total from words group by word"); System.out.println("========= " + time + "========="); wordCountsDataFrame.show(); -- cgit v1.2.3