aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-01-16 21:09:06 -0800
committerReynold Xin <rxin@databricks.com>2015-01-16 21:09:06 -0800
commit61b427d4b1c4934bd70ed4da844b64f0e9a377aa (patch)
tree5068b31119fa7e2256422d4fdf18703ae64d7ab2 /examples
parentee1c1f3a04dfe80843432e349f01178e47f02443 (diff)
downloadspark-61b427d4b1c4934bd70ed4da844b64f0e9a377aa.tar.gz
spark-61b427d4b1c4934bd70ed4da844b64f0e9a377aa.tar.bz2
spark-61b427d4b1c4934bd70ed4da844b64f0e9a377aa.zip
[SPARK-5193][SQL] Remove Spark SQL Java-specific API.
After the following patches, the main (Scala) API is now usable for Java users directly. https://github.com/apache/spark/pull/4056 https://github.com/apache/spark/pull/4054 https://github.com/apache/spark/pull/4049 https://github.com/apache/spark/pull/4030 https://github.com/apache/spark/pull/3965 https://github.com/apache/spark/pull/3958 Author: Reynold Xin <rxin@databricks.com> Closes #4065 from rxin/sql-java-api and squashes the following commits: b1fd860 [Reynold Xin] Fix Mima 6d86578 [Reynold Xin] Ok one more attempt in fixing Python... e8f1455 [Reynold Xin] Fix Python again... 3e53f91 [Reynold Xin] Fixed Python. 83735da [Reynold Xin] Fix BigDecimal test. e9f1de3 [Reynold Xin] Use scala BigDecimal. 500d2c4 [Reynold Xin] Fix Decimal. ba3bfa2 [Reynold Xin] Updated javadoc for RowFactory. c4ae1c5 [Reynold Xin] [SPARK-5193][SQL] Remove Spark SQL Java-specific API.
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java15
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java14
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java17
-rw-r--r--examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java32
4 files changed, 38 insertions, 40 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
index f4b4f8d8c7..247d2a5e31 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
@@ -33,9 +33,9 @@ import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.tuning.CrossValidator;
import org.apache.spark.ml.tuning.CrossValidatorModel;
import org.apache.spark.ml.tuning.ParamGridBuilder;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import org.apache.spark.sql.api.java.Row;
+import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.Row;
/**
* A simple example demonstrating model selection using CrossValidator.
@@ -55,7 +55,7 @@ public class JavaCrossValidatorExample {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaCrossValidatorExample");
JavaSparkContext jsc = new JavaSparkContext(conf);
- JavaSQLContext jsql = new JavaSQLContext(jsc);
+ SQLContext jsql = new SQLContext(jsc);
// Prepare training documents, which are labeled.
List<LabeledDocument> localTraining = Lists.newArrayList(
@@ -71,8 +71,7 @@ public class JavaCrossValidatorExample {
new LabeledDocument(9L, "a e c l", 0.0),
new LabeledDocument(10L, "spark compile", 1.0),
new LabeledDocument(11L, "hadoop software", 0.0));
- JavaSchemaRDD training =
- jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+ SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
Tokenizer tokenizer = new Tokenizer()
@@ -113,11 +112,11 @@ public class JavaCrossValidatorExample {
new Document(5L, "l m n"),
new Document(6L, "mapreduce spark"),
new Document(7L, "apache hadoop"));
- JavaSchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
+ SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
// Make predictions on test documents. cvModel uses the best model found (lrModel).
cvModel.transform(test).registerAsTable("prediction");
- JavaSchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
+ SchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
for (Row r: predictions.collect()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2)
+ ", prediction=" + r.get(3));
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
index e25b271777..5b92655e2e 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -28,9 +28,9 @@ import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import org.apache.spark.sql.api.java.Row;
+import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.Row;
/**
* A simple example demonstrating ways to specify parameters for Estimators and Transformers.
@@ -44,7 +44,7 @@ public class JavaSimpleParamsExample {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaSimpleParamsExample");
JavaSparkContext jsc = new JavaSparkContext(conf);
- JavaSQLContext jsql = new JavaSQLContext(jsc);
+ SQLContext jsql = new SQLContext(jsc);
// Prepare training data.
// We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
@@ -54,7 +54,7 @@ public class JavaSimpleParamsExample {
new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
- JavaSchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class);
+ SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class);
// Create a LogisticRegression instance. This instance is an Estimator.
LogisticRegression lr = new LogisticRegression();
@@ -94,14 +94,14 @@ public class JavaSimpleParamsExample {
new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
- JavaSchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class);
+ SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class);
// Make predictions on test documents using the Transformer.transform() method.
// LogisticRegression.transform will only use the 'features' column.
// Note that model2.transform() outputs a 'probability' column instead of the usual 'score'
// column since we renamed the lr.scoreCol parameter previously.
model2.transform(test).registerAsTable("results");
- JavaSchemaRDD results =
+ SchemaRDD results =
jsql.sql("SELECT features, label, probability, prediction FROM results");
for (Row r: results.collect()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
index 54f18014e4..74db449fad 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -21,6 +21,7 @@ import java.util.List;
import com.google.common.collect.Lists;
+import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
@@ -28,10 +29,9 @@ import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import org.apache.spark.sql.api.java.Row;
-import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.Row;
/**
* A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
@@ -46,7 +46,7 @@ public class JavaSimpleTextClassificationPipeline {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline");
JavaSparkContext jsc = new JavaSparkContext(conf);
- JavaSQLContext jsql = new JavaSQLContext(jsc);
+ SQLContext jsql = new SQLContext(jsc);
// Prepare training documents, which are labeled.
List<LabeledDocument> localTraining = Lists.newArrayList(
@@ -54,8 +54,7 @@ public class JavaSimpleTextClassificationPipeline {
new LabeledDocument(1L, "b d", 0.0),
new LabeledDocument(2L, "spark f g h", 1.0),
new LabeledDocument(3L, "hadoop mapreduce", 0.0));
- JavaSchemaRDD training =
- jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+ SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
Tokenizer tokenizer = new Tokenizer()
@@ -80,11 +79,11 @@ public class JavaSimpleTextClassificationPipeline {
new Document(5L, "l m n"),
new Document(6L, "mapreduce spark"),
new Document(7L, "apache hadoop"));
- JavaSchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
+ SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
// Make predictions on test documents.
model.transform(test).registerAsTable("prediction");
- JavaSchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
+ SchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
for (Row r: predictions.collect()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2)
+ ", prediction=" + r.get(3));
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
index 01c77bd443..b70804635d 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
@@ -26,9 +26,9 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import org.apache.spark.sql.api.java.Row;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SchemaRDD;
+import org.apache.spark.sql.Row;
public class JavaSparkSQL {
public static class Person implements Serializable {
@@ -55,7 +55,7 @@ public class JavaSparkSQL {
public static void main(String[] args) throws Exception {
SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
- JavaSQLContext sqlCtx = new JavaSQLContext(ctx);
+ SQLContext sqlCtx = new SQLContext(ctx);
System.out.println("=== Data source: RDD ===");
// Load a text file and convert each line to a Java Bean.
@@ -74,15 +74,15 @@ public class JavaSparkSQL {
});
// Apply a schema to an RDD of Java Beans and register it as a table.
- JavaSchemaRDD schemaPeople = sqlCtx.applySchema(people, Person.class);
+ SchemaRDD schemaPeople = sqlCtx.applySchema(people, Person.class);
schemaPeople.registerTempTable("people");
// SQL can be run over RDDs that have been registered as tables.
- JavaSchemaRDD teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
+ SchemaRDD teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
// The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
// The columns of a row in the result can be accessed by ordinal.
- List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
+ List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) {
return "Name: " + row.getString(0);
@@ -99,13 +99,13 @@ public class JavaSparkSQL {
// Read in the parquet file created above.
// Parquet files are self-describing so the schema is preserved.
// The result of loading a parquet file is also a JavaSchemaRDD.
- JavaSchemaRDD parquetFile = sqlCtx.parquetFile("people.parquet");
+ SchemaRDD parquetFile = sqlCtx.parquetFile("people.parquet");
//Parquet files can also be registered as tables and then used in SQL statements.
parquetFile.registerTempTable("parquetFile");
- JavaSchemaRDD teenagers2 =
+ SchemaRDD teenagers2 =
sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
- teenagerNames = teenagers2.map(new Function<Row, String>() {
+ teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) {
return "Name: " + row.getString(0);
@@ -120,7 +120,7 @@ public class JavaSparkSQL {
// The path can be either a single text file or a directory storing text files.
String path = "examples/src/main/resources/people.json";
// Create a JavaSchemaRDD from the file(s) pointed by path
- JavaSchemaRDD peopleFromJsonFile = sqlCtx.jsonFile(path);
+ SchemaRDD peopleFromJsonFile = sqlCtx.jsonFile(path);
// Because the schema of a JSON dataset is automatically inferred, to write queries,
// it is better to take a look at what is the schema.
@@ -134,11 +134,11 @@ public class JavaSparkSQL {
peopleFromJsonFile.registerTempTable("people");
// SQL statements can be run by using the sql methods provided by sqlCtx.
- JavaSchemaRDD teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
+ SchemaRDD teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
// The results of SQL queries are JavaSchemaRDDs and support all the normal RDD operations.
// The columns of a row in the result can be accessed by ordinal.
- teenagerNames = teenagers3.map(new Function<Row, String>() {
+ teenagerNames = teenagers3.toJavaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) { return "Name: " + row.getString(0); }
}).collect();
@@ -151,7 +151,7 @@ public class JavaSparkSQL {
List<String> jsonData = Arrays.asList(
"{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
- JavaSchemaRDD peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD);
+ SchemaRDD peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD.rdd());
// Take a look at the schema of this new JavaSchemaRDD.
peopleFromJsonRDD.printSchema();
@@ -164,8 +164,8 @@ public class JavaSparkSQL {
peopleFromJsonRDD.registerTempTable("people2");
- JavaSchemaRDD peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2");
- List<String> nameAndCity = peopleWithCity.map(new Function<Row, String>() {
+ SchemaRDD peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2");
+ List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) {
return "Name: " + row.getString(0) + ", City: " + row.getString(1);