aboutsummaryrefslogtreecommitdiff
path: root/examples/src
diff options
context:
space:
mode:
Diffstat (limited to 'examples/src')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java78
-rw-r--r--examples/src/main/resources/people.json3
2 files changed, 79 insertions, 2 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
index ad5ec84b71..607df3eddd 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
@@ -18,6 +18,7 @@
package org.apache.spark.examples.sql;
import java.io.Serializable;
+import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
@@ -56,6 +57,7 @@ public class JavaSparkSQL {
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
JavaSQLContext sqlCtx = new JavaSQLContext(ctx);
+ System.out.println("=== Data source: RDD ===");
// Load a text file and convert each line to a Java Bean.
JavaRDD<Person> people = ctx.textFile("examples/src/main/resources/people.txt").map(
new Function<String, Person>() {
@@ -84,16 +86,88 @@ public class JavaSparkSQL {
return "Name: " + row.getString(0);
}
}).collect();
+ for (String name: teenagerNames) {
+ System.out.println(name);
+ }
+ System.out.println("=== Data source: Parquet File ===");
// JavaSchemaRDDs can be saved as parquet files, maintaining the schema information.
schemaPeople.saveAsParquetFile("people.parquet");
- // Read in the parquet file created above. Parquet files are self-describing so the schema is preserved.
+ // Read in the parquet file created above.
+ // Parquet files are self-describing so the schema is preserved.
// The result of loading a parquet file is also a JavaSchemaRDD.
JavaSchemaRDD parquetFile = sqlCtx.parquetFile("people.parquet");
//Parquet files can also be registered as tables and then used in SQL statements.
parquetFile.registerAsTable("parquetFile");
- JavaSchemaRDD teenagers2 = sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
+ JavaSchemaRDD teenagers2 =
+ sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
+ teenagerNames = teenagers2.map(new Function<Row, String>() {
+ public String call(Row row) {
+ return "Name: " + row.getString(0);
+ }
+ }).collect();
+ for (String name: teenagerNames) {
+ System.out.println(name);
+ }
+
+ System.out.println("=== Data source: JSON Dataset ===");
+ // A JSON dataset is pointed by path.
+ // The path can be either a single text file or a directory storing text files.
+ String path = "examples/src/main/resources/people.json";
+ // Create a JavaSchemaRDD from the file(s) pointed by path
+ JavaSchemaRDD peopleFromJsonFile = sqlCtx.jsonFile(path);
+
+ // Because the schema of a JSON dataset is automatically inferred, to write queries,
+ // it is better to take a look at what is the schema.
+ peopleFromJsonFile.printSchema();
+ // The schema of people is ...
+ // root
+ // |-- age: IntegerType
+ // |-- name: StringType
+
+ // Register this JavaSchemaRDD as a table.
+ peopleFromJsonFile.registerAsTable("people");
+
+ // SQL statements can be run by using the sql methods provided by sqlCtx.
+ JavaSchemaRDD teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
+
+ // The results of SQL queries are JavaSchemaRDDs and support all the normal RDD operations.
+ // The columns of a row in the result can be accessed by ordinal.
+ teenagerNames = teenagers3.map(new Function<Row, String>() {
+ public String call(Row row) { return "Name: " + row.getString(0); }
+ }).collect();
+ for (String name: teenagerNames) {
+ System.out.println(name);
+ }
+
+ // Alternatively, a JavaSchemaRDD can be created for a JSON dataset represented by
+ // a RDD[String] storing one JSON object per string.
+ List<String> jsonData = Arrays.asList(
+ "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
+ JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
+ JavaSchemaRDD peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD);
+
+ // Take a look at the schema of this new JavaSchemaRDD.
+ peopleFromJsonRDD.printSchema();
+ // The schema of anotherPeople is ...
+ // root
+ // |-- address: StructType
+ // | |-- city: StringType
+ // | |-- state: StringType
+ // |-- name: StringType
+
+ peopleFromJsonRDD.registerAsTable("people2");
+
+ JavaSchemaRDD peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2");
+ List<String> nameAndCity = peopleWithCity.map(new Function<Row, String>() {
+ public String call(Row row) {
+ return "Name: " + row.getString(0) + ", City: " + row.getString(1);
+ }
+ }).collect();
+ for (String name: nameAndCity) {
+ System.out.println(name);
+ }
}
}
diff --git a/examples/src/main/resources/people.json b/examples/src/main/resources/people.json
new file mode 100644
index 0000000000..50a859cbd7
--- /dev/null
+++ b/examples/src/main/resources/people.json
@@ -0,0 +1,3 @@
+{"name":"Michael"}
+{"name":"Andy", "age":30}
+{"name":"Justin", "age":19}