aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2016-08-02 15:02:40 +0800
committerWenchen Fan <wenchen@databricks.com>2016-08-02 15:02:40 +0800
commit10e1c0e638774f5d746771b6dd251de2480f94eb (patch)
tree9aa40fef6c863aceb18243bc0ff8c7a824818cf7 /examples
parent5184df06b347f86776c8ac87415b8002a5942a35 (diff)
downloadspark-10e1c0e638774f5d746771b6dd251de2480f94eb.tar.gz
spark-10e1c0e638774f5d746771b6dd251de2480f94eb.tar.bz2
spark-10e1c0e638774f5d746771b6dd251de2480f94eb.zip
[SPARK-16734][EXAMPLES][SQL] Revise examples of all language bindings
## What changes were proposed in this pull request? This PR makes various minor updates to examples of all language bindings to make sure they are consistent with each other. Some typos and missing parts (JDBC example in Scala/Java/Python) are also fixed. ## How was this patch tested? Manually tested. Author: Cheng Lian <lian@databricks.com> Closes #14368 from liancheng/revise-examples.
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java23
-rw-r--r--examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java2
-rw-r--r--examples/src/main/python/sql/basic.py2
-rw-r--r--examples/src/main/python/sql/datasource.py32
-rw-r--r--examples/src/main/python/sql/hive.py2
-rw-r--r--examples/src/main/r/RSparkSQLExample.R113
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala22
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala2
8 files changed, 123 insertions, 75 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
index ec02c8bbb8..52e3b62b79 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
@@ -25,7 +25,6 @@ import java.util.List;
// $example on:basic_parquet_example$
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
-// import org.apache.spark.sql.Encoders;
// $example on:schema_merging$
// $example on:json_dataset$
import org.apache.spark.sql.Dataset;
@@ -92,7 +91,7 @@ public class JavaSQLDataSourceExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
- .appName("Java Spark SQL Data Sources Example")
+ .appName("Java Spark SQL data sources example")
.config("spark.some.config.option", "some-value")
.getOrCreate();
@@ -100,6 +99,7 @@ public class JavaSQLDataSourceExample {
runBasicParquetExample(spark);
runParquetSchemaMergingExample(spark);
runJsonDatasetExample(spark);
+ runJdbcDatasetExample(spark);
spark.stop();
}
@@ -183,10 +183,10 @@ public class JavaSQLDataSourceExample {
// The final schema consists of all 3 columns in the Parquet files together
// with the partitioning column appeared in the partition directory paths
// root
- // |-- value: int (nullable = true)
- // |-- square: int (nullable = true)
- // |-- cube: int (nullable = true)
- // |-- key : int (nullable = true)
+ // |-- value: int (nullable = true)
+ // |-- square: int (nullable = true)
+ // |-- cube: int (nullable = true)
+ // |-- key: int (nullable = true)
// $example off:schema_merging$
}
@@ -216,4 +216,15 @@ public class JavaSQLDataSourceExample {
// $example off:json_dataset$
}
+ private static void runJdbcDatasetExample(SparkSession spark) {
+ // $example on:jdbc_dataset$
+ Dataset<Row> jdbcDF = spark.read()
+ .format("jdbc")
+ .option("url", "jdbc:postgresql:dbserver")
+ .option("dbtable", "schema.tablename")
+ .option("user", "username")
+ .option("password", "password")
+ .load();
+ // $example off:jdbc_dataset$
+ }
}
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java
index afc18078d4..cff9032f52 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java
@@ -88,7 +88,7 @@ public class JavaSparkSQLExample {
// $example on:init_session$
SparkSession spark = SparkSession
.builder()
- .appName("Java Spark SQL Example")
+ .appName("Java Spark SQL basic example")
.config("spark.some.config.option", "some-value")
.getOrCreate();
// $example off:init_session$
diff --git a/examples/src/main/python/sql/basic.py b/examples/src/main/python/sql/basic.py
index 74f5009581..fdc017aed9 100644
--- a/examples/src/main/python/sql/basic.py
+++ b/examples/src/main/python/sql/basic.py
@@ -182,7 +182,7 @@ if __name__ == "__main__":
# $example on:init_session$
spark = SparkSession \
.builder \
- .appName("PythonSQL") \
+ .appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
# $example off:init_session$
diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py
index 0bdc3d66ff..b36c901d2b 100644
--- a/examples/src/main/python/sql/datasource.py
+++ b/examples/src/main/python/sql/datasource.py
@@ -92,14 +92,14 @@ def parquet_schema_merging_example(spark):
# The final schema consists of all 3 columns in the Parquet files together
# with the partitioning column appeared in the partition directory paths.
# root
- # |-- double: long (nullable = true)
- # |-- single: long (nullable = true)
- # |-- triple: long (nullable = true)
- # |-- key: integer (nullable = true)
+ # |-- double: long (nullable = true)
+ # |-- single: long (nullable = true)
+ # |-- triple: long (nullable = true)
+ # |-- key: integer (nullable = true)
# $example off:schema_merging$
-def json_dataset_examplg(spark):
+def json_dataset_example(spark):
# $example on:json_dataset$
# spark is from the previous example.
sc = spark.sparkContext
@@ -112,8 +112,8 @@ def json_dataset_examplg(spark):
# The inferred schema can be visualized using the printSchema() method
peopleDF.printSchema()
# root
- # |-- age: long (nullable = true)
- # |-- name: string (nullable = true)
+ # |-- age: long (nullable = true)
+ # |-- name: string (nullable = true)
# Creates a temporary view using the DataFrame
peopleDF.createOrReplaceTempView("people")
@@ -140,15 +140,29 @@ def json_dataset_examplg(spark):
# +---------------+----+
# $example off:json_dataset$
+
+def jdbc_dataset_example(spark):
+ # $example on:jdbc_dataset$
+ jdbcDF = spark.read \
+ .format("jdbc") \
+ .option("url", "jdbc:postgresql:dbserver") \
+ .option("dbtable", "schema.tablename") \
+ .option("user", "username") \
+ .option("password", "password") \
+ .load()
+ # $example off:jdbc_dataset$
+
+
if __name__ == "__main__":
spark = SparkSession \
.builder \
- .appName("PythonSQL") \
+ .appName("Python Spark SQL data source example") \
.getOrCreate()
basic_datasource_example(spark)
parquet_example(spark)
parquet_schema_merging_example(spark)
- json_dataset_examplg(spark)
+ json_dataset_example(spark)
+ jdbc_dataset_example(spark)
spark.stop()
diff --git a/examples/src/main/python/sql/hive.py b/examples/src/main/python/sql/hive.py
index d9ce5cef1f..9b2a2c4e6a 100644
--- a/examples/src/main/python/sql/hive.py
+++ b/examples/src/main/python/sql/hive.py
@@ -38,7 +38,7 @@ if __name__ == "__main__":
spark = SparkSession \
.builder \
- .appName("PythonSQL") \
+ .appName("Python Spark SQL Hive integration example") \
.config("spark.sql.warehouse.dir", warehouse_location) \
.enableHiveSupport() \
.getOrCreate()
diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R
index 33e88e15fd..de489e1bda 100644
--- a/examples/src/main/r/RSparkSQLExample.R
+++ b/examples/src/main/r/RSparkSQLExample.R
@@ -18,31 +18,43 @@
library(SparkR)
# $example on:init_session$
-sparkR.session(appName = "MyApp", sparkConfig = list(spark.executor.memory = "1g"))
+sparkR.session(appName = "MyApp", sparkConfig = list(spark.some.config.option = "some-value"))
# $example off:init_session$
-# $example on:create_DataFrames$
+# $example on:create_df$
df <- read.json("examples/src/main/resources/people.json")
# Displays the content of the DataFrame
head(df)
+## age name
+## 1 NA Michael
+## 2 30 Andy
+## 3 19 Justin
# Another method to print the first few rows and optionally truncate the printing of long values
showDF(df)
-# $example off:create_DataFrames$
+## +----+-------+
+## | age| name|
+## +----+-------+
+## |null|Michael|
+## | 30| Andy|
+## | 19| Justin|
+## +----+-------+
+## $example off:create_df$
-# $example on:dataframe_operations$
+# $example on:untyped_ops$
# Create the DataFrame
df <- read.json("examples/src/main/resources/people.json")
# Show the content of the DataFrame
head(df)
-## age name
-## null Michael
-## 30 Andy
-## 19 Justin
+## age name
+## 1 NA Michael
+## 2 30 Andy
+## 3 19 Justin
+
# Print the schema in a tree format
printSchema(df)
@@ -52,58 +64,58 @@ printSchema(df)
# Select only the "name" column
head(select(df, "name"))
-## name
-## Michael
-## Andy
-## Justin
+## name
+## 1 Michael
+## 2 Andy
+## 3 Justin
# Select everybody, but increment the age by 1
head(select(df, df$name, df$age + 1))
-## name (age + 1)
-## Michael null
-## Andy 31
-## Justin 20
+## name (age + 1.0)
+## 1 Michael NA
+## 2 Andy 31
+## 3 Justin 20
# Select people older than 21
head(where(df, df$age > 21))
-## age name
-## 30 Andy
+## age name
+## 1 30 Andy
# Count people by age
head(count(groupBy(df, "age")))
-## age count
-## null 1
-## 19 1
-## 30 1
-# $example off:dataframe_operations$
+## age count
+## 1 19 1
+## 2 NA 1
+## 3 30 1
+# $example off:untyped_ops$
# Register this DataFrame as a table.
createOrReplaceTempView(df, "table")
-# $example on:sql_query$
+# $example on:run_sql$
df <- sql("SELECT * FROM table")
-# $example off:sql_query$
+# $example off:run_sql$
-# $example on:source_parquet$
+# $example on:generic_load_save_functions$
df <- read.df("examples/src/main/resources/users.parquet")
write.df(select(df, "name", "favorite_color"), "namesAndFavColors.parquet")
-# $example off:source_parquet$
+# $example off:generic_load_save_functions$
-# $example on:source_json$
+# $example on:manual_load_options$
df <- read.df("examples/src/main/resources/people.json", "json")
namesAndAges <- select(df, "name", "age")
write.df(namesAndAges, "namesAndAges.parquet", "parquet")
-# $example off:source_json$
+# $example off:manual_load_options$
-# $example on:direct_query$
+# $example on:direct_sql$
df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
-# $example off:direct_query$
+# $example off:direct_sql$
-# $example on:load_programmatically$
+# $example on:basic_parquet_example$
df <- read.df("examples/src/main/resources/people.json", "json")
# SparkDataFrame can be saved as Parquet files, maintaining the schema information.
@@ -117,7 +129,7 @@ parquetFile <- read.parquet("people.parquet")
createOrReplaceTempView(parquetFile, "parquetFile")
teenagers <- sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
head(teenagers)
-## name
+## name
## 1 Justin
# We can also run custom R-UDFs on Spark DataFrames. Here we prefix all the names with "Name:"
@@ -129,7 +141,7 @@ for (teenName in collect(teenNames)$name) {
## Name: Michael
## Name: Andy
## Name: Justin
-# $example off:load_programmatically$
+# $example off:basic_parquet_example$
# $example on:schema_merging$
@@ -146,18 +158,17 @@ write.df(df2, "data/test_table/key=2", "parquet", "overwrite")
# Read the partitioned table
df3 <- read.df("data/test_table", "parquet", mergeSchema = "true")
printSchema(df3)
-
# The final schema consists of all 3 columns in the Parquet files together
-# with the partitioning column appeared in the partition directory paths.
-# root
-# |-- single: double (nullable = true)
-# |-- double: double (nullable = true)
-# |-- triple: double (nullable = true)
-# |-- key : int (nullable = true)
+# with the partitioning column appeared in the partition directory paths
+## root
+## |-- single: double (nullable = true)
+## |-- double: double (nullable = true)
+## |-- triple: double (nullable = true)
+## |-- key: integer (nullable = true)
# $example off:schema_merging$
-# $example on:load_json_file$
+# $example on:json_dataset$
# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files.
path <- "examples/src/main/resources/people.json"
@@ -166,9 +177,9 @@ people <- read.json(path)
# The inferred schema can be visualized using the printSchema() method.
printSchema(people)
-# root
-# |-- age: long (nullable = true)
-# |-- name: string (nullable = true)
+## root
+## |-- age: long (nullable = true)
+## |-- name: string (nullable = true)
# Register this DataFrame as a table.
createOrReplaceTempView(people, "people")
@@ -176,12 +187,12 @@ createOrReplaceTempView(people, "people")
# SQL statements can be run by using the sql methods.
teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
head(teenagers)
-## name
+## name
## 1 Justin
-# $example off:load_json_file$
+# $example off:json_dataset$
-# $example on:hive_table$
+# $example on:spark_hive$
# enableHiveSupport defaults to TRUE
sparkR.session(enableHiveSupport = TRUE)
sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
@@ -189,12 +200,12 @@ sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src
# Queries can be expressed in HiveQL.
results <- collect(sql("FROM src SELECT key, value"))
-# $example off:hive_table$
+# $example off:spark_hive$
-# $example on:jdbc$
+# $example on:jdbc_dataset$
df <- read.jdbc("jdbc:postgresql:dbserver", "schema.tablename", user = "username", password = "password")
-# $example off:jdbc$
+# $example off:jdbc_dataset$
# Stop the SparkSession now
sparkR.session.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
index 0caba12af0..dc3915a488 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
@@ -25,7 +25,7 @@ object SQLDataSourceExample {
def main(args: Array[String]) {
val spark = SparkSession
.builder()
- .appName("Spark SQL Data Soures Example")
+ .appName("Spark SQL data sources example")
.config("spark.some.config.option", "some-value")
.getOrCreate()
@@ -33,6 +33,7 @@ object SQLDataSourceExample {
runBasicParquetExample(spark)
runParquetSchemaMergingExample(spark)
runJsonDatasetExample(spark)
+ runJdbcDatasetExample(spark)
spark.stop()
}
@@ -99,10 +100,10 @@ object SQLDataSourceExample {
// The final schema consists of all 3 columns in the Parquet files together
// with the partitioning column appeared in the partition directory paths
// root
- // |-- value: int (nullable = true)
- // |-- square: int (nullable = true)
- // |-- cube: int (nullable = true)
- // |-- key : int (nullable = true)
+ // |-- value: int (nullable = true)
+ // |-- square: int (nullable = true)
+ // |-- cube: int (nullable = true)
+ // |-- key: int (nullable = true)
// $example off:schema_merging$
}
@@ -145,4 +146,15 @@ object SQLDataSourceExample {
// $example off:json_dataset$
}
+ private def runJdbcDatasetExample(spark: SparkSession): Unit = {
+ // $example on:jdbc_dataset$
+ val jdbcDF = spark.read
+ .format("jdbc")
+ .option("url", "jdbc:postgresql:dbserver")
+ .option("dbtable", "schema.tablename")
+ .option("user", "username")
+ .option("password", "password")
+ .load()
+ // $example off:jdbc_dataset$
+ }
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala
index 952c074d03..5cd437d017 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala
@@ -42,7 +42,7 @@ object SparkSQLExample {
// $example on:init_session$
val spark = SparkSession
.builder()
- .appName("Spark SQL Example")
+ .appName("Spark SQL basic example")
.config("spark.some.config.option", "some-value")
.getOrCreate()