diff options
author | Xin Ren <iamshrek@126.com> | 2016-07-11 20:05:28 +0800 |
---|---|---|
committer | Cheng Lian <lian@databricks.com> | 2016-07-11 20:05:28 +0800 |
commit | 9cb1eb7af779e74165552977002158a7dad9bb09 (patch) | |
tree | c4a3ae1c35526fcc7c05faf2a27954749166dc95 /examples/src/main/r/RSparkSQLExample.R | |
parent | e22627894126dceb7491300b63f1fe028b1e2e2c (diff) | |
download | spark-9cb1eb7af779e74165552977002158a7dad9bb09.tar.gz spark-9cb1eb7af779e74165552977002158a7dad9bb09.tar.bz2 spark-9cb1eb7af779e74165552977002158a7dad9bb09.zip |
[SPARK-16381][SQL][SPARKR] Update SQL examples and programming guide for R language binding
https://issues.apache.org/jira/browse/SPARK-16381
## What changes were proposed in this pull request?
Update SQL examples and programming guide for R language binding.
Here I just follow example https://github.com/apache/spark/compare/master...liancheng:example-snippet-extraction, created a separate R file to store all the example code.
## How was this patch tested?
Manual test on my local machine.
Screenshot as below:
![screen shot 2016-07-06 at 4 52 25 pm](https://cloud.githubusercontent.com/assets/3925641/16638180/13925a58-439a-11e6-8d57-8451a63dcae9.png)
Author: Xin Ren <iamshrek@126.com>
Closes #14082 from keypointt/SPARK-16381.
Diffstat (limited to 'examples/src/main/r/RSparkSQLExample.R')
-rw-r--r-- | examples/src/main/r/RSparkSQLExample.R | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R new file mode 100644 index 0000000000..eba3f1b91e --- /dev/null +++ b/examples/src/main/r/RSparkSQLExample.R @@ -0,0 +1,197 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(SparkR) + +# $example on:init_session$ +sparkR.session(appName = "MyApp", sparkConfig = list(spark.executor.memory = "1g")) +# $example off:init_session$ + + +# $example on:create_DataFrames$ +df <- read.json("examples/src/main/resources/people.json") + +# Displays the content of the DataFrame +head(df) + +# Another method to print the first few rows and optionally truncate the printing of long values +showDF(df) +# $example off:create_DataFrames$ + + +# $example on:dataframe_operations$ +# Create the DataFrame +df <- read.json("examples/src/main/resources/people.json") + +# Show the content of the DataFrame +head(df) +## age name +## null Michael +## 30 Andy +## 19 Justin + +# Print the schema in a tree format +printSchema(df) +## root +## |-- age: long (nullable = true) +## |-- name: string (nullable = true) + +# Select only the "name" column +head(select(df, "name")) +## name +## Michael +## Andy +## Justin + +# Select everybody, but increment the age by 1 +head(select(df, df$name, df$age + 1)) +## name (age + 1) +## Michael null +## Andy 31 +## Justin 20 + +# Select people older than 21 +head(where(df, df$age > 21)) +## age name +## 30 Andy + +# Count people by age +head(count(groupBy(df, "age"))) +## age count +## null 1 +## 19 1 +## 30 1 +# $example off:dataframe_operations$ + + +# Register this DataFrame as a table. +createOrReplaceTempView(df, "table") +# $example on:sql_query$ +df <- sql("SELECT * FROM table") +# $example off:sql_query$ + + +# $example on:source_parquet$ +df <- read.df("examples/src/main/resources/users.parquet") +write.df(select(df, "name", "favorite_color"), "namesAndFavColors.parquet") +# $example off:source_parquet$ + + +# $example on:source_json$ +df <- read.df("examples/src/main/resources/people.json", "json") +namesAndAges <- select(df, "name", "age") +write.df(namesAndAges, "namesAndAges.parquet", "parquet") +# $example off:source_json$ + + +# $example on:direct_query$ +df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") +# $example off:direct_query$ + + +# $example on:load_programmatically$ +df <- read.df("examples/src/main/resources/people.json", "json") + +# SparkDataFrame can be saved as Parquet files, maintaining the schema information. +write.parquet(df, "people.parquet") + +# Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved. +# The result of loading a parquet file is also a DataFrame. +parquetFile <- read.parquet("people.parquet") + +# Parquet files can also be used to create a temporary view and then used in SQL statements. +createOrReplaceTempView(parquetFile, "parquetFile") +teenagers <- sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") +head(teenagers) +## name +## 1 Justin + +# We can also run custom R-UDFs on Spark DataFrames. Here we prefix all the names with "Name:" +schema <- structType(structField("name", "string")) +teenNames <- dapply(df, function(p) { cbind(paste("Name:", p$name)) }, schema) +for (teenName in collect(teenNames)$name) { + cat(teenName, "\n") +} +## Name: Michael +## Name: Andy +## Name: Justin +# $example off:load_programmatically$ + + +# $example on:schema_merging$ +df1 <- createDataFrame(data.frame(single=c(12, 29), double=c(19, 23))) +df2 <- createDataFrame(data.frame(double=c(19, 23), triple=c(23, 18))) + +# Create a simple DataFrame, stored into a partition directory +write.df(df1, "data/test_table/key=1", "parquet", "overwrite") + +# Create another DataFrame in a new partition directory, +# adding a new column and dropping an existing column +write.df(df2, "data/test_table/key=2", "parquet", "overwrite") + +# Read the partitioned table +df3 <- read.df("data/test_table", "parquet", mergeSchema="true") +printSchema(df3) + +# The final schema consists of all 3 columns in the Parquet files together +# with the partitioning column appeared in the partition directory paths. +# root +# |-- single: double (nullable = true) +# |-- double: double (nullable = true) +# |-- triple: double (nullable = true) +# |-- key : int (nullable = true) +# $example off:schema_merging$ + + +# $example on:load_json_file$ +# A JSON dataset is pointed to by path. +# The path can be either a single text file or a directory storing text files. +path <- "examples/src/main/resources/people.json" +# Create a DataFrame from the file(s) pointed to by path +people <- read.json(path) + +# The inferred schema can be visualized using the printSchema() method. +printSchema(people) +# root +# |-- age: long (nullable = true) +# |-- name: string (nullable = true) + +# Register this DataFrame as a table. +createOrReplaceTempView(people, "people") + +# SQL statements can be run by using the sql methods. +teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") +head(teenagers) +## name +## 1 Justin +# $example off:load_json_file$ + + +# $example on:hive_table$ +# enableHiveSupport defaults to TRUE +sparkR.session(enableHiveSupport = TRUE) +sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") +sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src") + +# Queries can be expressed in HiveQL. +results <- collect(sql("FROM src SELECT key, value")) +# $example off:hive_table$ + + +# $example on:jdbc$ +df <- read.jdbc("jdbc:postgresql:dbserver", "schema.tablename", user = "username", password = "password") +# $example off:jdbc$ |