aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/r
diff options
context:
space:
mode:
authorXin Ren <iamshrek@126.com>2016-07-11 20:05:28 +0800
committerCheng Lian <lian@databricks.com>2016-07-11 20:05:28 +0800
commit9cb1eb7af779e74165552977002158a7dad9bb09 (patch)
treec4a3ae1c35526fcc7c05faf2a27954749166dc95 /examples/src/main/r
parente22627894126dceb7491300b63f1fe028b1e2e2c (diff)
downloadspark-9cb1eb7af779e74165552977002158a7dad9bb09.tar.gz
spark-9cb1eb7af779e74165552977002158a7dad9bb09.tar.bz2
spark-9cb1eb7af779e74165552977002158a7dad9bb09.zip
[SPARK-16381][SQL][SPARKR] Update SQL examples and programming guide for R language binding
https://issues.apache.org/jira/browse/SPARK-16381 ## What changes were proposed in this pull request? Update SQL examples and programming guide for R language binding. Here I just follow example https://github.com/apache/spark/compare/master...liancheng:example-snippet-extraction, created a separate R file to store all the example code. ## How was this patch tested? Manual test on my local machine. Screenshot as below: ![screen shot 2016-07-06 at 4 52 25 pm](https://cloud.githubusercontent.com/assets/3925641/16638180/13925a58-439a-11e6-8d57-8451a63dcae9.png) Author: Xin Ren <iamshrek@126.com> Closes #14082 from keypointt/SPARK-16381.
Diffstat (limited to 'examples/src/main/r')
-rw-r--r--examples/src/main/r/RSparkSQLExample.R197
-rw-r--r--examples/src/main/r/dataframe.R2
-rw-r--r--examples/src/main/r/ml.R2
3 files changed, 199 insertions, 2 deletions
diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R
new file mode 100644
index 0000000000..eba3f1b91e
--- /dev/null
+++ b/examples/src/main/r/RSparkSQLExample.R
@@ -0,0 +1,197 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(SparkR)
+
+# $example on:init_session$
+sparkR.session(appName = "MyApp", sparkConfig = list(spark.executor.memory = "1g"))
+# $example off:init_session$
+
+
+# $example on:create_DataFrames$
+df <- read.json("examples/src/main/resources/people.json")
+
+# Displays the content of the DataFrame
+head(df)
+
+# Another method to print the first few rows and optionally truncate the printing of long values
+showDF(df)
+# $example off:create_DataFrames$
+
+
+# $example on:dataframe_operations$
+# Create the DataFrame
+df <- read.json("examples/src/main/resources/people.json")
+
+# Show the content of the DataFrame
+head(df)
+## age name
+## null Michael
+## 30 Andy
+## 19 Justin
+
+# Print the schema in a tree format
+printSchema(df)
+## root
+## |-- age: long (nullable = true)
+## |-- name: string (nullable = true)
+
+# Select only the "name" column
+head(select(df, "name"))
+## name
+## Michael
+## Andy
+## Justin
+
+# Select everybody, but increment the age by 1
+head(select(df, df$name, df$age + 1))
+## name (age + 1)
+## Michael null
+## Andy 31
+## Justin 20
+
+# Select people older than 21
+head(where(df, df$age > 21))
+## age name
+## 30 Andy
+
+# Count people by age
+head(count(groupBy(df, "age")))
+## age count
+## null 1
+## 19 1
+## 30 1
+# $example off:dataframe_operations$
+
+
+# Register this DataFrame as a table.
+createOrReplaceTempView(df, "table")
+# $example on:sql_query$
+df <- sql("SELECT * FROM table")
+# $example off:sql_query$
+
+
+# $example on:source_parquet$
+df <- read.df("examples/src/main/resources/users.parquet")
+write.df(select(df, "name", "favorite_color"), "namesAndFavColors.parquet")
+# $example off:source_parquet$
+
+
+# $example on:source_json$
+df <- read.df("examples/src/main/resources/people.json", "json")
+namesAndAges <- select(df, "name", "age")
+write.df(namesAndAges, "namesAndAges.parquet", "parquet")
+# $example off:source_json$
+
+
+# $example on:direct_query$
+df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
+# $example off:direct_query$
+
+
+# $example on:load_programmatically$
+df <- read.df("examples/src/main/resources/people.json", "json")
+
+# SparkDataFrame can be saved as Parquet files, maintaining the schema information.
+write.parquet(df, "people.parquet")
+
+# Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved.
+# The result of loading a parquet file is also a DataFrame.
+parquetFile <- read.parquet("people.parquet")
+
+# Parquet files can also be used to create a temporary view and then used in SQL statements.
+createOrReplaceTempView(parquetFile, "parquetFile")
+teenagers <- sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
+head(teenagers)
+## name
+## 1 Justin
+
+# We can also run custom R-UDFs on Spark DataFrames. Here we prefix all the names with "Name:"
+schema <- structType(structField("name", "string"))
+teenNames <- dapply(df, function(p) { cbind(paste("Name:", p$name)) }, schema)
+for (teenName in collect(teenNames)$name) {
+ cat(teenName, "\n")
+}
+## Name: Michael
+## Name: Andy
+## Name: Justin
+# $example off:load_programmatically$
+
+
+# $example on:schema_merging$
+df1 <- createDataFrame(data.frame(single=c(12, 29), double=c(19, 23)))
+df2 <- createDataFrame(data.frame(double=c(19, 23), triple=c(23, 18)))
+
+# Create a simple DataFrame, stored into a partition directory
+write.df(df1, "data/test_table/key=1", "parquet", "overwrite")
+
+# Create another DataFrame in a new partition directory,
+# adding a new column and dropping an existing column
+write.df(df2, "data/test_table/key=2", "parquet", "overwrite")
+
+# Read the partitioned table
+df3 <- read.df("data/test_table", "parquet", mergeSchema="true")
+printSchema(df3)
+
+# The final schema consists of all 3 columns in the Parquet files together
+# with the partitioning column appeared in the partition directory paths.
+# root
+# |-- single: double (nullable = true)
+# |-- double: double (nullable = true)
+# |-- triple: double (nullable = true)
+# |-- key : int (nullable = true)
+# $example off:schema_merging$
+
+
+# $example on:load_json_file$
+# A JSON dataset is pointed to by path.
+# The path can be either a single text file or a directory storing text files.
+path <- "examples/src/main/resources/people.json"
+# Create a DataFrame from the file(s) pointed to by path
+people <- read.json(path)
+
+# The inferred schema can be visualized using the printSchema() method.
+printSchema(people)
+# root
+# |-- age: long (nullable = true)
+# |-- name: string (nullable = true)
+
+# Register this DataFrame as a table.
+createOrReplaceTempView(people, "people")
+
+# SQL statements can be run by using the sql methods.
+teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
+head(teenagers)
+## name
+## 1 Justin
+# $example off:load_json_file$
+
+
+# $example on:hive_table$
+# enableHiveSupport defaults to TRUE
+sparkR.session(enableHiveSupport = TRUE)
+sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
+
+# Queries can be expressed in HiveQL.
+results <- collect(sql("FROM src SELECT key, value"))
+# $example off:hive_table$
+
+
+# $example on:jdbc$
+df <- read.jdbc("jdbc:postgresql:dbserver", "schema.tablename", user = "username", password = "password")
+# $example off:jdbc$
diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R
index a377d6e864..295f9b4276 100644
--- a/examples/src/main/r/dataframe.R
+++ b/examples/src/main/r/dataframe.R
@@ -18,7 +18,7 @@
library(SparkR)
# Initialize SparkSession
-sc <- sparkR.session(appName="SparkR-DataFrame-example")
+sc <- sparkR.session(appName = "SparkR-DataFrame-example")
# Create a simple local data.frame
localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18))
diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R
index 940c98dcb9..65242e68b3 100644
--- a/examples/src/main/r/ml.R
+++ b/examples/src/main/r/ml.R
@@ -22,7 +22,7 @@
library(SparkR)
# Initialize SparkSession
-sparkR.session(appName="SparkR-ML-example")
+sparkR.session(appName = "SparkR-ML-example")
# $example on$
############################ spark.glm and glm ##############################################