diff options
Diffstat (limited to 'examples/src/main/r')
-rw-r--r-- | examples/src/main/r/RSparkSQLExample.R | 197 | ||||
-rw-r--r-- | examples/src/main/r/dataframe.R | 2 | ||||
-rw-r--r-- | examples/src/main/r/ml.R | 2 |
3 files changed, 199 insertions, 2 deletions
diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R new file mode 100644 index 0000000000..eba3f1b91e --- /dev/null +++ b/examples/src/main/r/RSparkSQLExample.R @@ -0,0 +1,197 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(SparkR) + +# $example on:init_session$ +sparkR.session(appName = "MyApp", sparkConfig = list(spark.executor.memory = "1g")) +# $example off:init_session$ + + +# $example on:create_DataFrames$ +df <- read.json("examples/src/main/resources/people.json") + +# Displays the content of the DataFrame +head(df) + +# Another method to print the first few rows and optionally truncate the printing of long values +showDF(df) +# $example off:create_DataFrames$ + + +# $example on:dataframe_operations$ +# Create the DataFrame +df <- read.json("examples/src/main/resources/people.json") + +# Show the content of the DataFrame +head(df) +## age name +## null Michael +## 30 Andy +## 19 Justin + +# Print the schema in a tree format +printSchema(df) +## root +## |-- age: long (nullable = true) +## |-- name: string (nullable = true) + +# Select only the "name" column +head(select(df, "name")) +## name +## Michael +## Andy +## Justin + +# Select everybody, but increment the age by 1 +head(select(df, df$name, df$age + 1)) +## name (age + 1) +## Michael null +## Andy 31 +## Justin 20 + +# Select people older than 21 +head(where(df, df$age > 21)) +## age name +## 30 Andy + +# Count people by age +head(count(groupBy(df, "age"))) +## age count +## null 1 +## 19 1 +## 30 1 +# $example off:dataframe_operations$ + + +# Register this DataFrame as a table. +createOrReplaceTempView(df, "table") +# $example on:sql_query$ +df <- sql("SELECT * FROM table") +# $example off:sql_query$ + + +# $example on:source_parquet$ +df <- read.df("examples/src/main/resources/users.parquet") +write.df(select(df, "name", "favorite_color"), "namesAndFavColors.parquet") +# $example off:source_parquet$ + + +# $example on:source_json$ +df <- read.df("examples/src/main/resources/people.json", "json") +namesAndAges <- select(df, "name", "age") +write.df(namesAndAges, "namesAndAges.parquet", "parquet") +# $example off:source_json$ + + +# $example on:direct_query$ +df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") +# $example off:direct_query$ + + +# $example on:load_programmatically$ +df <- read.df("examples/src/main/resources/people.json", "json") + +# SparkDataFrame can be saved as Parquet files, maintaining the schema information. +write.parquet(df, "people.parquet") + +# Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved. +# The result of loading a parquet file is also a DataFrame. +parquetFile <- read.parquet("people.parquet") + +# Parquet files can also be used to create a temporary view and then used in SQL statements. +createOrReplaceTempView(parquetFile, "parquetFile") +teenagers <- sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") +head(teenagers) +## name +## 1 Justin + +# We can also run custom R-UDFs on Spark DataFrames. Here we prefix all the names with "Name:" +schema <- structType(structField("name", "string")) +teenNames <- dapply(df, function(p) { cbind(paste("Name:", p$name)) }, schema) +for (teenName in collect(teenNames)$name) { + cat(teenName, "\n") +} +## Name: Michael +## Name: Andy +## Name: Justin +# $example off:load_programmatically$ + + +# $example on:schema_merging$ +df1 <- createDataFrame(data.frame(single=c(12, 29), double=c(19, 23))) +df2 <- createDataFrame(data.frame(double=c(19, 23), triple=c(23, 18))) + +# Create a simple DataFrame, stored into a partition directory +write.df(df1, "data/test_table/key=1", "parquet", "overwrite") + +# Create another DataFrame in a new partition directory, +# adding a new column and dropping an existing column +write.df(df2, "data/test_table/key=2", "parquet", "overwrite") + +# Read the partitioned table +df3 <- read.df("data/test_table", "parquet", mergeSchema="true") +printSchema(df3) + +# The final schema consists of all 3 columns in the Parquet files together +# with the partitioning column appeared in the partition directory paths. +# root +# |-- single: double (nullable = true) +# |-- double: double (nullable = true) +# |-- triple: double (nullable = true) +# |-- key : int (nullable = true) +# $example off:schema_merging$ + + +# $example on:load_json_file$ +# A JSON dataset is pointed to by path. +# The path can be either a single text file or a directory storing text files. +path <- "examples/src/main/resources/people.json" +# Create a DataFrame from the file(s) pointed to by path +people <- read.json(path) + +# The inferred schema can be visualized using the printSchema() method. +printSchema(people) +# root +# |-- age: long (nullable = true) +# |-- name: string (nullable = true) + +# Register this DataFrame as a table. +createOrReplaceTempView(people, "people") + +# SQL statements can be run by using the sql methods. +teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") +head(teenagers) +## name +## 1 Justin +# $example off:load_json_file$ + + +# $example on:hive_table$ +# enableHiveSupport defaults to TRUE +sparkR.session(enableHiveSupport = TRUE) +sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") +sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src") + +# Queries can be expressed in HiveQL. +results <- collect(sql("FROM src SELECT key, value")) +# $example off:hive_table$ + + +# $example on:jdbc$ +df <- read.jdbc("jdbc:postgresql:dbserver", "schema.tablename", user = "username", password = "password") +# $example off:jdbc$ diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R index a377d6e864..295f9b4276 100644 --- a/examples/src/main/r/dataframe.R +++ b/examples/src/main/r/dataframe.R @@ -18,7 +18,7 @@ library(SparkR) # Initialize SparkSession -sc <- sparkR.session(appName="SparkR-DataFrame-example") +sc <- sparkR.session(appName = "SparkR-DataFrame-example") # Create a simple local data.frame localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18)) diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R index 940c98dcb9..65242e68b3 100644 --- a/examples/src/main/r/ml.R +++ b/examples/src/main/r/ml.R @@ -22,7 +22,7 @@ library(SparkR) # Initialize SparkSession -sparkR.session(appName="SparkR-ML-example") +sparkR.session(appName = "SparkR-ML-example") # $example on$ ############################ spark.glm and glm ############################################## |