aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorDaniel Emaasit (PhD Student) <daniel.emaasit@gmail.com>2015-07-06 10:36:02 -0700
committerShivaram Venkataraman <shivaram@cs.berkeley.edu>2015-07-06 11:08:36 -0700
commit293225e0cd9318ad368dde30ac6a17725d33ebb6 (patch)
treea261f16ad078c5612cf1742eec12f3c1b6ca9e07 /examples
parent39e4e7e4d89077a637c4cad3a986e0e3447d1ae7 (diff)
downloadspark-293225e0cd9318ad368dde30ac6a17725d33ebb6.tar.gz
spark-293225e0cd9318ad368dde30ac6a17725d33ebb6.tar.bz2
spark-293225e0cd9318ad368dde30ac6a17725d33ebb6.zip
[SPARK-8124] [SPARKR] Created more examples on SparkR DataFrames
Here are more examples on SparkR DataFrames including creating a Spark Contect and a SQL context, loading data and simple data manipulation. Author: Daniel Emaasit (PhD Student) <daniel.emaasit@gmail.com> Closes #6668 from Emaasit/dan-dev and squashes the following commits: 3a97867 [Daniel Emaasit (PhD Student)] Used fewer rows for createDataFrame f7227f9 [Daniel Emaasit (PhD Student)] Using command line arguments a550f70 [Daniel Emaasit (PhD Student)] Used base R functions 33f9882 [Daniel Emaasit (PhD Student)] Renamed file b6603e3 [Daniel Emaasit (PhD Student)] changed "Describe" function to "describe" 90565dd [Daniel Emaasit (PhD Student)] Deleted the getting-started file b95a103 [Daniel Emaasit (PhD Student)] Deleted this file cc55cd8 [Daniel Emaasit (PhD Student)] combined all the code into one .R file c6933af [Daniel Emaasit (PhD Student)] changed variable name to SQLContext 8e0fe14 [Daniel Emaasit (PhD Student)] provided two options for creating DataFrames 2653573 [Daniel Emaasit (PhD Student)] Updates to a comment and variable name 275b787 [Daniel Emaasit (PhD Student)] Added the Apache License at the top of the file 2e8f724 [Daniel Emaasit (PhD Student)] Added the Apache License at the top of the file 486f44e [Daniel Emaasit (PhD Student)] Added the Apache License at the file d705112 [Daniel Emaasit (PhD Student)] Created more examples on SparkR DataFrames
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/r/data-manipulation.R107
1 files changed, 107 insertions, 0 deletions
diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R
new file mode 100644
index 0000000000..aa2336e300
--- /dev/null
+++ b/examples/src/main/r/data-manipulation.R
@@ -0,0 +1,107 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# For this example, we shall use the "flights" dataset
+# The dataset consists of every flight departing Houston in 2011.
+# The data set is made up of 227,496 rows x 14 columns.
+
+# To run this example use
+# ./bin/sparkR --packages com.databricks:spark-csv_2.10:1.0.3
+# examples/src/main/r/data-manipulation.R <path_to_csv>
+
+# Load SparkR library into your R session
+library(SparkR)
+
+args <- commandArgs(trailing = TRUE)
+
+if (length(args) != 1) {
+ print("Usage: data-manipulation.R <path-to-flights.csv")
+ print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv ")
+ q("no")
+}
+
+## Initialize SparkContext
+sc <- sparkR.init(appName = "SparkR-data-manipulation-example")
+
+## Initialize SQLContext
+sqlContext <- sparkRSQL.init(sc)
+
+flightsCsvPath <- args[[1]]
+
+# Create a local R dataframe
+flights_df <- read.csv(flightsCsvPath, header = TRUE)
+flights_df$date <- as.Date(flights_df$date)
+
+## Filter flights whose destination is San Francisco and write to a local data frame
+SFO_df <- flights_df[flights_df$dest == "SFO", ]
+
+# Convert the local data frame into a SparkR DataFrame
+SFO_DF <- createDataFrame(sqlContext, SFO_df)
+
+# Directly create a SparkR DataFrame from the source data
+flightsDF <- read.df(sqlContext, flightsCsvPath, source = "com.databricks.spark.csv", header = "true")
+
+# Print the schema of this Spark DataFrame
+printSchema(flightsDF)
+
+# Cache the DataFrame
+cache(flightsDF)
+
+# Print the first 6 rows of the DataFrame
+showDF(flightsDF, numRows = 6) ## Or
+head(flightsDF)
+
+# Show the column names in the DataFrame
+columns(flightsDF)
+
+# Show the number of rows in the DataFrame
+count(flightsDF)
+
+# Select specific columns
+destDF <- select(flightsDF, "dest", "cancelled")
+
+# Using SQL to select columns of data
+# First, register the flights DataFrame as a table
+registerTempTable(flightsDF, "flightsTable")
+destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable")
+
+# Use collect to create a local R data frame
+local_df <- collect(destDF)
+
+# Print the newly created local data frame
+head(local_df)
+
+# Filter flights whose destination is JFK
+jfkDF <- filter(flightsDF, "dest = \"JFK\"") ##OR
+jfkDF <- filter(flightsDF, flightsDF$dest == "JFK")
+
+# If the magrittr library is available, we can use it to
+# chain data frame operations
+if("magrittr" %in% rownames(installed.packages())) {
+ library(magrittr)
+
+ # Group the flights by date and then find the average daily delay
+ # Write the result into a DataFrame
+ groupBy(flightsDF, flightsDF$date) %>%
+ summarize(avg(flightsDF$dep_delay), avg(flightsDF$arr_delay)) -> dailyDelayDF
+
+ # Print the computed data frame
+ head(dailyDelayDF)
+}
+
+# Stop the SparkContext now
+sparkR.stop()