[SPARK-14883][DOCS] Fix wrong R examples and make them up-to-date

## What changes were proposed in this pull request? This issue aims to fix some errors in R examples and make them up-to-date in docs and example modules. - Remove the wrong usage of `map`. We need to use `lapply` in `sparkR` if needed. However, `lapply` is private so far. The corrected example will be added later. - Fix the wrong example in Section `Generic Load/Save Functions` of `docs/sql-programming-guide.md` for consistency - Fix datatypes in `sparkr.md`. - Update a data result in `sparkr.md`. - Replace deprecated functions to remove warnings: jsonFile -> read.json, parquetFile -> read.parquet - Use up-to-date R-like functions: loadDF -> read.df, saveDF -> write.df, saveAsParquetFile -> write.parquet - Replace `SparkR DataFrame` with `SparkDataFrame` in `dataframe.R` and `data-manipulation.R`. - Other minor syntax fixes and a typo. ## How was this patch tested? Manual. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #12649 from dongjoon-hyun/SPARK-14883.
author: Dongjoon Hyun <dongjoon@apache.org> 2016-04-24 22:10:27 -0700
committer: Shivaram Venkataraman <shivaram@cs.berkeley.edu> 2016-04-24 22:10:27 -0700
commit: 6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19 (patch)
tree: 494b601ba783d7b025b805504bde8f3f92b7667b
parent: 35319d326488b3bf9235dfcf9ac4533ce846f21f (diff)
download: spark-6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19.tar.gz
spark-6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19.tar.bz2
spark-6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19.zip
5 files changed, 31 insertions, 36 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 3b2fd73375..890d15dfee 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -845,7 +845,7 @@ setMethod("ncol",
             length(columns(x))
           })
 
-#' Returns the dimentions (number of rows and columns) of a SparkDataFrame
+#' Returns the dimensions (number of rows and columns) of a SparkDataFrame
 #' @param x a SparkDataFrame
 #'
 #' @family SparkDataFrame functions
diff --git a/docs/sparkr.md b/docs/sparkr.md
index a0b4f93776..760534ae14 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -141,7 +141,7 @@ head(people)
 # SparkR automatically infers the schema from the JSON file
 printSchema(people)
 # root
-#  |-- age: integer (nullable = true)
+#  |-- age: long (nullable = true)
 #  |-- name: string (nullable = true)
 
 {% endhighlight %}
@@ -195,7 +195,7 @@ df <- createDataFrame(sqlContext, faithful)
 
 # Get basic information about the DataFrame
 df
-## DataFrame[eruptions:double, waiting:double]
+## SparkDataFrame[eruptions:double, waiting:double]
 
 # Select only the "eruptions" column
 head(select(df, df$eruptions))
@@ -228,14 +228,13 @@ SparkR data frames support a number of commonly used functions to aggregate data
 # We use the `n` operator to count the number of times each waiting time appears
 head(summarize(groupBy(df, df$waiting), count = n(df$waiting)))
 ##  waiting count
-##1      81    13
-##2      60     6
-##3      68     1
+##1      70     4
+##2      67     1
+##3      69     2
 
 # We can also sort the output from the aggregation to get the most common waiting times
 waiting_counts <- summarize(groupBy(df, df$waiting), count = n(df$waiting))
 head(arrange(waiting_counts, desc(waiting_counts$count)))
-
 ##   waiting count
 ##1      78    15
 ##2      83    14
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 77887f4ca3..9a3db9c3f9 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -173,7 +173,7 @@ df.show()
 {% highlight r %}
 sqlContext <- SQLContext(sc)
 
-df <- jsonFile(sqlContext, "examples/src/main/resources/people.json")
+df <- read.json(sqlContext, "examples/src/main/resources/people.json")
 
 # Displays the content of the DataFrame to stdout
 showDF(df)
@@ -366,7 +366,7 @@ In addition to simple column references and expressions, DataFrames also have a
 sqlContext <- sparkRSQL.init(sc)
 
 # Create the DataFrame
-df <- jsonFile(sqlContext, "examples/src/main/resources/people.json")
+df <- read.json(sqlContext, "examples/src/main/resources/people.json")
 
 # Show the content of the DataFrame
 showDF(df)
@@ -889,8 +889,8 @@ df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")
 <div data-lang="r"  markdown="1">
 
 {% highlight r %}
-df <- loadDF(sqlContext, "people.parquet")
-saveDF(select(df, "name", "age"), "namesAndAges.parquet")
+df <- read.df(sqlContext, "examples/src/main/resources/users.parquet")
+write.df(select(df, "name", "favorite_color"), "namesAndFavColors.parquet")
 {% endhighlight %}
 
 </div>
@@ -939,8 +939,8 @@ df.select("name", "age").write.save("namesAndAges.parquet", format="parquet")
 
 {% highlight r %}
 
-df <- loadDF(sqlContext, "people.json", "json")
-saveDF(select(df, "name", "age"), "namesAndAges.parquet", "parquet")
+df <- read.df(sqlContext, "examples/src/main/resources/people.json", "json")
+write.df(select(df, "name", "age"), "namesAndAges.parquet", "parquet")
 
 {% endhighlight %}
 
@@ -1138,19 +1138,15 @@ for teenName in teenNames.collect():
 schemaPeople # The DataFrame from the previous example.
 
 # DataFrames can be saved as Parquet files, maintaining the schema information.
-saveAsParquetFile(schemaPeople, "people.parquet")
+write.parquet(schemaPeople, "people.parquet")
 
 # Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved.
 # The result of loading a parquet file is also a DataFrame.
-parquetFile <- parquetFile(sqlContext, "people.parquet")
+parquetFile <- read.parquet(sqlContext, "people.parquet")
 
 # Parquet files can also be registered as tables and then used in SQL statements.
-registerTempTable(parquetFile, "parquetFile");
+registerTempTable(parquetFile, "parquetFile")
 teenagers <- sql(sqlContext, "SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
-teenNames <- map(teenagers, function(p) { paste("Name:", p$name)})
-for (teenName in collect(teenNames)) {
-  cat(teenName, "\n")
-}
 {% endhighlight %}
 
 </div>
@@ -1318,14 +1314,14 @@ df3.printSchema()
 # sqlContext from the previous example is used in this example.
 
 # Create a simple DataFrame, stored into a partition directory
-saveDF(df1, "data/test_table/key=1", "parquet", "overwrite")
+write.df(df1, "data/test_table/key=1", "parquet", "overwrite")
 
 # Create another DataFrame in a new partition directory,
 # adding a new column and dropping an existing column
-saveDF(df2, "data/test_table/key=2", "parquet", "overwrite")
+write.df(df2, "data/test_table/key=2", "parquet", "overwrite")
 
 # Read the partitioned table
-df3 <- loadDF(sqlContext, "data/test_table", "parquet", mergeSchema="true")
+df3 <- read.df(sqlContext, "data/test_table", "parquet", mergeSchema="true")
 printSchema(df3)
 
 # The final schema consists of all 3 columns in the Parquet files together
@@ -1612,7 +1608,7 @@ sqlContext <- sparkRSQL.init(sc)
 # The path can be either a single text file or a directory storing text files.
 path <- "examples/src/main/resources/people.json"
 # Create a DataFrame from the file(s) pointed to by path
-people <- jsonFile(sqlContext, path)
+people <- read.json(sqlContext, path)
 
 # The inferred schema can be visualized using the printSchema() method.
 printSchema(people)
diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R
index aa2336e300..594bf49d60 100644
--- a/examples/src/main/r/data-manipulation.R
+++ b/examples/src/main/r/data-manipulation.R
@@ -30,7 +30,7 @@ args <- commandArgs(trailing = TRUE)
 
 if (length(args) != 1) {
   print("Usage: data-manipulation.R <path-to-flights.csv")
-  print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv ")
+  print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv")
   q("no")
 }
 
@@ -49,33 +49,33 @@ flights_df$date <- as.Date(flights_df$date)
 ## Filter flights whose destination is San Francisco and write to a local data frame
 SFO_df <- flights_df[flights_df$dest == "SFO", ] 
 
-# Convert the local data frame into a SparkR DataFrame
+# Convert the local data frame into a SparkDataFrame
 SFO_DF <- createDataFrame(sqlContext, SFO_df)
 
-#  Directly create a SparkR DataFrame from the source data
+#  Directly create a SparkDataFrame from the source data
 flightsDF <- read.df(sqlContext, flightsCsvPath, source = "com.databricks.spark.csv", header = "true")
 
-# Print the schema of this Spark DataFrame
+# Print the schema of this SparkDataFrame
 printSchema(flightsDF)
 
-# Cache the DataFrame
+# Cache the SparkDataFrame
 cache(flightsDF)
 
-# Print the first 6 rows of the DataFrame
+# Print the first 6 rows of the SparkDataFrame
 showDF(flightsDF, numRows = 6) ## Or
 head(flightsDF)
 
-# Show the column names in the DataFrame
+# Show the column names in the SparkDataFrame
 columns(flightsDF)
 
-# Show the number of rows in the DataFrame
+# Show the number of rows in the SparkDataFrame
 count(flightsDF)
 
 # Select specific columns
 destDF <- select(flightsDF, "dest", "cancelled")
 
 # Using SQL to select columns of data
-# First, register the flights DataFrame as a table
+# First, register the flights SparkDataFrame as a table
 registerTempTable(flightsDF, "flightsTable")
 destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable")
 
@@ -95,11 +95,11 @@ if("magrittr" %in% rownames(installed.packages())) {
   library(magrittr)
 
   # Group the flights by date and then find the average daily delay
-  # Write the result into a DataFrame
+  # Write the result into a SparkDataFrame
   groupBy(flightsDF, flightsDF$date) %>%
     summarize(avg(flightsDF$dep_delay), avg(flightsDF$arr_delay)) -> dailyDelayDF
 
-  # Print the computed data frame
+  # Print the computed SparkDataFrame
   head(dailyDelayDF)
 }
 
diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R
index 62f60e57ee..436bac6aaf 100644
--- a/examples/src/main/r/dataframe.R
+++ b/examples/src/main/r/dataframe.R
@@ -24,7 +24,7 @@ sqlContext <- sparkRSQL.init(sc)
 # Create a simple local data.frame
 localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18))
 
-# Convert local data frame to a SparkR DataFrame
+# Convert local data frame to a SparkDataFrame
 df <- createDataFrame(sqlContext, localDF)
 
 # Print its schema
author	Dongjoon Hyun <dongjoon@apache.org>	2016-04-24 22:10:27 -0700
committer	Shivaram Venkataraman <shivaram@cs.berkeley.edu>	2016-04-24 22:10:27 -0700
commit	6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19 (patch)
tree	494b601ba783d7b025b805504bde8f3f92b7667b
parent	35319d326488b3bf9235dfcf9ac4533ce846f21f (diff)
download	spark-6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19.tar.gz spark-6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19.tar.bz2 spark-6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19.zip