diff options
author | Dongjoon Hyun <dongjoon@apache.org> | 2016-04-24 22:10:27 -0700 |
---|---|---|
committer | Shivaram Venkataraman <shivaram@cs.berkeley.edu> | 2016-04-24 22:10:27 -0700 |
commit | 6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19 (patch) | |
tree | 494b601ba783d7b025b805504bde8f3f92b7667b /docs/sql-programming-guide.md | |
parent | 35319d326488b3bf9235dfcf9ac4533ce846f21f (diff) | |
download | spark-6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19.tar.gz spark-6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19.tar.bz2 spark-6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19.zip |
[SPARK-14883][DOCS] Fix wrong R examples and make them up-to-date
## What changes were proposed in this pull request?
This issue aims to fix some errors in R examples and make them up-to-date in docs and example modules.
- Remove the wrong usage of `map`. We need to use `lapply` in `sparkR` if needed. However, `lapply` is private so far. The corrected example will be added later.
- Fix the wrong example in Section `Generic Load/Save Functions` of `docs/sql-programming-guide.md` for consistency
- Fix datatypes in `sparkr.md`.
- Update a data result in `sparkr.md`.
- Replace deprecated functions to remove warnings: jsonFile -> read.json, parquetFile -> read.parquet
- Use up-to-date R-like functions: loadDF -> read.df, saveDF -> write.df, saveAsParquetFile -> write.parquet
- Replace `SparkR DataFrame` with `SparkDataFrame` in `dataframe.R` and `data-manipulation.R`.
- Other minor syntax fixes and a typo.
## How was this patch tested?
Manual.
Author: Dongjoon Hyun <dongjoon@apache.org>
Closes #12649 from dongjoon-hyun/SPARK-14883.
Diffstat (limited to 'docs/sql-programming-guide.md')
-rw-r--r-- | docs/sql-programming-guide.md | 30 |
1 files changed, 13 insertions, 17 deletions
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 77887f4ca3..9a3db9c3f9 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -173,7 +173,7 @@ df.show() {% highlight r %} sqlContext <- SQLContext(sc) -df <- jsonFile(sqlContext, "examples/src/main/resources/people.json") +df <- read.json(sqlContext, "examples/src/main/resources/people.json") # Displays the content of the DataFrame to stdout showDF(df) @@ -366,7 +366,7 @@ In addition to simple column references and expressions, DataFrames also have a sqlContext <- sparkRSQL.init(sc) # Create the DataFrame -df <- jsonFile(sqlContext, "examples/src/main/resources/people.json") +df <- read.json(sqlContext, "examples/src/main/resources/people.json") # Show the content of the DataFrame showDF(df) @@ -889,8 +889,8 @@ df.select("name", "favorite_color").write.save("namesAndFavColors.parquet") <div data-lang="r" markdown="1"> {% highlight r %} -df <- loadDF(sqlContext, "people.parquet") -saveDF(select(df, "name", "age"), "namesAndAges.parquet") +df <- read.df(sqlContext, "examples/src/main/resources/users.parquet") +write.df(select(df, "name", "favorite_color"), "namesAndFavColors.parquet") {% endhighlight %} </div> @@ -939,8 +939,8 @@ df.select("name", "age").write.save("namesAndAges.parquet", format="parquet") {% highlight r %} -df <- loadDF(sqlContext, "people.json", "json") -saveDF(select(df, "name", "age"), "namesAndAges.parquet", "parquet") +df <- read.df(sqlContext, "examples/src/main/resources/people.json", "json") +write.df(select(df, "name", "age"), "namesAndAges.parquet", "parquet") {% endhighlight %} @@ -1138,19 +1138,15 @@ for teenName in teenNames.collect(): schemaPeople # The DataFrame from the previous example. # DataFrames can be saved as Parquet files, maintaining the schema information. -saveAsParquetFile(schemaPeople, "people.parquet") +write.parquet(schemaPeople, "people.parquet") # Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved. # The result of loading a parquet file is also a DataFrame. -parquetFile <- parquetFile(sqlContext, "people.parquet") +parquetFile <- read.parquet(sqlContext, "people.parquet") # Parquet files can also be registered as tables and then used in SQL statements. -registerTempTable(parquetFile, "parquetFile"); +registerTempTable(parquetFile, "parquetFile") teenagers <- sql(sqlContext, "SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") -teenNames <- map(teenagers, function(p) { paste("Name:", p$name)}) -for (teenName in collect(teenNames)) { - cat(teenName, "\n") -} {% endhighlight %} </div> @@ -1318,14 +1314,14 @@ df3.printSchema() # sqlContext from the previous example is used in this example. # Create a simple DataFrame, stored into a partition directory -saveDF(df1, "data/test_table/key=1", "parquet", "overwrite") +write.df(df1, "data/test_table/key=1", "parquet", "overwrite") # Create another DataFrame in a new partition directory, # adding a new column and dropping an existing column -saveDF(df2, "data/test_table/key=2", "parquet", "overwrite") +write.df(df2, "data/test_table/key=2", "parquet", "overwrite") # Read the partitioned table -df3 <- loadDF(sqlContext, "data/test_table", "parquet", mergeSchema="true") +df3 <- read.df(sqlContext, "data/test_table", "parquet", mergeSchema="true") printSchema(df3) # The final schema consists of all 3 columns in the Parquet files together @@ -1612,7 +1608,7 @@ sqlContext <- sparkRSQL.init(sc) # The path can be either a single text file or a directory storing text files. path <- "examples/src/main/resources/people.json" # Create a DataFrame from the file(s) pointed to by path -people <- jsonFile(sqlContext, path) +people <- read.json(sqlContext, path) # The inferred schema can be visualized using the printSchema() method. printSchema(people) |