From 4ae9fe091c2cb8388c581093d62d3deaef40993e Mon Sep 17 00:00:00 2001 From: Sun Rui Date: Fri, 29 Apr 2016 16:41:07 -0700 Subject: [SPARK-12919][SPARKR] Implement dapply() on DataFrame in SparkR. ## What changes were proposed in this pull request? dapply() applies an R function on each partition of a DataFrame and returns a new DataFrame. The function signature is: dapply(df, function(localDF) {}, schema = NULL) R function input: local data.frame from the partition on local node R function output: local data.frame Schema specifies the Row format of the resulting DataFrame. It must match the R function's output. If schema is not specified, each partition of the result DataFrame will be serialized in R into a single byte array. Such resulting DataFrame can be processed by successive calls to dapply(). ## How was this patch tested? SparkR unit tests. Author: Sun Rui Author: Sun Rui Closes #12493 from sun-rui/SPARK-12919. --- docs/sql-programming-guide.md | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'docs') diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 9a3db9c3f9..a16a6bb1d9 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1147,6 +1147,11 @@ parquetFile <- read.parquet(sqlContext, "people.parquet") # Parquet files can also be registered as tables and then used in SQL statements. registerTempTable(parquetFile, "parquetFile") teenagers <- sql(sqlContext, "SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") +schema <- structType(structField("name", "string")) +teenNames <- dapply(df, function(p) { cbind(paste("Name:", p$name)) }, schema) +for (teenName in collect(teenNames)$name) { + cat(teenName, "\n") +} {% endhighlight %} -- cgit v1.2.3