aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorHossein <hossein@databricks.com>2015-07-31 14:07:41 -0700
committerShivaram Venkataraman <shivaram@cs.berkeley.edu>2015-07-31 14:08:18 -0700
commit710c2b5dd2dc6b8d947303ad8dfae4539b63fe11 (patch)
treea7f581717a60841fbbf621306acaf2e4df11b0a2 /R
parent82f47b811607a1eeeecba437fe0ffc15d4e5f9ec (diff)
downloadspark-710c2b5dd2dc6b8d947303ad8dfae4539b63fe11.tar.gz
spark-710c2b5dd2dc6b8d947303ad8dfae4539b63fe11.tar.bz2
spark-710c2b5dd2dc6b8d947303ad8dfae4539b63fe11.zip
[SPARK-9324] [SPARK-9322] [SPARK-9321] [SPARKR] Some aliases for R-like functions in DataFrames
Adds following aliases: * unique (distinct) * rbind (unionAll): accepts many DataFrames * nrow (count) * ncol * dim * names (columns): along with the replacement function to change names Author: Hossein <hossein@databricks.com> Closes #7764 from falaki/sparkR-alias and squashes the following commits: 56016f5 [Hossein] Updated R documentation 5e4a4d0 [Hossein] Removed extra code f51cbef [Hossein] Merge branch 'master' into sparkR-alias c1b88bd [Hossein] Moved setGeneric and other comments applied d9307f8 [Hossein] Added tests b5aa988 [Hossein] Added dim, ncol, nrow, names, rbind, and unique functions to DataFrames
Diffstat (limited to 'R')
-rw-r--r--R/pkg/NAMESPACE6
-rw-r--r--R/pkg/R/DataFrame.R90
-rw-r--r--R/pkg/R/generics.R4
-rw-r--r--R/pkg/inst/tests/test_sparkSQL.R22
4 files changed, 119 insertions, 3 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index a329e14f25..ff116cb1fb 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -29,6 +29,7 @@ exportMethods("arrange",
"count",
"crosstab",
"describe",
+ "dim",
"distinct",
"dropna",
"dtypes",
@@ -45,11 +46,15 @@ exportMethods("arrange",
"isLocal",
"join",
"limit",
+ "names",
+ "ncol",
+ "nrow",
"orderBy",
"mutate",
"names",
"persist",
"printSchema",
+ "rbind",
"registerTempTable",
"rename",
"repartition",
@@ -66,6 +71,7 @@ exportMethods("arrange",
"summarize",
"take",
"unionAll",
+ "unique",
"unpersist",
"where",
"withColumn",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index b31ad3729e..b4065d2944 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -255,6 +255,16 @@ setMethod("names",
columns(x)
})
+#' @rdname columns
+setMethod("names<-",
+ signature(x = "DataFrame"),
+ function(x, value) {
+ if (!is.null(value)) {
+ sdf <- callJMethod(x@sdf, "toDF", listToSeq(as.list(value)))
+ dataFrame(sdf)
+ }
+ })
+
#' Register Temporary Table
#'
#' Registers a DataFrame as a Temporary Table in the SQLContext
@@ -473,6 +483,18 @@ setMethod("distinct",
dataFrame(sdf)
})
+#' @title Distinct rows in a DataFrame
+#
+#' @description Returns a new DataFrame containing distinct rows in this DataFrame
+#'
+#' @rdname unique
+#' @aliases unique
+setMethod("unique",
+ signature(x = "DataFrame"),
+ function(x) {
+ distinct(x)
+ })
+
#' Sample
#'
#' Return a sampled subset of this DataFrame using a random seed.
@@ -534,6 +556,58 @@ setMethod("count",
callJMethod(x@sdf, "count")
})
+#' @title Number of rows for a DataFrame
+#' @description Returns number of rows in a DataFrames
+#'
+#' @name nrow
+#'
+#' @rdname nrow
+#' @aliases count
+setMethod("nrow",
+ signature(x = "DataFrame"),
+ function(x) {
+ count(x)
+ })
+
+#' Returns the number of columns in a DataFrame
+#'
+#' @param x a SparkSQL DataFrame
+#'
+#' @rdname ncol
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlContext <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlContext, path)
+#' ncol(df)
+#' }
+setMethod("ncol",
+ signature(x = "DataFrame"),
+ function(x) {
+ length(columns(x))
+ })
+
+#' Returns the dimentions (number of rows and columns) of a DataFrame
+#' @param x a SparkSQL DataFrame
+#'
+#' @rdname dim
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlContext <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlContext, path)
+#' dim(df)
+#' }
+setMethod("dim",
+ signature(x = "DataFrame"),
+ function(x) {
+ c(count(x), ncol(x))
+ })
+
#' Collects all the elements of a Spark DataFrame and coerces them into an R data.frame.
#'
#' @param x A SparkSQL DataFrame
@@ -1231,6 +1305,22 @@ setMethod("unionAll",
dataFrame(unioned)
})
+#' @title Union two or more DataFrames
+#
+#' @description Returns a new DataFrame containing rows of all parameters.
+#
+#' @rdname rbind
+#' @aliases unionAll
+setMethod("rbind",
+ signature(... = "DataFrame"),
+ function(x, ..., deparse.level = 1) {
+ if (nargs() == 3) {
+ unionAll(x, ...)
+ } else {
+ unionAll(x, Recall(..., deparse.level = 1))
+ }
+ })
+
#' Intersect
#'
#' Return a new DataFrame containing rows only in both this DataFrame
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index a3a121058e..71d1e348c4 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -669,3 +669,7 @@ setGeneric("upper", function(x) { standardGeneric("upper") })
#' @rdname glm
#' @export
setGeneric("glm")
+
+#' @rdname rbind
+#' @export
+setGeneric("rbind", signature = "...")
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 25f697314f..9faee8d59c 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -88,6 +88,9 @@ test_that("create DataFrame from RDD", {
df <- createDataFrame(sqlContext, rdd, list("a", "b"))
expect_is(df, "DataFrame")
expect_equal(count(df), 10)
+ expect_equal(nrow(df), 10)
+ expect_equal(ncol(df), 2)
+ expect_equal(dim(df), c(10, 2))
expect_equal(columns(df), c("a", "b"))
expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
@@ -491,7 +494,7 @@ test_that("head() and first() return the correct data", {
expect_equal(nrow(testFirst), 1)
})
-test_that("distinct() on DataFrames", {
+test_that("distinct() and unique on DataFrames", {
lines <- c("{\"name\":\"Michael\"}",
"{\"name\":\"Andy\", \"age\":30}",
"{\"name\":\"Justin\", \"age\":19}",
@@ -503,6 +506,10 @@ test_that("distinct() on DataFrames", {
uniques <- distinct(df)
expect_is(uniques, "DataFrame")
expect_equal(count(uniques), 3)
+
+ uniques2 <- unique(df)
+ expect_is(uniques2, "DataFrame")
+ expect_equal(count(uniques2), 3)
})
test_that("sample on a DataFrame", {
@@ -815,7 +822,7 @@ test_that("isLocal()", {
expect_false(isLocal(df))
})
-test_that("unionAll(), except(), and intersect() on a DataFrame", {
+test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", {
df <- jsonFile(sqlContext, jsonPath)
lines <- c("{\"name\":\"Bob\", \"age\":24}",
@@ -830,6 +837,11 @@ test_that("unionAll(), except(), and intersect() on a DataFrame", {
expect_equal(count(unioned), 6)
expect_equal(first(unioned)$name, "Michael")
+ unioned2 <- arrange(rbind(unioned, df, df2), df$age)
+ expect_is(unioned2, "DataFrame")
+ expect_equal(count(unioned2), 12)
+ expect_equal(first(unioned2)$name, "Michael")
+
excepted <- arrange(except(df, df2), desc(df$age))
expect_is(unioned, "DataFrame")
expect_equal(count(excepted), 2)
@@ -853,7 +865,7 @@ test_that("withColumn() and withColumnRenamed()", {
expect_equal(columns(newDF2)[1], "newerAge")
})
-test_that("mutate() and rename()", {
+test_that("mutate(), rename() and names()", {
df <- jsonFile(sqlContext, jsonPath)
newDF <- mutate(df, newAge = df$age + 2)
expect_equal(length(columns(newDF)), 3)
@@ -863,6 +875,10 @@ test_that("mutate() and rename()", {
newDF2 <- rename(df, newerAge = df$age)
expect_equal(length(columns(newDF2)), 2)
expect_equal(columns(newDF2)[1], "newerAge")
+
+ names(newDF2) <- c("newerName", "evenNewerAge")
+ expect_equal(length(names(newDF2)), 2)
+ expect_equal(names(newDF2)[1], "newerName")
})
test_that("write.df() on DataFrame and works with parquetFile", {