aboutsummaryrefslogtreecommitdiff
path: root/R/pkg
diff options
context:
space:
mode:
authorSun Rui <rui.sun@intel.com>2015-10-30 10:56:06 -0700
committerShivaram Venkataraman <shivaram@cs.berkeley.edu>2015-10-30 10:56:06 -0700
commit40c77fb23a1ee0a4e69d735ee6247f83b7e13b92 (patch)
tree316592c9adced89216c64d2e4d8fab0cf1f96529 /R/pkg
parentfab710a9171932f01ac81d100db8523dbd314925 (diff)
downloadspark-40c77fb23a1ee0a4e69d735ee6247f83b7e13b92.tar.gz
spark-40c77fb23a1ee0a4e69d735ee6247f83b7e13b92.tar.bz2
spark-40c77fb23a1ee0a4e69d735ee6247f83b7e13b92.zip
[SPARK-11210][SPARKR] Add window functions into SparkR [step 2].
Author: Sun Rui <rui.sun@intel.com> Closes #9196 from sun-rui/SPARK-11210.
Diffstat (limited to 'R/pkg')
-rw-r--r--R/pkg/NAMESPACE4
-rw-r--r--R/pkg/R/functions.R92
-rw-r--r--R/pkg/R/generics.R16
-rw-r--r--R/pkg/inst/tests/test_sparkSQL.R5
4 files changed, 117 insertions, 0 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index b73bed3128..cd9537a265 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -126,6 +126,7 @@ exportMethods("%in%",
"datediff",
"dayofmonth",
"dayofyear",
+ "denseRank",
"desc",
"endsWith",
"exp",
@@ -182,16 +183,19 @@ exportMethods("%in%",
"next_day",
"ntile",
"otherwise",
+ "percentRank",
"pmod",
"quarter",
"rand",
"randn",
+ "rank",
"regexp_extract",
"regexp_replace",
"reverse",
"rint",
"rlike",
"round",
+ "rowNumber",
"rpad",
"rtrim",
"second",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 366290fe66..d7fd279279 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2038,6 +2038,28 @@ setMethod("cumeDist",
column(jc)
})
+#' denseRank
+#'
+#' Window function: returns the rank of rows within a window partition, without any gaps.
+#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+#' sequence when there are ties. That is, if you were ranking a competition using denseRank
+#' and had three people tie for second place, you would say that all three were in second
+#' place and that the next person came in third.
+#'
+#' This is equivalent to the DENSE_RANK function in SQL.
+#'
+#' @rdname denseRank
+#' @name denseRank
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{denseRank()}
+setMethod("denseRank",
+ signature(x = "missing"),
+ function() {
+ jc <- callJStatic("org.apache.spark.sql.functions", "denseRank")
+ column(jc)
+ })
+
#' lag
#'
#' Window function: returns the value that is `offset` rows before the current row, and
@@ -2111,3 +2133,73 @@ setMethod("ntile",
jc <- callJStatic("org.apache.spark.sql.functions", "ntile", as.integer(x))
column(jc)
})
+
+#' percentRank
+#'
+#' Window function: returns the relative rank (i.e. percentile) of rows within a window partition.
+#'
+#' This is computed by:
+#'
+#' (rank of row in its partition - 1) / (number of rows in the partition - 1)
+#'
+#' This is equivalent to the PERCENT_RANK function in SQL.
+#'
+#' @rdname percentRank
+#' @name percentRank
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{percentRank()}
+setMethod("percentRank",
+ signature(x = "missing"),
+ function() {
+ jc <- callJStatic("org.apache.spark.sql.functions", "percentRank")
+ column(jc)
+ })
+
+#' rank
+#'
+#' Window function: returns the rank of rows within a window partition.
+#'
+#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+#' sequence when there are ties. That is, if you were ranking a competition using denseRank
+#' and had three people tie for second place, you would say that all three were in second
+#' place and that the next person came in third.
+#'
+#' This is equivalent to the RANK function in SQL.
+#'
+#' @rdname rank
+#' @name rank
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{rank()}
+setMethod("rank",
+ signature(x = "missing"),
+ function() {
+ jc <- callJStatic("org.apache.spark.sql.functions", "rank")
+ column(jc)
+ })
+
+# Expose rank() in the R base package
+setMethod("rank",
+ signature(x = "ANY"),
+ function(x, ...) {
+ base::rank(x, ...)
+ })
+
+#' rowNumber
+#'
+#' Window function: returns a sequential number starting at 1 within a window partition.
+#'
+#' This is equivalent to the ROW_NUMBER function in SQL.
+#'
+#' @rdname rowNumber
+#' @name rowNumber
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{rowNumber()}
+setMethod("rowNumber",
+ signature(x = "missing"),
+ function() {
+ jc <- callJStatic("org.apache.spark.sql.functions", "rowNumber")
+ column(jc)
+ })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c11c3c8d3e..0b35340e48 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -742,6 +742,10 @@ setGeneric("dayofmonth", function(x) { standardGeneric("dayofmonth") })
#' @export
setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") })
+#' @rdname denseRank
+#' @export
+setGeneric("denseRank", function(x) { standardGeneric("denseRank") })
+
#' @rdname explode
#' @export
setGeneric("explode", function(x) { standardGeneric("explode") })
@@ -878,6 +882,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") })
#' @export
setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
+#' @rdname percentRank
+#' @export
+setGeneric("percentRank", function(x) { standardGeneric("percentRank") })
+
#' @rdname pmod
#' @export
setGeneric("pmod", function(y, x) { standardGeneric("pmod") })
@@ -894,6 +902,10 @@ setGeneric("rand", function(seed) { standardGeneric("rand") })
#' @export
setGeneric("randn", function(seed) { standardGeneric("randn") })
+#' @rdname rank
+#' @export
+setGeneric("rank", function(x, ...) { standardGeneric("rank") })
+
#' @rdname regexp_extract
#' @export
setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp_extract") })
@@ -911,6 +923,10 @@ setGeneric("reverse", function(x) { standardGeneric("reverse") })
#' @export
setGeneric("rint", function(x, ...) { standardGeneric("rint") })
+#' @rdname rowNumber
+#' @export
+setGeneric("rowNumber", function(x) { standardGeneric("rowNumber") })
+
#' @rdname rpad
#' @export
setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index e1d4499925..b4a4d03b26 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -831,6 +831,11 @@ test_that("column functions", {
c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
c12 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
c13 <- cumeDist() + ntile(1)
+ c14 <- denseRank() + percentRank() + rank() + rowNumber()
+
+ # Test if base::rank() is exposed
+ expect_equal(class(rank())[[1]], "Column")
+ expect_equal(rank(1:3), as.numeric(c(1:3)))
df <- jsonFile(sqlContext, jsonPath)
df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))