aboutsummaryrefslogtreecommitdiff
path: root/R/pkg
diff options
context:
space:
mode:
authorDongjoon Hyun <dongjoon@apache.org>2016-06-20 11:12:41 -0700
committerShivaram Venkataraman <shivaram@cs.berkeley.edu>2016-06-20 11:12:41 -0700
commit9613424898fd2a586156bc4eb48e255749774f20 (patch)
tree937563432cb0a98bdbe8c3733af72222e96c21b5 /R/pkg
parent5cfabec8724714b897d6e23e670c39e58f554ea2 (diff)
downloadspark-9613424898fd2a586156bc4eb48e255749774f20.tar.gz
spark-9613424898fd2a586156bc4eb48e255749774f20.tar.bz2
spark-9613424898fd2a586156bc4eb48e255749774f20.zip
[SPARK-16059][R] Add `monotonically_increasing_id` function in SparkR
## What changes were proposed in this pull request? This PR adds `monotonically_increasing_id` column function in SparkR for API parity. After this PR, SparkR supports the followings. ```r > df <- read.json("examples/src/main/resources/people.json") > collect(select(df, monotonically_increasing_id(), df$name, df$age)) monotonically_increasing_id() name age 1 0 Michael NA 2 1 Andy 30 3 2 Justin 19 ``` ## How was this patch tested? Pass the Jenkins tests (with added testcase). Author: Dongjoon Hyun <dongjoon@apache.org> Closes #13774 from dongjoon-hyun/SPARK-16059.
Diffstat (limited to 'R/pkg')
-rw-r--r--R/pkg/NAMESPACE1
-rw-r--r--R/pkg/R/functions.R27
-rw-r--r--R/pkg/R/generics.R5
-rw-r--r--R/pkg/inst/tests/testthat/test_sparkSQL.R2
4 files changed, 34 insertions, 1 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 82e56ca437..0cfe190279 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -218,6 +218,7 @@ exportMethods("%in%",
"mean",
"min",
"minute",
+ "monotonically_increasing_id",
"month",
"months_between",
"n",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index a779127b37..0fb38bc289 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -911,6 +911,33 @@ setMethod("minute",
column(jc)
})
+#' monotonically_increasing_id
+#'
+#' Return a column that generates monotonically increasing 64-bit integers.
+#'
+#' The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.
+#' The current implementation puts the partition ID in the upper 31 bits, and the record number
+#' within each partition in the lower 33 bits. The assumption is that the SparkDataFrame has
+#' less than 1 billion partitions, and each partition has less than 8 billion records.
+#'
+#' As an example, consider a SparkDataFrame with two partitions, each with 3 records.
+#' This expression would return the following IDs:
+#' 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
+#'
+#' This is equivalent to the MONOTONICALLY_INCREASING_ID function in SQL.
+#'
+#' @rdname monotonically_increasing_id
+#' @name monotonically_increasing_id
+#' @family misc_funcs
+#' @export
+#' @examples \dontrun{select(df, monotonically_increasing_id())}
+setMethod("monotonically_increasing_id",
+ signature(x = "missing"),
+ function() {
+ jc <- callJStatic("org.apache.spark.sql.functions", "monotonically_increasing_id")
+ column(jc)
+ })
+
#' month
#'
#' Extracts the month as an integer from a given date/timestamp/string.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 6e754afab6..37d05560c3 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -993,6 +993,11 @@ setGeneric("md5", function(x) { standardGeneric("md5") })
#' @export
setGeneric("minute", function(x) { standardGeneric("minute") })
+#' @rdname monotonically_increasing_id
+#' @export
+setGeneric("monotonically_increasing_id",
+ function(x) { standardGeneric("monotonically_increasing_id") })
+
#' @rdname month
#' @export
setGeneric("month", function(x) { standardGeneric("month") })
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index fcc2ab3ed6..c5c5a069a8 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1047,7 +1047,7 @@ test_that("column functions", {
c5 <- hour(c) + initcap(c) + last(c) + last_day(c) + length(c)
c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c)
c7 <- mean(c) + min(c) + month(c) + negate(c) + quarter(c)
- c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c)
+ c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + monotonically_increasing_id()
c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)