aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzero323 <zero323@users.noreply.github.com>2017-04-24 10:56:57 -0700
committerFelix Cheung <felixcheung@apache.org>2017-04-24 10:56:57 -0700
commit8a272ddc9d2359a724aa89ae2f8de121a4aa7ac2 (patch)
tree38742432d743167daddcf0ad030b553fe6caf23e
parent90264aced7cfdf265636517b91e5d1324fe60112 (diff)
downloadspark-8a272ddc9d2359a724aa89ae2f8de121a4aa7ac2.tar.gz
spark-8a272ddc9d2359a724aa89ae2f8de121a4aa7ac2.tar.bz2
spark-8a272ddc9d2359a724aa89ae2f8de121a4aa7ac2.zip
[SPARK-20438][R] SparkR wrappers for split and repeatHEADmaster
## What changes were proposed in this pull request? Add wrappers for `o.a.s.sql.functions`: - `split` as `split_string` - `repeat` as `repeat_string` ## How was this patch tested? Existing tests, additional unit tests, `check-cran.sh` Author: zero323 <zero323@users.noreply.github.com> Closes #17729 from zero323/SPARK-20438.
-rw-r--r--R/pkg/NAMESPACE2
-rw-r--r--R/pkg/R/functions.R58
-rw-r--r--R/pkg/R/generics.R8
-rw-r--r--R/pkg/inst/tests/testthat/test_sparkSQL.R34
4 files changed, 102 insertions, 0 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index e804e30e14..95d5cc6d1c 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -300,6 +300,7 @@ exportMethods("%in%",
"rank",
"regexp_extract",
"regexp_replace",
+ "repeat_string",
"reverse",
"rint",
"rlike",
@@ -323,6 +324,7 @@ exportMethods("%in%",
"sort_array",
"soundex",
"spark_partition_id",
+ "split_string",
"stddev",
"stddev_pop",
"stddev_samp",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index e7decb9186..752e4c5c71 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -3745,3 +3745,61 @@ setMethod("collect_set",
jc <- callJStatic("org.apache.spark.sql.functions", "collect_set", x@jc)
column(jc)
})
+
+#' split_string
+#'
+#' Splits string on regular expression.
+#'
+#' Equivalent to \code{split} SQL function
+#'
+#' @param x Column to compute on
+#' @param pattern Java regular expression
+#'
+#' @rdname split_string
+#' @family string_funcs
+#' @aliases split_string,Column-method
+#' @export
+#' @examples \dontrun{
+#' df <- read.text("README.md")
+#'
+#' head(select(df, split_string(df$value, "\\s+")))
+#'
+#' # This is equivalent to the following SQL expression
+#' head(selectExpr(df, "split(value, '\\\\s+')"))
+#' }
+#' @note split_string 2.3.0
+setMethod("split_string",
+ signature(x = "Column", pattern = "character"),
+ function(x, pattern) {
+ jc <- callJStatic("org.apache.spark.sql.functions", "split", x@jc, pattern)
+ column(jc)
+ })
+
+#' repeat_string
+#'
+#' Repeats string n times.
+#'
+#' Equivalent to \code{repeat} SQL function
+#'
+#' @param x Column to compute on
+#' @param n Number of repetitions
+#'
+#' @rdname repeat_string
+#' @family string_funcs
+#' @aliases repeat_string,Column-method
+#' @export
+#' @examples \dontrun{
+#' df <- read.text("README.md")
+#'
+#' first(select(df, repeat_string(df$value, 3)))
+#'
+#' # This is equivalent to the following SQL expression
+#' first(selectExpr(df, "repeat(value, 3)"))
+#' }
+#' @note repeat_string 2.3.0
+setMethod("repeat_string",
+ signature(x = "Column", n = "numeric"),
+ function(x, n) {
+ jc <- callJStatic("org.apache.spark.sql.functions", "repeat", x@jc, numToInt(n))
+ column(jc)
+ })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 61d248ebd2..5e7a1c60c2 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1192,6 +1192,10 @@ setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp
setGeneric("regexp_replace",
function(x, pattern, replacement) { standardGeneric("regexp_replace") })
+#' @rdname repeat_string
+#' @export
+setGeneric("repeat_string", function(x, n) { standardGeneric("repeat_string") })
+
#' @rdname reverse
#' @export
setGeneric("reverse", function(x) { standardGeneric("reverse") })
@@ -1257,6 +1261,10 @@ setGeneric("skewness", function(x) { standardGeneric("skewness") })
#' @export
setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") })
+#' @rdname split_string
+#' @export
+setGeneric("split_string", function(x, pattern) { standardGeneric("split_string") })
+
#' @rdname soundex
#' @export
setGeneric("soundex", function(x) { standardGeneric("soundex") })
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index bf2093fdc4..c21ba2f1a1 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1546,6 +1546,40 @@ test_that("string operators", {
expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")
+
+ l4 <- list(list(a = "a.b@c.d 1\\b"))
+ df4 <- createDataFrame(l4)
+ expect_equal(
+ collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
+ list(list("a.b@c.d", "1\\b"))
+ )
+ expect_equal(
+ collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
+ list(list("a", "b@c", "d 1\\b"))
+ )
+ expect_equal(
+ collect(select(df4, split_string(df4$a, "@")))[1, 1],
+ list(list("a.b", "c.d 1\\b"))
+ )
+ expect_equal(
+ collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
+ list(list("a.b@c.d 1", "b"))
+ )
+
+ l5 <- list(list(a = "abc"))
+ df5 <- createDataFrame(l5)
+ expect_equal(
+ collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
+ "abc"
+ )
+ expect_equal(
+ collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
+ "abcabcabc"
+ )
+ expect_equal(
+ collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
+ ""
+ )
})
test_that("date functions on a DataFrame", {