aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorYu ISHIKAWA <yuu.ishikawa@gmail.com>2015-08-12 18:33:27 -0700
committerShivaram Venkataraman <shivaram@cs.berkeley.edu>2015-08-12 18:33:27 -0700
commitf4bc01f1f33a93e6affe5c8a3e33ffbd92d03f38 (patch)
tree025d97eb1343830aa7f276bc0ec47bc1552cc3aa /R
parent0d1d146c220f0d47d0e62b368d5b94d3bd9dd197 (diff)
downloadspark-f4bc01f1f33a93e6affe5c8a3e33ffbd92d03f38.tar.gz
spark-f4bc01f1f33a93e6affe5c8a3e33ffbd92d03f38.tar.bz2
spark-f4bc01f1f33a93e6affe5c8a3e33ffbd92d03f38.zip
[SPARK-9855] [SPARKR] Add expression functions into SparkR whose params are simple
I added lots of expression functions for SparkR. This PR includes only functions whose params are only `(Column)` or `(Column, Column)`. And I think we need to improve how to test those functions. However, it would be better to work on another issue. ## Diff Summary - Add lots of functions in `functions.R` and their generic in `generic.R` - Add aliases for `ceiling` and `sign` - Move expression functions from `column.R` to `functions.R` - Modify `rdname` from `column` to `functions` I haven't supported `not` function, because the name has a collesion with `testthat` package. I didn't think of the way to define it. ## New Supported Functions ``` approxCountDistinct ascii base64 bin bitwiseNOT ceil (alias: ceiling) crc32 dayofmonth dayofyear explode factorial hex hour initcap isNaN last_day length log2 ltrim md5 minute month negate quarter reverse round rtrim second sha1 signum (alias: sign) size soundex to_date trim unbase64 unhex weekofyear year datediff levenshtein months_between nanvl pmod ``` ## JIRA [[SPARK-9855] Add expression functions into SparkR whose params are simple - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-9855) Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com> Closes #8123 from yu-iskw/SPARK-9855.
Diffstat (limited to 'R')
-rw-r--r--R/pkg/DESCRIPTION1
-rw-r--r--R/pkg/R/column.R81
-rw-r--r--R/pkg/R/functions.R123
-rw-r--r--R/pkg/R/generics.R185
-rw-r--r--R/pkg/inst/tests/test_sparkSQL.R21
5 files changed, 309 insertions, 102 deletions
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 4949d86d20..83e6489721 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -29,6 +29,7 @@ Collate:
'client.R'
'context.R'
'deserialize.R'
+ 'functions.R'
'mllib.R'
'serialize.R'
'sparkR.R'
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index eeaf9f193b..328f595d08 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -60,12 +60,6 @@ operators <- list(
)
column_functions1 <- c("asc", "desc", "isNull", "isNotNull")
column_functions2 <- c("like", "rlike", "startsWith", "endsWith", "getField", "getItem", "contains")
-functions <- c("min", "max", "sum", "avg", "mean", "count", "abs", "sqrt",
- "first", "last", "lower", "upper", "sumDistinct",
- "acos", "asin", "atan", "cbrt", "ceiling", "cos", "cosh", "exp",
- "expm1", "floor", "log", "log10", "log1p", "rint", "sign",
- "sin", "sinh", "tan", "tanh", "toDegrees", "toRadians")
-binary_mathfunctions <- c("atan2", "hypot")
createOperator <- function(op) {
setMethod(op,
@@ -111,33 +105,6 @@ createColumnFunction2 <- function(name) {
})
}
-createStaticFunction <- function(name) {
- setMethod(name,
- signature(x = "Column"),
- function(x) {
- if (name == "ceiling") {
- name <- "ceil"
- }
- if (name == "sign") {
- name <- "signum"
- }
- jc <- callJStatic("org.apache.spark.sql.functions", name, x@jc)
- column(jc)
- })
-}
-
-createBinaryMathfunctions <- function(name) {
- setMethod(name,
- signature(y = "Column"),
- function(y, x) {
- if (class(x) == "Column") {
- x <- x@jc
- }
- jc <- callJStatic("org.apache.spark.sql.functions", name, y@jc, x)
- column(jc)
- })
-}
-
createMethods <- function() {
for (op in names(operators)) {
createOperator(op)
@@ -148,12 +115,6 @@ createMethods <- function() {
for (name in column_functions2) {
createColumnFunction2(name)
}
- for (x in functions) {
- createStaticFunction(x)
- }
- for (name in binary_mathfunctions) {
- createBinaryMathfunctions(name)
- }
}
createMethods()
@@ -242,45 +203,3 @@ setMethod("%in%",
jc <- callJMethod(x@jc, "in", table)
return(column(jc))
})
-
-#' Approx Count Distinct
-#'
-#' @rdname column
-#' @return the approximate number of distinct items in a group.
-setMethod("approxCountDistinct",
- signature(x = "Column"),
- function(x, rsd = 0.95) {
- jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc, rsd)
- column(jc)
- })
-
-#' Count Distinct
-#'
-#' @rdname column
-#' @return the number of distinct items in a group.
-setMethod("countDistinct",
- signature(x = "Column"),
- function(x, ...) {
- jcol <- lapply(list(...), function (x) {
- x@jc
- })
- jc <- callJStatic("org.apache.spark.sql.functions", "countDistinct", x@jc,
- listToSeq(jcol))
- column(jc)
- })
-
-#' @rdname column
-#' @aliases countDistinct
-setMethod("n_distinct",
- signature(x = "Column"),
- function(x, ...) {
- countDistinct(x, ...)
- })
-
-#' @rdname column
-#' @aliases count
-setMethod("n",
- signature(x = "Column"),
- function(x) {
- count(x)
- })
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
new file mode 100644
index 0000000000..a15d2d5da5
--- /dev/null
+++ b/R/pkg/R/functions.R
@@ -0,0 +1,123 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#' @include generics.R column.R
+NULL
+
+#' @title S4 expression functions for DataFrame column(s)
+#' @description These are expression functions on DataFrame columns
+
+functions1 <- c(
+ "abs", "acos", "approxCountDistinct", "ascii", "asin", "atan",
+ "avg", "base64", "bin", "bitwiseNOT", "cbrt", "ceil", "cos", "cosh", "count",
+ "crc32", "dayofmonth", "dayofyear", "exp", "explode", "expm1", "factorial",
+ "first", "floor", "hex", "hour", "initcap", "isNaN", "last", "last_day",
+ "length", "log", "log10", "log1p", "log2", "lower", "ltrim", "max", "md5",
+ "mean", "min", "minute", "month", "negate", "quarter", "reverse",
+ "rint", "round", "rtrim", "second", "sha1", "signum", "sin", "sinh", "size",
+ "soundex", "sqrt", "sum", "sumDistinct", "tan", "tanh", "toDegrees",
+ "toRadians", "to_date", "trim", "unbase64", "unhex", "upper", "weekofyear",
+ "year")
+functions2 <- c(
+ "atan2", "datediff", "hypot", "levenshtein", "months_between", "nanvl", "pmod")
+
+createFunction1 <- function(name) {
+ setMethod(name,
+ signature(x = "Column"),
+ function(x) {
+ jc <- callJStatic("org.apache.spark.sql.functions", name, x@jc)
+ column(jc)
+ })
+}
+
+createFunction2 <- function(name) {
+ setMethod(name,
+ signature(y = "Column"),
+ function(y, x) {
+ if (class(x) == "Column") {
+ x <- x@jc
+ }
+ jc <- callJStatic("org.apache.spark.sql.functions", name, y@jc, x)
+ column(jc)
+ })
+}
+
+createFunctions <- function() {
+ for (name in functions1) {
+ createFunction1(name)
+ }
+ for (name in functions2) {
+ createFunction2(name)
+ }
+}
+
+createFunctions()
+
+#' Approx Count Distinct
+#'
+#' @rdname functions
+#' @return the approximate number of distinct items in a group.
+setMethod("approxCountDistinct",
+ signature(x = "Column"),
+ function(x, rsd = 0.95) {
+ jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc, rsd)
+ column(jc)
+ })
+
+#' Count Distinct
+#'
+#' @rdname functions
+#' @return the number of distinct items in a group.
+setMethod("countDistinct",
+ signature(x = "Column"),
+ function(x, ...) {
+ jcol <- lapply(list(...), function (x) {
+ x@jc
+ })
+ jc <- callJStatic("org.apache.spark.sql.functions", "countDistinct", x@jc,
+ listToSeq(jcol))
+ column(jc)
+ })
+
+#' @rdname functions
+#' @aliases ceil
+setMethod("ceiling",
+ signature(x = "Column"),
+ function(x) {
+ ceil(x)
+ })
+
+#' @rdname functions
+#' @aliases signum
+setMethod("sign", signature(x = "Column"),
+ function(x) {
+ signum(x)
+ })
+
+#' @rdname functions
+#' @aliases countDistinct
+setMethod("n_distinct", signature(x = "Column"),
+ function(x, ...) {
+ countDistinct(x, ...)
+ })
+
+#' @rdname functions
+#' @aliases count
+setMethod("n", signature(x = "Column"),
+ function(x) {
+ count(x)
+ })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 379a78b1d8..f11e7fcb6a 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -577,10 +577,6 @@ setGeneric("asc", function(x) { standardGeneric("asc") })
#' @rdname column
#' @export
-setGeneric("avg", function(x, ...) { standardGeneric("avg") })
-
-#' @rdname column
-#' @export
setGeneric("between", function(x, bounds) { standardGeneric("between") })
#' @rdname column
@@ -589,11 +585,8 @@ setGeneric("cast", function(x, dataType) { standardGeneric("cast") })
#' @rdname column
#' @export
-setGeneric("cbrt", function(x) { standardGeneric("cbrt") })
-
-#' @rdname column
-#' @export
setGeneric("contains", function(x, ...) { standardGeneric("contains") })
+
#' @rdname column
#' @export
setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct") })
@@ -658,22 +651,190 @@ setGeneric("rlike", function(x, ...) { standardGeneric("rlike") })
#' @export
setGeneric("startsWith", function(x, ...) { standardGeneric("startsWith") })
-#' @rdname column
+
+###################### Expression Function Methods ##########################
+
+#' @rdname functions
+#' @export
+setGeneric("ascii", function(x) { standardGeneric("ascii") })
+
+#' @rdname functions
+#' @export
+setGeneric("avg", function(x, ...) { standardGeneric("avg") })
+
+#' @rdname functions
+#' @export
+setGeneric("base64", function(x) { standardGeneric("base64") })
+
+#' @rdname functions
+#' @export
+setGeneric("bin", function(x) { standardGeneric("bin") })
+
+#' @rdname functions
+#' @export
+setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })
+
+#' @rdname functions
+#' @export
+setGeneric("cbrt", function(x) { standardGeneric("cbrt") })
+
+#' @rdname functions
+#' @export
+setGeneric("ceil", function(x) { standardGeneric("ceil") })
+
+#' @rdname functions
+#' @export
+setGeneric("crc32", function(x) { standardGeneric("crc32") })
+
+#' @rdname functions
+#' @export
+setGeneric("datediff", function(y, x) { standardGeneric("datediff") })
+
+#' @rdname functions
+#' @export
+setGeneric("dayofmonth", function(x) { standardGeneric("dayofmonth") })
+
+#' @rdname functions
+#' @export
+setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") })
+
+#' @rdname functions
+#' @export
+setGeneric("explode", function(x) { standardGeneric("explode") })
+
+#' @rdname functions
+#' @export
+setGeneric("hex", function(x) { standardGeneric("hex") })
+
+#' @rdname functions
+#' @export
+setGeneric("hour", function(x) { standardGeneric("hour") })
+
+#' @rdname functions
+#' @export
+setGeneric("initcap", function(x) { standardGeneric("initcap") })
+
+#' @rdname functions
+#' @export
+setGeneric("isNaN", function(x) { standardGeneric("isNaN") })
+
+#' @rdname functions
+#' @export
+setGeneric("last_day", function(x) { standardGeneric("last_day") })
+
+#' @rdname functions
+#' @export
+setGeneric("levenshtein", function(y, x) { standardGeneric("levenshtein") })
+
+#' @rdname functions
+#' @export
+setGeneric("lower", function(x) { standardGeneric("lower") })
+
+#' @rdname functions
+#' @export
+setGeneric("ltrim", function(x) { standardGeneric("ltrim") })
+
+#' @rdname functions
+#' @export
+setGeneric("md5", function(x) { standardGeneric("md5") })
+
+#' @rdname functions
+#' @export
+setGeneric("minute", function(x) { standardGeneric("minute") })
+
+#' @rdname functions
+#' @export
+setGeneric("month", function(x) { standardGeneric("month") })
+
+#' @rdname functions
+#' @export
+setGeneric("months_between", function(y, x) { standardGeneric("months_between") })
+
+#' @rdname functions
+#' @export
+setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") })
+
+#' @rdname functions
+#' @export
+setGeneric("negate", function(x) { standardGeneric("negate") })
+
+#' @rdname functions
+#' @export
+setGeneric("pmod", function(y, x) { standardGeneric("pmod") })
+
+#' @rdname functions
+#' @export
+setGeneric("quarter", function(x) { standardGeneric("quarter") })
+
+#' @rdname functions
+#' @export
+setGeneric("reverse", function(x) { standardGeneric("reverse") })
+
+#' @rdname functions
+#' @export
+setGeneric("rtrim", function(x) { standardGeneric("rtrim") })
+
+#' @rdname functions
+#' @export
+setGeneric("second", function(x) { standardGeneric("second") })
+
+#' @rdname functions
+#' @export
+setGeneric("sha1", function(x) { standardGeneric("sha1") })
+
+#' @rdname functions
+#' @export
+setGeneric("signum", function(x) { standardGeneric("signum") })
+
+#' @rdname functions
+#' @export
+setGeneric("size", function(x) { standardGeneric("size") })
+
+#' @rdname functions
+#' @export
+setGeneric("soundex", function(x) { standardGeneric("soundex") })
+
+#' @rdname functions
#' @export
setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") })
-#' @rdname column
+#' @rdname functions
#' @export
setGeneric("toDegrees", function(x) { standardGeneric("toDegrees") })
-#' @rdname column
+#' @rdname functions
#' @export
setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
-#' @rdname column
+#' @rdname functions
+#' @export
+setGeneric("to_date", function(x) { standardGeneric("to_date") })
+
+#' @rdname functions
+#' @export
+setGeneric("trim", function(x) { standardGeneric("trim") })
+
+#' @rdname functions
+#' @export
+setGeneric("unbase64", function(x) { standardGeneric("unbase64") })
+
+#' @rdname functions
+#' @export
+setGeneric("unhex", function(x) { standardGeneric("unhex") })
+
+#' @rdname functions
#' @export
setGeneric("upper", function(x) { standardGeneric("upper") })
+#' @rdname functions
+#' @export
+setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
+
+#' @rdname functions
+#' @export
+setGeneric("year", function(x) { standardGeneric("year") })
+
+
#' @rdname glm
#' @export
setGeneric("glm")
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 7377fc8f1c..e6d3b21ff8 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -640,15 +640,18 @@ test_that("column operators", {
test_that("column functions", {
c <- SparkR:::col("a")
- c2 <- min(c) + max(c) + sum(c) + avg(c) + count(c) + abs(c) + sqrt(c)
- c3 <- lower(c) + upper(c) + first(c) + last(c)
- c4 <- approxCountDistinct(c) + countDistinct(c) + cast(c, "string")
- c5 <- n(c) + n_distinct(c)
- c5 <- acos(c) + asin(c) + atan(c) + cbrt(c)
- c6 <- ceiling(c) + cos(c) + cosh(c) + exp(c) + expm1(c)
- c7 <- floor(c) + log(c) + log10(c) + log1p(c) + rint(c)
- c8 <- sign(c) + sin(c) + sinh(c) + tan(c) + tanh(c)
- c9 <- toDegrees(c) + toRadians(c)
+ c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
+ c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
+ c3 <- cosh(c) + count(c) + crc32(c) + dayofmonth(c) + dayofyear(c) + exp(c)
+ c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
+ c5 <- hour(c) + initcap(c) + isNaN(c) + last(c) + last_day(c) + length(c)
+ c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c)
+ c7 <- mean(c) + min(c) + minute(c) + month(c) + negate(c) + quarter(c)
+ c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + second(c) + sha1(c)
+ c9 <- signum(c) + sin(c) + sinh(c) + size(c) + soundex(c) + sqrt(c) + sum(c)
+ c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
+ c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c) + weekofyear(c)
+ c12 <- year(c)
df <- jsonFile(sqlContext, jsonPath)
df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))