From 70f44ad2d836236c74e1336a7368982d5fe3abff Mon Sep 17 00:00:00 2001 From: Rerngvit Yanggratoke Date: Fri, 9 Oct 2015 09:36:40 -0700 Subject: [SPARK-10905] [SPARKR] Export freqItems() for DataFrameStatFunctions [SPARK-10905][SparkR]: Export freqItems() for DataFrameStatFunctions - Add function (together with roxygen2 doc) to DataFrame.R and generics.R - Expose the function in NAMESPACE - Add unit test for the function Author: Rerngvit Yanggratoke Closes #8962 from rerngvit/SPARK-10905. --- R/pkg/NAMESPACE | 1 + R/pkg/R/generics.R | 4 ++++ R/pkg/R/stats.R | 27 +++++++++++++++++++++++++++ R/pkg/inst/tests/test_sparkSQL.R | 21 +++++++++++++++++++++ 4 files changed, 53 insertions(+) (limited to 'R') diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 9aad35469b..255be2e76f 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -40,6 +40,7 @@ exportMethods("arrange", "fillna", "filter", "first", + "freqItems", "group_by", "groupBy", "head", diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index e9086fdbd1..c447413180 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -63,6 +63,10 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") }) # @export setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") }) +# @rdname statfunctions +# @export +setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") }) + # @rdname distinct # @export setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") }) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 06382d55d0..4928cf4d43 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -100,3 +100,30 @@ setMethod("corr", statFunctions <- callJMethod(x@sdf, "stat") callJMethod(statFunctions, "corr", col1, col2, method) }) + +#' freqItems +#' +#' Finding frequent items for columns, possibly with false positives. +#' Using the frequent element count algorithm described in +#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou. +#' +#' @param x A SparkSQL DataFrame. +#' @param cols A vector column names to search frequent items in. +#' @param support (Optional) The minimum frequency for an item to be considered `frequent`. +#' Should be greater than 1e-4. Default support = 0.01. +#' @return a local R data.frame with the frequent items in each column +#' +#' @rdname statfunctions +#' @name freqItems +#' @export +#' @examples +#' \dontrun{ +#' df <- jsonFile(sqlContext, "/path/to/file.json") +#' fi = freqItems(df, c("title", "gender")) +#' } +setMethod("freqItems", signature(x = "DataFrame", cols = "character"), + function(x, cols, support = 0.01) { + statFunctions <- callJMethod(x@sdf, "stat") + sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support) + collect(dataFrame(sct)) + }) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index e85de25070..4804ecf177 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1350,6 +1350,27 @@ test_that("cov() and corr() on a DataFrame", { expect_true(abs(result - 1.0) < 1e-12) }) +test_that("freqItems() on a DataFrame", { + input <- 1:1000 + rdf <- data.frame(numbers = input, letters = as.character(input), + negDoubles = input * -1.0, stringsAsFactors = F) + rdf[ input %% 3 == 0, ] <- c(1, "1", -1) + df <- createDataFrame(sqlContext, rdf) + multiColResults <- freqItems(df, c("numbers", "letters"), support=0.1) + expect_true(1 %in% multiColResults$numbers[[1]]) + expect_true("1" %in% multiColResults$letters[[1]]) + singleColResult <- freqItems(df, "negDoubles", support=0.1) + expect_true(-1 %in% head(singleColResult$negDoubles)[[1]]) + + l <- lapply(c(0:99), function(i) { + if (i %% 2 == 0) { list(1L, -1.0) } + else { list(i, i * -1.0) }}) + df <- createDataFrame(sqlContext, l, c("a", "b")) + result <- freqItems(df, c("a", "b"), 0.4) + expect_identical(result[[1]], list(list(1L, 99L))) + expect_identical(result[[2]], list(list(-1, -99))) +}) + test_that("SQL error message is returned from JVM", { retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e) expect_equal(grepl("Table Not Found: blah", retError), TRUE) -- cgit v1.2.3