aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorJunyang Qian <junyangq@databricks.com>2016-09-03 12:26:30 -0700
committerFelix Cheung <felixcheung@apache.org>2016-09-03 12:26:30 -0700
commitabb2f921036d97d8cab033838ae559eb731bf0fd (patch)
tree0bfe7656e726402591cba3def5fe77ff09526f74 /R
parentc2a1576c230697f56f282b6388c79835377e0f2f (diff)
downloadspark-abb2f921036d97d8cab033838ae559eb731bf0fd.tar.gz
spark-abb2f921036d97d8cab033838ae559eb731bf0fd.tar.bz2
spark-abb2f921036d97d8cab033838ae559eb731bf0fd.zip
[SPARK-17315][SPARKR] Kolmogorov-Smirnov test SparkR wrapper
## What changes were proposed in this pull request? This PR tries to add Kolmogorov-Smirnov Test wrapper to SparkR. This wrapper implementation only supports one sample test against normal distribution. ## How was this patch tested? R unit test. Author: Junyang Qian <junyangq@databricks.com> Closes #14881 from junyangq/SPARK-17315.
Diffstat (limited to 'R')
-rw-r--r--R/pkg/NAMESPACE7
-rw-r--r--R/pkg/R/generics.R4
-rw-r--r--R/pkg/R/mllib.R105
-rw-r--r--R/pkg/inst/tests/testthat/test_mllib.R34
4 files changed, 148 insertions, 2 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index ce41b512a4..a5e9cbdc37 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -42,7 +42,8 @@ exportMethods("glm",
"spark.perplexity",
"spark.isoreg",
"spark.gaussianMixture",
- "spark.als")
+ "spark.als",
+ "spark.kstest")
# Job group lifecycle management methods
export("setJobGroup",
@@ -342,7 +343,8 @@ export("as.DataFrame",
"tables",
"uncacheTable",
"print.summary.GeneralizedLinearRegressionModel",
- "read.ml")
+ "read.ml",
+ "print.summary.KSTest")
export("structField",
"structField.jobj",
@@ -366,6 +368,7 @@ S3method(print, jobj)
S3method(print, structField)
S3method(print, structType)
S3method(print, summary.GeneralizedLinearRegressionModel)
+S3method(print, summary.KSTest)
S3method(structField, character)
S3method(structField, jobj)
S3method(structType, jobj)
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 7e626be508..67a999da9b 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1375,3 +1375,7 @@ setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml")
#' @rdname spark.als
#' @export
setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") })
+
+#' @rdname spark.kstest
+#' @export
+setGeneric("spark.kstest", function(data, ...) { standardGeneric("spark.kstest") })
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 9a53f757b4..f321fd19b3 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -88,6 +88,13 @@ setClass("GaussianMixtureModel", representation(jobj = "jobj"))
#' @note ALSModel since 2.1.0
setClass("ALSModel", representation(jobj = "jobj"))
+#' S4 class that represents an KSTest
+#'
+#' @param jobj a Java object reference to the backing Scala KSTestWrapper
+#' @export
+#' @note KSTest since 2.1.0
+setClass("KSTest", representation(jobj = "jobj"))
+
#' Saves the MLlib model to the input path
#'
#' Saves the MLlib model to the input path. For more information, see the specific
@@ -1310,3 +1317,101 @@ setMethod("write.ml", signature(object = "ALSModel", path = "character"),
function(object, path, overwrite = FALSE) {
write_internal(object, path, overwrite)
})
+
+#' (One-Sample) Kolmogorov-Smirnov Test
+#'
+#' @description
+#' \code{spark.kstest} Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a
+#' continuous distribution.
+#'
+#' By comparing the largest difference between the empirical cumulative
+#' distribution of the sample data and the theoretical distribution we can provide a test for the
+#' the null hypothesis that the sample data comes from that theoretical distribution.
+#'
+#' Users can call \code{summary} to obtain a summary of the test, and \code{print.summary.KSTest}
+#' to print out a summary result.
+#'
+#' @param data a SparkDataFrame of user data.
+#' @param testCol column name where the test data is from. It should be a column of double type.
+#' @param nullHypothesis name of the theoretical distribution tested against. Currently only
+#' \code{"norm"} for normal distribution is supported.
+#' @param distParams parameters(s) of the distribution. For \code{nullHypothesis = "norm"},
+#' we can provide as a vector the mean and standard deviation of
+#' the distribution. If none is provided, then standard normal will be used.
+#' If only one is provided, then the standard deviation will be set to be one.
+#' @param ... additional argument(s) passed to the method.
+#' @return \code{spark.kstest} returns a test result object.
+#' @rdname spark.kstest
+#' @aliases spark.kstest,SparkDataFrame-method
+#' @name spark.kstest
+#' @seealso \href{http://spark.apache.org/docs/latest/mllib-statistics.html#hypothesis-testing}{
+#' MLlib: Hypothesis Testing}
+#' @export
+#' @examples
+#' \dontrun{
+#' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25))
+#' df <- createDataFrame(data)
+#' test <- spark.ktest(df, "test", "norm", c(0, 1))
+#'
+#' # get a summary of the test result
+#' testSummary <- summary(test)
+#' testSummary
+#'
+#' # print out the summary in an organized way
+#' print.summary.KSTest(test)
+#' }
+#' @note spark.kstest since 2.1.0
+setMethod("spark.kstest", signature(data = "SparkDataFrame"),
+ function(data, testCol = "test", nullHypothesis = c("norm"), distParams = c(0, 1)) {
+ tryCatch(match.arg(nullHypothesis),
+ error = function(e) {
+ msg <- paste("Distribution", nullHypothesis, "is not supported.")
+ stop(msg)
+ })
+ if (nullHypothesis == "norm") {
+ distParams <- as.numeric(distParams)
+ mu <- ifelse(length(distParams) < 1, 0, distParams[1])
+ sigma <- ifelse(length(distParams) < 2, 1, distParams[2])
+ jobj <- callJStatic("org.apache.spark.ml.r.KSTestWrapper",
+ "test", data@sdf, testCol, nullHypothesis,
+ as.array(c(mu, sigma)))
+ new("KSTest", jobj = jobj)
+ }
+})
+
+# Get the summary of Kolmogorov-Smirnov (KS) Test.
+#' @param object test result object of KSTest by \code{spark.kstest}.
+#' @return \code{summary} returns a list containing the p-value, test statistic computed for the
+#' test, the null hypothesis with its parameters tested against
+#' and degrees of freedom of the test.
+#' @rdname spark.kstest
+#' @aliases summary,KSTest-method
+#' @export
+#' @note summary(KSTest) since 2.1.0
+setMethod("summary", signature(object = "KSTest"),
+ function(object) {
+ jobj <- object@jobj
+ pValue <- callJMethod(jobj, "pValue")
+ statistic <- callJMethod(jobj, "statistic")
+ nullHypothesis <- callJMethod(jobj, "nullHypothesis")
+ distName <- callJMethod(jobj, "distName")
+ distParams <- unlist(callJMethod(jobj, "distParams"))
+ degreesOfFreedom <- callJMethod(jobj, "degreesOfFreedom")
+
+ list(p.value = pValue, statistic = statistic, nullHypothesis = nullHypothesis,
+ nullHypothesis.name = distName, nullHypothesis.parameters = distParams,
+ degreesOfFreedom = degreesOfFreedom)
+ })
+
+# Prints the summary of KSTest
+
+#' @rdname spark.kstest
+#' @param x test result object of KSTest by \code{spark.kstest}.
+#' @export
+#' @note print.summary.KSTest since 2.1.0
+print.summary.KSTest <- function(x, ...) {
+ jobj <- x@jobj
+ summaryStr <- callJMethod(jobj, "summary")
+ cat(summaryStr)
+ invisible(summaryStr)
+}
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 825a24073b..ca25f2c7e8 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -742,4 +742,38 @@ test_that("spark.als", {
unlink(modelPath)
})
+test_that("spark.kstest", {
+ data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
+ df <- createDataFrame(data)
+ testResult <- spark.kstest(df, "test", "norm")
+ stats <- summary(testResult)
+
+ rStats <- ks.test(data$test, "pnorm", alternative = "two.sided")
+
+ expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
+ expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
+
+ printStr <- print.summary.KSTest(testResult)
+ expect_match(printStr, paste0("Kolmogorov-Smirnov test summary:\\n",
+ "degrees of freedom = 0 \\n",
+ "statistic = 0.38208[0-9]* \\n",
+ "pValue = 0.19849[0-9]* \\n",
+ ".*"), perl = TRUE)
+
+ testResult <- spark.kstest(df, "test", "norm", -0.5)
+ stats <- summary(testResult)
+
+ rStats <- ks.test(data$test, "pnorm", -0.5, 1, alternative = "two.sided")
+
+ expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
+ expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
+
+ printStr <- print.summary.KSTest(testResult)
+ expect_match(printStr, paste0("Kolmogorov-Smirnov test summary:\\n",
+ "degrees of freedom = 0 \\n",
+ "statistic = 0.44003[0-9]* \\n",
+ "pValue = 0.09470[0-9]* \\n",
+ ".*"), perl = TRUE)
+})
+
sparkR.session.stop()