aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authoractuaryzhang <actuaryzhang10@gmail.com>2017-03-14 00:50:38 -0700
committerFelix Cheung <felixcheung@apache.org>2017-03-14 00:50:38 -0700
commitf6314eab4b494bd5b5e9e41c6f582d4f22c0967a (patch)
treeff067df4be9eb6f3b660abf8332136d778201146 /R
parent415f9f3423aacc395097e40427364c921a2ed7f1 (diff)
downloadspark-f6314eab4b494bd5b5e9e41c6f582d4f22c0967a.tar.gz
spark-f6314eab4b494bd5b5e9e41c6f582d4f22c0967a.tar.bz2
spark-f6314eab4b494bd5b5e9e41c6f582d4f22c0967a.zip
[SPARK-19391][SPARKR][ML] Tweedie GLM API for SparkR
## What changes were proposed in this pull request? Port Tweedie GLM #16344 to SparkR felixcheung yanboliang ## How was this patch tested? new test in SparkR Author: actuaryzhang <actuaryzhang10@gmail.com> Closes #16729 from actuaryzhang/sparkRTweedie.
Diffstat (limited to 'R')
-rw-r--r--R/pkg/R/mllib_regression.R55
-rw-r--r--R/pkg/inst/tests/testthat/test_mllib_regression.R38
-rw-r--r--R/pkg/vignettes/sparkr-vignettes.Rmd19
3 files changed, 102 insertions, 10 deletions
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 648d363f1a..d59c890f3e 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -53,12 +53,23 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
#' the result of a call to a family function. Refer R family at
#' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
#' Currently these families are supported: \code{binomial}, \code{gaussian},
-#' \code{Gamma}, and \code{poisson}.
+#' \code{Gamma}, \code{poisson} and \code{tweedie}.
+#'
+#' Note that there are two ways to specify the tweedie family.
+#' \itemize{
+#' \item Set \code{family = "tweedie"} and specify the var.power and link.power;
+#' \item When package \code{statmod} is loaded, the tweedie family is specified using the
+#' family definition therein, i.e., \code{tweedie(var.power, link.power)}.
+#' }
#' @param tol positive convergence tolerance of iterations.
#' @param maxIter integer giving the maximal number of IRLS iterations.
#' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
#' weights as 1.0.
#' @param regParam regularization parameter for L2 regularization.
+#' @param var.power the power in the variance function of the Tweedie distribution which provides
+#' the relationship between the variance and mean of the distribution. Only
+#' applicable to the Tweedie family.
+#' @param link.power the index in the power link function. Only applicable to the Tweedie family.
#' @param ... additional arguments passed to the method.
#' @aliases spark.glm,SparkDataFrame,formula-method
#' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -84,14 +95,30 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
#' # can also read back the saved model and print
#' savedModel <- read.ml(path)
#' summary(savedModel)
+#'
+#' # fit tweedie model
+#' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie",
+#' var.power = 1.2, link.power = 0)
+#' summary(model)
+#'
+#' # use the tweedie family from statmod
+#' library(statmod)
+#' model <- spark.glm(df, Freq ~ Sex + Age, family = tweedie(1.2, 0))
+#' summary(model)
#' }
#' @note spark.glm since 2.0.0
#' @seealso \link{glm}, \link{read.ml}
setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
- regParam = 0.0) {
+ regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power) {
+
if (is.character(family)) {
- family <- get(family, mode = "function", envir = parent.frame())
+ # Handle when family = "tweedie"
+ if (tolower(family) == "tweedie") {
+ family <- list(family = "tweedie", link = NULL)
+ } else {
+ family <- get(family, mode = "function", envir = parent.frame())
+ }
}
if (is.function(family)) {
family <- family()
@@ -100,6 +127,12 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
print(family)
stop("'family' not recognized")
}
+ # Handle when family = statmod::tweedie()
+ if (tolower(family$family) == "tweedie" && !is.null(family$variance)) {
+ var.power <- log(family$variance(exp(1)))
+ link.power <- log(family$linkfun(exp(1)))
+ family <- list(family = "tweedie", link = NULL)
+ }
formula <- paste(deparse(formula), collapse = "")
if (!is.null(weightCol) && weightCol == "") {
@@ -111,7 +144,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
# For known families, Gamma is upper-cased
jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
"fit", formula, data@sdf, tolower(family$family), family$link,
- tol, as.integer(maxIter), weightCol, regParam)
+ tol, as.integer(maxIter), weightCol, regParam,
+ as.double(var.power), as.double(link.power))
new("GeneralizedLinearRegressionModel", jobj = jobj)
})
@@ -126,11 +160,13 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
#' the result of a call to a family function. Refer R family at
#' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
#' Currently these families are supported: \code{binomial}, \code{gaussian},
-#' \code{Gamma}, and \code{poisson}.
+#' \code{poisson}, \code{Gamma}, and \code{tweedie}.
#' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
#' weights as 1.0.
#' @param epsilon positive convergence tolerance of iterations.
#' @param maxit integer giving the maximal number of IRLS iterations.
+#' @param var.power the index of the power variance function in the Tweedie family.
+#' @param link.power the index of the power link function in the Tweedie family.
#' @return \code{glm} returns a fitted generalized linear model.
#' @rdname glm
#' @export
@@ -145,8 +181,10 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
#' @note glm since 1.5.0
#' @seealso \link{spark.glm}
setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
- function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL) {
- spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol)
+ function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
+ var.power = 0.0, link.power = 1.0 - var.power) {
+ spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
+ var.power = var.power, link.power = link.power)
})
# Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().
@@ -172,9 +210,10 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
deviance <- callJMethod(jobj, "rDeviance")
df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull")
df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom")
- aic <- callJMethod(jobj, "rAic")
iter <- callJMethod(jobj, "rNumIterations")
family <- callJMethod(jobj, "rFamily")
+ aic <- callJMethod(jobj, "rAic")
+ if (family == "tweedie" && aic == 0) aic <- NA
deviance.resid <- if (is.loaded) {
NULL
} else {
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index 81a5bdc414..3e9ad77198 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -77,6 +77,24 @@ test_that("spark.glm and predict", {
out <- capture.output(print(summary(model)))
expect_true(any(grepl("Dispersion parameter for gamma family", out)))
+ # tweedie family
+ model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
+ family = "tweedie", var.power = 1.2, link.power = 0.0)
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+ vals <- collect(select(prediction, "prediction"))
+
+ # manual calculation of the R predicted values to avoid dependence on statmod
+ #' library(statmod)
+ #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+ #' family = tweedie(var.power = 1.2, link.power = 0.0))
+ #' print(coef(rModel))
+
+ rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
+ rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+ data = iris) %*% rCoef))
+ expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
+
# Test stats::predict is working
x <- rnorm(15)
y <- x + rnorm(15)
@@ -233,7 +251,7 @@ test_that("glm and predict", {
training <- suppressWarnings(createDataFrame(iris))
# gaussian family
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
- prediction <- predict(model, training)
+ prediction <- predict(model, training)
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
vals <- collect(select(prediction, "prediction"))
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
@@ -249,6 +267,24 @@ test_that("glm and predict", {
data = iris, family = poisson(link = identity)), iris))
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+ # tweedie family
+ model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
+ family = "tweedie", var.power = 1.2, link.power = 0.0)
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+ vals <- collect(select(prediction, "prediction"))
+
+ # manual calculation of the R predicted values to avoid dependence on statmod
+ #' library(statmod)
+ #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+ #' family = tweedie(var.power = 1.2, link.power = 0.0))
+ #' print(coef(rModel))
+
+ rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
+ rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+ data = iris) %*% rCoef))
+ expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
+
# Test stats::predict is working
x <- rnorm(15)
y <- x + rnorm(15)
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 43c255cff3..a6ff650c33 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -672,6 +672,7 @@ gaussian | identity, log, inverse
binomial | logit, probit, cloglog (complementary log-log)
poisson | log, identity, sqrt
gamma | inverse, identity, log
+tweedie | power link function
There are three ways to specify the `family` argument.
@@ -679,7 +680,11 @@ There are three ways to specify the `family` argument.
* Family function, e.g. `family = binomial`.
-* Result returned by a family function, e.g. `family = poisson(link = log)`
+* Result returned by a family function, e.g. `family = poisson(link = log)`.
+
+* Note that there are two ways to specify the tweedie family:
+ a) Set `family = "tweedie"` and specify the `var.power` and `link.power`
+ b) When package `statmod` is loaded, the tweedie family is specified using the family definition therein, i.e., `tweedie()`.
For more information regarding the families and their link functions, see the Wikipedia page [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model).
@@ -695,6 +700,18 @@ gaussianFitted <- predict(gaussianGLM, carsDF)
head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp"))
```
+The following is the same fit using the tweedie family:
+```{r}
+tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", var.power = 0.0)
+summary(tweedieGLM1)
+```
+We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link:
+```{r}
+tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie",
+ var.power = 1.2, link.power = 0.0)
+summary(tweedieGLM2)
+```
+
#### Isotonic Regression
`spark.isoreg` fits an [Isotonic Regression](https://en.wikipedia.org/wiki/Isotonic_regression) model against a `SparkDataFrame`. It solves a weighted univariate a regression problem under a complete order constraint. Specifically, given a set of real observed responses $y_1, \ldots, y_n$, corresponding real features $x_1, \ldots, x_n$, and optionally positive weights $w_1, \ldots, w_n$, we want to find a monotone (piecewise linear) function $f$ to minimize