aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2016-06-21 08:31:15 -0700
committerXiangrui Meng <meng@databricks.com>2016-06-21 08:31:15 -0700
commit4f83ca1059a3b580fca3f006974ff5ac4d5212a1 (patch)
tree0d1c12a4c1f67574acdad034bc909e17117fa10b /R
parentf3a768b7b96f00f33d2fe4e6c0bf4acf373ad4f4 (diff)
downloadspark-4f83ca1059a3b580fca3f006974ff5ac4d5212a1.tar.gz
spark-4f83ca1059a3b580fca3f006974ff5ac4d5212a1.tar.bz2
spark-4f83ca1059a3b580fca3f006974ff5ac4d5212a1.zip
[SPARK-15177][.1][R] make SparkR model params and default values consistent with MLlib
## What changes were proposed in this pull request? This PR is a subset of #13023 by yanboliang to make SparkR model param names and default values consistent with MLlib. I tried to avoid other changes from #13023 to keep this PR minimal. I will send a follow-up PR to improve the documentation. Main changes: * `spark.glm`: epsilon -> tol, maxit -> maxIter * `spark.kmeans`: default k -> 2, default maxIter -> 20, default initMode -> "k-means||" * `spark.naiveBayes`: laplace -> smoothing, default 1.0 ## How was this patch tested? Existing unit tests. Author: Xiangrui Meng <meng@databricks.com> Closes #13801 from mengxr/SPARK-15177.1.
Diffstat (limited to 'R')
-rw-r--r--R/pkg/R/mllib.R74
-rw-r--r--R/pkg/inst/tests/testthat/test_mllib.R4
2 files changed, 38 insertions, 40 deletions
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 74dba8fe96..b83b3b3d3f 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -64,8 +64,8 @@ setClass("KMeansModel", representation(jobj = "jobj"))
#' This can be a character string naming a family function, a family function or
#' the result of a call to a family function. Refer R family at
#' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
-#' @param epsilon Positive convergence tolerance of iterations.
-#' @param maxit Integer giving the maximal number of IRLS iterations.
+#' @param tol Positive convergence tolerance of iterations.
+#' @param maxIter Integer giving the maximal number of IRLS iterations.
#' @return a fitted generalized linear model
#' @rdname spark.glm
#' @export
@@ -74,32 +74,30 @@ setClass("KMeansModel", representation(jobj = "jobj"))
#' sparkR.session()
#' data(iris)
#' df <- createDataFrame(iris)
-#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family="gaussian")
+#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
#' summary(model)
#' }
#' @note spark.glm since 2.0.0
-setMethod(
- "spark.glm",
- signature(data = "SparkDataFrame", formula = "formula"),
- function(data, formula, family = gaussian, epsilon = 1e-06, maxit = 25) {
- if (is.character(family)) {
- family <- get(family, mode = "function", envir = parent.frame())
- }
- if (is.function(family)) {
- family <- family()
- }
- if (is.null(family$family)) {
- print(family)
- stop("'family' not recognized")
- }
+setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
+ function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25) {
+ if (is.character(family)) {
+ family <- get(family, mode = "function", envir = parent.frame())
+ }
+ if (is.function(family)) {
+ family <- family()
+ }
+ if (is.null(family$family)) {
+ print(family)
+ stop("'family' not recognized")
+ }
- formula <- paste(deparse(formula), collapse = "")
+ formula <- paste(deparse(formula), collapse = "")
- jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
- "fit", formula, data@sdf, family$family, family$link,
- epsilon, as.integer(maxit))
- return(new("GeneralizedLinearRegressionModel", jobj = jobj))
-})
+ jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
+ "fit", formula, data@sdf, family$family, family$link,
+ tol, as.integer(maxIter))
+ return(new("GeneralizedLinearRegressionModel", jobj = jobj))
+ })
#' Fits a generalized linear model (R-compliant).
#'
@@ -122,13 +120,13 @@ setMethod(
#' sparkR.session()
#' data(iris)
#' df <- createDataFrame(iris)
-#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian")
+#' model <- glm(Sepal_Length ~ Sepal_Width, df, family = "gaussian")
#' summary(model)
#' }
#' @note glm since 1.5.0
setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
- function(formula, family = gaussian, data, epsilon = 1e-06, maxit = 25) {
- spark.glm(data, formula, family, epsilon, maxit)
+ function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25) {
+ spark.glm(data, formula, family, tol = epsilon, maxIter = maxit)
})
#' Get the summary of a generalized linear model
@@ -296,17 +294,17 @@ setMethod("summary", signature(object = "NaiveBayesModel"),
#' @export
#' @examples
#' \dontrun{
-#' model <- spark.kmeans(data, ~ ., k=2, initMode="random")
+#' model <- spark.kmeans(data, ~ ., k = 4, initMode = "random")
#' }
#' @note spark.kmeans since 2.0.0
setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula"),
- function(data, formula, k, maxIter = 10, initMode = c("random", "k-means||")) {
+ function(data, formula, k = 2, maxIter = 20, initMode = c("k-means||", "random")) {
formula <- paste(deparse(formula), collapse = "")
initMode <- match.arg(initMode)
jobj <- callJStatic("org.apache.spark.ml.r.KMeansWrapper", "fit", data@sdf, formula,
as.integer(k), as.integer(maxIter), initMode)
return(new("KMeansModel", jobj = jobj))
- })
+ })
#' Get fitted result from a k-means model
#'
@@ -397,7 +395,7 @@ setMethod("predict", signature(object = "KMeansModel"),
#' @param data SparkDataFrame for training
#' @param formula A symbolic description of the model to be fitted. Currently only a few formula
#' operators are supported, including '~', '.', ':', '+', and '-'.
-#' @param laplace Smoothing parameter
+#' @param smoothing Smoothing parameter
#' @return a fitted naive Bayes model
#' @rdname spark.naiveBayes
#' @seealso e1071: \url{https://cran.r-project.org/web/packages/e1071/}
@@ -405,16 +403,16 @@ setMethod("predict", signature(object = "KMeansModel"),
#' @examples
#' \dontrun{
#' df <- createDataFrame(infert)
-#' model <- spark.naiveBayes(df, education ~ ., laplace = 0)
+#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0)
#'}
#' @note spark.naiveBayes since 2.0.0
setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"),
- function(data, formula, laplace = 0, ...) {
- formula <- paste(deparse(formula), collapse = "")
- jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit",
- formula, data@sdf, laplace)
- return(new("NaiveBayesModel", jobj = jobj))
- })
+ function(data, formula, smoothing = 1.0, ...) {
+ formula <- paste(deparse(formula), collapse = "")
+ jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit",
+ formula, data@sdf, smoothing)
+ return(new("NaiveBayesModel", jobj = jobj))
+ })
#' Save fitted MLlib model to the input path
#'
@@ -431,7 +429,7 @@ setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "form
#' @examples
#' \dontrun{
#' df <- createDataFrame(infert)
-#' model <- spark.naiveBayes(df, education ~ ., laplace = 0)
+#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0)
#' path <- "path/to/model"
#' write.ml(model, path)
#' }
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index c8c5ef2476..753da81760 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -288,7 +288,7 @@ test_that("spark.kmeans", {
take(training, 1)
- model <- spark.kmeans(data = training, ~ ., k = 2)
+ model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
sample <- take(select(predict(model, training), "prediction"), 1)
expect_equal(typeof(sample$prediction), "integer")
expect_equal(sample$prediction, 1)
@@ -363,7 +363,7 @@ test_that("spark.naiveBayes", {
t <- as.data.frame(Titanic)
t1 <- t[t$Freq > 0, -5]
df <- suppressWarnings(createDataFrame(t1))
- m <- spark.naiveBayes(df, Survived ~ .)
+ m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0)
s <- summary(m)
expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6)
expect_equal(sum(s$apriori), 1)