aboutsummaryrefslogtreecommitdiff
path: root/R/pkg
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-11-16 01:04:18 -0800
committerYanbo Liang <ybliang8@gmail.com>2016-11-16 01:04:18 -0800
commit95eb06bd7d0f7110ef62c8d1cb6337c72b10d99f (patch)
tree24610489c5c3c41a4f6e29da144abcc28dcbdfc0 /R/pkg
parent4ac9759f807d217b6f67badc6d5f6b7138eb92d2 (diff)
downloadspark-95eb06bd7d0f7110ef62c8d1cb6337c72b10d99f.tar.gz
spark-95eb06bd7d0f7110ef62c8d1cb6337c72b10d99f.tar.bz2
spark-95eb06bd7d0f7110ef62c8d1cb6337c72b10d99f.zip
[SPARK-18438][SPARKR][ML] spark.mlp should support RFormula.
## What changes were proposed in this pull request? ```spark.mlp``` should support ```RFormula``` like other ML algorithm wrappers. BTW, I did some cleanup and improvement for ```spark.mlp```. ## How was this patch tested? Unit tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #15883 from yanboliang/spark-18438.
Diffstat (limited to 'R/pkg')
-rw-r--r--R/pkg/R/generics.R2
-rw-r--r--R/pkg/R/mllib.R30
-rw-r--r--R/pkg/inst/tests/testthat/test_mllib.R63
3 files changed, 62 insertions, 33 deletions
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 7653ca7bcc..499c7b279e 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1373,7 +1373,7 @@ setGeneric("spark.logit", function(data, formula, ...) { standardGeneric("spark.
#' @rdname spark.mlp
#' @export
-setGeneric("spark.mlp", function(data, ...) { standardGeneric("spark.mlp") })
+setGeneric("spark.mlp", function(data, formula, ...) { standardGeneric("spark.mlp") })
#' @rdname spark.naiveBayes
#' @export
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 1065b4b37d..265e64e746 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -525,7 +525,7 @@ setMethod("write.ml", signature(object = "LDAModel", path = "character"),
#' @note spark.isoreg since 2.1.0
setMethod("spark.isoreg", signature(data = "SparkDataFrame", formula = "formula"),
function(data, formula, isotonic = TRUE, featureIndex = 0, weightCol = NULL) {
- formula <- paste0(deparse(formula), collapse = "")
+ formula <- paste(deparse(formula), collapse = "")
if (is.null(weightCol)) {
weightCol <- ""
@@ -775,7 +775,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
tol = 1E-6, fitIntercept = TRUE, family = "auto", standardization = TRUE,
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
probabilityCol = "probability") {
- formula <- paste0(deparse(formula), collapse = "")
+ formula <- paste(deparse(formula), collapse = "")
if (is.null(weightCol)) {
weightCol <- ""
@@ -858,6 +858,8 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
#' Multilayer Perceptron}
#'
#' @param data a \code{SparkDataFrame} of observations and labels for model fitting.
+#' @param formula a symbolic description of the model to be fitted. Currently only a few formula
+#' operators are supported, including '~', '.', ':', '+', and '-'.
#' @param blockSize blockSize parameter.
#' @param layers integer vector containing the number of nodes for each layer
#' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "l-bfgs".
@@ -870,7 +872,7 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
#' @param ... additional arguments passed to the method.
#' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
#' @rdname spark.mlp
-#' @aliases spark.mlp,SparkDataFrame-method
+#' @aliases spark.mlp,SparkDataFrame,formula-method
#' @name spark.mlp
#' @seealso \link{read.ml}
#' @export
@@ -879,7 +881,7 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
#' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
#'
#' # fit a Multilayer Perceptron Classification Model
-#' model <- spark.mlp(df, blockSize = 128, layers = c(4, 3), solver = "l-bfgs",
+#' model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 3), solver = "l-bfgs",
#' maxIter = 100, tol = 0.5, stepSize = 1, seed = 1,
#' initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
#'
@@ -896,9 +898,10 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
#' summary(savedModel)
#' }
#' @note spark.mlp since 2.1.0
-setMethod("spark.mlp", signature(data = "SparkDataFrame"),
- function(data, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
+setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
+ function(data, formula, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL) {
+ formula <- paste(deparse(formula), collapse = "")
if (is.null(layers)) {
stop ("layers must be a integer vector with length > 1.")
}
@@ -913,7 +916,7 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame"),
initialWeights <- as.array(as.numeric(na.omit(initialWeights)))
}
jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
- "fit", data@sdf, as.integer(blockSize), as.array(layers),
+ "fit", data@sdf, formula, as.integer(blockSize), as.array(layers),
as.character(solver), as.integer(maxIter), as.numeric(tol),
as.numeric(stepSize), seed, initialWeights)
new("MultilayerPerceptronClassificationModel", jobj = jobj)
@@ -936,9 +939,10 @@ setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel
# Returns the summary of a Multilayer Perceptron Classification Model produced by \code{spark.mlp}
#' @param object a Multilayer Perceptron Classification Model fitted by \code{spark.mlp}
-#' @return \code{summary} returns a list containing \code{labelCount}, \code{layers}, and
-#' \code{weights}. For \code{weights}, it is a numeric vector with length equal to
-#' the expected given the architecture (i.e., for 8-10-2 network, 100 connection weights).
+#' @return \code{summary} returns a list containing \code{numOfInputs}, \code{numOfOutputs},
+#' \code{layers}, and \code{weights}. For \code{weights}, it is a numeric vector with
+#' length equal to the expected given the architecture (i.e., for 8-10-2 network,
+#' 112 connection weights).
#' @rdname spark.mlp
#' @export
#' @aliases summary,MultilayerPerceptronClassificationModel-method
@@ -946,10 +950,12 @@ setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel
setMethod("summary", signature(object = "MultilayerPerceptronClassificationModel"),
function(object) {
jobj <- object@jobj
- labelCount <- callJMethod(jobj, "labelCount")
layers <- unlist(callJMethod(jobj, "layers"))
+ numOfInputs <- head(layers, n = 1)
+ numOfOutputs <- tail(layers, n = 1)
weights <- callJMethod(jobj, "weights")
- list(labelCount = labelCount, layers = layers, weights = weights)
+ list(numOfInputs = numOfInputs, numOfOutputs = numOfOutputs,
+ layers = layers, weights = weights)
})
#' Naive Bayes Models
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 07df4b6d6f..2a97a51cfa 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -371,12 +371,13 @@ test_that("spark.kmeans", {
test_that("spark.mlp", {
df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
source = "libsvm")
- model <- spark.mlp(df, blockSize = 128, layers = c(4, 5, 4, 3), solver = "l-bfgs", maxIter = 100,
- tol = 0.5, stepSize = 1, seed = 1)
+ model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
+ solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
# Test summary method
summary <- summary(model)
- expect_equal(summary$labelCount, 3)
+ expect_equal(summary$numOfInputs, 4)
+ expect_equal(summary$numOfOutputs, 3)
expect_equal(summary$layers, c(4, 5, 4, 3))
expect_equal(length(summary$weights), 64)
expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
@@ -385,7 +386,7 @@ test_that("spark.mlp", {
# Test predict method
mlpTestDF <- df
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 6), c(0, 1, 1, 1, 1, 1))
+ expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
# Test model save/load
modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
@@ -395,46 +396,68 @@ test_that("spark.mlp", {
model2 <- read.ml(modelPath)
summary2 <- summary(model2)
- expect_equal(summary2$labelCount, 3)
+ expect_equal(summary2$numOfInputs, 4)
+ expect_equal(summary2$numOfOutputs, 3)
expect_equal(summary2$layers, c(4, 5, 4, 3))
expect_equal(length(summary2$weights), 64)
unlink(modelPath)
# Test default parameter
- model <- spark.mlp(df, layers = c(4, 5, 4, 3))
+ model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3))
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 10), c(1, 1, 1, 1, 0, 1, 2, 2, 1, 0))
+ expect_equal(head(mlpPredictions$prediction, 10),
+ c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
# Test illegal parameter
- expect_error(spark.mlp(df, layers = NULL), "layers must be a integer vector with length > 1.")
- expect_error(spark.mlp(df, layers = c()), "layers must be a integer vector with length > 1.")
- expect_error(spark.mlp(df, layers = c(3)), "layers must be a integer vector with length > 1.")
+ expect_error(spark.mlp(df, label ~ features, layers = NULL),
+ "layers must be a integer vector with length > 1.")
+ expect_error(spark.mlp(df, label ~ features, layers = c()),
+ "layers must be a integer vector with length > 1.")
+ expect_error(spark.mlp(df, label ~ features, layers = c(3)),
+ "layers must be a integer vector with length > 1.")
# Test random seed
# default seed
- model <- spark.mlp(df, layers = c(4, 5, 4, 3), maxIter = 10)
+ model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 2, 2, 1, 2, 0, 1))
+ expect_equal(head(mlpPredictions$prediction, 10),
+ c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
# seed equals 10
- model <- spark.mlp(df, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
+ model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 2, 1, 2, 2, 1, 0, 0, 1))
+ expect_equal(head(mlpPredictions$prediction, 10),
+ c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
# test initialWeights
- model <- spark.mlp(df, layers = c(4, 3), maxIter = 2, initialWeights =
+ model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1))
+ expect_equal(head(mlpPredictions$prediction, 10),
+ c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
- model <- spark.mlp(df, layers = c(4, 3), maxIter = 2, initialWeights =
+ model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
c(0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 5.0, 5.0, 9.0, 9.0, 9.0, 9.0, 9.0))
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1))
+ expect_equal(head(mlpPredictions$prediction, 10),
+ c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
- model <- spark.mlp(df, layers = c(4, 3), maxIter = 2)
+ model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2)
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 2, 1, 0, 0, 1))
+ expect_equal(head(mlpPredictions$prediction, 10),
+ c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "0.0", "2.0", "1.0", "0.0"))
+
+ # Test formula works well
+ df <- suppressWarnings(createDataFrame(iris))
+ model <- spark.mlp(df, Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
+ layers = c(4, 3))
+ summary <- summary(model)
+ expect_equal(summary$numOfInputs, 4)
+ expect_equal(summary$numOfOutputs, 3)
+ expect_equal(summary$layers, c(4, 3))
+ expect_equal(length(summary$weights), 15)
+ expect_equal(head(summary$weights, 5), list(-1.1957257, -5.2693685, 7.4489734, -6.3751413,
+ -10.2376130), tolerance = 1e-6)
})
test_that("spark.naiveBayes", {