aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorXin Ren <iamshrek@126.com>2016-08-24 11:18:10 -0700
committerFelix Cheung <felixcheung@apache.org>2016-08-24 11:18:10 -0700
commit2fbdb606392631b1dff88ec86f388cc2559c28f5 (patch)
tree002050c92864378c0c65a5d6c449420c8d604170 /R
parentd2932a0e987132c694ed59515b7c77adaad052e6 (diff)
downloadspark-2fbdb606392631b1dff88ec86f388cc2559c28f5.tar.gz
spark-2fbdb606392631b1dff88ec86f388cc2559c28f5.tar.bz2
spark-2fbdb606392631b1dff88ec86f388cc2559c28f5.zip
[SPARK-16445][MLLIB][SPARKR] Multilayer Perceptron Classifier wrapper in SparkR
https://issues.apache.org/jira/browse/SPARK-16445 ## What changes were proposed in this pull request? Create Multilayer Perceptron Classifier wrapper in SparkR ## How was this patch tested? Tested manually on local machine Author: Xin Ren <iamshrek@126.com> Closes #14447 from keypointt/SPARK-16445.
Diffstat (limited to 'R')
-rw-r--r--R/pkg/NAMESPACE1
-rw-r--r--R/pkg/R/generics.R4
-rw-r--r--R/pkg/R/mllib.R125
-rw-r--r--R/pkg/inst/tests/testthat/test_mllib.R32
4 files changed, 157 insertions, 5 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 709057675e..ad587a6b7d 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -27,6 +27,7 @@ exportMethods("glm",
"summary",
"spark.kmeans",
"fitted",
+ "spark.mlp",
"spark.naiveBayes",
"spark.survreg",
"spark.lda",
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 88884e6257..7e626be508 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1330,6 +1330,10 @@ setGeneric("spark.kmeans", function(data, formula, ...) { standardGeneric("spark
#' @export
setGeneric("fitted")
+#' @rdname spark.mlp
+#' @export
+setGeneric("spark.mlp", function(data, ...) { standardGeneric("spark.mlp") })
+
#' @rdname spark.naiveBayes
#' @export
setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("spark.naiveBayes") })
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index a40310d194..a670600ca6 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -60,6 +60,13 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj"))
#' @note KMeansModel since 2.0.0
setClass("KMeansModel", representation(jobj = "jobj"))
+#' S4 class that represents a MultilayerPerceptronClassificationModel
+#'
+#' @param jobj a Java object reference to the backing Scala MultilayerPerceptronClassifierWrapper
+#' @export
+#' @note MultilayerPerceptronClassificationModel since 2.1.0
+setClass("MultilayerPerceptronClassificationModel", representation(jobj = "jobj"))
+
#' S4 class that represents an IsotonicRegressionModel
#'
#' @param jobj a Java object reference to the backing Scala IsotonicRegressionModel
@@ -90,7 +97,7 @@ setClass("ALSModel", representation(jobj = "jobj"))
#' @export
#' @seealso \link{spark.glm}, \link{glm},
#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
-#' @seealso \link{spark.lda}, \link{spark.naiveBayes}, \link{spark.survreg},
+#' @seealso \link{spark.lda}, \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg}
#' @seealso \link{read.ml}
NULL
@@ -103,7 +110,7 @@ NULL
#' @export
#' @seealso \link{spark.glm}, \link{glm},
#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
-#' @seealso \link{spark.naiveBayes}, \link{spark.survreg},
+#' @seealso \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg}
NULL
write_internal <- function(object, path, overwrite = FALSE) {
@@ -631,6 +638,95 @@ setMethod("predict", signature(object = "KMeansModel"),
predict_internal(object, newData)
})
+#' Multilayer Perceptron Classification Model
+#'
+#' \code{spark.mlp} fits a multi-layer perceptron neural network model against a SparkDataFrame.
+#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make
+#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
+#' Only categorical data is supported.
+#' For more details, see
+#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html}{
+#' Multilayer Perceptron}
+#'
+#' @param data a \code{SparkDataFrame} of observations and labels for model fitting.
+#' @param blockSize blockSize parameter.
+#' @param layers integer vector containing the number of nodes for each layer
+#' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "l-bfgs".
+#' @param maxIter maximum iteration number.
+#' @param tol convergence tolerance of iterations.
+#' @param stepSize stepSize parameter.
+#' @param seed seed parameter for weights initialization.
+#' @param ... additional arguments passed to the method.
+#' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
+#' @rdname spark.mlp
+#' @aliases spark.mlp,SparkDataFrame-method
+#' @name spark.mlp
+#' @seealso \link{read.ml}
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
+#'
+#' # fit a Multilayer Perceptron Classification Model
+#' model <- spark.mlp(df, blockSize = 128, layers = c(4, 5, 4, 3), solver = "l-bfgs",
+#' maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
+#'
+#' # get the summary of the model
+#' summary(model)
+#'
+#' # make predictions
+#' predictions <- predict(model, df)
+#'
+#' # save and load the model
+#' path <- "path/to/model"
+#' write.ml(model, path)
+#' savedModel <- read.ml(path)
+#' summary(savedModel)
+#' }
+#' @note spark.mlp since 2.1.0
+setMethod("spark.mlp", signature(data = "SparkDataFrame"),
+ function(data, blockSize = 128, layers = c(3, 5, 2), solver = "l-bfgs", maxIter = 100,
+ tol = 0.5, stepSize = 1, seed = 1) {
+ jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
+ "fit", data@sdf, as.integer(blockSize), as.array(layers),
+ as.character(solver), as.integer(maxIter), as.numeric(tol),
+ as.numeric(stepSize), as.integer(seed))
+ new("MultilayerPerceptronClassificationModel", jobj = jobj)
+ })
+
+# Makes predictions from a model produced by spark.mlp().
+
+#' @param newData a SparkDataFrame for testing.
+#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
+#' "prediction".
+#' @rdname spark.mlp
+#' @aliases predict,MultilayerPerceptronClassificationModel-method
+#' @export
+#' @note predict(MultilayerPerceptronClassificationModel) since 2.1.0
+setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel"),
+ function(object, newData) {
+ predict_internal(object, newData)
+ })
+
+# Returns the summary of a Multilayer Perceptron Classification Model produced by \code{spark.mlp}
+
+#' @param object a Multilayer Perceptron Classification Model fitted by \code{spark.mlp}
+#' @return \code{summary} returns a list containing \code{layers}, the label distribution, and
+#' \code{tables}, conditional probabilities given the target label.
+#' @rdname spark.mlp
+#' @export
+#' @aliases summary,MultilayerPerceptronClassificationModel-method
+#' @note summary(MultilayerPerceptronClassificationModel) since 2.1.0
+setMethod("summary", signature(object = "MultilayerPerceptronClassificationModel"),
+ function(object) {
+ jobj <- object@jobj
+ labelCount <- callJMethod(jobj, "labelCount")
+ layers <- unlist(callJMethod(jobj, "layers"))
+ weights <- callJMethod(jobj, "weights")
+ weights <- matrix(weights, nrow = length(weights))
+ list(labelCount = labelCount, layers = layers, weights = weights)
+ })
+
#' Naive Bayes Models
#'
#' \code{spark.naiveBayes} fits a Bernoulli naive Bayes model against a SparkDataFrame.
@@ -685,7 +781,7 @@ setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "form
#'
#' @rdname spark.naiveBayes
#' @export
-#' @seealso \link{read.ml}
+#' @seealso \link{write.ml}
#' @note write.ml(NaiveBayesModel, character) since 2.0.0
setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"),
function(object, path, overwrite = FALSE) {
@@ -700,7 +796,7 @@ setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"),
#' @rdname spark.survreg
#' @export
#' @note write.ml(AFTSurvivalRegressionModel, character) since 2.0.0
-#' @seealso \link{read.ml}
+#' @seealso \link{write.ml}
setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = "character"),
function(object, path, overwrite = FALSE) {
write_internal(object, path, overwrite)
@@ -734,6 +830,23 @@ setMethod("write.ml", signature(object = "KMeansModel", path = "character"),
write_internal(object, path, overwrite)
})
+# Saves the Multilayer Perceptron Classification Model to the input path.
+
+#' @param path the directory where the model is saved.
+#' @param overwrite overwrites or not if the output path already exists. Default is FALSE
+#' which means throw exception if the output path exists.
+#'
+#' @rdname spark.mlp
+#' @aliases write.ml,MultilayerPerceptronClassificationModel,character-method
+#' @export
+#' @seealso \link{write.ml}
+#' @note write.ml(MultilayerPerceptronClassificationModel, character) since 2.1.0
+setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationModel",
+ path = "character"),
+ function(object, path, overwrite = FALSE) {
+ write_internal(object, path, overwrite)
+ })
+
# Save fitted IsotonicRegressionModel to the input path
#' @param path The directory where the model is saved
@@ -791,6 +904,8 @@ read.ml <- function(path) {
new("KMeansModel", jobj = jobj)
} else if (isInstanceOf(jobj, "org.apache.spark.ml.r.LDAWrapper")) {
new("LDAModel", jobj = jobj)
+ } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper")) {
+ new("MultilayerPerceptronClassificationModel", jobj = jobj)
} else if (isInstanceOf(jobj, "org.apache.spark.ml.r.IsotonicRegressionWrapper")) {
new("IsotonicRegressionModel", jobj = jobj)
} else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GaussianMixtureWrapper")) {
@@ -798,7 +913,7 @@ read.ml <- function(path) {
} else if (isInstanceOf(jobj, "org.apache.spark.ml.r.ALSWrapper")) {
new("ALSModel", jobj = jobj)
} else {
- stop(paste("Unsupported model: ", jobj))
+ stop("Unsupported model: ", jobj)
}
}
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index de9bd48662..1e6da650d1 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -347,6 +347,38 @@ test_that("spark.kmeans", {
unlink(modelPath)
})
+test_that("spark.mlp", {
+ df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
+ model <- spark.mlp(df, blockSize = 128, layers = c(4, 5, 4, 3), solver = "l-bfgs", maxIter = 100,
+ tol = 0.5, stepSize = 1, seed = 1)
+
+ # Test summary method
+ summary <- summary(model)
+ expect_equal(summary$labelCount, 3)
+ expect_equal(summary$layers, c(4, 5, 4, 3))
+ expect_equal(length(summary$weights), 64)
+
+ # Test predict method
+ mlpTestDF <- df
+ mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+ expect_equal(head(mlpPredictions$prediction, 6), c(0, 1, 1, 1, 1, 1))
+
+ # Test model save/load
+ modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ summary2 <- summary(model2)
+
+ expect_equal(summary2$labelCount, 3)
+ expect_equal(summary2$layers, c(4, 5, 4, 3))
+ expect_equal(length(summary2$weights), 64)
+
+ unlink(modelPath)
+
+})
+
test_that("spark.naiveBayes", {
# R code to reproduce the result.
# We do not support instance weights yet. So we ignore the frequencies.