aboutsummaryrefslogtreecommitdiff
path: root/R/pkg/inst/tests
diff options
context:
space:
mode:
authorwm624@hotmail.com <wm624@hotmail.com>2016-10-26 16:12:55 -0700
committerFelix Cheung <felixcheung@apache.org>2016-10-26 16:12:55 -0700
commit29cea8f332aa3750f8ff7c3b9e705d107278da4b (patch)
treee1a0d66e0e5a50cb5cdf5d6c1cdd7357c30c9409 /R/pkg/inst/tests
parent5b7d403c1819c32a6a5b87d470f8de1a8ad7a987 (diff)
downloadspark-29cea8f332aa3750f8ff7c3b9e705d107278da4b.tar.gz
spark-29cea8f332aa3750f8ff7c3b9e705d107278da4b.tar.bz2
spark-29cea8f332aa3750f8ff7c3b9e705d107278da4b.zip
[SPARK-17157][SPARKR] Add multiclass logistic regression SparkR Wrapper
## What changes were proposed in this pull request? As we discussed in #14818, I added a separate R wrapper spark.logit for logistic regression. This single interface supports both binary and multinomial logistic regression. It also has "predict" and "summary" for binary logistic regression. ## How was this patch tested? New unit tests are added. Author: wm624@hotmail.com <wm624@hotmail.com> Closes #15365 from wangmiao1981/glm.
Diffstat (limited to 'R/pkg/inst/tests')
-rw-r--r--R/pkg/inst/tests/testthat/test_mllib.R55
1 files changed, 55 insertions, 0 deletions
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 33cc069f14..6d1fccc7c0 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -602,6 +602,61 @@ test_that("spark.isotonicRegression", {
unlink(modelPath)
})
+test_that("spark.logit", {
+ # test binary logistic regression
+ label <- c(1.0, 1.0, 1.0, 0.0, 0.0)
+ feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+ binary_data <- as.data.frame(cbind(label, feature))
+ binary_df <- createDataFrame(binary_data)
+
+ blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
+ blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
+ expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0))
+ blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0)
+ blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction"))
+ expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1))
+
+ # test summary of binary logistic regression
+ blr_summary <- summary(blr_model)
+ blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
+ expect_equal(blr_fmeasure$threshold, c(0.8221347, 0.7884005, 0.6674709, 0.3785437, 0.3434487),
+ tolerance = 1e-4)
+ expect_equal(blr_fmeasure$"F-Measure", c(0.5000000, 0.8000000, 0.6666667, 0.8571429, 0.7500000),
+ tolerance = 1e-4)
+ blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision"))
+ expect_equal(blr_precision$precision, c(1.0000000, 1.0000000, 0.6666667, 0.7500000, 0.6000000),
+ tolerance = 1e-4)
+ blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall"))
+ expect_equal(blr_recall$recall, c(0.3333333, 0.6666667, 0.6666667, 1.0000000, 1.0000000),
+ tolerance = 1e-4)
+
+ # test model save and read
+ modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp")
+ write.ml(blr_model, modelPath)
+ expect_error(write.ml(blr_model, modelPath))
+ write.ml(blr_model, modelPath, overwrite = TRUE)
+ blr_model2 <- read.ml(modelPath)
+ blr_predict2 <- collect(select(predict(blr_model2, binary_df), "prediction"))
+ expect_equal(blr_predict$prediction, blr_predict2$prediction)
+ expect_error(summary(blr_model2))
+ unlink(modelPath)
+
+ # test multinomial logistic regression
+ label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
+ feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
+ feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
+ feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
+ feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
+ data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
+ df <- createDataFrame(data)
+
+ model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1))
+ predict1 <- collect(select(predict(model, df), "prediction"))
+ expect_equal(predict1$prediction, c(0, 0, 0, 0, 0))
+ # Summary of multinomial logistic regression is not implemented yet
+ expect_error(summary(model))
+})
+
test_that("spark.gaussianMixture", {
# R code to reproduce the result.
# nolint start