From 29cea8f332aa3750f8ff7c3b9e705d107278da4b Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Wed, 26 Oct 2016 16:12:55 -0700 Subject: [SPARK-17157][SPARKR] Add multiclass logistic regression SparkR Wrapper ## What changes were proposed in this pull request? As we discussed in #14818, I added a separate R wrapper spark.logit for logistic regression. This single interface supports both binary and multinomial logistic regression. It also has "predict" and "summary" for binary logistic regression. ## How was this patch tested? New unit tests are added. Author: wm624@hotmail.com Closes #15365 from wangmiao1981/glm. --- R/pkg/inst/tests/testthat/test_mllib.R | 55 ++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'R/pkg/inst/tests') diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 33cc069f14..6d1fccc7c0 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -602,6 +602,61 @@ test_that("spark.isotonicRegression", { unlink(modelPath) }) +test_that("spark.logit", { + # test binary logistic regression + label <- c(1.0, 1.0, 1.0, 0.0, 0.0) + feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776) + binary_data <- as.data.frame(cbind(label, feature)) + binary_df <- createDataFrame(binary_data) + + blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0) + blr_predict <- collect(select(predict(blr_model, binary_df), "prediction")) + expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0)) + blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0) + blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction")) + expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1)) + + # test summary of binary logistic regression + blr_summary <- summary(blr_model) + blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure")) + expect_equal(blr_fmeasure$threshold, c(0.8221347, 0.7884005, 0.6674709, 0.3785437, 0.3434487), + tolerance = 1e-4) + expect_equal(blr_fmeasure$"F-Measure", c(0.5000000, 0.8000000, 0.6666667, 0.8571429, 0.7500000), + tolerance = 1e-4) + blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision")) + expect_equal(blr_precision$precision, c(1.0000000, 1.0000000, 0.6666667, 0.7500000, 0.6000000), + tolerance = 1e-4) + blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall")) + expect_equal(blr_recall$recall, c(0.3333333, 0.6666667, 0.6666667, 1.0000000, 1.0000000), + tolerance = 1e-4) + + # test model save and read + modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp") + write.ml(blr_model, modelPath) + expect_error(write.ml(blr_model, modelPath)) + write.ml(blr_model, modelPath, overwrite = TRUE) + blr_model2 <- read.ml(modelPath) + blr_predict2 <- collect(select(predict(blr_model2, binary_df), "prediction")) + expect_equal(blr_predict$prediction, blr_predict2$prediction) + expect_error(summary(blr_model2)) + unlink(modelPath) + + # test multinomial logistic regression + label <- c(0.0, 1.0, 2.0, 0.0, 0.0) + feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667) + feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987) + feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130) + feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842) + data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4)) + df <- createDataFrame(data) + + model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1)) + predict1 <- collect(select(predict(model, df), "prediction")) + expect_equal(predict1$prediction, c(0, 0, 0, 0, 0)) + # Summary of multinomial logistic regression is not implemented yet + expect_error(summary(model)) +}) + test_that("spark.gaussianMixture", { # R code to reproduce the result. # nolint start -- cgit v1.2.3