[SPARK-17157][SPARKR] Add multiclass logistic regression SparkR Wrapper

## What changes were proposed in this pull request? As we discussed in #14818, I added a separate R wrapper spark.logit for logistic regression. This single interface supports both binary and multinomial logistic regression. It also has "predict" and "summary" for binary logistic regression. ## How was this patch tested? New unit tests are added. Author: wm624@hotmail.com <wm624@hotmail.com> Closes #15365 from wangmiao1981/glm.
author: wm624@hotmail.com <wm624@hotmail.com> 2016-10-26 16:12:55 -0700
committer: Felix Cheung <felixcheung@apache.org> 2016-10-26 16:12:55 -0700
commit: 29cea8f332aa3750f8ff7c3b9e705d107278da4b (patch)
tree: e1a0d66e0e5a50cb5cdf5d6c1cdd7357c30c9409 /R/pkg/inst/tests
parent: 5b7d403c1819c32a6a5b87d470f8de1a8ad7a987 (diff)
download: spark-29cea8f332aa3750f8ff7c3b9e705d107278da4b.tar.gz
spark-29cea8f332aa3750f8ff7c3b9e705d107278da4b.tar.bz2
spark-29cea8f332aa3750f8ff7c3b9e705d107278da4b.zip
1 files changed, 55 insertions, 0 deletions
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 33cc069f14..6d1fccc7c0 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -602,6 +602,61 @@ test_that("spark.isotonicRegression", {
   unlink(modelPath)
 })
 
+test_that("spark.logit", {
+  # test binary logistic regression
+  label <- c(1.0, 1.0, 1.0, 0.0, 0.0)
+  feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+  binary_data <- as.data.frame(cbind(label, feature))
+  binary_df <- createDataFrame(binary_data)
+
+  blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
+  blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
+  expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0))
+  blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0)
+  blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction"))
+  expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1))
+
+  # test summary of binary logistic regression
+  blr_summary <- summary(blr_model)
+  blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
+  expect_equal(blr_fmeasure$threshold, c(0.8221347, 0.7884005, 0.6674709, 0.3785437, 0.3434487),
+               tolerance = 1e-4)
+  expect_equal(blr_fmeasure$"F-Measure", c(0.5000000, 0.8000000, 0.6666667, 0.8571429, 0.7500000),
+               tolerance = 1e-4)
+  blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision"))
+  expect_equal(blr_precision$precision, c(1.0000000, 1.0000000, 0.6666667, 0.7500000, 0.6000000),
+               tolerance = 1e-4)
+  blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall"))
+  expect_equal(blr_recall$recall, c(0.3333333, 0.6666667, 0.6666667, 1.0000000, 1.0000000),
+               tolerance = 1e-4)
+
+  # test model save and read
+  modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp")
+  write.ml(blr_model, modelPath)
+  expect_error(write.ml(blr_model, modelPath))
+  write.ml(blr_model, modelPath, overwrite = TRUE)
+  blr_model2 <- read.ml(modelPath)
+  blr_predict2 <- collect(select(predict(blr_model2, binary_df), "prediction"))
+  expect_equal(blr_predict$prediction, blr_predict2$prediction)
+  expect_error(summary(blr_model2))
+  unlink(modelPath)
+
+  # test multinomial logistic regression
+  label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
+  feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
+  feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
+  feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
+  feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
+  data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
+  df <- createDataFrame(data)
+
+  model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1))
+  predict1 <- collect(select(predict(model, df), "prediction"))
+  expect_equal(predict1$prediction, c(0, 0, 0, 0, 0))
+  # Summary of multinomial logistic regression is not implemented yet
+  expect_error(summary(model))
+})
+
 test_that("spark.gaussianMixture", {
   # R code to reproduce the result.
   # nolint start
author	wm624@hotmail.com <wm624@hotmail.com>	2016-10-26 16:12:55 -0700
committer	Felix Cheung <felixcheung@apache.org>	2016-10-26 16:12:55 -0700
commit	29cea8f332aa3750f8ff7c3b9e705d107278da4b (patch)
tree	e1a0d66e0e5a50cb5cdf5d6c1cdd7357c30c9409 /R/pkg/inst/tests
parent	5b7d403c1819c32a6a5b87d470f8de1a8ad7a987 (diff)
download	spark-29cea8f332aa3750f8ff7c3b9e705d107278da4b.tar.gz spark-29cea8f332aa3750f8ff7c3b9e705d107278da4b.tar.bz2 spark-29cea8f332aa3750f8ff7c3b9e705d107278da4b.zip