[SPARK-18476][SPARKR][ML] SparkR Logistic Regression should should support output original label.

## What changes were proposed in this pull request? Similar to SPARK-18401, as a classification algorithm, logistic regression should support output original label instead of supporting index label. In this PR, original label output is supported and test cases are modified and added. Document is also modified. ## How was this patch tested? Unit tests. Author: wm624@hotmail.com <wm624@hotmail.com> Closes #15910 from wangmiao1981/audit.
author: wm624@hotmail.com <wm624@hotmail.com> 2016-11-30 20:32:17 -0800
committer: Yanbo Liang <ybliang8@gmail.com> 2016-11-30 20:32:17 -0800
commit: 2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e (patch)
tree: 8d33015f0e154aec683c7fa7bc0770526c37c64e /R/pkg/inst/tests/testthat
parent: 0a811210f809eb5b80eae14694d484d45b48b3f6 (diff)
download: spark-2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e.tar.gz
spark-2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e.tar.bz2
spark-2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e.zip
1 files changed, 18 insertions, 8 deletions
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 467e00cf79..0553e704bd 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -646,30 +646,30 @@ test_that("spark.isotonicRegression", {
 
 test_that("spark.logit", {
   # test binary logistic regression
-  label <- c(1.0, 1.0, 1.0, 0.0, 0.0)
+  label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
   feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
   binary_data <- as.data.frame(cbind(label, feature))
   binary_df <- createDataFrame(binary_data)
 
   blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
   blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
-  expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0))
+  expect_equal(blr_predict$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
   blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0)
   blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction"))
-  expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1))
+  expect_equal(blr_predict1$prediction, c("1.0", "1.0", "1.0", "1.0", "1.0"))
 
   # test summary of binary logistic regression
   blr_summary <- summary(blr_model)
   blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
-  expect_equal(blr_fmeasure$threshold, c(0.8221347, 0.7884005, 0.6674709, 0.3785437, 0.3434487),
+  expect_equal(blr_fmeasure$threshold, c(0.6565513, 0.6214563, 0.3325291, 0.2115995, 0.1778653),
                tolerance = 1e-4)
-  expect_equal(blr_fmeasure$"F-Measure", c(0.5000000, 0.8000000, 0.6666667, 0.8571429, 0.7500000),
+  expect_equal(blr_fmeasure$"F-Measure", c(0.6666667, 0.5000000, 0.8000000, 0.6666667, 0.5714286),
                tolerance = 1e-4)
   blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision"))
-  expect_equal(blr_precision$precision, c(1.0000000, 1.0000000, 0.6666667, 0.7500000, 0.6000000),
+  expect_equal(blr_precision$precision, c(1.0000000, 0.5000000, 0.6666667, 0.5000000, 0.4000000),
                tolerance = 1e-4)
   blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall"))
-  expect_equal(blr_recall$recall, c(0.3333333, 0.6666667, 0.6666667, 1.0000000, 1.0000000),
+  expect_equal(blr_recall$recall, c(0.5000000, 0.5000000, 1.0000000, 1.0000000, 1.0000000),
                tolerance = 1e-4)
 
   # test model save and read
@@ -683,6 +683,16 @@ test_that("spark.logit", {
   expect_error(summary(blr_model2))
   unlink(modelPath)
 
+  # test prediction label as text
+  training <- suppressWarnings(createDataFrame(iris))
+  binomial_training <- training[training$Species %in% c("versicolor", "virginica"), ]
+  binomial_model <- spark.logit(binomial_training, Species ~ Sepal_Length + Sepal_Width)
+  prediction <- predict(binomial_model, binomial_training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
+  expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica",
+                "versicolor", "virginica", "versicolor", "virginica", "versicolor")
+  expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
+
   # test multinomial logistic regression
   label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
   feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
@@ -694,7 +704,7 @@ test_that("spark.logit", {
 
   model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1))
   predict1 <- collect(select(predict(model, df), "prediction"))
-  expect_equal(predict1$prediction, c(0, 0, 0, 0, 0))
+  expect_equal(predict1$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
   # Summary of multinomial logistic regression is not implemented yet
   expect_error(summary(model))
 })
author	wm624@hotmail.com <wm624@hotmail.com>	2016-11-30 20:32:17 -0800
committer	Yanbo Liang <ybliang8@gmail.com>	2016-11-30 20:32:17 -0800
commit	2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e (patch)
tree	8d33015f0e154aec683c7fa7bc0770526c37c64e /R/pkg/inst/tests/testthat
parent	0a811210f809eb5b80eae14694d484d45b48b3f6 (diff)
download	spark-2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e.tar.gz spark-2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e.tar.bz2 spark-2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e.zip