diff options
author | wm624@hotmail.com <wm624@hotmail.com> | 2016-11-30 20:32:17 -0800 |
---|---|---|
committer | Yanbo Liang <ybliang8@gmail.com> | 2016-11-30 20:32:17 -0800 |
commit | 2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e (patch) | |
tree | 8d33015f0e154aec683c7fa7bc0770526c37c64e /R/pkg/inst/tests/testthat | |
parent | 0a811210f809eb5b80eae14694d484d45b48b3f6 (diff) | |
download | spark-2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e.tar.gz spark-2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e.tar.bz2 spark-2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e.zip |
[SPARK-18476][SPARKR][ML] SparkR Logistic Regression should should support output original label.
## What changes were proposed in this pull request?
Similar to SPARK-18401, as a classification algorithm, logistic regression should support output original label instead of supporting index label.
In this PR, original label output is supported and test cases are modified and added. Document is also modified.
## How was this patch tested?
Unit tests.
Author: wm624@hotmail.com <wm624@hotmail.com>
Closes #15910 from wangmiao1981/audit.
Diffstat (limited to 'R/pkg/inst/tests/testthat')
-rw-r--r-- | R/pkg/inst/tests/testthat/test_mllib.R | 26 |
1 files changed, 18 insertions, 8 deletions
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 467e00cf79..0553e704bd 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -646,30 +646,30 @@ test_that("spark.isotonicRegression", { test_that("spark.logit", { # test binary logistic regression - label <- c(1.0, 1.0, 1.0, 0.0, 0.0) + label <- c(0.0, 0.0, 0.0, 1.0, 1.0) feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776) binary_data <- as.data.frame(cbind(label, feature)) binary_df <- createDataFrame(binary_data) blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0) blr_predict <- collect(select(predict(blr_model, binary_df), "prediction")) - expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0)) + expect_equal(blr_predict$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0")) blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0) blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction")) - expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1)) + expect_equal(blr_predict1$prediction, c("1.0", "1.0", "1.0", "1.0", "1.0")) # test summary of binary logistic regression blr_summary <- summary(blr_model) blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure")) - expect_equal(blr_fmeasure$threshold, c(0.8221347, 0.7884005, 0.6674709, 0.3785437, 0.3434487), + expect_equal(blr_fmeasure$threshold, c(0.6565513, 0.6214563, 0.3325291, 0.2115995, 0.1778653), tolerance = 1e-4) - expect_equal(blr_fmeasure$"F-Measure", c(0.5000000, 0.8000000, 0.6666667, 0.8571429, 0.7500000), + expect_equal(blr_fmeasure$"F-Measure", c(0.6666667, 0.5000000, 0.8000000, 0.6666667, 0.5714286), tolerance = 1e-4) blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision")) - expect_equal(blr_precision$precision, c(1.0000000, 1.0000000, 0.6666667, 0.7500000, 0.6000000), + expect_equal(blr_precision$precision, c(1.0000000, 0.5000000, 0.6666667, 0.5000000, 0.4000000), tolerance = 1e-4) blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall")) - expect_equal(blr_recall$recall, c(0.3333333, 0.6666667, 0.6666667, 1.0000000, 1.0000000), + expect_equal(blr_recall$recall, c(0.5000000, 0.5000000, 1.0000000, 1.0000000, 1.0000000), tolerance = 1e-4) # test model save and read @@ -683,6 +683,16 @@ test_that("spark.logit", { expect_error(summary(blr_model2)) unlink(modelPath) + # test prediction label as text + training <- suppressWarnings(createDataFrame(iris)) + binomial_training <- training[training$Species %in% c("versicolor", "virginica"), ] + binomial_model <- spark.logit(binomial_training, Species ~ Sepal_Length + Sepal_Width) + prediction <- predict(binomial_model, binomial_training) + expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character") + expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica", + "versicolor", "virginica", "versicolor", "virginica", "versicolor") + expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected) + # test multinomial logistic regression label <- c(0.0, 1.0, 2.0, 0.0, 0.0) feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667) @@ -694,7 +704,7 @@ test_that("spark.logit", { model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1)) predict1 <- collect(select(predict(model, df), "prediction")) - expect_equal(predict1$prediction, c(0, 0, 0, 0, 0)) + expect_equal(predict1$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0")) # Summary of multinomial logistic regression is not implemented yet expect_error(summary(model)) }) |