aboutsummaryrefslogtreecommitdiff
path: root/R/pkg/inst/tests
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-04-15 08:23:51 -0700
committerXiangrui Meng <meng@databricks.com>2016-04-15 08:23:51 -0700
commit83af297ac42546580983f91079f74e3a4cf25050 (patch)
treeabaa00d9f381bcd4fa4adae7a2bc79b54ad325b4 /R/pkg/inst/tests
parent06b9d623e8f58d7bd450a50d938f83b4b3472a32 (diff)
downloadspark-83af297ac42546580983f91079f74e3a4cf25050.tar.gz
spark-83af297ac42546580983f91079f74e3a4cf25050.tar.bz2
spark-83af297ac42546580983f91079f74e3a4cf25050.zip
[SPARK-13925][ML][SPARKR] Expose R-like summary statistics in SparkR::glm for more family and link functions
## What changes were proposed in this pull request? Expose R-like summary statistics in SparkR::glm for more family and link functions. Note: Not all values in R [summary.glm](http://stat.ethz.ch/R-manual/R-patched/library/stats/html/summary.glm.html) are exposed, we only provide the most commonly used statistics in this PR. More statistics can be added in the followup work. ## How was this patch tested? Unit tests. SparkR Output: ``` Deviance Residuals: (Note: These are approximate quantiles with relative error <= 0.01) Min 1Q Median 3Q Max -0.95096 -0.16585 -0.00232 0.17410 0.72918 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.6765 0.23536 7.1231 4.4561e-11 Sepal_Length 0.34988 0.046301 7.5566 4.1873e-12 Species_versicolor -0.98339 0.072075 -13.644 0 Species_virginica -1.0075 0.093306 -10.798 0 (Dispersion parameter for gaussian family taken to be 0.08351462) Null deviance: 28.307 on 149 degrees of freedom Residual deviance: 12.193 on 146 degrees of freedom AIC: 59.22 Number of Fisher Scoring iterations: 1 ``` R output: ``` Deviance Residuals: Min 1Q Median 3Q Max -0.95096 -0.16522 0.00171 0.18416 0.72918 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.67650 0.23536 7.123 4.46e-11 *** Sepal.Length 0.34988 0.04630 7.557 4.19e-12 *** Speciesversicolor -0.98339 0.07207 -13.644 < 2e-16 *** Speciesvirginica -1.00751 0.09331 -10.798 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for gaussian family taken to be 0.08351462) Null deviance: 28.307 on 149 degrees of freedom Residual deviance: 12.193 on 146 degrees of freedom AIC: 59.217 Number of Fisher Scoring iterations: 2 ``` cc mengxr Author: Yanbo Liang <ybliang8@gmail.com> Closes #12393 from yanboliang/spark-13925.
Diffstat (limited to 'R/pkg/inst/tests')
-rw-r--r--R/pkg/inst/tests/testthat/test_mllib.R49
1 files changed, 49 insertions, 0 deletions
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index a9dbd2bdc4..47bbf7e5bd 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -77,6 +77,55 @@ test_that("glm and predict", {
expect_equal(length(predict(lm(y ~ x))), 15)
})
+test_that("glm summary", {
+ # gaussian family
+ training <- suppressWarnings(createDataFrame(sqlContext, iris))
+ stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
+
+ rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
+
+ coefs <- unlist(stats$coefficients)
+ rCoefs <- unlist(rStats$coefficients)
+ expect_true(all(abs(rCoefs - coefs) < 1e-4))
+ expect_true(all(
+ rownames(stats$coefficients) ==
+ c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
+ expect_equal(stats$dispersion, rStats$dispersion)
+ expect_equal(stats$null.deviance, rStats$null.deviance)
+ expect_equal(stats$deviance, rStats$deviance)
+ expect_equal(stats$df.null, rStats$df.null)
+ expect_equal(stats$df.residual, rStats$df.residual)
+ expect_equal(stats$aic, rStats$aic)
+
+ # binomial family
+ df <- suppressWarnings(createDataFrame(sqlContext, iris))
+ training <- df[df$Species %in% c("versicolor", "virginica"), ]
+ stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
+ family = binomial(link = "logit")))
+
+ rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+ rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
+ family = binomial(link = "logit")))
+
+ coefs <- unlist(stats$coefficients)
+ rCoefs <- unlist(rStats$coefficients)
+ expect_true(all(abs(rCoefs - coefs) < 1e-4))
+ expect_true(all(
+ rownames(stats$coefficients) ==
+ c("(Intercept)", "Sepal_Length", "Sepal_Width")))
+ expect_equal(stats$dispersion, rStats$dispersion)
+ expect_equal(stats$null.deviance, rStats$null.deviance)
+ expect_equal(stats$deviance, rStats$deviance)
+ expect_equal(stats$df.null, rStats$df.null)
+ expect_equal(stats$df.residual, rStats$df.residual)
+ expect_equal(stats$aic, rStats$aic)
+
+ # Test summary works on base GLM models
+ baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
+ baseSummary <- summary(baseModel)
+ expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
+})
+
test_that("kmeans", {
newIris <- iris
newIris$Species <- NULL