From 75e05a5a964c9585dd09a2ef6178881929bab1f1 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 12 Apr 2016 10:51:07 -0700
Subject: [SPARK-12566][SPARK-14324][ML] GLM model family, link function
 support in SparkR:::glm

* SparkR glm supports families and link functions which match R's signature for family.
* SparkR glm API refactor. The comparative standard of the new API is R glm, so I only expose the arguments that R glm supports: ```formula, family, data, epsilon and maxit```.
* This PR is focus on glm() and predict(), summary statistics will be done in a separate PR after this get in.
* This PR depends on #12287 which make GLMs support link prediction at Scala side. After that merged, I will add more tests for predict() to this PR.

Unit tests.

cc mengxr jkbradley hhbyyh

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #12294 from yanboliang/spark-12566.
---
 R/pkg/inst/tests/testthat/test_mllib.R | 95 +++++++++++-----------------------
 1 file changed, 29 insertions(+), 66 deletions(-)

(limited to 'R/pkg/inst/tests/testthat/test_mllib.R')

diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index fdb591756e..a9dbd2bdc4 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -25,20 +25,21 @@ sc <- sparkR.init()
 
 sqlContext <- sparkRSQL.init(sc)
 
-test_that("glm and predict", {
+test_that("formula of glm", {
   training <- suppressWarnings(createDataFrame(sqlContext, iris))
-  test <- select(training, "Sepal_Length")
-  model <- glm(Sepal_Width ~ Sepal_Length, training, family = "gaussian")
-  prediction <- predict(model, test)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  # dot minus and intercept vs native glm
+  model <- glm(Sepal_Width ~ . - Species + 0, data = training)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
-  # Test stats::predict is working
-  x <- rnorm(15)
-  y <- x + rnorm(15)
-  expect_equal(length(predict(lm(y ~ x))), 15)
-})
+  # feature interaction vs native glm
+  model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
-test_that("glm should work with long formula", {
+  # glm should work with long formula
   training <- suppressWarnings(createDataFrame(sqlContext, iris))
   training$LongLongLongLongLongName <- training$Sepal_Width
   training$VeryLongLongLongLonLongName <- training$Sepal_Length
@@ -50,68 +51,30 @@ test_that("glm should work with long formula", {
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
 
-test_that("predictions match with native glm", {
+test_that("glm and predict", {
   training <- suppressWarnings(createDataFrame(sqlContext, iris))
+  # gaussian family
   model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
-  vals <- collect(select(predict(model, training), "prediction"))
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
   rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-})
-
-test_that("dot minus and intercept vs native glm", {
-  training <- suppressWarnings(createDataFrame(sqlContext, iris))
-  model <- glm(Sepal_Width ~ . - Species + 0, data = training)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-})
 
-test_that("feature interaction vs native glm", {
-  training <- suppressWarnings(createDataFrame(sqlContext, iris))
-  model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
+  # poisson family
+  model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
+               family = poisson(link = identity))
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+  rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
+           data = iris, family = poisson(link = identity)), iris))
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-})
 
-test_that("summary coefficients match with native glm", {
-  training <- suppressWarnings(createDataFrame(sqlContext, iris))
-  stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training, solver = "normal"))
-  coefs <- unlist(stats$coefficients)
-  devianceResiduals <- unlist(stats$devianceResiduals)
-
-  rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
-  rCoefs <- unlist(rStats$coefficients)
-  rDevianceResiduals <- c(-0.95096, 0.72918)
-
-  expect_true(all(abs(rCoefs - coefs) < 1e-5))
-  expect_true(all(abs(rDevianceResiduals - devianceResiduals) < 1e-5))
-  expect_true(all(
-    rownames(stats$coefficients) ==
-    c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
-})
-
-test_that("summary coefficients match with native glm of family 'binomial'", {
-  df <- suppressWarnings(createDataFrame(sqlContext, iris))
-  training <- filter(df, df$Species != "setosa")
-  stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
-    family = "binomial"))
-  coefs <- as.vector(stats$coefficients[, 1])
-
-  rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
-  rCoefs <- as.vector(coef(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
-    family = binomial(link = "logit"))))
-
-  expect_true(all(abs(rCoefs - coefs) < 1e-4))
-  expect_true(all(
-    rownames(stats$coefficients) ==
-    c("(Intercept)", "Sepal_Length", "Sepal_Width")))
-})
-
-test_that("summary works on base GLM models", {
-  baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
-  baseSummary <- summary(baseModel)
-  expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
+  # Test stats::predict is working
+  x <- rnorm(15)
+  y <- x + rnorm(15)
+  expect_equal(length(predict(lm(y ~ x))), 15)
 })
 
 test_that("kmeans", {
-- 
cgit v1.2.3