2 files changed, 25 insertions, 5 deletions
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 265e64e746..02bc6456de 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -278,8 +278,10 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat
 
 #' @param object a fitted generalized linear model.
 #' @return \code{summary} returns a summary object of the fitted model, a list of components
-#'         including at least the coefficients, null/residual deviance, null/residual degrees
-#'         of freedom, AIC and number of iterations IRLS takes.
+#'         including at least the coefficients matrix (which includes coefficients, standard error
+#'         of coefficients, t value and p value), null/residual deviance, null/residual degrees of
+#'         freedom, AIC and number of iterations IRLS takes. If there are collinear columns
+#'         in you data, the coefficients matrix only provides coefficients.
 #'
 #' @rdname spark.glm
 #' @export
@@ -303,9 +305,18 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
             } else {
               dataFrame(callJMethod(jobj, "rDevianceResiduals"))
             }
-            coefficients <- matrix(coefficients, ncol = 4)
-            colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
-            rownames(coefficients) <- unlist(features)
+            # If the underlying WeightedLeastSquares using "normal" solver, we can provide
+            # coefficients, standard error of coefficients, t value and p value. Otherwise,
+            # it will be fitted by local "l-bfgs", we can only provide coefficients.
+            if (length(features) == length(coefficients)) {
+              coefficients <- matrix(coefficients, ncol = 1)
+              colnames(coefficients) <- c("Estimate")
+              rownames(coefficients) <- unlist(features)
+            } else {
+              coefficients <- matrix(coefficients, ncol = 4)
+              colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
+              rownames(coefficients) <- unlist(features)
+            }
             ans <- list(deviance.resid = deviance.resid, coefficients = coefficients,
                         dispersion = dispersion, null.deviance = null.deviance,
                         deviance = deviance, df.null = df.null, df.residual = df.residual,
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 2a97a51cfa..467e00cf79 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -169,6 +169,15 @@ test_that("spark.glm summary", {
   df <- suppressWarnings(createDataFrame(data))
   regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
   expect_equal(regStats$aic, 14.00976, tolerance = 1e-4) # 14.00976 is from summary() result
+
+  # Test spark.glm works on collinear data
+  A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
+  b <- c(1, 2, 3, 4)
+  data <- as.data.frame(cbind(A, b))
+  df <- createDataFrame(data)
+  stats <- summary(spark.glm(df, b ~ . - 1))
+  coefs <- unlist(stats$coefficients)
+  expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
 })
 
 test_that("spark.glm save/load", {