[SPARK-13925][ML][SPARKR] Expose R-like summary statistics in SparkR::glm for more family and link functions

## What changes were proposed in this pull request? Expose R-like summary statistics in SparkR::glm for more family and link functions. Note: Not all values in R [summary.glm](http://stat.ethz.ch/R-manual/R-patched/library/stats/html/summary.glm.html) are exposed, we only provide the most commonly used statistics in this PR. More statistics can be added in the followup work. ## How was this patch tested? Unit tests. SparkR Output: ``` Deviance Residuals: (Note: These are approximate quantiles with relative error <= 0.01) Min 1Q Median 3Q Max -0.95096 -0.16585 -0.00232 0.17410 0.72918 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.6765 0.23536 7.1231 4.4561e-11 Sepal_Length 0.34988 0.046301 7.5566 4.1873e-12 Species_versicolor -0.98339 0.072075 -13.644 0 Species_virginica -1.0075 0.093306 -10.798 0 (Dispersion parameter for gaussian family taken to be 0.08351462) Null deviance: 28.307 on 149 degrees of freedom Residual deviance: 12.193 on 146 degrees of freedom AIC: 59.22 Number of Fisher Scoring iterations: 1 ``` R output: ``` Deviance Residuals: Min 1Q Median 3Q Max -0.95096 -0.16522 0.00171 0.18416 0.72918 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.67650 0.23536 7.123 4.46e-11 *** Sepal.Length 0.34988 0.04630 7.557 4.19e-12 *** Speciesversicolor -0.98339 0.07207 -13.644 < 2e-16 *** Speciesvirginica -1.00751 0.09331 -10.798 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for gaussian family taken to be 0.08351462) Null deviance: 28.307 on 149 degrees of freedom Residual deviance: 12.193 on 146 degrees of freedom AIC: 59.217 Number of Fisher Scoring iterations: 2 ``` cc mengxr Author: Yanbo Liang <ybliang8@gmail.com> Closes #12393 from yanboliang/spark-13925.
author: Yanbo Liang <ybliang8@gmail.com> 2016-04-15 08:23:51 -0700
committer: Xiangrui Meng <meng@databricks.com> 2016-04-15 08:23:51 -0700
commit: 83af297ac42546580983f91079f74e3a4cf25050 (patch)
tree: abaa00d9f381bcd4fa4adae7a2bc79b54ad325b4 /mllib/src
parent: 06b9d623e8f58d7bd450a50d938f83b4b3472a32 (diff)
download: spark-83af297ac42546580983f91079f74e3a4cf25050.tar.gz
spark-83af297ac42546580983f91079f74e3a4cf25050.tar.bz2
spark-83af297ac42546580983f91079f74e3a4cf25050.zip
1 files changed, 46 insertions, 6 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 475a308385..f66323e36c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -30,19 +30,59 @@ private[r] class GeneralizedLinearRegressionWrapper private (
   private val glm: GeneralizedLinearRegressionModel =
     pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]
 
+  lazy val rFeatures: Array[String] = if (glm.getFitIntercept) {
+    Array("(Intercept)") ++ features
+  } else {
+    features
+  }
+
   lazy val rCoefficients: Array[Double] = if (glm.getFitIntercept) {
-    Array(glm.intercept) ++ glm.coefficients.toArray
+    Array(glm.intercept) ++ glm.coefficients.toArray ++
+      rCoefficientStandardErrors ++ rTValues ++ rPValues
   } else {
-    glm.coefficients.toArray
+    glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
   }
 
-  lazy val rFeatures: Array[String] = if (glm.getFitIntercept) {
-    Array("(Intercept)") ++ features
+  private lazy val rCoefficientStandardErrors = if (glm.getFitIntercept) {
+    Array(glm.summary.coefficientStandardErrors.last) ++
+      glm.summary.coefficientStandardErrors.dropRight(1)
   } else {
-    features
+    glm.summary.coefficientStandardErrors
+  }
+
+  private lazy val rTValues = if (glm.getFitIntercept) {
+    Array(glm.summary.tValues.last) ++ glm.summary.tValues.dropRight(1)
+  } else {
+    glm.summary.tValues
   }
 
-  def transform(dataset: DataFrame): DataFrame = {
+  private lazy val rPValues = if (glm.getFitIntercept) {
+    Array(glm.summary.pValues.last) ++ glm.summary.pValues.dropRight(1)
+  } else {
+    glm.summary.pValues
+  }
+
+  lazy val rDispersion: Double = glm.summary.dispersion
+
+  lazy val rNullDeviance: Double = glm.summary.nullDeviance
+
+  lazy val rDeviance: Double = glm.summary.deviance
+
+  lazy val rResidualDegreeOfFreedomNull: Long = glm.summary.residualDegreeOfFreedomNull
+
+  lazy val rResidualDegreeOfFreedom: Long = glm.summary.residualDegreeOfFreedom
+
+  lazy val rAic: Double = glm.summary.aic
+
+  lazy val rNumIterations: Int = glm.summary.numIterations
+
+  lazy val rDevianceResiduals: DataFrame = glm.summary.residuals()
+
+  lazy val rFamily: String = glm.getFamily
+
+  def residuals(residualsType: String): DataFrame = glm.summary.residuals(residualsType)
+
+  def transform(dataset: Dataset[_]): DataFrame = {
     pipeline.transform(dataset).drop(glm.getFeaturesCol)
   }
 }
author	Yanbo Liang <ybliang8@gmail.com>	2016-04-15 08:23:51 -0700
committer	Xiangrui Meng <meng@databricks.com>	2016-04-15 08:23:51 -0700
commit	83af297ac42546580983f91079f74e3a4cf25050 (patch)
tree	abaa00d9f381bcd4fa4adae7a2bc79b54ad325b4 /mllib/src
parent	06b9d623e8f58d7bd450a50d938f83b4b3472a32 (diff)
download	spark-83af297ac42546580983f91079f74e3a4cf25050.tar.gz spark-83af297ac42546580983f91079f74e3a4cf25050.tar.bz2 spark-83af297ac42546580983f91079f74e3a4cf25050.zip