aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala18
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala57
2 files changed, 43 insertions, 32 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index f137c8cb41..3891ae63a4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -215,6 +215,8 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
* Sets the value of param [[weightCol]].
* If this is not set or empty, we treat all instance weights as 1.0.
* Default is not set, so all instances have weight one.
+ * In the Binomial family, weights correspond to number of trials and should be integer.
+ * Non-integer weights are rounded to integer in AIC calculation.
*
* @group setParam
*/
@@ -467,10 +469,12 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
override def variance(mu: Double): Double = mu * (1.0 - mu)
+ private def ylogy(y: Double, mu: Double): Double = {
+ if (y == 0) 0.0 else y * math.log(y / mu)
+ }
+
override def deviance(y: Double, mu: Double, weight: Double): Double = {
- val my = 1.0 - y
- 2.0 * weight * (y * math.log(math.max(y, 1.0) / mu) +
- my * math.log(math.max(my, 1.0) / (1.0 - mu)))
+ 2.0 * weight * (ylogy(y, mu) + ylogy(1.0 - y, 1.0 - mu))
}
override def aic(
@@ -479,7 +483,13 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
numInstances: Double,
weightSum: Double): Double = {
-2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) =>
- weight * dist.Binomial(1, mu).logProbabilityOf(math.round(y).toInt)
+ // weights for Binomial distribution correspond to number of trials
+ val wt = math.round(weight).toInt
+ if (wt == 0) {
+ 0.0
+ } else {
+ dist.Binomial(wt, mu).logProbabilityOf(math.round(y * weight).toInt)
+ }
}.sum()
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 3e9e1fced8..ed24c1e16a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -711,16 +711,17 @@ class GeneralizedLinearRegressionSuite
R code:
A <- matrix(c(0, 1, 2, 3, 5, 2, 1, 3), 4, 2)
- b <- c(1, 0, 1, 0)
- w <- c(1, 2, 3, 4)
+ b <- c(1, 0.5, 1, 0)
+ w <- c(1, 2.0, 0.3, 4.7)
df <- as.data.frame(cbind(A, b))
*/
val datasetWithWeight = Seq(
Instance(1.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
- Instance(0.0, 2.0, Vectors.dense(1.0, 2.0)),
- Instance(1.0, 3.0, Vectors.dense(2.0, 1.0)),
- Instance(0.0, 4.0, Vectors.dense(3.0, 3.0))
+ Instance(0.5, 2.0, Vectors.dense(1.0, 2.0)),
+ Instance(1.0, 0.3, Vectors.dense(2.0, 1.0)),
+ Instance(0.0, 4.7, Vectors.dense(3.0, 3.0))
).toDF()
+
/*
R code:
@@ -728,34 +729,34 @@ class GeneralizedLinearRegressionSuite
summary(model)
Deviance Residuals:
- 1 2 3 4
- 1.273 -1.437 2.533 -1.556
+ 1 2 3 4
+ 0.2404 0.1965 1.2824 -0.6916
Coefficients:
Estimate Std. Error z value Pr(>|z|)
- V1 -0.30217 0.46242 -0.653 0.513
- V2 -0.04452 0.37124 -0.120 0.905
+ x1 -1.6901 1.2764 -1.324 0.185
+ x2 0.7059 0.9449 0.747 0.455
(Dispersion parameter for binomial family taken to be 1)
- Null deviance: 13.863 on 4 degrees of freedom
- Residual deviance: 12.524 on 2 degrees of freedom
- AIC: 16.524
+ Null deviance: 8.3178 on 4 degrees of freedom
+ Residual deviance: 2.2193 on 2 degrees of freedom
+ AIC: 5.9915
Number of Fisher Scoring iterations: 5
residuals(model, type="pearson")
1 2 3 4
- 1.117731 -1.162962 2.395838 -1.189005
+ 0.171217 0.197406 2.085864 -0.495332
residuals(model, type="working")
1 2 3 4
- 2.249324 -1.676240 2.913346 -1.353433
+ 1.029315 0.281881 15.502768 -1.052203
residuals(model, type="response")
- 1 2 3 4
- 0.5554219 -0.4034267 0.6567520 -0.2611382
- */
+ 1 2 3 4
+ 0.028480 0.069123 0.935495 -0.049613
+ */
val trainer = new GeneralizedLinearRegression()
.setFamily("binomial")
.setWeightCol("weight")
@@ -763,21 +764,21 @@ class GeneralizedLinearRegressionSuite
val model = trainer.fit(datasetWithWeight)
- val coefficientsR = Vectors.dense(Array(-0.30217, -0.04452))
+ val coefficientsR = Vectors.dense(Array(-1.690134, 0.705929))
val interceptR = 0.0
- val devianceResidualsR = Array(1.273, -1.437, 2.533, -1.556)
- val pearsonResidualsR = Array(1.117731, -1.162962, 2.395838, -1.189005)
- val workingResidualsR = Array(2.249324, -1.676240, 2.913346, -1.353433)
- val responseResidualsR = Array(0.5554219, -0.4034267, 0.6567520, -0.2611382)
- val seCoefR = Array(0.46242, 0.37124)
- val tValsR = Array(-0.653, -0.120)
- val pValsR = Array(0.513, 0.905)
+ val devianceResidualsR = Array(0.2404, 0.1965, 1.2824, -0.6916)
+ val pearsonResidualsR = Array(0.171217, 0.197406, 2.085864, -0.495332)
+ val workingResidualsR = Array(1.029315, 0.281881, 15.502768, -1.052203)
+ val responseResidualsR = Array(0.02848, 0.069123, 0.935495, -0.049613)
+ val seCoefR = Array(1.276417, 0.944934)
+ val tValsR = Array(-1.324124, 0.747068)
+ val pValsR = Array(0.185462, 0.455023)
val dispersionR = 1.0
- val nullDevianceR = 13.863
- val residualDevianceR = 12.524
+ val nullDevianceR = 8.3178
+ val residualDevianceR = 2.2193
val residualDegreeOfFreedomNullR = 4
val residualDegreeOfFreedomR = 2
- val aicR = 16.524
+ val aicR = 5.991537
val summary = model.summary
val devianceResiduals = summary.residuals()