From 9da7ceed81b0afce7deb8f39f3a6d565d401a391 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Thu, 5 Nov 2015 09:56:18 -0800 Subject: [SPARK-11473][ML] R-like summary statistics with intercept for OLS via normal equation solver Follow up [SPARK-9836](https://issues.apache.org/jira/browse/SPARK-9836), we should also support summary statistics for ```intercept```. Author: Yanbo Liang Closes #9485 from yanboliang/spark-11473. --- .../spark/ml/optim/WeightedLeastSquares.scala | 35 ++++++++++++---------- .../spark/ml/regression/LinearRegression.scala | 22 ++++++++------ .../ml/regression/LinearRegressionSuite.scala | 16 +++++----- 3 files changed, 40 insertions(+), 33 deletions(-) (limited to 'mllib') diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala index e612a2122e..8617722ae5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala @@ -75,7 +75,7 @@ private[ml] class WeightedLeastSquares( val summary = instances.treeAggregate(new Aggregator)(_.add(_), _.merge(_)) summary.validate() logInfo(s"Number of instances: ${summary.count}.") - val k = summary.k + val k = if (fitIntercept) summary.k + 1 else summary.k val triK = summary.triK val wSum = summary.wSum val bBar = summary.bBar @@ -86,14 +86,6 @@ private[ml] class WeightedLeastSquares( val aaBar = summary.aaBar val aaValues = aaBar.values - if (fitIntercept) { - // shift centers - // A^T A - aBar aBar^T - BLAS.spr(-1.0, aBar, aaValues) - // A^T b - bBar aBar - BLAS.axpy(-bBar, aBar, abBar) - } - // add regularization to diagonals var i = 0 var j = 2 @@ -111,21 +103,32 @@ private[ml] class WeightedLeastSquares( j += 1 } - val x = new DenseVector(CholeskyDecomposition.solve(aaBar.values, abBar.values)) + val aa = if (fitIntercept) { + Array.concat(aaBar.values, aBar.values, Array(1.0)) + } else { + aaBar.values + } + val ab = if (fitIntercept) { + Array.concat(abBar.values, Array(bBar)) + } else { + abBar.values + } + + val x = CholeskyDecomposition.solve(aa, ab) + + val aaInv = CholeskyDecomposition.inverse(aa, k) - val aaInv = CholeskyDecomposition.inverse(aaBar.values, k) // aaInv is a packed upper triangular matrix, here we get all elements on diagonal val diagInvAtWA = new DenseVector((1 to k).map { i => aaInv(i + (i - 1) * i / 2 - 1) / wSum }.toArray) - // compute intercept - val intercept = if (fitIntercept) { - bBar - BLAS.dot(aBar, x) + val (coefficients, intercept) = if (fitIntercept) { + (new DenseVector(x.slice(0, x.length - 1)), x.last) } else { - 0.0 + (new DenseVector(x), 0.0) } - new WeightedLeastSquaresModel(x, intercept, diagInvAtWA) + new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index c51e30483a..6638313818 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -511,8 +511,7 @@ class LinearRegressionSummary private[regression] ( } /** - * Standard error of estimated coefficients. - * Note that standard error of estimated intercept is not supported currently. + * Standard error of estimated coefficients and intercept. */ lazy val coefficientStandardErrors: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { @@ -532,21 +531,26 @@ class LinearRegressionSummary private[regression] ( } } - /** T-statistic of estimated coefficients. - * Note that t-statistic of estimated intercept is not supported currently. - */ + /** + * T-statistic of estimated coefficients and intercept. + */ lazy val tValues: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { throw new UnsupportedOperationException( "No t-statistic available for this LinearRegressionModel") } else { - model.coefficients.toArray.zip(coefficientStandardErrors).map { x => x._1 / x._2 } + val estimate = if (model.getFitIntercept) { + Array.concat(model.coefficients.toArray, Array(model.intercept)) + } else { + model.coefficients.toArray + } + estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 } } } - /** Two-sided p-value of estimated coefficients. - * Note that p-value of estimated intercept is not supported currently. - */ + /** + * Two-sided p-value of estimated coefficients and intercept. + */ lazy val pValues: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { throw new UnsupportedOperationException( diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index fbf83e8922..a1d86fe8fe 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -621,13 +621,13 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model.summary.objectiveHistory.length == 1) assert(model.summary.objectiveHistory(0) == 0.0) val devianceResidualsR = Array(-0.35566, 0.34504) - val seCoefR = Array(0.0011756, 0.0009032) - val tValsR = Array(3998, 7971) - val pValsR = Array(0, 0) + val seCoefR = Array(0.0011756, 0.0009032, 0.0018489) + val tValsR = Array(3998, 7971, 3407) + val pValsR = Array(0, 0, 0) model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x => - assert(x._1 ~== x._2 absTol 1E-3) } + assert(x._1 ~== x._2 absTol 1E-5) } model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x => - assert(x._1 ~== x._2 absTol 1E-3) } + assert(x._1 ~== x._2 absTol 1E-5) } model.summary.tValues.map(_.round).zip(tValsR).foreach{ x => assert(x._1 === x._2) } model.summary.pValues.map(_.round).zip(pValsR).foreach{ x => assert(x._1 === x._2) } } @@ -789,9 +789,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val coefficientsR = Vectors.dense(Array(6.080, -0.600)) val interceptR = 18.080 val devianceResidualsR = Array(-1.358, 1.920) - val seCoefR = Array(5.556, 1.960) - val tValsR = Array(1.094, -0.306) - val pValsR = Array(0.471, 0.811) + val seCoefR = Array(5.556, 1.960, 9.608) + val tValsR = Array(1.094, -0.306, 1.882) + val pValsR = Array(0.471, 0.811, 0.311) assert(model.coefficients ~== coefficientsR absTol 1E-3) assert(model.intercept ~== interceptR absTol 1E-3) -- cgit v1.2.3