aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorDB Tsai <dbtsai@alpinenow.com>2014-12-22 16:42:55 -0800
committerXiangrui Meng <meng@databricks.com>2014-12-22 16:42:55 -0800
commita96b72781ae40bb303613990b8d8b4721b84e1c3 (patch)
tree69ed3021cbc056f925c7214a824c1ade622ad878 /mllib
parentc233ab3d8d75a33495298964fe73dbf7dd8fe305 (diff)
downloadspark-a96b72781ae40bb303613990b8d8b4721b84e1c3.tar.gz
spark-a96b72781ae40bb303613990b8d8b4721b84e1c3.tar.bz2
spark-a96b72781ae40bb303613990b8d8b4721b84e1c3.zip
[SPARK-4907][MLlib] Inconsistent loss and gradient in LeastSquaresGradient compared with R
In most of the academic paper and algorithm implementations, people use L = 1/2n ||A weights-y||^2 instead of L = 1/n ||A weights-y||^2 for least-squared loss. See Eq. (1) in http://web.stanford.edu/~hastie/Papers/glmnet.pdf Since MLlib uses different convention, this will result different residuals and all the stats properties will be different from GLMNET package in R. The model coefficients will be still the same under this change. Author: DB Tsai <dbtsai@alpinenow.com> Closes #3746 from dbtsai/lir and squashes the following commits: 19c2e85 [DB Tsai] make stepsize twice to converge to the same solution 0b2c29c [DB Tsai] first commit
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala10
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala6
2 files changed, 8 insertions, 8 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
index 45dbf6044f..5a419d1640 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
@@ -94,16 +94,16 @@ class LogisticGradient extends Gradient {
* :: DeveloperApi ::
* Compute gradient and loss for a Least-squared loss function, as used in linear regression.
* This is correct for the averaged least squares loss function (mean squared error)
- * L = 1/n ||A weights-y||^2
+ * L = 1/2n ||A weights-y||^2
* See also the documentation for the precise formulation.
*/
@DeveloperApi
class LeastSquaresGradient extends Gradient {
override def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
val diff = dot(data, weights) - label
- val loss = diff * diff
+ val loss = diff * diff / 2.0
val gradient = data.copy
- scal(2.0 * diff, gradient)
+ scal(diff, gradient)
(gradient, loss)
}
@@ -113,8 +113,8 @@ class LeastSquaresGradient extends Gradient {
weights: Vector,
cumGradient: Vector): Double = {
val diff = dot(data, weights) - label
- axpy(2.0 * diff, data, cumGradient)
- diff * diff
+ axpy(diff, data, cumGradient)
+ diff * diff / 2.0
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
index 03b71301e9..70b43ddb7d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
@@ -52,7 +52,7 @@ class StreamingLinearRegressionSuite extends FunSuite with TestSuiteBase {
// create model
val model = new StreamingLinearRegressionWithSGD()
.setInitialWeights(Vectors.dense(0.0, 0.0))
- .setStepSize(0.1)
+ .setStepSize(0.2)
.setNumIterations(25)
// generate sequence of simulated data
@@ -84,7 +84,7 @@ class StreamingLinearRegressionSuite extends FunSuite with TestSuiteBase {
// create model
val model = new StreamingLinearRegressionWithSGD()
.setInitialWeights(Vectors.dense(0.0))
- .setStepSize(0.1)
+ .setStepSize(0.2)
.setNumIterations(25)
// generate sequence of simulated data
@@ -118,7 +118,7 @@ class StreamingLinearRegressionSuite extends FunSuite with TestSuiteBase {
// create model initialized with true weights
val model = new StreamingLinearRegressionWithSGD()
.setInitialWeights(Vectors.dense(10.0, 10.0))
- .setStepSize(0.1)
+ .setStepSize(0.2)
.setNumIterations(25)
// generate sequence of simulated data for testing