aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFeynman Liang <fliang@databricks.com>2015-07-17 14:00:53 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-07-17 14:00:53 -0700
commit6da1069696186572c66cbd83947c1a1dbd2bc827 (patch)
tree5bb7ed475b06d0025f3ae377f9b7fade5017843f
parentad0954f6de29761e0e7e543212c5bfe1fdcbed9f (diff)
downloadspark-6da1069696186572c66cbd83947c1a1dbd2bc827.tar.gz
spark-6da1069696186572c66cbd83947c1a1dbd2bc827.tar.bz2
spark-6da1069696186572c66cbd83947c1a1dbd2bc827.zip
[SPARK-9090] [ML] Fix definition of residual in LinearRegressionSummary, EnsembleTestHelper, and SquaredError
Make the definition of residuals in Spark consistent with literature. We have been using `prediction - label` for residuals, but literature usually defines `residual = label - prediction`. Author: Feynman Liang <fliang@databricks.com> Closes #7435 from feynmanliang/SPARK-9090-Fix-LinearRegressionSummary-Residuals and squashes the following commits: f4b39d8 [Feynman Liang] Fix doc bc12a92 [Feynman Liang] Tweak EnsembleTestHelper and SquaredError residuals 63f0d60 [Feynman Liang] Fix definition of residual
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala4
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala4
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala2
4 files changed, 7 insertions, 7 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 8fc9860566..89718e0f3e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -355,9 +355,9 @@ class LinearRegressionSummary private[regression] (
*/
val r2: Double = metrics.r2
- /** Residuals (predicted value - label value) */
+ /** Residuals (label - predicted value) */
@transient lazy val residuals: DataFrame = {
- val t = udf { (pred: Double, label: Double) => pred - label}
+ val t = udf { (pred: Double, label: Double) => label - pred }
predictions.select(t(col(predictionCol), col(labelCol)).as("residuals"))
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
index a5582d3ef3..011a5d5742 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
@@ -42,11 +42,11 @@ object SquaredError extends Loss {
* @return Loss gradient
*/
override def gradient(prediction: Double, label: Double): Double = {
- 2.0 * (prediction - label)
+ - 2.0 * (label - prediction)
}
override private[mllib] def computeError(prediction: Double, label: Double): Double = {
- val err = prediction - label
+ val err = label - prediction
err * err
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index cf120cf2a4..374002c5b4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -302,7 +302,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
.map { case Row(features: DenseVector, label: Double) =>
val prediction =
features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
- prediction - label
+ label - prediction
}
.zip(model.summary.residuals.map(_.getDouble(0)))
.collect()
@@ -314,7 +314,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
Use the following R code to generate model training results.
predictions <- predict(fit, newx=features)
- residuals <- predictions - label
+ residuals <- label - predictions
> mean(residuals^2) # MSE
[1] 0.009720325
> mean(abs(residuals)) # MAD
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
index 8972c229b7..334bf3790f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
@@ -70,7 +70,7 @@ object EnsembleTestHelper {
metricName: String = "mse") {
val predictions = input.map(x => model.predict(x.features))
val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) =>
- prediction - label
+ label - prediction
}
val metric = metricName match {
case "mse" =>