aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorYanbo Liang <yanbohappy@gmail.com>2014-10-30 12:00:56 -0700
committerXiangrui Meng <meng@databricks.com>2014-10-30 12:00:56 -0700
commitd9327192eee7f18e92381c59a42b0e1770f1f8f4 (patch)
tree0fdaedec62eadc943abdc27a2b197da07c3c9866 /mllib
parentc7ad0852084dc28f3ebc144adfd4928b23f1c8ea (diff)
downloadspark-d9327192eee7f18e92381c59a42b0e1770f1f8f4.tar.gz
spark-d9327192eee7f18e92381c59a42b0e1770f1f8f4.tar.bz2
spark-d9327192eee7f18e92381c59a42b0e1770f1f8f4.zip
SPARK-4111 [MLlib] add regression metrics
Add RegressionMetrics.scala as regression metrics used for evaluation and corresponding test case RegressionMetricsSuite.scala. Author: Yanbo Liang <yanbohappy@gmail.com> Author: liangyanbo <liangyanbo@meituan.com> Closes #2978 from yanbohappy/regression_metrics and squashes the following commits: 730d0a9 [Yanbo Liang] more clearly annotation 3d0bec1 [Yanbo Liang] rename and keep code style a8ad3e3 [Yanbo Liang] simplify code for keeping style d454909 [Yanbo Liang] rename parameter and function names, delete unused columns, add reference 2e56282 [liangyanbo] rename r2_score() and remove unused column 43bb12b [liangyanbo] add regression metrics
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala89
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala52
2 files changed, 141 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
new file mode 100644
index 0000000000..693117d820
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.evaluation
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.rdd.RDD
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, MultivariateOnlineSummarizer}
+
+/**
+ * :: Experimental ::
+ * Evaluator for regression.
+ *
+ * @param predictionAndObservations an RDD of (prediction, observation) pairs.
+ */
+@Experimental
+class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extends Logging {
+
+ /**
+ * Use MultivariateOnlineSummarizer to calculate summary statistics of observations and errors.
+ */
+ private lazy val summary: MultivariateStatisticalSummary = {
+ val summary: MultivariateStatisticalSummary = predictionAndObservations.map {
+ case (prediction, observation) => Vectors.dense(observation, observation - prediction)
+ }.aggregate(new MultivariateOnlineSummarizer())(
+ (summary, v) => summary.add(v),
+ (sum1, sum2) => sum1.merge(sum2)
+ )
+ summary
+ }
+
+ /**
+ * Returns the explained variance regression score.
+ * explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
+ * Reference: [[http://en.wikipedia.org/wiki/Explained_variation]]
+ */
+ def explainedVariance: Double = {
+ 1 - summary.variance(1) / summary.variance(0)
+ }
+
+ /**
+ * Returns the mean absolute error, which is a risk function corresponding to the
+ * expected value of the absolute error loss or l1-norm loss.
+ */
+ def meanAbsoluteError: Double = {
+ summary.normL1(1) / summary.count
+ }
+
+ /**
+ * Returns the mean squared error, which is a risk function corresponding to the
+ * expected value of the squared error loss or quadratic loss.
+ */
+ def meanSquaredError: Double = {
+ val rmse = summary.normL2(1) / math.sqrt(summary.count)
+ rmse * rmse
+ }
+
+ /**
+ * Returns the root mean squared error, which is defined as the square root of
+ * the mean squared error.
+ */
+ def rootMeanSquaredError: Double = {
+ summary.normL2(1) / math.sqrt(summary.count)
+ }
+
+ /**
+ * Returns R^2^, the coefficient of determination.
+ * Reference: [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
+ */
+ def r2: Double = {
+ 1 - math.pow(summary.normL2(1), 2) / (summary.variance(0) * (summary.count - 1))
+ }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
new file mode 100644
index 0000000000..5396d7b2b7
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.evaluation
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class RegressionMetricsSuite extends FunSuite with LocalSparkContext {
+
+ test("regression metrics") {
+ val predictionAndObservations = sc.parallelize(
+ Seq((2.5,3.0),(0.0,-0.5),(2.0,2.0),(8.0,7.0)), 2)
+ val metrics = new RegressionMetrics(predictionAndObservations)
+ assert(metrics.explainedVariance ~== 0.95717 absTol 1E-5,
+ "explained variance regression score mismatch")
+ assert(metrics.meanAbsoluteError ~== 0.5 absTol 1E-5, "mean absolute error mismatch")
+ assert(metrics.meanSquaredError ~== 0.375 absTol 1E-5, "mean squared error mismatch")
+ assert(metrics.rootMeanSquaredError ~== 0.61237 absTol 1E-5,
+ "root mean squared error mismatch")
+ assert(metrics.r2 ~== 0.94861 absTol 1E-5, "r2 score mismatch")
+ }
+
+ test("regression metrics with complete fitting") {
+ val predictionAndObservations = sc.parallelize(
+ Seq((3.0,3.0),(0.0,0.0),(2.0,2.0),(8.0,8.0)), 2)
+ val metrics = new RegressionMetrics(predictionAndObservations)
+ assert(metrics.explainedVariance ~== 1.0 absTol 1E-5,
+ "explained variance regression score mismatch")
+ assert(metrics.meanAbsoluteError ~== 0.0 absTol 1E-5, "mean absolute error mismatch")
+ assert(metrics.meanSquaredError ~== 0.0 absTol 1E-5, "mean squared error mismatch")
+ assert(metrics.rootMeanSquaredError ~== 0.0 absTol 1E-5,
+ "root mean squared error mismatch")
+ assert(metrics.r2 ~== 1.0 absTol 1E-5, "r2 score mismatch")
+ }
+}