aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/main
diff options
context:
space:
mode:
authorAlexander Ulanov <nashb@yandex.ru>2014-07-15 08:40:22 -0700
committerXiangrui Meng <meng@databricks.com>2014-07-15 08:40:22 -0700
commit04b01bb101eeaf76c2e7c94c291669f0b2372c9a (patch)
tree5939b35b6371d1386e9930bb8cd78ce9d4eacec7 /mllib/src/main
parent6555618c8f39b4e7da9402c3fd9da7a75bf7794e (diff)
downloadspark-04b01bb101eeaf76c2e7c94c291669f0b2372c9a.tar.gz
spark-04b01bb101eeaf76c2e7c94c291669f0b2372c9a.tar.bz2
spark-04b01bb101eeaf76c2e7c94c291669f0b2372c9a.zip
[MLLIB] [SPARK-2222] Add multiclass evaluation metrics
Adding two classes: 1) MulticlassMetrics implements various multiclass evaluation metrics 2) MulticlassMetricsSuite implements unit tests for MulticlassMetrics Author: Alexander Ulanov <nashb@yandex.ru> Author: unknown <ulanov@ULANOV1.emea.hpqcorp.net> Author: Xiangrui Meng <meng@databricks.com> Closes #1155 from avulanov/master and squashes the following commits: 2eae80f [Alexander Ulanov] Merge pull request #1 from mengxr/avulanov-master 5ebeb08 [Xiangrui Meng] minor updates 79c3555 [Alexander Ulanov] Addressing reviewers comments mengxr 0fa9511 [Alexander Ulanov] Addressing reviewers comments mengxr f0dadc9 [Alexander Ulanov] Addressing reviewers comments mengxr 4811378 [Alexander Ulanov] Removing println 87fb11f [Alexander Ulanov] Addressing reviewers comments mengxr. Added confusion matrix e3db569 [Alexander Ulanov] Addressing reviewers comments mengxr. Added true positive rate and false positive rate. Test suite code style. a7e8bf0 [Alexander Ulanov] Addressing reviewers comments mengxr c3a77ad [Alexander Ulanov] Addressing reviewers comments mengxr e2c91c3 [Alexander Ulanov] Fixes to mutliclass metics d5ce981 [unknown] Comments about Double a5c8ba4 [unknown] Unit tests. Class rename fcee82d [unknown] Unit tests. Class rename d535d62 [unknown] Multiclass evaluation
Diffstat (limited to 'mllib/src/main')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala190
1 files changed, 190 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
new file mode 100644
index 0000000000..666362ae67
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.evaluation
+
+import scala.collection.Map
+
+import org.apache.spark.SparkContext._
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.mllib.linalg.{Matrices, Matrix}
+import org.apache.spark.rdd.RDD
+
+/**
+ * ::Experimental::
+ * Evaluator for multiclass classification.
+ *
+ * @param predictionAndLabels an RDD of (prediction, label) pairs.
+ */
+@Experimental
+class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
+
+ private lazy val labelCountByClass: Map[Double, Long] = predictionAndLabels.values.countByValue()
+ private lazy val labelCount: Long = labelCountByClass.values.sum
+ private lazy val tpByClass: Map[Double, Int] = predictionAndLabels
+ .map { case (prediction, label) =>
+ (label, if (label == prediction) 1 else 0)
+ }.reduceByKey(_ + _)
+ .collectAsMap()
+ private lazy val fpByClass: Map[Double, Int] = predictionAndLabels
+ .map { case (prediction, label) =>
+ (prediction, if (prediction != label) 1 else 0)
+ }.reduceByKey(_ + _)
+ .collectAsMap()
+ private lazy val confusions = predictionAndLabels
+ .map { case (prediction, label) =>
+ ((label, prediction), 1)
+ }.reduceByKey(_ + _)
+ .collectAsMap()
+
+ /**
+ * Returns confusion matrix:
+ * predicted classes are in columns,
+ * they are ordered by class label ascending,
+ * as in "labels"
+ */
+ def confusionMatrix: Matrix = {
+ val n = labels.size
+ val values = Array.ofDim[Double](n * n)
+ var i = 0
+ while (i < n) {
+ var j = 0
+ while (j < n) {
+ values(i + j * n) = confusions.getOrElse((labels(i), labels(j)), 0).toDouble
+ j += 1
+ }
+ i += 1
+ }
+ Matrices.dense(n, n, values)
+ }
+
+ /**
+ * Returns true positive rate for a given label (category)
+ * @param label the label.
+ */
+ def truePositiveRate(label: Double): Double = recall(label)
+
+ /**
+ * Returns false positive rate for a given label (category)
+ * @param label the label.
+ */
+ def falsePositiveRate(label: Double): Double = {
+ val fp = fpByClass.getOrElse(label, 0)
+ fp.toDouble / (labelCount - labelCountByClass(label))
+ }
+
+ /**
+ * Returns precision for a given label (category)
+ * @param label the label.
+ */
+ def precision(label: Double): Double = {
+ val tp = tpByClass(label)
+ val fp = fpByClass.getOrElse(label, 0)
+ if (tp + fp == 0) 0 else tp.toDouble / (tp + fp)
+ }
+
+ /**
+ * Returns recall for a given label (category)
+ * @param label the label.
+ */
+ def recall(label: Double): Double = tpByClass(label).toDouble / labelCountByClass(label)
+
+ /**
+ * Returns f-measure for a given label (category)
+ * @param label the label.
+ * @param beta the beta parameter.
+ */
+ def fMeasure(label: Double, beta: Double): Double = {
+ val p = precision(label)
+ val r = recall(label)
+ val betaSqrd = beta * beta
+ if (p + r == 0) 0 else (1 + betaSqrd) * p * r / (betaSqrd * p + r)
+ }
+
+ /**
+ * Returns f1-measure for a given label (category)
+ * @param label the label.
+ */
+ def fMeasure(label: Double): Double = fMeasure(label, 1.0)
+
+ /**
+ * Returns precision
+ */
+ lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount
+
+ /**
+ * Returns recall
+ * (equals to precision for multiclass classifier
+ * because sum of all false positives is equal to sum
+ * of all false negatives)
+ */
+ lazy val recall: Double = precision
+
+ /**
+ * Returns f-measure
+ * (equals to precision and recall because precision equals recall)
+ */
+ lazy val fMeasure: Double = precision
+
+ /**
+ * Returns weighted true positive rate
+ * (equals to precision, recall and f-measure)
+ */
+ lazy val weightedTruePositiveRate: Double = weightedRecall
+
+ /**
+ * Returns weighted false positive rate
+ */
+ lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case (category, count) =>
+ falsePositiveRate(category) * count.toDouble / labelCount
+ }.sum
+
+ /**
+ * Returns weighted averaged recall
+ * (equals to precision, recall and f-measure)
+ */
+ lazy val weightedRecall: Double = labelCountByClass.map { case (category, count) =>
+ recall(category) * count.toDouble / labelCount
+ }.sum
+
+ /**
+ * Returns weighted averaged precision
+ */
+ lazy val weightedPrecision: Double = labelCountByClass.map { case (category, count) =>
+ precision(category) * count.toDouble / labelCount
+ }.sum
+
+ /**
+ * Returns weighted averaged f-measure
+ * @param beta the beta parameter.
+ */
+ def weightedFMeasure(beta: Double): Double = labelCountByClass.map { case (category, count) =>
+ fMeasure(category, beta) * count.toDouble / labelCount
+ }.sum
+
+ /**
+ * Returns weighted averaged f1-measure
+ */
+ lazy val weightedFMeasure: Double = labelCountByClass.map { case (category, count) =>
+ fMeasure(category, 1.0) * count.toDouble / labelCount
+ }.sum
+
+ /**
+ * Returns the sequence of labels in ascending order
+ */
+ lazy val labels: Array[Double] = tpByClass.keys.toArray.sorted
+}