diff options
author | Alexander Ulanov <nashb@yandex.ru> | 2014-10-31 18:31:03 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2014-10-31 18:31:03 -0700 |
commit | 62d01d255c001a6d397cc166a10aba3894f43459 (patch) | |
tree | 3c7407e1bd3b47607dd14e85e21b36649bff2cab /mllib/src/main | |
parent | 23f73f525ce3d2b4a614e60f4f9170c860ab93da (diff) | |
download | spark-62d01d255c001a6d397cc166a10aba3894f43459.tar.gz spark-62d01d255c001a6d397cc166a10aba3894f43459.tar.bz2 spark-62d01d255c001a6d397cc166a10aba3894f43459.zip |
[MLLIB] SPARK-2329 Add multi-label evaluation metrics
Implementation of various multi-label classification measures, including: Hamming-loss, strict and default Accuracy, macro-averaged Precision, Recall and F1-measure based on documents and labels, micro-averaged measures: https://issues.apache.org/jira/browse/SPARK-2329
Multi-class measures are currently in the following pull request: https://github.com/apache/spark/pull/1155
Author: Alexander Ulanov <nashb@yandex.ru>
Author: avulanov <nashb@yandex.ru>
Closes #1270 from avulanov/multilabelmetrics and squashes the following commits:
fc8175e [Alexander Ulanov] Merge with previous updates
43a613e [Alexander Ulanov] Addressing reviewers comments: change Set to Array
517a594 [avulanov] Addressing reviewers comments: Scala style
cf4222bc [avulanov] Addressing reviewers comments: renaming. Added label method that returns the list of labels
1843f73 [Alexander Ulanov] Scala style fix
79e8476 [Alexander Ulanov] Replacing fold(_ + _) with sum as suggested by srowen
ca46765 [Alexander Ulanov] Cosmetic changes: Apache header and parameter explanation
40593f5 [Alexander Ulanov] Multi-label metrics: Hamming-loss, strict and normal accuracy, fix to macro measures, bunch of tests
ad62df0 [Alexander Ulanov] Comments and scala style check
154164b [Alexander Ulanov] Multilabel evaluation metics and tests: macro precision and recall averaged by docs, micro and per-class precision and recall averaged by class
Diffstat (limited to 'mllib/src/main')
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala new file mode 100644 index 0000000000..ea10bde5fa --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.evaluation + +import org.apache.spark.rdd.RDD +import org.apache.spark.SparkContext._ + +/** + * Evaluator for multilabel classification. + * @param predictionAndLabels an RDD of (predictions, labels) pairs, + * both are non-null Arrays, each with unique elements. + */ +class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]) { + + private lazy val numDocs: Long = predictionAndLabels.count() + + private lazy val numLabels: Long = predictionAndLabels.flatMap { case (_, labels) => + labels}.distinct().count() + + /** + * Returns subset accuracy + * (for equal sets of labels) + */ + lazy val subsetAccuracy: Double = predictionAndLabels.filter { case (predictions, labels) => + predictions.deep == labels.deep + }.count().toDouble / numDocs + + /** + * Returns accuracy + */ + lazy val accuracy: Double = predictionAndLabels.map { case (predictions, labels) => + labels.intersect(predictions).size.toDouble / + (labels.size + predictions.size - labels.intersect(predictions).size)}.sum / numDocs + + + /** + * Returns Hamming-loss + */ + lazy val hammingLoss: Double = predictionAndLabels.map { case (predictions, labels) => + labels.size + predictions.size - 2 * labels.intersect(predictions).size + }.sum / (numDocs * numLabels) + + /** + * Returns document-based precision averaged by the number of documents + */ + lazy val precision: Double = predictionAndLabels.map { case (predictions, labels) => + if (predictions.size > 0) { + predictions.intersect(labels).size.toDouble / predictions.size + } else { + 0 + } + }.sum / numDocs + + /** + * Returns document-based recall averaged by the number of documents + */ + lazy val recall: Double = predictionAndLabels.map { case (predictions, labels) => + labels.intersect(predictions).size.toDouble / labels.size + }.sum / numDocs + + /** + * Returns document-based f1-measure averaged by the number of documents + */ + lazy val f1Measure: Double = predictionAndLabels.map { case (predictions, labels) => + 2.0 * predictions.intersect(labels).size / (predictions.size + labels.size) + }.sum / numDocs + + private lazy val tpPerClass = predictionAndLabels.flatMap { case (predictions, labels) => + predictions.intersect(labels) + }.countByValue() + + private lazy val fpPerClass = predictionAndLabels.flatMap { case (predictions, labels) => + predictions.diff(labels) + }.countByValue() + + private lazy val fnPerClass = predictionAndLabels.flatMap { case(predictions, labels) => + labels.diff(predictions) + }.countByValue() + + /** + * Returns precision for a given label (category) + * @param label the label. + */ + def precision(label: Double) = { + val tp = tpPerClass(label) + val fp = fpPerClass.getOrElse(label, 0L) + if (tp + fp == 0) 0 else tp.toDouble / (tp + fp) + } + + /** + * Returns recall for a given label (category) + * @param label the label. + */ + def recall(label: Double) = { + val tp = tpPerClass(label) + val fn = fnPerClass.getOrElse(label, 0L) + if (tp + fn == 0) 0 else tp.toDouble / (tp + fn) + } + + /** + * Returns f1-measure for a given label (category) + * @param label the label. + */ + def f1Measure(label: Double) = { + val p = precision(label) + val r = recall(label) + if((p + r) == 0) 0 else 2 * p * r / (p + r) + } + + private lazy val sumTp = tpPerClass.foldLeft(0L) { case (sum, (_, tp)) => sum + tp } + private lazy val sumFpClass = fpPerClass.foldLeft(0L) { case (sum, (_, fp)) => sum + fp } + private lazy val sumFnClass = fnPerClass.foldLeft(0L) { case (sum, (_, fn)) => sum + fn } + + /** + * Returns micro-averaged label-based precision + * (equals to micro-averaged document-based precision) + */ + lazy val microPrecision = { + val sumFp = fpPerClass.foldLeft(0L){ case(cum, (_, fp)) => cum + fp} + sumTp.toDouble / (sumTp + sumFp) + } + + /** + * Returns micro-averaged label-based recall + * (equals to micro-averaged document-based recall) + */ + lazy val microRecall = { + val sumFn = fnPerClass.foldLeft(0.0){ case(cum, (_, fn)) => cum + fn} + sumTp.toDouble / (sumTp + sumFn) + } + + /** + * Returns micro-averaged label-based f1-measure + * (equals to micro-averaged document-based f1-measure) + */ + lazy val microF1Measure = 2.0 * sumTp / (2 * sumTp + sumFnClass + sumFpClass) + + /** + * Returns the sequence of labels in ascending order + */ + lazy val labels: Array[Double] = tpPerClass.keys.toArray.sorted +} |