diff options
author | shikai.tang <tar.sky06@gmail.com> | 2015-08-12 21:53:15 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-08-12 21:53:15 -0700 |
commit | df543892122342b97e5137b266959ba97589b3ef (patch) | |
tree | 525171bf0465cc95e220bfa30b672a5488428a18 /mllib/src/main/scala/org/apache | |
parent | 5fc058a1fc5d83ad53feec936475484aef3800b3 (diff) | |
download | spark-df543892122342b97e5137b266959ba97589b3ef.tar.gz spark-df543892122342b97e5137b266959ba97589b3ef.tar.bz2 spark-df543892122342b97e5137b266959ba97589b3ef.zip |
[SPARK-8922] [DOCUMENTATION, MLLIB] Add @since tags to mllib.evaluation
Author: shikai.tang <tar.sky06@gmail.com>
Closes #7429 from mosessky/master.
Diffstat (limited to 'mllib/src/main/scala/org/apache')
5 files changed, 50 insertions, 5 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index c1d1a22481..486741edd6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -41,6 +41,7 @@ import org.apache.spark.sql.DataFrame * of bins may not exactly equal numBins. The last bin in each partition may * be smaller as a result, meaning there may be an extra sample at * partition boundaries. + * @since 1.3.0 */ @Experimental class BinaryClassificationMetrics( @@ -51,6 +52,7 @@ class BinaryClassificationMetrics( /** * Defaults `numBins` to 0. + * @since 1.0.0 */ def this(scoreAndLabels: RDD[(Double, Double)]) = this(scoreAndLabels, 0) @@ -61,12 +63,18 @@ class BinaryClassificationMetrics( private[mllib] def this(scoreAndLabels: DataFrame) = this(scoreAndLabels.map(r => (r.getDouble(0), r.getDouble(1)))) - /** Unpersist intermediate RDDs used in the computation. */ + /** + * Unpersist intermediate RDDs used in the computation. + * @since 1.0.0 + */ def unpersist() { cumulativeCounts.unpersist() } - /** Returns thresholds in descending order. */ + /** + * Returns thresholds in descending order. + * @since 1.0.0 + */ def thresholds(): RDD[Double] = cumulativeCounts.map(_._1) /** @@ -74,6 +82,7 @@ class BinaryClassificationMetrics( * which is an RDD of (false positive rate, true positive rate) * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic + * @since 1.0.0 */ def roc(): RDD[(Double, Double)] = { val rocCurve = createCurve(FalsePositiveRate, Recall) @@ -85,6 +94,7 @@ class BinaryClassificationMetrics( /** * Computes the area under the receiver operating characteristic (ROC) curve. + * @since 1.0.0 */ def areaUnderROC(): Double = AreaUnderCurve.of(roc()) @@ -92,6 +102,7 @@ class BinaryClassificationMetrics( * Returns the precision-recall curve, which is an RDD of (recall, precision), * NOT (precision, recall), with (0.0, 1.0) prepended to it. * @see http://en.wikipedia.org/wiki/Precision_and_recall + * @since 1.0.0 */ def pr(): RDD[(Double, Double)] = { val prCurve = createCurve(Recall, Precision) @@ -102,6 +113,7 @@ class BinaryClassificationMetrics( /** * Computes the area under the precision-recall curve. + * @since 1.0.0 */ def areaUnderPR(): Double = AreaUnderCurve.of(pr()) @@ -110,16 +122,26 @@ class BinaryClassificationMetrics( * @param beta the beta factor in F-Measure computation. * @return an RDD of (threshold, F-Measure) pairs. * @see http://en.wikipedia.org/wiki/F1_score + * @since 1.0.0 */ def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta)) - /** Returns the (threshold, F-Measure) curve with beta = 1.0. */ + /** + * Returns the (threshold, F-Measure) curve with beta = 1.0. + * @since 1.0.0 + */ def fMeasureByThreshold(): RDD[(Double, Double)] = fMeasureByThreshold(1.0) - /** Returns the (threshold, precision) curve. */ + /** + * Returns the (threshold, precision) curve. + * @since 1.0.0 + */ def precisionByThreshold(): RDD[(Double, Double)] = createCurve(Precision) - /** Returns the (threshold, recall) curve. */ + /** + * Returns the (threshold, recall) curve. + * @since 1.0.0 + */ def recallByThreshold(): RDD[(Double, Double)] = createCurve(Recall) private lazy val ( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala index 4628dc5690..dddfa3ea5b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.DataFrame * Evaluator for multiclass classification. * * @param predictionAndLabels an RDD of (prediction, label) pairs. + * @since 1.1.0 */ @Experimental class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { @@ -64,6 +65,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * predicted classes are in columns, * they are ordered by class label ascending, * as in "labels" + * @since 1.1.0 */ def confusionMatrix: Matrix = { val n = labels.size @@ -83,12 +85,14 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns true positive rate for a given label (category) * @param label the label. + * @since 1.1.0 */ def truePositiveRate(label: Double): Double = recall(label) /** * Returns false positive rate for a given label (category) * @param label the label. + * @since 1.1.0 */ def falsePositiveRate(label: Double): Double = { val fp = fpByClass.getOrElse(label, 0) @@ -98,6 +102,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns precision for a given label (category) * @param label the label. + * @since 1.1.0 */ def precision(label: Double): Double = { val tp = tpByClass(label) @@ -108,6 +113,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns recall for a given label (category) * @param label the label. + * @since 1.1.0 */ def recall(label: Double): Double = tpByClass(label).toDouble / labelCountByClass(label) @@ -115,6 +121,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * Returns f-measure for a given label (category) * @param label the label. * @param beta the beta parameter. + * @since 1.1.0 */ def fMeasure(label: Double, beta: Double): Double = { val p = precision(label) @@ -126,6 +133,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns f1-measure for a given label (category) * @param label the label. + * @since 1.1.0 */ def fMeasure(label: Double): Double = fMeasure(label, 1.0) @@ -179,6 +187,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns weighted averaged f-measure * @param beta the beta parameter. + * @since 1.1.0 */ def weightedFMeasure(beta: Double): Double = labelCountByClass.map { case (category, count) => fMeasure(category, beta) * count.toDouble / labelCount diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala index bf6eb1d5bd..77cb1e09bd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.DataFrame * Evaluator for multilabel classification. * @param predictionAndLabels an RDD of (predictions, labels) pairs, * both are non-null Arrays, each with unique elements. + * @since 1.2.0 */ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]) { @@ -103,6 +104,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] /** * Returns precision for a given label (category) * @param label the label. + * @since 1.2.0 */ def precision(label: Double): Double = { val tp = tpPerClass(label) @@ -113,6 +115,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] /** * Returns recall for a given label (category) * @param label the label. + * @since 1.2.0 */ def recall(label: Double): Double = { val tp = tpPerClass(label) @@ -123,6 +126,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] /** * Returns f1-measure for a given label (category) * @param label the label. + * @since 1.2.0 */ def f1Measure(label: Double): Double = { val p = precision(label) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala index 5b5a2a1450..063fbed8cd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala @@ -34,6 +34,7 @@ import org.apache.spark.rdd.RDD * Java users should use [[RankingMetrics$.of]] to create a [[RankingMetrics]] instance. * * @param predictionAndLabels an RDD of (predicted ranking, ground truth set) pairs. + * @since 1.2.0 */ @Experimental class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]) @@ -55,6 +56,7 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])] * * @param k the position to compute the truncated precision, must be positive * @return the average precision at the first k ranking positions + * @since 1.2.0 */ def precisionAt(k: Int): Double = { require(k > 0, "ranking position k should be positive") @@ -124,6 +126,7 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])] * * @param k the position to compute the truncated ndcg, must be positive * @return the average ndcg at the first k ranking positions + * @since 1.2.0 */ def ndcgAt(k: Int): Double = { require(k > 0, "ranking position k should be positive") @@ -162,6 +165,7 @@ object RankingMetrics { /** * Creates a [[RankingMetrics]] instance (for Java users). * @param predictionAndLabels a JavaRDD of (predicted ranking, ground truth set) pairs + * @since 1.4.0 */ def of[E, T <: jl.Iterable[E]](predictionAndLabels: JavaRDD[(T, T)]): RankingMetrics[E] = { implicit val tag = JavaSparkContext.fakeClassTag[E] diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala index 408847afa8..54dfd8c099 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.DataFrame * Evaluator for regression. * * @param predictionAndObservations an RDD of (prediction, observation) pairs. + * @since 1.2.0 */ @Experimental class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extends Logging { @@ -66,6 +67,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend * Returns the variance explained by regression. * explainedVariance = \sum_i (\hat{y_i} - \bar{y})^2 / n * @see [[https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained]] + * @since 1.2.0 */ def explainedVariance: Double = { SSreg / summary.count @@ -74,6 +76,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend /** * Returns the mean absolute error, which is a risk function corresponding to the * expected value of the absolute error loss or l1-norm loss. + * @since 1.2.0 */ def meanAbsoluteError: Double = { summary.normL1(1) / summary.count @@ -82,6 +85,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend /** * Returns the mean squared error, which is a risk function corresponding to the * expected value of the squared error loss or quadratic loss. + * @since 1.2.0 */ def meanSquaredError: Double = { SSerr / summary.count @@ -90,6 +94,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend /** * Returns the root mean squared error, which is defined as the square root of * the mean squared error. + * @since 1.2.0 */ def rootMeanSquaredError: Double = { math.sqrt(this.meanSquaredError) @@ -98,6 +103,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend /** * Returns R^2^, the unadjusted coefficient of determination. * @see [[http://en.wikipedia.org/wiki/Coefficient_of_determination]] + * @since 1.2.0 */ def r2: Double = { 1 - SSerr / SStot |