aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorshikai.tang <tar.sky06@gmail.com>2015-08-12 21:53:15 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-12 21:53:15 -0700
commitdf543892122342b97e5137b266959ba97589b3ef (patch)
tree525171bf0465cc95e220bfa30b672a5488428a18 /mllib
parent5fc058a1fc5d83ad53feec936475484aef3800b3 (diff)
downloadspark-df543892122342b97e5137b266959ba97589b3ef.tar.gz
spark-df543892122342b97e5137b266959ba97589b3ef.tar.bz2
spark-df543892122342b97e5137b266959ba97589b3ef.zip
[SPARK-8922] [DOCUMENTATION, MLLIB] Add @since tags to mllib.evaluation
Author: shikai.tang <tar.sky06@gmail.com> Closes #7429 from mosessky/master.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala32
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala9
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala6
5 files changed, 50 insertions, 5 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index c1d1a22481..486741edd6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.DataFrame
* of bins may not exactly equal numBins. The last bin in each partition may
* be smaller as a result, meaning there may be an extra sample at
* partition boundaries.
+ * @since 1.3.0
*/
@Experimental
class BinaryClassificationMetrics(
@@ -51,6 +52,7 @@ class BinaryClassificationMetrics(
/**
* Defaults `numBins` to 0.
+ * @since 1.0.0
*/
def this(scoreAndLabels: RDD[(Double, Double)]) = this(scoreAndLabels, 0)
@@ -61,12 +63,18 @@ class BinaryClassificationMetrics(
private[mllib] def this(scoreAndLabels: DataFrame) =
this(scoreAndLabels.map(r => (r.getDouble(0), r.getDouble(1))))
- /** Unpersist intermediate RDDs used in the computation. */
+ /**
+ * Unpersist intermediate RDDs used in the computation.
+ * @since 1.0.0
+ */
def unpersist() {
cumulativeCounts.unpersist()
}
- /** Returns thresholds in descending order. */
+ /**
+ * Returns thresholds in descending order.
+ * @since 1.0.0
+ */
def thresholds(): RDD[Double] = cumulativeCounts.map(_._1)
/**
@@ -74,6 +82,7 @@ class BinaryClassificationMetrics(
* which is an RDD of (false positive rate, true positive rate)
* with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
* @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic
+ * @since 1.0.0
*/
def roc(): RDD[(Double, Double)] = {
val rocCurve = createCurve(FalsePositiveRate, Recall)
@@ -85,6 +94,7 @@ class BinaryClassificationMetrics(
/**
* Computes the area under the receiver operating characteristic (ROC) curve.
+ * @since 1.0.0
*/
def areaUnderROC(): Double = AreaUnderCurve.of(roc())
@@ -92,6 +102,7 @@ class BinaryClassificationMetrics(
* Returns the precision-recall curve, which is an RDD of (recall, precision),
* NOT (precision, recall), with (0.0, 1.0) prepended to it.
* @see http://en.wikipedia.org/wiki/Precision_and_recall
+ * @since 1.0.0
*/
def pr(): RDD[(Double, Double)] = {
val prCurve = createCurve(Recall, Precision)
@@ -102,6 +113,7 @@ class BinaryClassificationMetrics(
/**
* Computes the area under the precision-recall curve.
+ * @since 1.0.0
*/
def areaUnderPR(): Double = AreaUnderCurve.of(pr())
@@ -110,16 +122,26 @@ class BinaryClassificationMetrics(
* @param beta the beta factor in F-Measure computation.
* @return an RDD of (threshold, F-Measure) pairs.
* @see http://en.wikipedia.org/wiki/F1_score
+ * @since 1.0.0
*/
def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta))
- /** Returns the (threshold, F-Measure) curve with beta = 1.0. */
+ /**
+ * Returns the (threshold, F-Measure) curve with beta = 1.0.
+ * @since 1.0.0
+ */
def fMeasureByThreshold(): RDD[(Double, Double)] = fMeasureByThreshold(1.0)
- /** Returns the (threshold, precision) curve. */
+ /**
+ * Returns the (threshold, precision) curve.
+ * @since 1.0.0
+ */
def precisionByThreshold(): RDD[(Double, Double)] = createCurve(Precision)
- /** Returns the (threshold, recall) curve. */
+ /**
+ * Returns the (threshold, recall) curve.
+ * @since 1.0.0
+ */
def recallByThreshold(): RDD[(Double, Double)] = createCurve(Recall)
private lazy val (
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 4628dc5690..dddfa3ea5b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.DataFrame
* Evaluator for multiclass classification.
*
* @param predictionAndLabels an RDD of (prediction, label) pairs.
+ * @since 1.1.0
*/
@Experimental
class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
@@ -64,6 +65,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
* predicted classes are in columns,
* they are ordered by class label ascending,
* as in "labels"
+ * @since 1.1.0
*/
def confusionMatrix: Matrix = {
val n = labels.size
@@ -83,12 +85,14 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
/**
* Returns true positive rate for a given label (category)
* @param label the label.
+ * @since 1.1.0
*/
def truePositiveRate(label: Double): Double = recall(label)
/**
* Returns false positive rate for a given label (category)
* @param label the label.
+ * @since 1.1.0
*/
def falsePositiveRate(label: Double): Double = {
val fp = fpByClass.getOrElse(label, 0)
@@ -98,6 +102,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
/**
* Returns precision for a given label (category)
* @param label the label.
+ * @since 1.1.0
*/
def precision(label: Double): Double = {
val tp = tpByClass(label)
@@ -108,6 +113,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
/**
* Returns recall for a given label (category)
* @param label the label.
+ * @since 1.1.0
*/
def recall(label: Double): Double = tpByClass(label).toDouble / labelCountByClass(label)
@@ -115,6 +121,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
* Returns f-measure for a given label (category)
* @param label the label.
* @param beta the beta parameter.
+ * @since 1.1.0
*/
def fMeasure(label: Double, beta: Double): Double = {
val p = precision(label)
@@ -126,6 +133,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
/**
* Returns f1-measure for a given label (category)
* @param label the label.
+ * @since 1.1.0
*/
def fMeasure(label: Double): Double = fMeasure(label, 1.0)
@@ -179,6 +187,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
/**
* Returns weighted averaged f-measure
* @param beta the beta parameter.
+ * @since 1.1.0
*/
def weightedFMeasure(beta: Double): Double = labelCountByClass.map { case (category, count) =>
fMeasure(category, beta) * count.toDouble / labelCount
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
index bf6eb1d5bd..77cb1e09bd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.DataFrame
* Evaluator for multilabel classification.
* @param predictionAndLabels an RDD of (predictions, labels) pairs,
* both are non-null Arrays, each with unique elements.
+ * @since 1.2.0
*/
class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]) {
@@ -103,6 +104,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
/**
* Returns precision for a given label (category)
* @param label the label.
+ * @since 1.2.0
*/
def precision(label: Double): Double = {
val tp = tpPerClass(label)
@@ -113,6 +115,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
/**
* Returns recall for a given label (category)
* @param label the label.
+ * @since 1.2.0
*/
def recall(label: Double): Double = {
val tp = tpPerClass(label)
@@ -123,6 +126,7 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])]
/**
* Returns f1-measure for a given label (category)
* @param label the label.
+ * @since 1.2.0
*/
def f1Measure(label: Double): Double = {
val p = precision(label)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
index 5b5a2a1450..063fbed8cd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
@@ -34,6 +34,7 @@ import org.apache.spark.rdd.RDD
* Java users should use [[RankingMetrics$.of]] to create a [[RankingMetrics]] instance.
*
* @param predictionAndLabels an RDD of (predicted ranking, ground truth set) pairs.
+ * @since 1.2.0
*/
@Experimental
class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])])
@@ -55,6 +56,7 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]
*
* @param k the position to compute the truncated precision, must be positive
* @return the average precision at the first k ranking positions
+ * @since 1.2.0
*/
def precisionAt(k: Int): Double = {
require(k > 0, "ranking position k should be positive")
@@ -124,6 +126,7 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]
*
* @param k the position to compute the truncated ndcg, must be positive
* @return the average ndcg at the first k ranking positions
+ * @since 1.2.0
*/
def ndcgAt(k: Int): Double = {
require(k > 0, "ranking position k should be positive")
@@ -162,6 +165,7 @@ object RankingMetrics {
/**
* Creates a [[RankingMetrics]] instance (for Java users).
* @param predictionAndLabels a JavaRDD of (predicted ranking, ground truth set) pairs
+ * @since 1.4.0
*/
def of[E, T <: jl.Iterable[E]](predictionAndLabels: JavaRDD[(T, T)]): RankingMetrics[E] = {
implicit val tag = JavaSparkContext.fakeClassTag[E]
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
index 408847afa8..54dfd8c099 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.DataFrame
* Evaluator for regression.
*
* @param predictionAndObservations an RDD of (prediction, observation) pairs.
+ * @since 1.2.0
*/
@Experimental
class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extends Logging {
@@ -66,6 +67,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
* Returns the variance explained by regression.
* explainedVariance = \sum_i (\hat{y_i} - \bar{y})^2 / n
* @see [[https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained]]
+ * @since 1.2.0
*/
def explainedVariance: Double = {
SSreg / summary.count
@@ -74,6 +76,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
/**
* Returns the mean absolute error, which is a risk function corresponding to the
* expected value of the absolute error loss or l1-norm loss.
+ * @since 1.2.0
*/
def meanAbsoluteError: Double = {
summary.normL1(1) / summary.count
@@ -82,6 +85,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
/**
* Returns the mean squared error, which is a risk function corresponding to the
* expected value of the squared error loss or quadratic loss.
+ * @since 1.2.0
*/
def meanSquaredError: Double = {
SSerr / summary.count
@@ -90,6 +94,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
/**
* Returns the root mean squared error, which is defined as the square root of
* the mean squared error.
+ * @since 1.2.0
*/
def rootMeanSquaredError: Double = {
math.sqrt(this.meanSquaredError)
@@ -98,6 +103,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
/**
* Returns R^2^, the unadjusted coefficient of determination.
* @see [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
+ * @since 1.2.0
*/
def r2: Double = {
1 - SSerr / SStot