diff options
author | Dongjoon Hyun <dongjoon@apache.org> | 2016-06-23 11:07:34 +0100 |
---|---|---|
committer | Sean Owen <sowen@cloudera.com> | 2016-06-23 11:07:34 +0100 |
commit | 5eef1e6c6a8b6202fc6db4a90c4caab5169e86c6 (patch) | |
tree | 70eff7993a955e5c37aa2fcea4b05693baf9b902 /core/src | |
parent | 4374a46bfc52ee4f3ae9f61ccedc77a62aa9d4ee (diff) | |
download | spark-5eef1e6c6a8b6202fc6db4a90c4caab5169e86c6.tar.gz spark-5eef1e6c6a8b6202fc6db4a90c4caab5169e86c6.tar.bz2 spark-5eef1e6c6a8b6202fc6db4a90c4caab5169e86c6.zip |
[SPARK-15660][CORE] Update RDD `variance/stdev` description and add popVariance/popStdev
## What changes were proposed in this pull request?
In Spark-11490, `variance/stdev` are redefined as the **sample** `variance/stdev` instead of population ones. This PR updates the other old documentations to prevent users from misunderstanding. This will update the following Scala/Java API docs.
- http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.api.java.JavaDoubleRDD
- http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.rdd.DoubleRDDFunctions
- http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.util.StatCounter
- http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/api/java/JavaDoubleRDD.html
- http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/rdd/DoubleRDDFunctions.html
- http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/util/StatCounter.html
Also, this PR adds them `popVariance` and `popStdev` functions clearly.
## How was this patch tested?
Pass the updated Jenkins tests.
Author: Dongjoon Hyun <dongjoon@apache.org>
Closes #13403 from dongjoon-hyun/SPARK-15660.
Diffstat (limited to 'core/src')
5 files changed, 58 insertions, 8 deletions
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala index 0d3a5237d9..0026fc9dad 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala @@ -22,6 +22,7 @@ import java.lang.{Double => JDouble} import scala.language.implicitConversions import scala.reflect.ClassTag +import org.apache.spark.annotation.Since import org.apache.spark.Partitioner import org.apache.spark.api.java.function.{Function => JFunction} import org.apache.spark.partial.{BoundedDouble, PartialResult} @@ -184,10 +185,10 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) /** Compute the mean of this RDD's elements. */ def mean(): JDouble = srdd.mean() - /** Compute the variance of this RDD's elements. */ + /** Compute the population variance of this RDD's elements. */ def variance(): JDouble = srdd.variance() - /** Compute the standard deviation of this RDD's elements. */ + /** Compute the population standard deviation of this RDD's elements. */ def stdev(): JDouble = srdd.stdev() /** @@ -202,6 +203,18 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) */ def sampleVariance(): JDouble = srdd.sampleVariance() + /** + * Compute the population standard deviation of this RDD's elements. + */ + @Since("2.1.0") + def popStdev(): JDouble = srdd.popStdev() + + /** + * Compute the population variance of this RDD's elements. + */ + @Since("2.1.0") + def popVariance(): JDouble = srdd.popVariance() + /** Return the approximate mean of the elements in this RDD. */ def meanApprox(timeout: Long, confidence: JDouble): PartialResult[BoundedDouble] = srdd.meanApprox(timeout, confidence) diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala index 368916a39e..a05a770b40 100644 --- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala @@ -17,6 +17,7 @@ package org.apache.spark.rdd +import org.apache.spark.annotation.Since import org.apache.spark.TaskContext import org.apache.spark.internal.Logging import org.apache.spark.partial.BoundedDouble @@ -47,12 +48,12 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable { stats().mean } - /** Compute the variance of this RDD's elements. */ + /** Compute the population variance of this RDD's elements. */ def variance(): Double = self.withScope { stats().variance } - /** Compute the standard deviation of this RDD's elements. */ + /** Compute the population standard deviation of this RDD's elements. */ def stdev(): Double = self.withScope { stats().stdev } @@ -74,6 +75,22 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable { } /** + * Compute the population standard deviation of this RDD's elements. + */ + @Since("2.1.0") + def popStdev(): Double = self.withScope { + stats().popStdev + } + + /** + * Compute the population variance of this RDD's elements. + */ + @Since("2.1.0") + def popVariance(): Double = self.withScope { + stats().popVariance + } + + /** * Approximate operation to return the mean within a timeout. */ def meanApprox( diff --git a/core/src/main/scala/org/apache/spark/util/StatCounter.scala b/core/src/main/scala/org/apache/spark/util/StatCounter.scala index 8586da1996..45381365f1 100644 --- a/core/src/main/scala/org/apache/spark/util/StatCounter.scala +++ b/core/src/main/scala/org/apache/spark/util/StatCounter.scala @@ -17,6 +17,8 @@ package org.apache.spark.util +import org.apache.spark.annotation.Since + /** * A class for tracking the statistics of a set of numbers (count, mean and variance) in a * numerically robust way. Includes support for merging two StatCounters. Based on Welford @@ -104,8 +106,14 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable { def min: Double = minValue - /** Return the variance of the values. */ - def variance: Double = { + /** Return the population variance of the values. */ + def variance: Double = popVariance + + /** + * Return the population variance of the values. + */ + @Since("2.1.0") + def popVariance: Double = { if (n == 0) { Double.NaN } else { @@ -125,8 +133,14 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable { } } - /** Return the standard deviation of the values. */ - def stdev: Double = math.sqrt(variance) + /** Return the population standard deviation of the values. */ + def stdev: Double = popStdev + + /** + * Return the population standard deviation of the values. + */ + @Since("2.1.0") + def popStdev: Double = math.sqrt(popVariance) /** * Return the sample standard deviation of the values, which corrects for bias in estimating the diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java index 7bac068321..533025ba83 100644 --- a/core/src/test/java/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java @@ -733,8 +733,10 @@ public class JavaAPISuite implements Serializable { assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(6.22222, rdd.variance(), 0.01); + assertEquals(rdd.variance(), rdd.popVariance(), 1e-14); assertEquals(7.46667, rdd.sampleVariance(), 0.01); assertEquals(2.49444, rdd.stdev(), 0.01); + assertEquals(rdd.stdev(), rdd.popStdev(), 1e-14); assertEquals(2.73252, rdd.sampleStdev(), 0.01); rdd.first(); diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala index 3d31c7864e..c5d4968ef7 100644 --- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala +++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala @@ -244,6 +244,10 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva assert(abs(6.0/2 - rdd.mean) < 0.01) assert(abs(1.0 - rdd.variance) < 0.01) assert(abs(1.0 - rdd.stdev) < 0.01) + assert(abs(rdd.variance - rdd.popVariance) < 1e-14) + assert(abs(rdd.stdev - rdd.popStdev) < 1e-14) + assert(abs(2.0 - rdd.sampleVariance) < 1e-14) + assert(abs(Math.sqrt(2.0) - rdd.sampleStdev) < 1e-14) assert(stats.max === 4.0) assert(stats.min === 2.0) |