aboutsummaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorDongjoon Hyun <dongjoon@apache.org>2016-06-23 11:07:34 +0100
committerSean Owen <sowen@cloudera.com>2016-06-23 11:07:34 +0100
commit5eef1e6c6a8b6202fc6db4a90c4caab5169e86c6 (patch)
tree70eff7993a955e5c37aa2fcea4b05693baf9b902 /core
parent4374a46bfc52ee4f3ae9f61ccedc77a62aa9d4ee (diff)
downloadspark-5eef1e6c6a8b6202fc6db4a90c4caab5169e86c6.tar.gz
spark-5eef1e6c6a8b6202fc6db4a90c4caab5169e86c6.tar.bz2
spark-5eef1e6c6a8b6202fc6db4a90c4caab5169e86c6.zip
[SPARK-15660][CORE] Update RDD `variance/stdev` description and add popVariance/popStdev
## What changes were proposed in this pull request? In Spark-11490, `variance/stdev` are redefined as the **sample** `variance/stdev` instead of population ones. This PR updates the other old documentations to prevent users from misunderstanding. This will update the following Scala/Java API docs. - http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.api.java.JavaDoubleRDD - http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.rdd.DoubleRDDFunctions - http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.util.StatCounter - http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/api/java/JavaDoubleRDD.html - http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/rdd/DoubleRDDFunctions.html - http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/util/StatCounter.html Also, this PR adds them `popVariance` and `popStdev` functions clearly. ## How was this patch tested? Pass the updated Jenkins tests. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #13403 from dongjoon-hyun/SPARK-15660.
Diffstat (limited to 'core')
-rw-r--r--core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala17
-rw-r--r--core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala21
-rw-r--r--core/src/main/scala/org/apache/spark/util/StatCounter.scala22
-rw-r--r--core/src/test/java/org/apache/spark/JavaAPISuite.java2
-rw-r--r--core/src/test/scala/org/apache/spark/PartitioningSuite.scala4
5 files changed, 58 insertions, 8 deletions
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
index 0d3a5237d9..0026fc9dad 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
@@ -22,6 +22,7 @@ import java.lang.{Double => JDouble}
import scala.language.implicitConversions
import scala.reflect.ClassTag
+import org.apache.spark.annotation.Since
import org.apache.spark.Partitioner
import org.apache.spark.api.java.function.{Function => JFunction}
import org.apache.spark.partial.{BoundedDouble, PartialResult}
@@ -184,10 +185,10 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
/** Compute the mean of this RDD's elements. */
def mean(): JDouble = srdd.mean()
- /** Compute the variance of this RDD's elements. */
+ /** Compute the population variance of this RDD's elements. */
def variance(): JDouble = srdd.variance()
- /** Compute the standard deviation of this RDD's elements. */
+ /** Compute the population standard deviation of this RDD's elements. */
def stdev(): JDouble = srdd.stdev()
/**
@@ -202,6 +203,18 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
*/
def sampleVariance(): JDouble = srdd.sampleVariance()
+ /**
+ * Compute the population standard deviation of this RDD's elements.
+ */
+ @Since("2.1.0")
+ def popStdev(): JDouble = srdd.popStdev()
+
+ /**
+ * Compute the population variance of this RDD's elements.
+ */
+ @Since("2.1.0")
+ def popVariance(): JDouble = srdd.popVariance()
+
/** Return the approximate mean of the elements in this RDD. */
def meanApprox(timeout: Long, confidence: JDouble): PartialResult[BoundedDouble] =
srdd.meanApprox(timeout, confidence)
diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
index 368916a39e..a05a770b40 100644
--- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
@@ -17,6 +17,7 @@
package org.apache.spark.rdd
+import org.apache.spark.annotation.Since
import org.apache.spark.TaskContext
import org.apache.spark.internal.Logging
import org.apache.spark.partial.BoundedDouble
@@ -47,12 +48,12 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
stats().mean
}
- /** Compute the variance of this RDD's elements. */
+ /** Compute the population variance of this RDD's elements. */
def variance(): Double = self.withScope {
stats().variance
}
- /** Compute the standard deviation of this RDD's elements. */
+ /** Compute the population standard deviation of this RDD's elements. */
def stdev(): Double = self.withScope {
stats().stdev
}
@@ -74,6 +75,22 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
}
/**
+ * Compute the population standard deviation of this RDD's elements.
+ */
+ @Since("2.1.0")
+ def popStdev(): Double = self.withScope {
+ stats().popStdev
+ }
+
+ /**
+ * Compute the population variance of this RDD's elements.
+ */
+ @Since("2.1.0")
+ def popVariance(): Double = self.withScope {
+ stats().popVariance
+ }
+
+ /**
* Approximate operation to return the mean within a timeout.
*/
def meanApprox(
diff --git a/core/src/main/scala/org/apache/spark/util/StatCounter.scala b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
index 8586da1996..45381365f1 100644
--- a/core/src/main/scala/org/apache/spark/util/StatCounter.scala
+++ b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
@@ -17,6 +17,8 @@
package org.apache.spark.util
+import org.apache.spark.annotation.Since
+
/**
* A class for tracking the statistics of a set of numbers (count, mean and variance) in a
* numerically robust way. Includes support for merging two StatCounters. Based on Welford
@@ -104,8 +106,14 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable {
def min: Double = minValue
- /** Return the variance of the values. */
- def variance: Double = {
+ /** Return the population variance of the values. */
+ def variance: Double = popVariance
+
+ /**
+ * Return the population variance of the values.
+ */
+ @Since("2.1.0")
+ def popVariance: Double = {
if (n == 0) {
Double.NaN
} else {
@@ -125,8 +133,14 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable {
}
}
- /** Return the standard deviation of the values. */
- def stdev: Double = math.sqrt(variance)
+ /** Return the population standard deviation of the values. */
+ def stdev: Double = popStdev
+
+ /**
+ * Return the population standard deviation of the values.
+ */
+ @Since("2.1.0")
+ def popStdev: Double = math.sqrt(popVariance)
/**
* Return the sample standard deviation of the values, which corrects for bias in estimating the
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index 7bac068321..533025ba83 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -733,8 +733,10 @@ public class JavaAPISuite implements Serializable {
assertEquals(20/6.0, rdd.mean(), 0.01);
assertEquals(20/6.0, rdd.mean(), 0.01);
assertEquals(6.22222, rdd.variance(), 0.01);
+ assertEquals(rdd.variance(), rdd.popVariance(), 1e-14);
assertEquals(7.46667, rdd.sampleVariance(), 0.01);
assertEquals(2.49444, rdd.stdev(), 0.01);
+ assertEquals(rdd.stdev(), rdd.popStdev(), 1e-14);
assertEquals(2.73252, rdd.sampleStdev(), 0.01);
rdd.first();
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
index 3d31c7864e..c5d4968ef7 100644
--- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -244,6 +244,10 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
assert(abs(6.0/2 - rdd.mean) < 0.01)
assert(abs(1.0 - rdd.variance) < 0.01)
assert(abs(1.0 - rdd.stdev) < 0.01)
+ assert(abs(rdd.variance - rdd.popVariance) < 1e-14)
+ assert(abs(rdd.stdev - rdd.popStdev) < 1e-14)
+ assert(abs(2.0 - rdd.sampleVariance) < 1e-14)
+ assert(abs(Math.sqrt(2.0) - rdd.sampleStdev) < 1e-14)
assert(stats.max === 4.0)
assert(stats.min === 2.0)