From 5eef1e6c6a8b6202fc6db4a90c4caab5169e86c6 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Thu, 23 Jun 2016 11:07:34 +0100 Subject: [SPARK-15660][CORE] Update RDD `variance/stdev` description and add popVariance/popStdev ## What changes were proposed in this pull request? In Spark-11490, `variance/stdev` are redefined as the **sample** `variance/stdev` instead of population ones. This PR updates the other old documentations to prevent users from misunderstanding. This will update the following Scala/Java API docs. - http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.api.java.JavaDoubleRDD - http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.rdd.DoubleRDDFunctions - http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.util.StatCounter - http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/api/java/JavaDoubleRDD.html - http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/rdd/DoubleRDDFunctions.html - http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/util/StatCounter.html Also, this PR adds them `popVariance` and `popStdev` functions clearly. ## How was this patch tested? Pass the updated Jenkins tests. Author: Dongjoon Hyun Closes #13403 from dongjoon-hyun/SPARK-15660. --- .../org/apache/spark/api/java/JavaDoubleRDD.scala | 17 +++++++++++++++-- .../org/apache/spark/rdd/DoubleRDDFunctions.scala | 21 +++++++++++++++++++-- .../scala/org/apache/spark/util/StatCounter.scala | 22 ++++++++++++++++++---- .../test/java/org/apache/spark/JavaAPISuite.java | 2 ++ .../scala/org/apache/spark/PartitioningSuite.scala | 4 ++++ 5 files changed, 58 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala index 0d3a5237d9..0026fc9dad 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala @@ -22,6 +22,7 @@ import java.lang.{Double => JDouble} import scala.language.implicitConversions import scala.reflect.ClassTag +import org.apache.spark.annotation.Since import org.apache.spark.Partitioner import org.apache.spark.api.java.function.{Function => JFunction} import org.apache.spark.partial.{BoundedDouble, PartialResult} @@ -184,10 +185,10 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) /** Compute the mean of this RDD's elements. */ def mean(): JDouble = srdd.mean() - /** Compute the variance of this RDD's elements. */ + /** Compute the population variance of this RDD's elements. */ def variance(): JDouble = srdd.variance() - /** Compute the standard deviation of this RDD's elements. */ + /** Compute the population standard deviation of this RDD's elements. */ def stdev(): JDouble = srdd.stdev() /** @@ -202,6 +203,18 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) */ def sampleVariance(): JDouble = srdd.sampleVariance() + /** + * Compute the population standard deviation of this RDD's elements. + */ + @Since("2.1.0") + def popStdev(): JDouble = srdd.popStdev() + + /** + * Compute the population variance of this RDD's elements. + */ + @Since("2.1.0") + def popVariance(): JDouble = srdd.popVariance() + /** Return the approximate mean of the elements in this RDD. */ def meanApprox(timeout: Long, confidence: JDouble): PartialResult[BoundedDouble] = srdd.meanApprox(timeout, confidence) diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala index 368916a39e..a05a770b40 100644 --- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala @@ -17,6 +17,7 @@ package org.apache.spark.rdd +import org.apache.spark.annotation.Since import org.apache.spark.TaskContext import org.apache.spark.internal.Logging import org.apache.spark.partial.BoundedDouble @@ -47,12 +48,12 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable { stats().mean } - /** Compute the variance of this RDD's elements. */ + /** Compute the population variance of this RDD's elements. */ def variance(): Double = self.withScope { stats().variance } - /** Compute the standard deviation of this RDD's elements. */ + /** Compute the population standard deviation of this RDD's elements. */ def stdev(): Double = self.withScope { stats().stdev } @@ -73,6 +74,22 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable { stats().sampleVariance } + /** + * Compute the population standard deviation of this RDD's elements. + */ + @Since("2.1.0") + def popStdev(): Double = self.withScope { + stats().popStdev + } + + /** + * Compute the population variance of this RDD's elements. + */ + @Since("2.1.0") + def popVariance(): Double = self.withScope { + stats().popVariance + } + /** * Approximate operation to return the mean within a timeout. */ diff --git a/core/src/main/scala/org/apache/spark/util/StatCounter.scala b/core/src/main/scala/org/apache/spark/util/StatCounter.scala index 8586da1996..45381365f1 100644 --- a/core/src/main/scala/org/apache/spark/util/StatCounter.scala +++ b/core/src/main/scala/org/apache/spark/util/StatCounter.scala @@ -17,6 +17,8 @@ package org.apache.spark.util +import org.apache.spark.annotation.Since + /** * A class for tracking the statistics of a set of numbers (count, mean and variance) in a * numerically robust way. Includes support for merging two StatCounters. Based on Welford @@ -104,8 +106,14 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable { def min: Double = minValue - /** Return the variance of the values. */ - def variance: Double = { + /** Return the population variance of the values. */ + def variance: Double = popVariance + + /** + * Return the population variance of the values. + */ + @Since("2.1.0") + def popVariance: Double = { if (n == 0) { Double.NaN } else { @@ -125,8 +133,14 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable { } } - /** Return the standard deviation of the values. */ - def stdev: Double = math.sqrt(variance) + /** Return the population standard deviation of the values. */ + def stdev: Double = popStdev + + /** + * Return the population standard deviation of the values. + */ + @Since("2.1.0") + def popStdev: Double = math.sqrt(popVariance) /** * Return the sample standard deviation of the values, which corrects for bias in estimating the diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java index 7bac068321..533025ba83 100644 --- a/core/src/test/java/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java @@ -733,8 +733,10 @@ public class JavaAPISuite implements Serializable { assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(6.22222, rdd.variance(), 0.01); + assertEquals(rdd.variance(), rdd.popVariance(), 1e-14); assertEquals(7.46667, rdd.sampleVariance(), 0.01); assertEquals(2.49444, rdd.stdev(), 0.01); + assertEquals(rdd.stdev(), rdd.popStdev(), 1e-14); assertEquals(2.73252, rdd.sampleStdev(), 0.01); rdd.first(); diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala index 3d31c7864e..c5d4968ef7 100644 --- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala +++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala @@ -244,6 +244,10 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva assert(abs(6.0/2 - rdd.mean) < 0.01) assert(abs(1.0 - rdd.variance) < 0.01) assert(abs(1.0 - rdd.stdev) < 0.01) + assert(abs(rdd.variance - rdd.popVariance) < 1e-14) + assert(abs(rdd.stdev - rdd.popStdev) < 1e-14) + assert(abs(2.0 - rdd.sampleVariance) < 1e-14) + assert(abs(Math.sqrt(2.0) - rdd.sampleStdev) < 1e-14) assert(stats.max === 4.0) assert(stats.min === 2.0) -- cgit v1.2.3