From 5eef1e6c6a8b6202fc6db4a90c4caab5169e86c6 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 23 Jun 2016 11:07:34 +0100
Subject: [SPARK-15660][CORE] Update RDD `variance/stdev` description and add
 popVariance/popStdev

## What changes were proposed in this pull request?

In Spark-11490, `variance/stdev` are redefined as the **sample** `variance/stdev` instead of population ones. This PR updates the other old documentations to prevent users from misunderstanding. This will update the following Scala/Java API docs.

- http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.api.java.JavaDoubleRDD
- http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.rdd.DoubleRDDFunctions
- http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.util.StatCounter
- http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/api/java/JavaDoubleRDD.html
- http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/rdd/DoubleRDDFunctions.html
- http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/util/StatCounter.html

Also, this PR adds them `popVariance` and `popStdev` functions clearly.

## How was this patch tested?

Pass the updated Jenkins tests.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #13403 from dongjoon-hyun/SPARK-15660.
---
 .../org/apache/spark/api/java/JavaDoubleRDD.scala  | 17 +++++++++++++++--
 .../org/apache/spark/rdd/DoubleRDDFunctions.scala  | 21 +++++++++++++++++++--
 .../scala/org/apache/spark/util/StatCounter.scala  | 22 ++++++++++++++++++----
 .../test/java/org/apache/spark/JavaAPISuite.java   |  2 ++
 .../scala/org/apache/spark/PartitioningSuite.scala |  4 ++++
 5 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
index 0d3a5237d9..0026fc9dad 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
@@ -22,6 +22,7 @@ import java.lang.{Double => JDouble}
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
+import org.apache.spark.annotation.Since
 import org.apache.spark.Partitioner
 import org.apache.spark.api.java.function.{Function => JFunction}
 import org.apache.spark.partial.{BoundedDouble, PartialResult}
@@ -184,10 +185,10 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
   /** Compute the mean of this RDD's elements. */
   def mean(): JDouble = srdd.mean()
 
-  /** Compute the variance of this RDD's elements. */
+  /** Compute the population variance of this RDD's elements. */
   def variance(): JDouble = srdd.variance()
 
-  /** Compute the standard deviation of this RDD's elements. */
+  /** Compute the population standard deviation of this RDD's elements. */
   def stdev(): JDouble = srdd.stdev()
 
   /**
@@ -202,6 +203,18 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
    */
   def sampleVariance(): JDouble = srdd.sampleVariance()
 
+  /**
+   * Compute the population standard deviation of this RDD's elements.
+   */
+  @Since("2.1.0")
+  def popStdev(): JDouble = srdd.popStdev()
+
+  /**
+   * Compute the population variance of this RDD's elements.
+   */
+  @Since("2.1.0")
+  def popVariance(): JDouble = srdd.popVariance()
+
   /** Return the approximate mean of the elements in this RDD. */
   def meanApprox(timeout: Long, confidence: JDouble): PartialResult[BoundedDouble] =
     srdd.meanApprox(timeout, confidence)
diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
index 368916a39e..a05a770b40 100644
--- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.rdd
 
+import org.apache.spark.annotation.Since
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.partial.BoundedDouble
@@ -47,12 +48,12 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
     stats().mean
   }
 
-  /** Compute the variance of this RDD's elements. */
+  /** Compute the population variance of this RDD's elements. */
   def variance(): Double = self.withScope {
     stats().variance
   }
 
-  /** Compute the standard deviation of this RDD's elements. */
+  /** Compute the population standard deviation of this RDD's elements. */
   def stdev(): Double = self.withScope {
     stats().stdev
   }
@@ -73,6 +74,22 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
     stats().sampleVariance
   }
 
+  /**
+   * Compute the population standard deviation of this RDD's elements.
+   */
+  @Since("2.1.0")
+  def popStdev(): Double = self.withScope {
+    stats().popStdev
+  }
+
+  /**
+   * Compute the population variance of this RDD's elements.
+   */
+  @Since("2.1.0")
+  def popVariance(): Double = self.withScope {
+    stats().popVariance
+  }
+
   /**
    * Approximate operation to return the mean within a timeout.
    */
diff --git a/core/src/main/scala/org/apache/spark/util/StatCounter.scala b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
index 8586da1996..45381365f1 100644
--- a/core/src/main/scala/org/apache/spark/util/StatCounter.scala
+++ b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.util
 
+import org.apache.spark.annotation.Since
+
 /**
  * A class for tracking the statistics of a set of numbers (count, mean and variance) in a
  * numerically robust way. Includes support for merging two StatCounters. Based on Welford
@@ -104,8 +106,14 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable {
 
   def min: Double = minValue
 
-  /** Return the variance of the values. */
-  def variance: Double = {
+  /** Return the population variance of the values. */
+  def variance: Double = popVariance
+
+  /**
+   * Return the population variance of the values.
+   */
+  @Since("2.1.0")
+  def popVariance: Double = {
     if (n == 0) {
       Double.NaN
     } else {
@@ -125,8 +133,14 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable {
     }
   }
 
-  /** Return the standard deviation of the values. */
-  def stdev: Double = math.sqrt(variance)
+  /** Return the population standard deviation of the values. */
+  def stdev: Double = popStdev
+
+  /**
+   * Return the population standard deviation of the values.
+   */
+  @Since("2.1.0")
+  def popStdev: Double = math.sqrt(popVariance)
 
   /**
    * Return the sample standard deviation of the values, which corrects for bias in estimating the
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index 7bac068321..533025ba83 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -733,8 +733,10 @@ public class JavaAPISuite implements Serializable {
     assertEquals(20/6.0, rdd.mean(), 0.01);
     assertEquals(20/6.0, rdd.mean(), 0.01);
     assertEquals(6.22222, rdd.variance(), 0.01);
+    assertEquals(rdd.variance(), rdd.popVariance(), 1e-14);
     assertEquals(7.46667, rdd.sampleVariance(), 0.01);
     assertEquals(2.49444, rdd.stdev(), 0.01);
+    assertEquals(rdd.stdev(), rdd.popStdev(), 1e-14);
     assertEquals(2.73252, rdd.sampleStdev(), 0.01);
 
     rdd.first();
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
index 3d31c7864e..c5d4968ef7 100644
--- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -244,6 +244,10 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
     assert(abs(6.0/2 - rdd.mean) < 0.01)
     assert(abs(1.0 - rdd.variance) < 0.01)
     assert(abs(1.0 - rdd.stdev) < 0.01)
+    assert(abs(rdd.variance - rdd.popVariance) < 1e-14)
+    assert(abs(rdd.stdev - rdd.popStdev) < 1e-14)
+    assert(abs(2.0 - rdd.sampleVariance) < 1e-14)
+    assert(abs(Math.sqrt(2.0) - rdd.sampleStdev) < 1e-14)
     assert(stats.max === 4.0)
     assert(stats.min === 2.0)
 
-- 
cgit v1.2.3