[SPARK-17768][CORE] Small (Sum,Count,Mean)Evaluator problems and suboptimalities

## What changes were proposed in this pull request? Fix: - GroupedMeanEvaluator and GroupedSumEvaluator are unused, as is the StudentTCacher support class - CountEvaluator can return a lower bound < 0, when counts can't be negative - MeanEvaluator will actually fail on exactly 1 datum (yields t-test with 0 DOF) - CountEvaluator uses a normal distribution, which may be an inappropriate approximation (leading to above) - Test for SumEvaluator asserts incorrect expected sums – e.g. after observing 10% of data has sum of 2, expectation should be 20, not 38 - CountEvaluator, MeanEvaluator have no unit tests to catch these - Duplication of distribution code across CountEvaluator, GroupedCountEvaluator - The stats in each could use a bit of documentation as I had to guess at them - (Code could use a few cleanups and optimizations too) ## How was this patch tested? Existing and new tests Author: Sean Owen <sowen@cloudera.com> Closes #15341 from srowen/SPARK-17768.
author: Sean Owen <sowen@cloudera.com> 2016-10-08 11:31:12 +0100
committer: Sean Owen <sowen@cloudera.com> 2016-10-08 11:31:12 +0100
commit: 4201ddcc07ca2e9af78bf4a74fdb3900c1783347 (patch)
tree: ae50667b9ae7e8e8b57ccf431ad08181c40baaac /core/src/test
parent: 362ba4b6f8e8fc2355368742c5adced7573fec00 (diff)
download: spark-4201ddcc07ca2e9af78bf4a74fdb3900c1783347.tar.gz
spark-4201ddcc07ca2e9af78bf4a74fdb3900c1783347.tar.bz2
spark-4201ddcc07ca2e9af78bf4a74fdb3900c1783347.zip
3 files changed, 126 insertions, 56 deletions
diff --git a/core/src/test/scala/org/apache/spark/partial/CountEvaluatorSuite.scala b/core/src/test/scala/org/apache/spark/partial/CountEvaluatorSuite.scala
new file mode 100644
index 0000000000..da3256bd88
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/partial/CountEvaluatorSuite.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.partial
+
+import org.apache.spark.SparkFunSuite
+
+class CountEvaluatorSuite extends SparkFunSuite {
+
+  test("test count 0") {
+    val evaluator = new CountEvaluator(10, 0.95)
+    assert(new BoundedDouble(0.0, 0.0, 0.0, Double.PositiveInfinity) == evaluator.currentResult())
+    evaluator.merge(1, 0)
+    assert(new BoundedDouble(0.0, 0.0, 0.0, Double.PositiveInfinity) == evaluator.currentResult())
+  }
+
+  test("test count >= 1") {
+    val evaluator = new CountEvaluator(10, 0.95)
+    evaluator.merge(1, 1)
+    assert(new BoundedDouble(10.0, 0.95, 1.0, 36.0) == evaluator.currentResult())
+    evaluator.merge(1, 3)
+    assert(new BoundedDouble(20.0, 0.95, 7.0, 41.0) == evaluator.currentResult())
+    evaluator.merge(1, 8)
+    assert(new BoundedDouble(40.0, 0.95, 24.0, 61.0) == evaluator.currentResult())
+    (4 to 10).foreach(_ => evaluator.merge(1, 10))
+    assert(new BoundedDouble(82.0, 1.0, 82.0, 82.0) == evaluator.currentResult())
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/partial/MeanEvaluatorSuite.scala b/core/src/test/scala/org/apache/spark/partial/MeanEvaluatorSuite.scala
new file mode 100644
index 0000000000..eaa1262b41
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/partial/MeanEvaluatorSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.partial
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.util.StatCounter
+
+class MeanEvaluatorSuite extends SparkFunSuite {
+
+  test("test count 0") {
+    val evaluator = new MeanEvaluator(10, 0.95)
+    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
+      evaluator.currentResult())
+    evaluator.merge(1, new StatCounter())
+    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
+      evaluator.currentResult())
+    evaluator.merge(1, new StatCounter(Seq(0.0)))
+    assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
+      evaluator.currentResult())
+  }
+
+  test("test count 1") {
+    val evaluator = new MeanEvaluator(10, 0.95)
+    evaluator.merge(1, new StatCounter(Seq(1.0)))
+    assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
+      evaluator.currentResult())
+  }
+
+  test("test count > 1") {
+    val evaluator = new MeanEvaluator(10, 0.95)
+    evaluator.merge(1, new StatCounter(Seq(1.0)))
+    evaluator.merge(1, new StatCounter(Seq(3.0)))
+    assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) ==
+      evaluator.currentResult())
+    evaluator.merge(1, new StatCounter(Seq(8.0)))
+    assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) ==
+      evaluator.currentResult())
+    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0))))
+    assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult())
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/partial/SumEvaluatorSuite.scala b/core/src/test/scala/org/apache/spark/partial/SumEvaluatorSuite.scala
index a79f5b4d74..e212db7362 100644
--- a/core/src/test/scala/org/apache/spark/partial/SumEvaluatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/partial/SumEvaluatorSuite.scala
@@ -17,61 +17,34 @@
 
 package org.apache.spark.partial
 
-import org.apache.spark._
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.StatCounter
 
-class SumEvaluatorSuite extends SparkFunSuite with SharedSparkContext {
+class SumEvaluatorSuite extends SparkFunSuite {
 
   test("correct handling of count 1") {
+    // sanity check:
+    assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))
 
-    // setup
-    val counter = new StatCounter(List(2.0))
     // count of 10 because it's larger than 1,
     // and 0.95 because that's the default
     val evaluator = new SumEvaluator(10, 0.95)
     // arbitrarily assign id 1
-    evaluator.merge(1, counter)
-
-    // execute
-    val res = evaluator.currentResult()
-    // 38.0 - 7.1E-15 because that's how the maths shakes out
-    val targetMean = 38.0 - 7.1E-15
-
-    // Sanity check that equality works on BoundedDouble
-    assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))
-
-    // actual test
-    assert(res ==
-      new BoundedDouble(targetMean, 0.950, Double.NegativeInfinity, Double.PositiveInfinity))
+    evaluator.merge(1, new StatCounter(Seq(2.0)))
+    assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
+      evaluator.currentResult())
   }
 
   test("correct handling of count 0") {
-
-    // setup
-    val counter = new StatCounter(List())
-    // count of 10 because it's larger than 0,
-    // and 0.95 because that's the default
     val evaluator = new SumEvaluator(10, 0.95)
-    // arbitrarily assign id 1
-    evaluator.merge(1, counter)
-
-    // execute
-    val res = evaluator.currentResult()
-    // assert
-    assert(res == new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity))
+    evaluator.merge(1, new StatCounter())
+    assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
+      evaluator.currentResult())
   }
 
   test("correct handling of NaN") {
-
-    // setup
-    val counter = new StatCounter(List(1, Double.NaN, 2))
-    // count of 10 because it's larger than 0,
-    // and 0.95 because that's the default
     val evaluator = new SumEvaluator(10, 0.95)
-    // arbitrarily assign id 1
-    evaluator.merge(1, counter)
-
-    // execute
+    evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2)))
     val res = evaluator.currentResult()
     // assert - note semantics of == in face of NaN
     assert(res.mean.isNaN)
@@ -81,27 +54,24 @@ class SumEvaluatorSuite extends SparkFunSuite with SharedSparkContext {
   }
 
   test("correct handling of > 1 values") {
-
-    // setup
-    val counter = new StatCounter(List(1, 3, 2))
-    // count of 10 because it's larger than 0,
-    // and 0.95 because that's the default
     val evaluator = new SumEvaluator(10, 0.95)
-    // arbitrarily assign id 1
-    evaluator.merge(1, counter)
-
-    // execute
+    evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0)))
     val res = evaluator.currentResult()
+    assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) ==
+      evaluator.currentResult())
+  }
 
-    // These vals because that's how the maths shakes out
-    val targetMean = 78.0
-    val targetLow = -117.617 + 2.732357258139473E-5
-    val targetHigh = 273.617 - 2.7323572624027292E-5
-    val target = new BoundedDouble(targetMean, 0.95, targetLow, targetHigh)
-
-
-    // check that values are within expected tolerance of expectation
-    assert(res == target)
+  test("test count > 1") {
+    val evaluator = new SumEvaluator(10, 0.95)
+    evaluator.merge(1, new StatCounter().merge(1.0))
+    evaluator.merge(1, new StatCounter().merge(3.0))
+    assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) ==
+      evaluator.currentResult())
+    evaluator.merge(1, new StatCounter().merge(8.0))
+    assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) ==
+      evaluator.currentResult())
+    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0)))
+    assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult())
   }
 
 }
author	Sean Owen <sowen@cloudera.com>	2016-10-08 11:31:12 +0100
committer	Sean Owen <sowen@cloudera.com>	2016-10-08 11:31:12 +0100
commit	4201ddcc07ca2e9af78bf4a74fdb3900c1783347 (patch)
tree	ae50667b9ae7e8e8b57ccf431ad08181c40baaac /core/src/test
parent	362ba4b6f8e8fc2355368742c5adced7573fec00 (diff)
download	spark-4201ddcc07ca2e9af78bf4a74fdb3900c1783347.tar.gz spark-4201ddcc07ca2e9af78bf4a74fdb3900c1783347.tar.bz2 spark-4201ddcc07ca2e9af78bf4a74fdb3900c1783347.zip