aboutsummaryrefslogtreecommitdiff
path: root/core/src/test
diff options
context:
space:
mode:
authorSean Owen <sowen@cloudera.com>2016-10-08 11:31:12 +0100
committerSean Owen <sowen@cloudera.com>2016-10-08 11:31:12 +0100
commit4201ddcc07ca2e9af78bf4a74fdb3900c1783347 (patch)
treeae50667b9ae7e8e8b57ccf431ad08181c40baaac /core/src/test
parent362ba4b6f8e8fc2355368742c5adced7573fec00 (diff)
downloadspark-4201ddcc07ca2e9af78bf4a74fdb3900c1783347.tar.gz
spark-4201ddcc07ca2e9af78bf4a74fdb3900c1783347.tar.bz2
spark-4201ddcc07ca2e9af78bf4a74fdb3900c1783347.zip
[SPARK-17768][CORE] Small (Sum,Count,Mean)Evaluator problems and suboptimalities
## What changes were proposed in this pull request? Fix: - GroupedMeanEvaluator and GroupedSumEvaluator are unused, as is the StudentTCacher support class - CountEvaluator can return a lower bound < 0, when counts can't be negative - MeanEvaluator will actually fail on exactly 1 datum (yields t-test with 0 DOF) - CountEvaluator uses a normal distribution, which may be an inappropriate approximation (leading to above) - Test for SumEvaluator asserts incorrect expected sums – e.g. after observing 10% of data has sum of 2, expectation should be 20, not 38 - CountEvaluator, MeanEvaluator have no unit tests to catch these - Duplication of distribution code across CountEvaluator, GroupedCountEvaluator - The stats in each could use a bit of documentation as I had to guess at them - (Code could use a few cleanups and optimizations too) ## How was this patch tested? Existing and new tests Author: Sean Owen <sowen@cloudera.com> Closes #15341 from srowen/SPARK-17768.
Diffstat (limited to 'core/src/test')
-rw-r--r--core/src/test/scala/org/apache/spark/partial/CountEvaluatorSuite.scala43
-rw-r--r--core/src/test/scala/org/apache/spark/partial/MeanEvaluatorSuite.scala57
-rw-r--r--core/src/test/scala/org/apache/spark/partial/SumEvaluatorSuite.scala82
3 files changed, 126 insertions, 56 deletions
diff --git a/core/src/test/scala/org/apache/spark/partial/CountEvaluatorSuite.scala b/core/src/test/scala/org/apache/spark/partial/CountEvaluatorSuite.scala
new file mode 100644
index 0000000000..da3256bd88
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/partial/CountEvaluatorSuite.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.partial
+
+import org.apache.spark.SparkFunSuite
+
+class CountEvaluatorSuite extends SparkFunSuite {
+
+ test("test count 0") {
+ val evaluator = new CountEvaluator(10, 0.95)
+ assert(new BoundedDouble(0.0, 0.0, 0.0, Double.PositiveInfinity) == evaluator.currentResult())
+ evaluator.merge(1, 0)
+ assert(new BoundedDouble(0.0, 0.0, 0.0, Double.PositiveInfinity) == evaluator.currentResult())
+ }
+
+ test("test count >= 1") {
+ val evaluator = new CountEvaluator(10, 0.95)
+ evaluator.merge(1, 1)
+ assert(new BoundedDouble(10.0, 0.95, 1.0, 36.0) == evaluator.currentResult())
+ evaluator.merge(1, 3)
+ assert(new BoundedDouble(20.0, 0.95, 7.0, 41.0) == evaluator.currentResult())
+ evaluator.merge(1, 8)
+ assert(new BoundedDouble(40.0, 0.95, 24.0, 61.0) == evaluator.currentResult())
+ (4 to 10).foreach(_ => evaluator.merge(1, 10))
+ assert(new BoundedDouble(82.0, 1.0, 82.0, 82.0) == evaluator.currentResult())
+ }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/partial/MeanEvaluatorSuite.scala b/core/src/test/scala/org/apache/spark/partial/MeanEvaluatorSuite.scala
new file mode 100644
index 0000000000..eaa1262b41
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/partial/MeanEvaluatorSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.partial
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.util.StatCounter
+
+class MeanEvaluatorSuite extends SparkFunSuite {
+
+ test("test count 0") {
+ val evaluator = new MeanEvaluator(10, 0.95)
+ assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
+ evaluator.currentResult())
+ evaluator.merge(1, new StatCounter())
+ assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
+ evaluator.currentResult())
+ evaluator.merge(1, new StatCounter(Seq(0.0)))
+ assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
+ evaluator.currentResult())
+ }
+
+ test("test count 1") {
+ val evaluator = new MeanEvaluator(10, 0.95)
+ evaluator.merge(1, new StatCounter(Seq(1.0)))
+ assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
+ evaluator.currentResult())
+ }
+
+ test("test count > 1") {
+ val evaluator = new MeanEvaluator(10, 0.95)
+ evaluator.merge(1, new StatCounter(Seq(1.0)))
+ evaluator.merge(1, new StatCounter(Seq(3.0)))
+ assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) ==
+ evaluator.currentResult())
+ evaluator.merge(1, new StatCounter(Seq(8.0)))
+ assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) ==
+ evaluator.currentResult())
+ (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0))))
+ assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult())
+ }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/partial/SumEvaluatorSuite.scala b/core/src/test/scala/org/apache/spark/partial/SumEvaluatorSuite.scala
index a79f5b4d74..e212db7362 100644
--- a/core/src/test/scala/org/apache/spark/partial/SumEvaluatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/partial/SumEvaluatorSuite.scala
@@ -17,61 +17,34 @@
package org.apache.spark.partial
-import org.apache.spark._
+import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter
-class SumEvaluatorSuite extends SparkFunSuite with SharedSparkContext {
+class SumEvaluatorSuite extends SparkFunSuite {
test("correct handling of count 1") {
+ // sanity check:
+ assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))
- // setup
- val counter = new StatCounter(List(2.0))
// count of 10 because it's larger than 1,
// and 0.95 because that's the default
val evaluator = new SumEvaluator(10, 0.95)
// arbitrarily assign id 1
- evaluator.merge(1, counter)
-
- // execute
- val res = evaluator.currentResult()
- // 38.0 - 7.1E-15 because that's how the maths shakes out
- val targetMean = 38.0 - 7.1E-15
-
- // Sanity check that equality works on BoundedDouble
- assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))
-
- // actual test
- assert(res ==
- new BoundedDouble(targetMean, 0.950, Double.NegativeInfinity, Double.PositiveInfinity))
+ evaluator.merge(1, new StatCounter(Seq(2.0)))
+ assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
+ evaluator.currentResult())
}
test("correct handling of count 0") {
-
- // setup
- val counter = new StatCounter(List())
- // count of 10 because it's larger than 0,
- // and 0.95 because that's the default
val evaluator = new SumEvaluator(10, 0.95)
- // arbitrarily assign id 1
- evaluator.merge(1, counter)
-
- // execute
- val res = evaluator.currentResult()
- // assert
- assert(res == new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity))
+ evaluator.merge(1, new StatCounter())
+ assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
+ evaluator.currentResult())
}
test("correct handling of NaN") {
-
- // setup
- val counter = new StatCounter(List(1, Double.NaN, 2))
- // count of 10 because it's larger than 0,
- // and 0.95 because that's the default
val evaluator = new SumEvaluator(10, 0.95)
- // arbitrarily assign id 1
- evaluator.merge(1, counter)
-
- // execute
+ evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2)))
val res = evaluator.currentResult()
// assert - note semantics of == in face of NaN
assert(res.mean.isNaN)
@@ -81,27 +54,24 @@ class SumEvaluatorSuite extends SparkFunSuite with SharedSparkContext {
}
test("correct handling of > 1 values") {
-
- // setup
- val counter = new StatCounter(List(1, 3, 2))
- // count of 10 because it's larger than 0,
- // and 0.95 because that's the default
val evaluator = new SumEvaluator(10, 0.95)
- // arbitrarily assign id 1
- evaluator.merge(1, counter)
-
- // execute
+ evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0)))
val res = evaluator.currentResult()
+ assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) ==
+ evaluator.currentResult())
+ }
- // These vals because that's how the maths shakes out
- val targetMean = 78.0
- val targetLow = -117.617 + 2.732357258139473E-5
- val targetHigh = 273.617 - 2.7323572624027292E-5
- val target = new BoundedDouble(targetMean, 0.95, targetLow, targetHigh)
-
-
- // check that values are within expected tolerance of expectation
- assert(res == target)
+ test("test count > 1") {
+ val evaluator = new SumEvaluator(10, 0.95)
+ evaluator.merge(1, new StatCounter().merge(1.0))
+ evaluator.merge(1, new StatCounter().merge(3.0))
+ assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) ==
+ evaluator.currentResult())
+ evaluator.merge(1, new StatCounter().merge(8.0))
+ assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) ==
+ evaluator.currentResult())
+ (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0)))
+ assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult())
}
}