aboutsummaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorHossein Falaki <falaki@gmail.com>2013-12-30 19:32:05 -0800
committerHossein Falaki <falaki@gmail.com>2013-12-30 19:32:05 -0800
commitd6cded7155b36880f81544bdf6fc6c20dd52ad7d (patch)
tree13f614475d62e4124831245ad7190c6200f213a6 /core
parentc3073b6cf2a647451441e8dfc18fe4334497113c (diff)
downloadspark-d6cded7155b36880f81544bdf6fc6c20dd52ad7d.tar.gz
spark-d6cded7155b36880f81544bdf6fc6c20dd52ad7d.tar.bz2
spark-d6cded7155b36880f81544bdf6fc6c20dd52ad7d.zip
Added Java unit tests for countApproxDistinct and countApproxDistinctByKey
Diffstat (limited to 'core')
-rw-r--r--core/src/test/scala/org/apache/spark/JavaAPISuite.java32
1 files changed, 32 insertions, 0 deletions
diff --git a/core/src/test/scala/org/apache/spark/JavaAPISuite.java b/core/src/test/scala/org/apache/spark/JavaAPISuite.java
index 79913dc718..6398feb9f8 100644
--- a/core/src/test/scala/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/scala/org/apache/spark/JavaAPISuite.java
@@ -930,4 +930,36 @@ public class JavaAPISuite implements Serializable {
parts[1]);
}
+ @Test
+ public void countApproxDistinct() {
+ List<Integer> arrayData = new ArrayList<Integer>();
+ int size = 100;
+ for (int i = 0; i < 100000; i++) {
+ arrayData.add(i % size);
+ }
+ JavaRDD<Integer> simpleRdd = sc.parallelize(arrayData, 10);
+ Assert.assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.2) - size) / (size * 1.0)) < 0.2);
+ Assert.assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.05) - size) / (size * 1.0)) <= 0.05);
+ Assert.assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.01) - size) / (size * 1.0)) <= 0.01);
+ }
+
+ @Test
+ public void countApproxDistinctByKey() {
+ double relativeSD = 0.001;
+
+ List<Tuple2<Integer, Integer>> arrayData = new ArrayList<Tuple2<Integer, Integer>>();
+ for (int i = 10; i < 100; i++)
+ for (int j = 0; j < i; j++)
+ arrayData.add(new Tuple2<Integer, Integer>(i, j));
+
+ JavaPairRDD<Integer, Integer> pairRdd = sc.parallelizePairs(arrayData);
+ List<Tuple2<Integer, Object>> res = pairRdd.countApproxDistinctByKey(relativeSD).collect();
+ for (Tuple2<Integer, Object> resItem : res) {
+ double count = (double)resItem._1();
+ Long resCount = (Long)resItem._2();
+ Double error = Math.abs((resCount - count) / count);
+ Assert.assertTrue(error < relativeSD);
+ }
+
+ }
}