diff options
author | Hossein Falaki <falaki@gmail.com> | 2013-12-30 19:32:05 -0800 |
---|---|---|
committer | Hossein Falaki <falaki@gmail.com> | 2013-12-30 19:32:05 -0800 |
commit | d6cded7155b36880f81544bdf6fc6c20dd52ad7d (patch) | |
tree | 13f614475d62e4124831245ad7190c6200f213a6 | |
parent | c3073b6cf2a647451441e8dfc18fe4334497113c (diff) | |
download | spark-d6cded7155b36880f81544bdf6fc6c20dd52ad7d.tar.gz spark-d6cded7155b36880f81544bdf6fc6c20dd52ad7d.tar.bz2 spark-d6cded7155b36880f81544bdf6fc6c20dd52ad7d.zip |
Added Java unit tests for countApproxDistinct and countApproxDistinctByKey
-rw-r--r-- | core/src/test/scala/org/apache/spark/JavaAPISuite.java | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/core/src/test/scala/org/apache/spark/JavaAPISuite.java b/core/src/test/scala/org/apache/spark/JavaAPISuite.java index 79913dc718..6398feb9f8 100644 --- a/core/src/test/scala/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/scala/org/apache/spark/JavaAPISuite.java @@ -930,4 +930,36 @@ public class JavaAPISuite implements Serializable { parts[1]); } + @Test + public void countApproxDistinct() { + List<Integer> arrayData = new ArrayList<Integer>(); + int size = 100; + for (int i = 0; i < 100000; i++) { + arrayData.add(i % size); + } + JavaRDD<Integer> simpleRdd = sc.parallelize(arrayData, 10); + Assert.assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.2) - size) / (size * 1.0)) < 0.2); + Assert.assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.05) - size) / (size * 1.0)) <= 0.05); + Assert.assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.01) - size) / (size * 1.0)) <= 0.01); + } + + @Test + public void countApproxDistinctByKey() { + double relativeSD = 0.001; + + List<Tuple2<Integer, Integer>> arrayData = new ArrayList<Tuple2<Integer, Integer>>(); + for (int i = 10; i < 100; i++) + for (int j = 0; j < i; j++) + arrayData.add(new Tuple2<Integer, Integer>(i, j)); + + JavaPairRDD<Integer, Integer> pairRdd = sc.parallelizePairs(arrayData); + List<Tuple2<Integer, Object>> res = pairRdd.countApproxDistinctByKey(relativeSD).collect(); + for (Tuple2<Integer, Object> resItem : res) { + double count = (double)resItem._1(); + Long resCount = (Long)resItem._2(); + Double error = Math.abs((resCount - count) / count); + Assert.assertTrue(error < relativeSD); + } + + } } |