aboutsummaryrefslogtreecommitdiff
path: root/core/src
diff options
context:
space:
mode:
authorVinod K C <vinod.kc@huawei.com>2015-05-09 10:03:15 +0100
committerSean Owen <sowen@cloudera.com>2015-05-09 10:03:15 +0100
commitdda6d9f4045fa2d1265abffa9d7dbdc967448417 (patch)
treecff2d6a2034b32f4dc71ba2f2927fb90a1cd1dda /core/src
parent29926238418223b0888d418d163feebf0217b35e (diff)
downloadspark-dda6d9f4045fa2d1265abffa9d7dbdc967448417.tar.gz
spark-dda6d9f4045fa2d1265abffa9d7dbdc967448417.tar.bz2
spark-dda6d9f4045fa2d1265abffa9d7dbdc967448417.zip
[SPARK-7438] [SPARK CORE] Fixed validation of relativeSD in countApproxDistinct
Author: Vinod K C <vinod.kc@huawei.com> Closes #5974 from vinodkc/fix_countApproxDistinct_Validation and squashes the following commits: 3a3d59c [Vinod K C] Reverted removal of validation relativeSD<0.000017 799976e [Vinod K C] Removed testcase to assert IAE when relativeSD>3.7 8ddbfae [Vinod K C] Remove blank line b1b00a3 [Vinod K C] Removed relativeSD validation from python API,RDD.scala will do validation 122d378 [Vinod K C] Fixed validation of relativeSD in countApproxDistinct
Diffstat (limited to 'core/src')
-rw-r--r--core/src/main/scala/org/apache/spark/rdd/RDD.scala7
-rw-r--r--core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala2
2 files changed, 6 insertions, 3 deletions
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 8baf199f21..7dad30ecbd 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1161,8 +1161,8 @@ abstract class RDD[T: ClassTag](
*/
@Experimental
def countApproxDistinct(p: Int, sp: Int): Long = withScope {
- require(p >= 4, s"p ($p) must be at least 4")
- require(sp <= 32, s"sp ($sp) cannot be greater than 32")
+ require(p >= 4, s"p ($p) must be >= 4")
+ require(sp <= 32, s"sp ($sp) must be <= 32")
require(sp == 0 || p <= sp, s"p ($p) cannot be greater than sp ($sp)")
val zeroCounter = new HyperLogLogPlus(p, sp)
aggregate(zeroCounter)(
@@ -1187,8 +1187,9 @@ abstract class RDD[T: ClassTag](
* It must be greater than 0.000017.
*/
def countApproxDistinct(relativeSD: Double = 0.05): Long = withScope {
+ require(relativeSD > 0.000017, s"accuracy ($relativeSD) must be greater than 0.000017")
val p = math.ceil(2.0 * math.log(1.054 / relativeSD) / math.log(2)).toInt
- countApproxDistinct(p, 0)
+ countApproxDistinct(if (p < 4) 4 else p, 0)
}
/**
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index ef8c36a286..afc11bdc4d 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -89,6 +89,8 @@ class RDDSuite extends FunSuite with SharedSparkContext {
val simpleRdd = sc.makeRDD(uniformDistro, 10)
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2)
assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1)
+ assert(error(simpleRdd.countApproxDistinct(0.02), size) < 0.1)
+ assert(error(simpleRdd.countApproxDistinct(0.5), size) < 0.22)
}
test("SparkContext.union") {