diff options
author | Hossein Falaki <falaki@gmail.com> | 2013-12-30 19:31:06 -0800 |
---|---|---|
committer | Hossein Falaki <falaki@gmail.com> | 2013-12-30 19:31:06 -0800 |
commit | c3073b6cf2a647451441e8dfc18fe4334497113c (patch) | |
tree | 355d6c66c6b2de83043ec9d52a39c408cd91849c | |
parent | ed06500d300e93ae3129a035a364117adcb7d361 (diff) | |
download | spark-c3073b6cf2a647451441e8dfc18fe4334497113c.tar.gz spark-c3073b6cf2a647451441e8dfc18fe4334497113c.tar.bz2 spark-c3073b6cf2a647451441e8dfc18fe4334497113c.zip |
Added Java API for countApproxDistinct
-rw-r--r-- | core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index f344804b4c..924d8af060 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -444,4 +444,15 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[T]] takeOrdered(num, comp) } + + /** + * Return approximate number of distinct elements in the RDD. + * + * The accuracy of approximation can be controlled through the relative standard deviation + * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in + * more accurate counts but increase the memory footprint and vise versa. The default value of + * relativeSD is 0.05. + */ + def countApproxDistinct(relativeSD: Double = 0.05): Long = rdd.countApproxDistinct(relativeSD) + } |