aboutsummaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorHossein Falaki <falaki@gmail.com>2013-12-30 19:31:06 -0800
committerHossein Falaki <falaki@gmail.com>2013-12-30 19:31:06 -0800
commitc3073b6cf2a647451441e8dfc18fe4334497113c (patch)
tree355d6c66c6b2de83043ec9d52a39c408cd91849c /core
parented06500d300e93ae3129a035a364117adcb7d361 (diff)
downloadspark-c3073b6cf2a647451441e8dfc18fe4334497113c.tar.gz
spark-c3073b6cf2a647451441e8dfc18fe4334497113c.tar.bz2
spark-c3073b6cf2a647451441e8dfc18fe4334497113c.zip
Added Java API for countApproxDistinct
Diffstat (limited to 'core')
-rw-r--r--core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala11
1 files changed, 11 insertions, 0 deletions
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index f344804b4c..924d8af060 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -444,4 +444,15 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[T]]
takeOrdered(num, comp)
}
+
+ /**
+ * Return approximate number of distinct elements in the RDD.
+ *
+ * The accuracy of approximation can be controlled through the relative standard deviation
+ * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
+ * more accurate counts but increase the memory footprint and vise versa. The default value of
+ * relativeSD is 0.05.
+ */
+ def countApproxDistinct(relativeSD: Double = 0.05): Long = rdd.countApproxDistinct(relativeSD)
+
}