Added Java API for countApproxDistinctByKey

author: Hossein Falaki <falaki@gmail.com> 2013-12-30 19:30:42 -0800
committer: Hossein Falaki <falaki@gmail.com> 2013-12-30 19:30:42 -0800
commit: ed06500d300e93ae3129a035a364117adcb7d361 (patch)
tree: 27eedf33b73f85d9f38dfa31fcff9b87cbf97f22
parent: b75d7c98bc94d42f11522162e30ae4fc546d5bf9 (diff)
download: spark-ed06500d300e93ae3129a035a364117adcb7d361.tar.gz
spark-ed06500d300e93ae3129a035a364117adcb7d361.tar.bz2
spark-ed06500d300e93ae3129a035a364117adcb7d361.zip
1 files changed, 36 insertions, 0 deletions
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index 363667fa86..55c87450ac 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -611,6 +611,42 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kClassTag: ClassTag[K
    * Return an RDD with the values of each tuple.
    */
   def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2))
+
+  /**
+   * Return approximate number of distinct values for each key in this RDD.
+   * The accuracy of approximation can be controlled through the relative standard deviation
+   * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
+   * more accurate counts but increase the memory footprint and vise versa. Uses the provided
+   * Partitioner to partition the output RDD.
+   */
+  def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaRDD[(K, Long)] = {
+    rdd.countApproxDistinctByKey(relativeSD, partitioner)
+  }
+
+  /**
+   * Return approximate number of distinct values for each key this RDD.
+   * The accuracy of approximation can be controlled through the relative standard deviation
+   * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
+   * more accurate counts but increase the memory footprint and vise versa. The default value of
+   * relativeSD is 0.05. Hash-partitions the output RDD using the existing partitioner/parallelism
+   * level.
+   */
+  def countApproxDistinctByKey(relativeSD: Double = 0.05): JavaRDD[(K, Long)] = {
+    rdd.countApproxDistinctByKey(relativeSD)
+  }
+
+
+  /**
+   * Return approximate number of distinct values for each key in this RDD.
+   * The accuracy of approximation can be controlled through the relative standard deviation
+   * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
+   * more accurate counts but increase the memory footprint and vise versa. HashPartitions the
+   * output RDD into numPartitions.
+   *
+   */
+  def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaRDD[(K, Long)] = {
+    rdd.countApproxDistinctByKey(relativeSD, numPartitions)
+  }
 }
 
 object JavaPairRDD {
author	Hossein Falaki <falaki@gmail.com>	2013-12-30 19:30:42 -0800
committer	Hossein Falaki <falaki@gmail.com>	2013-12-30 19:30:42 -0800
commit	ed06500d300e93ae3129a035a364117adcb7d361 (patch)
tree	27eedf33b73f85d9f38dfa31fcff9b87cbf97f22
parent	b75d7c98bc94d42f11522162e30ae4fc546d5bf9 (diff)
download	spark-ed06500d300e93ae3129a035a364117adcb7d361.tar.gz spark-ed06500d300e93ae3129a035a364117adcb7d361.tar.bz2 spark-ed06500d300e93ae3129a035a364117adcb7d361.zip