diff options
author | Matthew Farrellee <matt@redhat.com> | 2014-09-16 11:39:57 -0700 |
---|---|---|
committer | Josh Rosen <joshrosen@apache.org> | 2014-09-16 11:39:57 -0700 |
commit | 9d5fa763d8559ac412a18d7a2f43c4368a0af897 (patch) | |
tree | e42e70e02055cedbce87e5ab7889743700c57ef4 /python/pyspark/rdd.py | |
parent | 86d253ec4e2ed94c68687d575f9e2dfbb44463e1 (diff) | |
download | spark-9d5fa763d8559ac412a18d7a2f43c4368a0af897.tar.gz spark-9d5fa763d8559ac412a18d7a2f43c4368a0af897.tar.bz2 spark-9d5fa763d8559ac412a18d7a2f43c4368a0af897.zip |
[SPARK-3519] add distinct(n) to PySpark
Added missing rdd.distinct(numPartitions) and associated tests
Author: Matthew Farrellee <matt@redhat.com>
Closes #2383 from mattf/SPARK-3519 and squashes the following commits:
30b837a [Matthew Farrellee] Combine test cases to save on JVM startups
6bc4a2c [Matthew Farrellee] [SPARK-3519] add distinct(n) to SchemaRDD in PySpark
7a17f2b [Matthew Farrellee] [SPARK-3519] add distinct(n) to PySpark
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r-- | python/pyspark/rdd.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 21f182b0ff..cb09c191be 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -301,7 +301,7 @@ class RDD(object): return ifilter(f, iterator) return self.mapPartitions(func, True) - def distinct(self): + def distinct(self, numPartitions=None): """ Return a new RDD containing the distinct elements in this RDD. @@ -309,7 +309,7 @@ class RDD(object): [1, 2, 3] """ return self.map(lambda x: (x, None)) \ - .reduceByKey(lambda x, _: x) \ + .reduceByKey(lambda x, _: x, numPartitions) \ .map(lambda (x, _): x) def sample(self, withReplacement, fraction, seed=None): |