aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql.py
diff options
context:
space:
mode:
authorMatthew Farrellee <matt@redhat.com>2014-09-16 11:39:57 -0700
committerJosh Rosen <joshrosen@apache.org>2014-09-16 11:39:57 -0700
commit9d5fa763d8559ac412a18d7a2f43c4368a0af897 (patch)
treee42e70e02055cedbce87e5ab7889743700c57ef4 /python/pyspark/sql.py
parent86d253ec4e2ed94c68687d575f9e2dfbb44463e1 (diff)
downloadspark-9d5fa763d8559ac412a18d7a2f43c4368a0af897.tar.gz
spark-9d5fa763d8559ac412a18d7a2f43c4368a0af897.tar.bz2
spark-9d5fa763d8559ac412a18d7a2f43c4368a0af897.zip
[SPARK-3519] add distinct(n) to PySpark
Added missing rdd.distinct(numPartitions) and associated tests Author: Matthew Farrellee <matt@redhat.com> Closes #2383 from mattf/SPARK-3519 and squashes the following commits: 30b837a [Matthew Farrellee] Combine test cases to save on JVM startups 6bc4a2c [Matthew Farrellee] [SPARK-3519] add distinct(n) to SchemaRDD in PySpark 7a17f2b [Matthew Farrellee] [SPARK-3519] add distinct(n) to PySpark
Diffstat (limited to 'python/pyspark/sql.py')
-rw-r--r--python/pyspark/sql.py7
1 files changed, 5 insertions, 2 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index fc9310fef3..eac55cbe15 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -1694,8 +1694,11 @@ class SchemaRDD(RDD):
rdd = self._jschema_rdd.coalesce(numPartitions, shuffle)
return SchemaRDD(rdd, self.sql_ctx)
- def distinct(self):
- rdd = self._jschema_rdd.distinct()
+ def distinct(self, numPartitions=None):
+ if numPartitions is None:
+ rdd = self._jschema_rdd.distinct()
+ else:
+ rdd = self._jschema_rdd.distinct(numPartitions)
return SchemaRDD(rdd, self.sql_ctx)
def intersection(self, other):