From 9d5fa763d8559ac412a18d7a2f43c4368a0af897 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Tue, 16 Sep 2014 11:39:57 -0700 Subject: [SPARK-3519] add distinct(n) to PySpark Added missing rdd.distinct(numPartitions) and associated tests Author: Matthew Farrellee Closes #2383 from mattf/SPARK-3519 and squashes the following commits: 30b837a [Matthew Farrellee] Combine test cases to save on JVM startups 6bc4a2c [Matthew Farrellee] [SPARK-3519] add distinct(n) to SchemaRDD in PySpark 7a17f2b [Matthew Farrellee] [SPARK-3519] add distinct(n) to PySpark --- python/pyspark/rdd.py | 4 ++-- python/pyspark/sql.py | 7 +++++-- python/pyspark/tests.py | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'python') diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 21f182b0ff..cb09c191be 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -301,7 +301,7 @@ class RDD(object): return ifilter(f, iterator) return self.mapPartitions(func, True) - def distinct(self): + def distinct(self, numPartitions=None): """ Return a new RDD containing the distinct elements in this RDD. @@ -309,7 +309,7 @@ class RDD(object): [1, 2, 3] """ return self.map(lambda x: (x, None)) \ - .reduceByKey(lambda x, _: x) \ + .reduceByKey(lambda x, _: x, numPartitions) \ .map(lambda (x, _): x) def sample(self, withReplacement, fraction, seed=None): diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index fc9310fef3..eac55cbe15 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -1694,8 +1694,11 @@ class SchemaRDD(RDD): rdd = self._jschema_rdd.coalesce(numPartitions, shuffle) return SchemaRDD(rdd, self.sql_ctx) - def distinct(self): - rdd = self._jschema_rdd.distinct() + def distinct(self, numPartitions=None): + if numPartitions is None: + rdd = self._jschema_rdd.distinct() + else: + rdd = self._jschema_rdd.distinct(numPartitions) return SchemaRDD(rdd, self.sql_ctx) def intersection(self, other): diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index f255b44359..0b3854347a 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -587,6 +587,14 @@ class TestRDDFunctions(PySparkTestCase): self.assertEquals(partitions[0], [(0, 5), (0, 8), (2, 6)]) self.assertEquals(partitions[1], [(1, 3), (3, 8), (3, 8)]) + def test_distinct(self): + rdd = self.sc.parallelize((1, 2, 3)*10, 10) + self.assertEquals(rdd.getNumPartitions(), 10) + self.assertEquals(rdd.distinct().count(), 3) + result = rdd.distinct(5) + self.assertEquals(result.getNumPartitions(), 5) + self.assertEquals(result.count(), 3) + class TestSQL(PySparkTestCase): @@ -636,6 +644,15 @@ class TestSQL(PySparkTestCase): srdd.count() srdd.collect() + def test_distinct(self): + rdd = self.sc.parallelize(['{"a": 1}', '{"b": 2}', '{"c": 3}']*10, 10) + srdd = self.sqlCtx.jsonRDD(rdd) + self.assertEquals(srdd.getNumPartitions(), 10) + self.assertEquals(srdd.distinct().count(), 3) + result = srdd.distinct(5) + self.assertEquals(result.getNumPartitions(), 5) + self.assertEquals(result.count(), 3) + class TestIO(PySparkTestCase): -- cgit v1.2.3