aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2014-11-07 20:53:03 -0800
committerJosh Rosen <joshrosen@databricks.com>2014-11-07 20:55:12 -0800
commit4895f65447aa2338729fccb5200efa29a9d62163 (patch)
tree79a606730f5a64a034af36c4af18efab1c8015ec /python
parent4fb26df8748ea7dda11db8c2b99f4b08da25bb4e (diff)
downloadspark-4895f65447aa2338729fccb5200efa29a9d62163.tar.gz
spark-4895f65447aa2338729fccb5200efa29a9d62163.tar.bz2
spark-4895f65447aa2338729fccb5200efa29a9d62163.zip
[SPARK-4304] [PySpark] Fix sort on empty RDD
This PR fix sortBy()/sortByKey() on empty RDD. This should be back ported into 1.1/1.2 Author: Davies Liu <davies@databricks.com> Closes #3162 from davies/fix_sort and squashes the following commits: 84f64b7 [Davies Liu] add tests 52995b5 [Davies Liu] fix sortByKey() on empty RDD (cherry picked from commit 7779109796c90d789464ab0be35917f963bbe867) Signed-off-by: Josh Rosen <joshrosen@databricks.com> Conflicts: python/pyspark/tests.py
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/rdd.py2
-rw-r--r--python/pyspark/tests.py3
2 files changed, 5 insertions, 0 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 3f81550bbb..ac8ceff055 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -598,6 +598,8 @@ class RDD(object):
# the key-space into bins such that the bins have roughly the same
# number of (key, value) pairs falling into them
rddSize = self.count()
+ if not rddSize:
+ return self # empty RDD
maxSampleSize = numPartitions * 20.0 # constant from Spark's RangePartitioner
fraction = min(maxSampleSize / max(rddSize, 1), 1.0)
samples = self.sample(False, fraction, 1).map(lambda (k, v): k).collect()
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 5cea1b03ea..b4a9c59790 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -470,6 +470,9 @@ class TestRDDFunctions(PySparkTestCase):
self.assertEquals(([1, "b"], [5]), rdd.histogram(1))
self.assertRaises(TypeError, lambda: rdd.histogram(2))
+ def test_sort_on_empty_rdd(self):
+ self.assertEqual([], self.sc.parallelize(zip([], [])).sortByKey().collect())
+
def test_sample(self):
rdd = self.sc.parallelize(range(0, 100), 4)
wo = rdd.sample(False, 0.1, 2).collect()