aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--python/pyspark/rdd.py2
-rw-r--r--python/pyspark/tests.py3
2 files changed, 5 insertions, 0 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 879655dc53..08d0474026 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -521,6 +521,8 @@ class RDD(object):
# the key-space into bins such that the bins have roughly the same
# number of (key, value) pairs falling into them
rddSize = self.count()
+ if not rddSize:
+ return self # empty RDD
maxSampleSize = numPartitions * 20.0 # constant from Spark's RangePartitioner
fraction = min(maxSampleSize / max(rddSize, 1), 1.0)
samples = self.sample(False, fraction, 1).map(lambda (k, v): k).collect()
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 9f625c5c6c..491e445a21 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -649,6 +649,9 @@ class RDDTests(ReusedPySparkTestCase):
self.assertEquals(result.getNumPartitions(), 5)
self.assertEquals(result.count(), 3)
+ def test_sort_on_empty_rdd(self):
+ self.assertEqual([], self.sc.parallelize(zip([], [])).sortByKey().collect())
+
def test_sample(self):
rdd = self.sc.parallelize(range(0, 100), 4)
wo = rdd.sample(False, 0.1, 2).collect()