aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/rdd.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r--python/pyspark/rdd.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index e13bab946c..15be4bfec9 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1070,10 +1070,13 @@ class RDD(object):
# If we didn't find any rows after the previous iteration,
# quadruple and retry. Otherwise, interpolate the number of
# partitions we need to try, but overestimate it by 50%.
+ # We also cap the estimation in the end.
if len(items) == 0:
numPartsToTry = partsScanned * 4
else:
- numPartsToTry = int(1.5 * num * partsScanned / len(items))
+ # the first paramter of max is >=1 whenever partsScanned >= 2
+ numPartsToTry = int(1.5 * num * partsScanned / len(items)) - partsScanned
+ numPartsToTry = min(max(numPartsToTry, 1), partsScanned * 4)
left = num - len(items)