aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--core/src/main/scala/org/apache/spark/rdd/RDD.scala7
-rw-r--r--python/pyspark/rdd.py8
2 files changed, 7 insertions, 8 deletions
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index af9e31ba7b..1cf55e86f6 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1064,11 +1064,10 @@ abstract class RDD[T: ClassTag](
// greater than totalParts because we actually cap it at totalParts in runJob.
var numPartsToTry = 1
if (partsScanned > 0) {
- // If we didn't find any rows after the first iteration, just try all partitions next.
- // Otherwise, interpolate the number of partitions we need to try, but overestimate it
- // by 50%.
+ // If we didn't find any rows after the previous iteration, quadruple and retry. Otherwise,
+ // interpolate the number of partitions we need to try, but overestimate it by 50%.
if (buf.size == 0) {
- numPartsToTry = totalParts - 1
+ numPartsToTry = partsScanned * 4
} else {
numPartsToTry = (1.5 * num * partsScanned / buf.size).toInt
}
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index dff6fc26fc..04f13523b4 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1089,11 +1089,11 @@ class RDD(object):
# we actually cap it at totalParts in runJob.
numPartsToTry = 1
if partsScanned > 0:
- # If we didn't find any rows after the first iteration, just
- # try all partitions next. Otherwise, interpolate the number
- # of partitions we need to try, but overestimate it by 50%.
+ # If we didn't find any rows after the previous iteration,
+ # quadruple and retry. Otherwise, interpolate the number of
+ # partitions we need to try, but overestimate it by 50%.
if len(items) == 0:
- numPartsToTry = totalParts - 1
+ numPartsToTry = partsScanned * 4
else:
numPartsToTry = int(1.5 * num * partsScanned / len(items))