aboutsummaryrefslogtreecommitdiff
path: root/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
diff options
context:
space:
mode:
Diffstat (limited to 'core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala')
-rw-r--r--core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala48
1 files changed, 25 insertions, 23 deletions
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
index 31e6fd519d..a97d2a01c8 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
@@ -22,7 +22,8 @@ import java.io.{ObjectOutputStream, IOException}
private[spark] class ZippedPartitionsPartition(
idx: Int,
- @transient rdds: Seq[RDD[_]])
+ @transient rdds: Seq[RDD[_]],
+ @transient val preferredLocations: Seq[String])
extends Partition {
override val index: Int = idx
@@ -39,31 +40,29 @@ private[spark] class ZippedPartitionsPartition(
abstract class ZippedPartitionsBaseRDD[V: ClassManifest](
sc: SparkContext,
- var rdds: Seq[RDD[_]])
+ var rdds: Seq[RDD[_]],
+ preservesPartitioning: Boolean = false)
extends RDD[V](sc, rdds.map(x => new OneToOneDependency(x))) {
+ override val partitioner =
+ if (preservesPartitioning) firstParent[Any].partitioner else None
+
override def getPartitions: Array[Partition] = {
- val sizes = rdds.map(x => x.partitions.size)
- if (!sizes.forall(x => x == sizes(0))) {
+ val numParts = rdds.head.partitions.size
+ if (!rdds.forall(rdd => rdd.partitions.size == numParts)) {
throw new IllegalArgumentException("Can't zip RDDs with unequal numbers of partitions")
}
- val array = new Array[Partition](sizes(0))
- for (i <- 0 until sizes(0)) {
- array(i) = new ZippedPartitionsPartition(i, rdds)
+ Array.tabulate[Partition](numParts) { i =>
+ val prefs = rdds.map(rdd => rdd.preferredLocations(rdd.partitions(i)))
+ // Check whether there are any hosts that match all RDDs; otherwise return the union
+ val exactMatchLocations = prefs.reduce((x, y) => x.intersect(y))
+ val locs = if (!exactMatchLocations.isEmpty) exactMatchLocations else prefs.flatten.distinct
+ new ZippedPartitionsPartition(i, rdds, locs)
}
- array
}
override def getPreferredLocations(s: Partition): Seq[String] = {
- val parts = s.asInstanceOf[ZippedPartitionsPartition].partitions
- val prefs = rdds.zip(parts).map { case (rdd, p) => rdd.preferredLocations(p) }
- // Check whether there are any hosts that match all RDDs; otherwise return the union
- val exactMatchLocations = prefs.reduce((x, y) => x.intersect(y))
- if (!exactMatchLocations.isEmpty) {
- exactMatchLocations
- } else {
- prefs.flatten.distinct
- }
+ s.asInstanceOf[ZippedPartitionsPartition].preferredLocations
}
override def clearDependencies() {
@@ -76,8 +75,9 @@ class ZippedPartitionsRDD2[A: ClassManifest, B: ClassManifest, V: ClassManifest]
sc: SparkContext,
f: (Iterator[A], Iterator[B]) => Iterator[V],
var rdd1: RDD[A],
- var rdd2: RDD[B])
- extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2)) {
+ var rdd2: RDD[B],
+ preservesPartitioning: Boolean = false)
+ extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2), preservesPartitioning) {
override def compute(s: Partition, context: TaskContext): Iterator[V] = {
val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
@@ -97,8 +97,9 @@ class ZippedPartitionsRDD3
f: (Iterator[A], Iterator[B], Iterator[C]) => Iterator[V],
var rdd1: RDD[A],
var rdd2: RDD[B],
- var rdd3: RDD[C])
- extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3)) {
+ var rdd3: RDD[C],
+ preservesPartitioning: Boolean = false)
+ extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3), preservesPartitioning) {
override def compute(s: Partition, context: TaskContext): Iterator[V] = {
val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
@@ -122,8 +123,9 @@ class ZippedPartitionsRDD4
var rdd1: RDD[A],
var rdd2: RDD[B],
var rdd3: RDD[C],
- var rdd4: RDD[D])
- extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3, rdd4)) {
+ var rdd4: RDD[D],
+ preservesPartitioning: Boolean = false)
+ extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3, rdd4), preservesPartitioning) {
override def compute(s: Partition, context: TaskContext): Iterator[V] = {
val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions