aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/rdd.py
diff options
context:
space:
mode:
authorPrashant Sharma <prashant.s@imaginea.com>2013-10-10 09:42:23 +0530
committerPrashant Sharma <prashant.s@imaginea.com>2013-10-10 09:42:23 +0530
commit26860639c5fee7fc23db1e686f8eb202921e4314 (patch)
treee05e555fcd713a7eb15680ae078994d70f396135 /python/pyspark/rdd.py
parent7d50f9f87baeb1f4b8d77d669d25649b97dd1d57 (diff)
parent7be75682b931dd52014f3cfdc6887e54583ad0af (diff)
downloadspark-26860639c5fee7fc23db1e686f8eb202921e4314.tar.gz
spark-26860639c5fee7fc23db1e686f8eb202921e4314.tar.bz2
spark-26860639c5fee7fc23db1e686f8eb202921e4314.zip
Merge branch 'scala-2.10' of github.com:ScrapCodes/spark into scala-2.10
Conflicts: core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManager.scala project/SparkBuild.scala
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r--python/pyspark/rdd.py10
1 files changed, 6 insertions, 4 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 7611b13e82..33dc865256 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -29,7 +29,7 @@ from threading import Thread
from pyspark import cloudpickle
from pyspark.serializers import batched, Batch, dump_pickle, load_pickle, \
- read_from_pickle_file
+ read_from_pickle_file, pack_long
from pyspark.join import python_join, python_left_outer_join, \
python_right_outer_join, python_cogroup
from pyspark.statcounter import StatCounter
@@ -690,11 +690,13 @@ class RDD(object):
# form the hash buckets in Python, transferring O(numPartitions) objects
# to Java. Each object is a (splitNumber, [objects]) pair.
def add_shuffle_key(split, iterator):
+
buckets = defaultdict(list)
+
for (k, v) in iterator:
buckets[partitionFunc(k) % numPartitions].append((k, v))
for (split, items) in buckets.iteritems():
- yield str(split)
+ yield pack_long(split)
yield dump_pickle(Batch(items))
keyed = PipelinedRDD(self, add_shuffle_key)
keyed._bypass_serializer = True
@@ -831,8 +833,8 @@ class RDD(object):
>>> sorted(x.subtractByKey(y).collect())
[('b', 4), ('b', 5)]
"""
- filter_func = lambda tpl: len(tpl[1][0]) > 0 and len(tpl[1][1]) == 0
- map_func = lambda tpl: [(tpl[0], val) for val in tpl[1][0]]
+ filter_func = lambda (key, vals): len(vals[0]) > 0 and len(vals[1]) == 0
+ map_func = lambda (key, vals): [(key, val) for val in vals[0]]
return self.cogroup(other, numPartitions).filter(filter_func).flatMap(map_func)
def subtract(self, other, numPartitions=None):