diff options
author | Prashant Sharma <prashant.s@imaginea.com> | 2013-10-10 09:42:23 +0530 |
---|---|---|
committer | Prashant Sharma <prashant.s@imaginea.com> | 2013-10-10 09:42:23 +0530 |
commit | 26860639c5fee7fc23db1e686f8eb202921e4314 (patch) | |
tree | e05e555fcd713a7eb15680ae078994d70f396135 /python/pyspark/rdd.py | |
parent | 7d50f9f87baeb1f4b8d77d669d25649b97dd1d57 (diff) | |
parent | 7be75682b931dd52014f3cfdc6887e54583ad0af (diff) | |
download | spark-26860639c5fee7fc23db1e686f8eb202921e4314.tar.gz spark-26860639c5fee7fc23db1e686f8eb202921e4314.tar.bz2 spark-26860639c5fee7fc23db1e686f8eb202921e4314.zip |
Merge branch 'scala-2.10' of github.com:ScrapCodes/spark into scala-2.10
Conflicts:
core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManager.scala
project/SparkBuild.scala
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r-- | python/pyspark/rdd.py | 10 |
1 files changed, 6 insertions, 4 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 7611b13e82..33dc865256 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -29,7 +29,7 @@ from threading import Thread from pyspark import cloudpickle from pyspark.serializers import batched, Batch, dump_pickle, load_pickle, \ - read_from_pickle_file + read_from_pickle_file, pack_long from pyspark.join import python_join, python_left_outer_join, \ python_right_outer_join, python_cogroup from pyspark.statcounter import StatCounter @@ -690,11 +690,13 @@ class RDD(object): # form the hash buckets in Python, transferring O(numPartitions) objects # to Java. Each object is a (splitNumber, [objects]) pair. def add_shuffle_key(split, iterator): + buckets = defaultdict(list) + for (k, v) in iterator: buckets[partitionFunc(k) % numPartitions].append((k, v)) for (split, items) in buckets.iteritems(): - yield str(split) + yield pack_long(split) yield dump_pickle(Batch(items)) keyed = PipelinedRDD(self, add_shuffle_key) keyed._bypass_serializer = True @@ -831,8 +833,8 @@ class RDD(object): >>> sorted(x.subtractByKey(y).collect()) [('b', 4), ('b', 5)] """ - filter_func = lambda tpl: len(tpl[1][0]) > 0 and len(tpl[1][1]) == 0 - map_func = lambda tpl: [(tpl[0], val) for val in tpl[1][0]] + filter_func = lambda (key, vals): len(vals[0]) > 0 and len(vals[1]) == 0 + map_func = lambda (key, vals): [(key, val) for val in vals[0]] return self.cogroup(other, numPartitions).filter(filter_func).flatMap(map_func) def subtract(self, other, numPartitions=None): |