diff options
author | Andre Schumacher <schumach@icsi.berkeley.edu> | 2013-10-04 11:56:47 -0700 |
---|---|---|
committer | Andre Schumacher <schumach@icsi.berkeley.edu> | 2013-10-04 11:56:47 -0700 |
commit | c84946fe210069259f5d42ab8fd22a5ddae91d12 (patch) | |
tree | c57a9c9887f3a26334faa538b830880b8a035885 /python/pyspark/rdd.py | |
parent | 232765f7b26d933caa14e0e1bc0e4937dae90523 (diff) | |
download | spark-c84946fe210069259f5d42ab8fd22a5ddae91d12.tar.gz spark-c84946fe210069259f5d42ab8fd22a5ddae91d12.tar.bz2 spark-c84946fe210069259f5d42ab8fd22a5ddae91d12.zip |
Fixing SPARK-602: PythonPartitioner
Currently PythonPartitioner determines partition ID by hashing a
byte-array representation of PySpark's key. This PR lets
PythonPartitioner use the actual partition ID, which is required e.g.
for sorting via PySpark.
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r-- | python/pyspark/rdd.py | 10 |
1 files changed, 6 insertions, 4 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 58e1849cad..39c402b412 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -29,7 +29,7 @@ from threading import Thread from pyspark import cloudpickle from pyspark.serializers import batched, Batch, dump_pickle, load_pickle, \ - read_from_pickle_file + read_from_pickle_file, pack_long from pyspark.join import python_join, python_left_outer_join, \ python_right_outer_join, python_cogroup from pyspark.statcounter import StatCounter @@ -690,11 +690,13 @@ class RDD(object): # form the hash buckets in Python, transferring O(numPartitions) objects # to Java. Each object is a (splitNumber, [objects]) pair. def add_shuffle_key(split, iterator): + buckets = defaultdict(list) + for (k, v) in iterator: buckets[partitionFunc(k) % numPartitions].append((k, v)) for (split, items) in buckets.iteritems(): - yield str(split) + yield pack_long(split) yield dump_pickle(Batch(items)) keyed = PipelinedRDD(self, add_shuffle_key) keyed._bypass_serializer = True @@ -831,8 +833,8 @@ class RDD(object): >>> sorted(x.subtractByKey(y).collect()) [('b', 4), ('b', 5)] """ - filter_func = lambda tpl: len(tpl[1][0]) > 0 and len(tpl[1][1]) == 0 - map_func = lambda tpl: [(tpl[0], val) for val in tpl[1][0]] + filter_func = lambda (key, vals): len(vals[0]) > 0 and len(vals[1]) == 0 + map_func = lambda (key, vals): [(key, val) for val in vals[0]] return self.cogroup(other, numPartitions).filter(filter_func).flatMap(map_func) def subtract(self, other, numPartitions=None): |