From b92d16b114fd49e881d09e7974ad57b2a0df2906 Mon Sep 17 00:00:00 2001 From: Andrew Ash Date: Tue, 17 Jun 2014 11:47:48 -0700 Subject: SPARK-1063 Add .sortBy(f) method on RDD This never got merged from the apache/incubator-spark repo (which is now deleted) but there had been several rounds of code review on this PR there. I think this is ready for merging. Author: Andrew Ash This patch had conflicts when merged, resolved by Committer: Reynold Xin Closes #369 from ash211/sortby and squashes the following commits: d09147a [Andrew Ash] Fix Ordering import 43d0a53 [Andrew Ash] Fix missing .collect() 29a54ed [Andrew Ash] Re-enable test by converting to a closure 5a95348 [Andrew Ash] Add license for RDDSuiteUtils 64ed6e3 [Andrew Ash] Remove leaked diff d4de69a [Andrew Ash] Remove scar tissue 63638b5 [Andrew Ash] Add Python version of .sortBy() 45e0fde [Andrew Ash] Add Java version of .sortBy() adf84c5 [Andrew Ash] Re-indent to keep line lengths under 100 chars 9d9b9d8 [Andrew Ash] Use parentheses on .collect() calls 0457b69 [Andrew Ash] Ignore failing test 99f0baf [Andrew Ash] Merge branch 'master' into sortby 222ae97 [Andrew Ash] Try moving Ordering objects out to a different class 3fd0dd3 [Andrew Ash] Add (failing) test for sortByKey with explicit Ordering b8b5bbc [Andrew Ash] Align remove extra spaces that were used to align ='s in test code 8c53298 [Andrew Ash] Actually use ascending and numPartitions parameters 381eef2 [Andrew Ash] Correct silly typo 7db3e84 [Andrew Ash] Support ascending and numPartitions params in sortBy() 0f685fd [Andrew Ash] Merge remote-tracking branch 'origin/master' into sortby ca4490d [Andrew Ash] Add .sortBy(f) method on RDD --- python/pyspark/rdd.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'python/pyspark/rdd.py') diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index bb4d035edc..65f63153cd 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -549,6 +549,18 @@ class RDD(object): .mapPartitions(mapFunc,preservesPartitioning=True) .flatMap(lambda x: x, preservesPartitioning=True)) + def sortBy(self, keyfunc, ascending=True, numPartitions=None): + """ + Sorts this RDD by the given keyfunc + + >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)] + >>> sc.parallelize(tmp).sortBy(lambda x: x[0]).collect() + [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)] + >>> sc.parallelize(tmp).sortBy(lambda x: x[1]).collect() + [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)] + """ + return self.keyBy(keyfunc).sortByKey(ascending, numPartitions).values() + def glom(self): """ Return an RDD created by coalescing all elements within each partition -- cgit v1.2.3