From a96558caa3c0feb20bbf0f3ec367673886fc78c6 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Sat, 9 Jun 2012 14:44:18 -0700 Subject: Performance improvements to shuffle operations: in particular, preserve RDD partitioning in more cases where it's possible, and use iterators instead of materializing collections when doing joins. --- bagel/src/main/scala/spark/bagel/Bagel.scala | 3 +-- .../main/scala/spark/bagel/examples/WikipediaPageRankStandalone.scala | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'bagel') diff --git a/bagel/src/main/scala/spark/bagel/Bagel.scala b/bagel/src/main/scala/spark/bagel/Bagel.scala index 2f57c9c0fd..996ca2a877 100644 --- a/bagel/src/main/scala/spark/bagel/Bagel.scala +++ b/bagel/src/main/scala/spark/bagel/Bagel.scala @@ -30,8 +30,7 @@ object Bagel extends Logging { val aggregated = agg(verts, aggregator) val combinedMsgs = msgs.combineByKey( - combiner.createCombiner, combiner.mergeMsg, combiner.mergeCombiners, - splits, partitioner) + combiner.createCombiner _, combiner.mergeMsg _, combiner.mergeCombiners _, partitioner) val grouped = combinedMsgs.groupWith(verts) val (processed, numMsgs, numActiveVerts) = comp[K, V, M, C](sc, grouped, compute(_, _, aggregated, superstep)) diff --git a/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRankStandalone.scala b/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRankStandalone.scala index 7084ff97d9..8ce7abd03f 100644 --- a/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRankStandalone.scala +++ b/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRankStandalone.scala @@ -105,7 +105,6 @@ object WikipediaPageRankStandalone { ranks = (contribs.combineByKey((x: Double) => x, (x: Double, y: Double) => x + y, (x: Double, y: Double) => x + y, - numSplits, partitioner) .mapValues(sum => a/n + (1-a)*sum)) } -- cgit v1.2.3