aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-07-28 17:42:35 -0700
committerReynold Xin <rxin@databricks.com>2015-07-28 17:42:35 -0700
commit6662ee21244067180c1bcef0b16107b2979fd933 (patch)
treec99d004dd3a4cb2309e2f39537907e1859e54d4e /sql
parentb7f54119f86f916481aeccc67f07e77dc2a924c7 (diff)
downloadspark-6662ee21244067180c1bcef0b16107b2979fd933.tar.gz
spark-6662ee21244067180c1bcef0b16107b2979fd933.tar.bz2
spark-6662ee21244067180c1bcef0b16107b2979fd933.zip
[SPARK-9418][SQL] Use sort-merge join as the default shuffle join.
Sort-merge join is more robust in Spark since sorting can be made using the Tungsten sort operator. Author: Reynold Xin <rxin@databricks.com> Closes #7733 from rxin/smj and squashes the following commits: 61e4d34 [Reynold Xin] Fixed test case. 5ffd731 [Reynold Xin] Fixed JoinSuite. a137dc0 [Reynold Xin] [SPARK-9418][SQL] Use sort-merge join as the default shuffle join.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala2
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala6
-rw-r--r--sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala (renamed from sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala)8
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala2
4 files changed, 9 insertions, 9 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 40eba33f59..cdb0c7a1c0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -322,7 +322,7 @@ private[spark] object SQLConf {
" memory.")
val SORTMERGE_JOIN = booleanConf("spark.sql.planner.sortMergeJoin",
- defaultValue = Some(false),
+ defaultValue = Some(true),
doc = "When true, use sort merge join (as opposed to hash join) by default for large joins.")
// This is only used for the thriftserver
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index dfb2a7e099..666f26bf62 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -79,9 +79,9 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
("SELECT * FROM testData FULL OUTER JOIN testData2 WHERE key = 2", classOf[CartesianProduct]),
("SELECT * FROM testData JOIN testData2 WHERE key > a", classOf[CartesianProduct]),
("SELECT * FROM testData FULL OUTER JOIN testData2 WHERE key > a", classOf[CartesianProduct]),
- ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[ShuffledHashJoin]),
- ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", classOf[ShuffledHashJoin]),
- ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", classOf[ShuffledHashJoin]),
+ ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[SortMergeJoin]),
+ ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", classOf[SortMergeJoin]),
+ ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", classOf[SortMergeJoin]),
("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[ShuffledHashOuterJoin]),
("SELECT * FROM testData RIGHT JOIN testData2 ON key = a where key = 2",
classOf[ShuffledHashOuterJoin]),
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala
index 1fe4fe9629..1a5ba20404 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala
@@ -23,16 +23,16 @@ import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.hive.test.TestHive
/**
- * Runs the test cases that are included in the hive distribution with sort merge join is true.
+ * Runs the test cases that are included in the hive distribution with hash joins.
*/
-class SortMergeCompatibilitySuite extends HiveCompatibilitySuite {
+class HashJoinCompatibilitySuite extends HiveCompatibilitySuite {
override def beforeAll() {
super.beforeAll()
- TestHive.setConf(SQLConf.SORTMERGE_JOIN, true)
+ TestHive.setConf(SQLConf.SORTMERGE_JOIN, false)
}
override def afterAll() {
- TestHive.setConf(SQLConf.SORTMERGE_JOIN, false)
+ TestHive.setConf(SQLConf.SORTMERGE_JOIN, true)
super.afterAll()
}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index f067ea0d4f..bc72b0172a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -172,7 +172,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
bhj = df.queryExecution.sparkPlan.collect { case j: BroadcastHashJoin => j }
assert(bhj.isEmpty, "BroadcastHashJoin still planned even though it is switched off")
- val shj = df.queryExecution.sparkPlan.collect { case j: ShuffledHashJoin => j }
+ val shj = df.queryExecution.sparkPlan.collect { case j: SortMergeJoin => j }
assert(shj.size === 1,
"ShuffledHashJoin should be planned when BroadcastHashJoin is turned off")