[SPARK-4570][SQL]add BroadcastLeftSemiJoinHash

JIRA issue: [SPARK-4570](https://issues.apache.org/jira/browse/SPARK-4570) We are planning to create a `BroadcastLeftSemiJoinHash` to implement the broadcast join for `left semijoin` In left semijoin : If the size of data from right side is smaller than the user-settable threshold `AUTO_BROADCASTJOIN_THRESHOLD`, the planner would mark it as the `broadcast` relation and mark the other relation as the stream side. The broadcast table will be broadcasted to all of the executors involved in the join, as a `org.apache.spark.broadcast.Broadcast` object. It will use `joins.BroadcastLeftSemiJoinHash`.,else it will use `joins.LeftSemiJoinHash`. The benchmark suggests these made the optimized version 4x faster when `left semijoin` <pre><code> Original: left semi join : 9288 ms Optimized: left semi join : 1963 ms </code></pre> The micro benchmark load `data1/kv3.txt` into a normal Hive table. Benchmark code: <pre><code> def benchmark(f: => Unit) = { val begin = System.currentTimeMillis() f val end = System.currentTimeMillis() end - begin } val sc = new SparkContext( new SparkConf() .setMaster("local") .setAppName(getClass.getSimpleName.stripSuffix("$"))) val hiveContext = new HiveContext(sc) import hiveContext._ sql("drop table if exists left_table") sql("drop table if exists right_table") sql( """create table left_table (key int, value string) """.stripMargin) sql( s"""load data local inpath "/data1/kv3.txt" into table left_table""") sql( """create table right_table (key int, value string) """.stripMargin) sql( """ |from left_table |insert overwrite table right_table |select left_table.key, left_table.value """.stripMargin) val leftSimeJoin = sql( """select a.key from left_table a |left semi join right_table b on a.key = b.key""".stripMargin) val leftSemiJoinDuration = benchmark(leftSimeJoin.count()) println(s"left semi join : $leftSemiJoinDuration ms ") </code></pre> Author: wangxiaojing <u9jing@gmail.com> Closes #3442 from wangxiaojing/SPARK-4570 and squashes the following commits: a4a43c9 [wangxiaojing] rebase f103983 [wangxiaojing] change style fbe4887 [wangxiaojing] change style ff2e618 [wangxiaojing] add testsuite 1a8da2a [wangxiaojing] add BroadcastLeftSemiJoinHash
author: wangxiaojing <u9jing@gmail.com> 2014-12-30 13:54:12 -0800
committer: Michael Armbrust <michael@databricks.com> 2014-12-30 13:54:12 -0800
commit: 07fa1910d9c4092d670381c447403105f01c584e (patch)
tree: 6552fd64d119b5333145b49200a598676145c534 /sql/hive
parent: 8f29b7cafc2b6e802e4eb21f681d6369da2f30fa (diff)
download: spark-07fa1910d9c4092d670381c447403105f01c584e.tar.gz
spark-07fa1910d9c4092d670381c447403105f01c584e.tar.bz2
spark-07fa1910d9c4092d670381c447403105f01c584e.zip
1 files changed, 49 insertions, 1 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index ff4071d8e2..4b6a9308b9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -22,7 +22,7 @@ import org.scalatest.BeforeAndAfterAll
 import scala.reflect.ClassTag
 
 import org.apache.spark.sql.{SQLConf, QueryTest}
-import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, ShuffledHashJoin}
+import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.execution._
@@ -193,4 +193,52 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
     )
   }
 
+  test("auto converts to broadcast left semi join, by size estimate of a relation") {
+    val leftSemiJoinQuery =
+      """SELECT * FROM src a
+        |left semi JOIN src b ON a.key=86 and a.key = b.key""".stripMargin
+    val answer = (86, "val_86") :: Nil
+
+    var rdd = sql(leftSemiJoinQuery)
+
+    // Assert src has a size smaller than the threshold.
+    val sizes = rdd.queryExecution.analyzed.collect {
+      case r if implicitly[ClassTag[MetastoreRelation]].runtimeClass
+        .isAssignableFrom(r.getClass) =>
+        r.statistics.sizeInBytes
+    }
+    assert(sizes.size === 2 && sizes(1) <= autoBroadcastJoinThreshold
+      && sizes(0) <= autoBroadcastJoinThreshold,
+      s"query should contain two relations, each of which has size smaller than autoConvertSize")
+
+    // Using `sparkPlan` because for relevant patterns in HashJoin to be
+    // matched, other strategies need to be applied.
+    var bhj = rdd.queryExecution.sparkPlan.collect {
+      case j: BroadcastLeftSemiJoinHash => j
+    }
+    assert(bhj.size === 1,
+      s"actual query plans do not contain broadcast join: ${rdd.queryExecution}")
+
+    checkAnswer(rdd, answer) // check correctness of output
+
+    TestHive.settings.synchronized {
+      val tmp = autoBroadcastJoinThreshold
+
+      sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1")
+      rdd = sql(leftSemiJoinQuery)
+      bhj = rdd.queryExecution.sparkPlan.collect {
+        case j: BroadcastLeftSemiJoinHash => j
+      }
+      assert(bhj.isEmpty, "BroadcastHashJoin still planned even though it is switched off")
+
+      val shj = rdd.queryExecution.sparkPlan.collect {
+        case j: LeftSemiJoinHash => j
+      }
+      assert(shj.size === 1,
+        "LeftSemiJoinHash should be planned when BroadcastHashJoin is turned off")
+
+      sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=$tmp")
+    }
+
+  }
 }
author	wangxiaojing <u9jing@gmail.com>	2014-12-30 13:54:12 -0800
committer	Michael Armbrust <michael@databricks.com>	2014-12-30 13:54:12 -0800
commit	07fa1910d9c4092d670381c447403105f01c584e (patch)
tree	6552fd64d119b5333145b49200a598676145c534 /sql/hive
parent	8f29b7cafc2b6e802e4eb21f681d6369da2f30fa (diff)
download	spark-07fa1910d9c4092d670381c447403105f01c584e.tar.gz spark-07fa1910d9c4092d670381c447403105f01c584e.tar.bz2 spark-07fa1910d9c4092d670381c447403105f01c584e.zip