aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRajesh Balamohan <rbalamohan@apache.org>2016-01-20 11:30:03 -0800
committerReynold Xin <rxin@databricks.com>2016-01-20 11:30:03 -0800
commitab4a6bfd11b870428eb2a96aa213f7d34c0aa622 (patch)
treeab5e253658f616f3c930167d8667e5873709e2d7
parente75e340a406b765608258b49f7e2f1107d4605fb (diff)
downloadspark-ab4a6bfd11b870428eb2a96aa213f7d34c0aa622.tar.gz
spark-ab4a6bfd11b870428eb2a96aa213f7d34c0aa622.tar.bz2
spark-ab4a6bfd11b870428eb2a96aa213f7d34c0aa622.zip
[SPARK-12898] Consider having dummyCallSite for HiveTableScan
Currently, HiveTableScan runs with getCallSite which is really expensive and shows up when scanning through large table with partitions (e.g TPC-DS) which slows down the overall runtime of the job. It would be good to consider having dummyCallSite in HiveTableScan. Author: Rajesh Balamohan <rbalamohan@apache.org> Closes #10825 from rajeshbalamohan/SPARK-12898.
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala13
1 files changed, 10 insertions, 3 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index 1588728bdb..eff8833e92 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution._
import org.apache.spark.sql.hive._
import org.apache.spark.sql.types.{BooleanType, DataType}
+import org.apache.spark.util.Utils
/**
* The Hive table scan operator. Column and partition pruning are both handled.
@@ -133,11 +134,17 @@ case class HiveTableScan(
}
protected override def doExecute(): RDD[InternalRow] = {
+ // Using dummyCallSite, as getCallSite can turn out to be expensive with
+ // with multiple partitions.
val rdd = if (!relation.hiveQlTable.isPartitioned) {
- hadoopReader.makeRDDForTable(relation.hiveQlTable)
+ Utils.withDummyCallSite(sqlContext.sparkContext) {
+ hadoopReader.makeRDDForTable(relation.hiveQlTable)
+ }
} else {
- hadoopReader.makeRDDForPartitionedTable(
- prunePartitions(relation.getHiveQlPartitions(partitionPruningPred)))
+ Utils.withDummyCallSite(sqlContext.sparkContext) {
+ hadoopReader.makeRDDForPartitionedTable(
+ prunePartitions(relation.getHiveQlPartitions(partitionPruningPred)))
+ }
}
rdd.mapPartitionsInternal { iter =>
val proj = UnsafeProjection.create(schema)