aboutsummaryrefslogtreecommitdiff
path: root/sql/core
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2016-01-16 10:29:27 -0800
committerDavies Liu <davies.liu@gmail.com>2016-01-16 10:29:27 -0800
commit3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6 (patch)
tree2e05f1fbb9eec2c870081b00b9e60505db07e1b9 /sql/core
parent86972fa52152d2149b88ba75be048a6986006285 (diff)
downloadspark-3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6.tar.gz
spark-3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6.tar.bz2
spark-3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6.zip
[SPARK-12796] [SQL] Whole stage codegen
This is the initial work for whole stage codegen, it support Projection/Filter/Range, we will continue work on this to support more physical operators. A micro benchmark show that a query with range, filter and projection could be 3X faster then before. It's turned on by default. For a tree that have at least two chained plans, a WholeStageCodegen will be inserted into it, for example, the following plan ``` Limit 10 +- Project [(id#5L + 1) AS (id + 1)#6L] +- Filter ((id#5L & 1) = 1) +- Range 0, 1, 4, 10, [id#5L] ``` will be translated into ``` Limit 10 +- WholeStageCodegen +- Project [(id#1L + 1) AS (id + 1)#2L] +- Filter ((id#1L & 1) = 1) +- Range 0, 1, 4, 10, [id#1L] ``` Here is the call graph to generate Java source for A and B (A support codegen, but B does not): ``` * WholeStageCodegen Plan A FakeInput Plan B * ========================================================================= * * -> execute() * | * doExecute() --------> produce() * | * doProduce() -------> produce() * | * doProduce() ---> execute() * | * consume() * doConsume() ------------| * | * doConsume() <----- consume() ``` A SparkPlan that support codegen need to implement doProduce() and doConsume(): ``` def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String ``` Author: Davies Liu <davies@databricks.com> Closes #10735 from davies/whole2.
Diffstat (limited to 'sql/core')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala3
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala9
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala3
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/BufferedRowIterator.java64
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala1
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala299
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala114
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala6
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala6
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala2
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala6
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala6
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/BenchmarkWholeStageCodegen.scala60
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala6
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala38
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala6
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala2
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala2
18 files changed, 604 insertions, 29 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 3422d0ead4..95e5fbb119 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -18,7 +18,6 @@
package org.apache.spark.sql
import java.io.CharArrayWriter
-import java.util.Properties
import scala.language.implicitConversions
import scala.reflect.ClassTag
@@ -39,12 +38,10 @@ import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, FileRelation, LogicalRDD, Queryable, QueryExecution, SQLExecution}
import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation}
import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
-import org.apache.spark.sql.sources.HadoopFsRelation
import org.apache.spark.sql.types._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.Utils
-
private[sql] object DataFrame {
def apply(sqlContext: SQLContext, logicalPlan: LogicalPlan): DataFrame = {
new DataFrame(sqlContext, logicalPlan)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 4e3662724c..4c1eb0b30b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -489,6 +489,13 @@ private[spark] object SQLConf {
isPublic = false,
doc = "This flag should be set to true to enable support for SQL2011 reserved keywords.")
+ val WHOLESTAGE_CODEGEN_ENABLED = booleanConf("spark.sql.codegen.wholeStage",
+ defaultValue = Some(true),
+ doc = "When true, the whole stage (of multiple operators) will be compiled into single java" +
+ " method",
+ isPublic = false)
+
+
object Deprecated {
val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
val EXTERNAL_SORT = "spark.sql.planner.externalSort"
@@ -561,6 +568,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with ParserCon
private[spark] def nativeView: Boolean = getConf(NATIVE_VIEW)
+ private[spark] def wholeStageEnabled: Boolean = getConf(WHOLESTAGE_CODEGEN_ENABLED)
+
def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE)
private[spark] def subexpressionEliminationEnabled: Boolean =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a0939adb6d..18ddffe1be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -904,7 +904,8 @@ class SQLContext private[sql](
@transient
protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] {
val batches = Seq(
- Batch("Add exchange", Once, EnsureRequirements(self))
+ Batch("Add exchange", Once, EnsureRequirements(self)),
+ Batch("Whole stage codegen", Once, CollapseCodegenStages(self))
)
}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/BufferedRowIterator.java b/sql/core/src/main/scala/org/apache/spark/sql/execution/BufferedRowIterator.java
new file mode 100644
index 0000000000..b1bbb1da10
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/BufferedRowIterator.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution;
+
+import scala.collection.Iterator;
+
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+
+/**
+ * An iterator interface used to pull the output from generated function for multiple operators
+ * (whole stage codegen).
+ *
+ * TODO: replaced it by batched columnar format.
+ */
+public class BufferedRowIterator {
+ protected InternalRow currentRow;
+ protected Iterator<InternalRow> input;
+ // used when there is no column in output
+ protected UnsafeRow unsafeRow = new UnsafeRow(0);
+
+ public boolean hasNext() {
+ if (currentRow == null) {
+ processNext();
+ }
+ return currentRow != null;
+ }
+
+ public InternalRow next() {
+ InternalRow r = currentRow;
+ currentRow = null;
+ return r;
+ }
+
+ public void setInput(Iterator<InternalRow> iter) {
+ input = iter;
+ }
+
+ /**
+ * Processes the input until have a row as output (currentRow).
+ *
+ * After it's called, if currentRow is still null, it means no more rows left.
+ */
+ protected void processNext() {
+ if (input.hasNext()) {
+ currentRow = input.next();
+ }
+ }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 2355de3d05..75101ea0fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -97,7 +97,6 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
/** Specifies sort order for each partition requirements on the input data for this operator. */
def requiredChildOrdering: Seq[Seq[SortOrder]] = Seq.fill(children.size)(Nil)
-
/**
* Returns the result of this query as an RDD[InternalRow] by delegating to doExecute
* after adding query plan information to created RDDs for visualization.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala
new file mode 100644
index 0000000000..c15fabab80
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala
@@ -0,0 +1,299 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, BoundReference, Expression, LeafExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.rules.Rule
+
+/**
+ * An interface for those physical operators that support codegen.
+ */
+trait CodegenSupport extends SparkPlan {
+
+ /**
+ * Whether this SparkPlan support whole stage codegen or not.
+ */
+ def supportCodegen: Boolean = true
+
+ /**
+ * Which SparkPlan is calling produce() of this one. It's itself for the first SparkPlan.
+ */
+ private var parent: CodegenSupport = null
+
+ /**
+ * Returns an input RDD of InternalRow and Java source code to process them.
+ */
+ def produce(ctx: CodegenContext, parent: CodegenSupport): (RDD[InternalRow], String) = {
+ this.parent = parent
+ doProduce(ctx)
+ }
+
+ /**
+ * Generate the Java source code to process, should be overrided by subclass to support codegen.
+ *
+ * doProduce() usually generate the framework, for example, aggregation could generate this:
+ *
+ * if (!initialized) {
+ * # create a hash map, then build the aggregation hash map
+ * # call child.produce()
+ * initialized = true;
+ * }
+ * while (hashmap.hasNext()) {
+ * row = hashmap.next();
+ * # build the aggregation results
+ * # create varialbles for results
+ * # call consume(), wich will call parent.doConsume()
+ * }
+ */
+ protected def doProduce(ctx: CodegenContext): (RDD[InternalRow], String)
+
+ /**
+ * Consume the columns generated from current SparkPlan, call it's parent or create an iterator.
+ */
+ protected def consume(ctx: CodegenContext, columns: Seq[ExprCode]): String = {
+ assert(columns.length == output.length)
+ parent.doConsume(ctx, this, columns)
+ }
+
+
+ /**
+ * Generate the Java source code to process the rows from child SparkPlan.
+ *
+ * This should be override by subclass to support codegen.
+ *
+ * For example, Filter will generate the code like this:
+ *
+ * # code to evaluate the predicate expression, result is isNull1 and value2
+ * if (isNull1 || value2) {
+ * # call consume(), which will call parent.doConsume()
+ * }
+ */
+ def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String
+}
+
+
+/**
+ * InputAdapter is used to hide a SparkPlan from a subtree that support codegen.
+ *
+ * This is the leaf node of a tree with WholeStageCodegen, is used to generate code that consumes
+ * an RDD iterator of InternalRow.
+ */
+case class InputAdapter(child: SparkPlan) extends LeafNode with CodegenSupport {
+
+ override def output: Seq[Attribute] = child.output
+
+ override def supportCodegen: Boolean = true
+
+ override def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) = {
+ val exprs = output.zipWithIndex.map(x => new BoundReference(x._2, x._1.dataType, true))
+ val row = ctx.freshName("row")
+ ctx.INPUT_ROW = row
+ ctx.currentVars = null
+ val columns = exprs.map(_.gen(ctx))
+ val code = s"""
+ | while (input.hasNext()) {
+ | InternalRow $row = (InternalRow) input.next();
+ | ${columns.map(_.code).mkString("\n")}
+ | ${consume(ctx, columns)}
+ | }
+ """.stripMargin
+ (child.execute(), code)
+ }
+
+ def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String = {
+ throw new UnsupportedOperationException
+ }
+
+ override def doExecute(): RDD[InternalRow] = {
+ throw new UnsupportedOperationException
+ }
+
+ override def simpleString: String = "INPUT"
+}
+
+/**
+ * WholeStageCodegen compile a subtree of plans that support codegen together into single Java
+ * function.
+ *
+ * Here is the call graph of to generate Java source (plan A support codegen, but plan B does not):
+ *
+ * WholeStageCodegen Plan A FakeInput Plan B
+ * =========================================================================
+ *
+ * -> execute()
+ * |
+ * doExecute() --------> produce()
+ * |
+ * doProduce() -------> produce()
+ * |
+ * doProduce() ---> execute()
+ * |
+ * consume()
+ * doConsume() ------------|
+ * |
+ * doConsume() <----- consume()
+ *
+ * SparkPlan A should override doProduce() and doConsume().
+ *
+ * doCodeGen() will create a CodeGenContext, which will hold a list of variables for input,
+ * used to generated code for BoundReference.
+ */
+case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan])
+ extends SparkPlan with CodegenSupport {
+
+ override def output: Seq[Attribute] = plan.output
+
+ override def doExecute(): RDD[InternalRow] = {
+ val ctx = new CodegenContext
+ val (rdd, code) = plan.produce(ctx, this)
+ val references = ctx.references.toArray
+ val source = s"""
+ public Object generate(Object[] references) {
+ return new GeneratedIterator(references);
+ }
+
+ class GeneratedIterator extends org.apache.spark.sql.execution.BufferedRowIterator {
+
+ private Object[] references;
+ ${ctx.declareMutableStates()}
+
+ public GeneratedIterator(Object[] references) {
+ this.references = references;
+ ${ctx.initMutableStates()}
+ }
+
+ protected void processNext() {
+ $code
+ }
+ }
+ """
+ // try to compile, helpful for debug
+ // println(s"${CodeFormatter.format(source)}")
+ CodeGenerator.compile(source)
+
+ rdd.mapPartitions { iter =>
+ val clazz = CodeGenerator.compile(source)
+ val buffer = clazz.generate(references).asInstanceOf[BufferedRowIterator]
+ buffer.setInput(iter)
+ new Iterator[InternalRow] {
+ override def hasNext: Boolean = buffer.hasNext
+ override def next: InternalRow = buffer.next()
+ }
+ }
+ }
+
+ override def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) = {
+ throw new UnsupportedOperationException
+ }
+
+ override def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String = {
+ if (input.nonEmpty) {
+ val colExprs = output.zipWithIndex.map { case (attr, i) =>
+ BoundReference(i, attr.dataType, attr.nullable)
+ }
+ // generate the code to create a UnsafeRow
+ ctx.currentVars = input
+ val code = GenerateUnsafeProjection.createCode(ctx, colExprs, false)
+ s"""
+ | ${code.code.trim}
+ | currentRow = ${code.value};
+ | return;
+ """.stripMargin
+ } else {
+ // There is no columns
+ s"""
+ | currentRow = unsafeRow;
+ | return;
+ """.stripMargin
+ }
+ }
+
+ override def generateTreeString(
+ depth: Int,
+ lastChildren: Seq[Boolean],
+ builder: StringBuilder): StringBuilder = {
+ if (depth > 0) {
+ lastChildren.init.foreach { isLast =>
+ val prefixFragment = if (isLast) " " else ": "
+ builder.append(prefixFragment)
+ }
+
+ val branch = if (lastChildren.last) "+- " else ":- "
+ builder.append(branch)
+ }
+
+ builder.append(simpleString)
+ builder.append("\n")
+
+ plan.generateTreeString(depth + 1, lastChildren :+children.isEmpty :+ true, builder)
+ if (children.nonEmpty) {
+ children.init.foreach(_.generateTreeString(depth + 1, lastChildren :+ false, builder))
+ children.last.generateTreeString(depth + 1, lastChildren :+ true, builder)
+ }
+
+ builder
+ }
+
+ override def simpleString: String = "WholeStageCodegen"
+}
+
+
+/**
+ * Find the chained plans that support codegen, collapse them together as WholeStageCodegen.
+ */
+private[sql] case class CollapseCodegenStages(sqlContext: SQLContext) extends Rule[SparkPlan] {
+
+ private def supportCodegen(plan: SparkPlan): Boolean = plan match {
+ case plan: CodegenSupport if plan.supportCodegen =>
+ // Non-leaf with CodegenFallback does not work with whole stage codegen
+ val willFallback = plan.expressions.exists(
+ _.find(e => e.isInstanceOf[CodegenFallback] && !e.isInstanceOf[LeafExpression]).isDefined
+ )
+ // the generated code will be huge if there are too many columns
+ val haveManyColumns = plan.output.length > 200
+ !willFallback && !haveManyColumns
+ case _ => false
+ }
+
+ def apply(plan: SparkPlan): SparkPlan = {
+ if (sqlContext.conf.wholeStageEnabled) {
+ plan.transform {
+ case plan: CodegenSupport if supportCodegen(plan) &&
+ // Whole stage codegen is only useful when there are at least two levels of operators that
+ // support it (save at least one projection/iterator).
+ plan.children.exists(supportCodegen) =>
+
+ var inputs = ArrayBuffer[SparkPlan]()
+ val combined = plan.transform {
+ case p if !supportCodegen(p) =>
+ inputs += p
+ InputAdapter(p)
+ }.asInstanceOf[CodegenSupport]
+ WholeStageCodegen(combined, inputs)
+ }
+ } else {
+ plan
+ }
+ }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 92c9a56131..9e2e0357c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -22,19 +22,37 @@ import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD, ShuffledRDD}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, ExpressionCanonicalizer}
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.types.LongType
import org.apache.spark.util.MutablePair
import org.apache.spark.util.random.PoissonSampler
-case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryNode {
+case class Project(projectList: Seq[NamedExpression], child: SparkPlan)
+ extends UnaryNode with CodegenSupport {
override private[sql] lazy val metrics = Map(
"numRows" -> SQLMetrics.createLongMetric(sparkContext, "number of rows"))
override def output: Seq[Attribute] = projectList.map(_.toAttribute)
+ protected override def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) = {
+ child.asInstanceOf[CodegenSupport].produce(ctx, this)
+ }
+
+ override def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String = {
+ val exprs = projectList.map(x =>
+ ExpressionCanonicalizer.execute(BindReferences.bindReference(x, child.output)))
+ ctx.currentVars = input
+ val output = exprs.map(_.gen(ctx))
+ s"""
+ | ${output.map(_.code).mkString("\n")}
+ |
+ | ${consume(ctx, output)}
+ """.stripMargin
+ }
+
protected override def doExecute(): RDD[InternalRow] = {
val numRows = longMetric("numRows")
child.execute().mapPartitionsInternal { iter =>
@@ -51,13 +69,30 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends
}
-case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
+case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode with CodegenSupport {
override def output: Seq[Attribute] = child.output
private[sql] override lazy val metrics = Map(
"numInputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of input rows"),
"numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))
+ protected override def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) = {
+ child.asInstanceOf[CodegenSupport].produce(ctx, this)
+ }
+
+ override def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String = {
+ val expr = ExpressionCanonicalizer.execute(
+ BindReferences.bindReference(condition, child.output))
+ ctx.currentVars = input
+ val eval = expr.gen(ctx)
+ s"""
+ | ${eval.code}
+ | if (!${eval.isNull} && ${eval.value}) {
+ | ${consume(ctx, ctx.currentVars)}
+ | }
+ """.stripMargin
+ }
+
protected override def doExecute(): RDD[InternalRow] = {
val numInputRows = longMetric("numInputRows")
val numOutputRows = longMetric("numOutputRows")
@@ -116,7 +151,80 @@ case class Range(
numSlices: Int,
numElements: BigInt,
output: Seq[Attribute])
- extends LeafNode {
+ extends LeafNode with CodegenSupport {
+
+ protected override def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) = {
+ val initTerm = ctx.freshName("range_initRange")
+ ctx.addMutableState("boolean", initTerm, s"$initTerm = false;")
+ val partitionEnd = ctx.freshName("range_partitionEnd")
+ ctx.addMutableState("long", partitionEnd, s"$partitionEnd = 0L;")
+ val number = ctx.freshName("range_number")
+ ctx.addMutableState("long", number, s"$number = 0L;")
+ val overflow = ctx.freshName("range_overflow")
+ ctx.addMutableState("boolean", overflow, s"$overflow = false;")
+
+ val value = ctx.freshName("range_value")
+ val ev = ExprCode("", "false", value)
+ val BigInt = classOf[java.math.BigInteger].getName
+ val checkEnd = if (step > 0) {
+ s"$number < $partitionEnd"
+ } else {
+ s"$number > $partitionEnd"
+ }
+
+ val rdd = sqlContext.sparkContext.parallelize(0 until numSlices, numSlices)
+ .map(i => InternalRow(i))
+
+ val code = s"""
+ | // initialize Range
+ | if (!$initTerm) {
+ | $initTerm = true;
+ | if (input.hasNext()) {
+ | $BigInt index = $BigInt.valueOf(((InternalRow) input.next()).getInt(0));
+ | $BigInt numSlice = $BigInt.valueOf(${numSlices}L);
+ | $BigInt numElement = $BigInt.valueOf(${numElements.toLong}L);
+ | $BigInt step = $BigInt.valueOf(${step}L);
+ | $BigInt start = $BigInt.valueOf(${start}L);
+ |
+ | $BigInt st = index.multiply(numElement).divide(numSlice).multiply(step).add(start);
+ | if (st.compareTo($BigInt.valueOf(Long.MAX_VALUE)) > 0) {
+ | $number = Long.MAX_VALUE;
+ | } else if (st.compareTo($BigInt.valueOf(Long.MIN_VALUE)) < 0) {
+ | $number = Long.MIN_VALUE;
+ | } else {
+ | $number = st.longValue();
+ | }
+ |
+ | $BigInt end = index.add($BigInt.ONE).multiply(numElement).divide(numSlice)
+ | .multiply(step).add(start);
+ | if (end.compareTo($BigInt.valueOf(Long.MAX_VALUE)) > 0) {
+ | $partitionEnd = Long.MAX_VALUE;
+ | } else if (end.compareTo($BigInt.valueOf(Long.MIN_VALUE)) < 0) {
+ | $partitionEnd = Long.MIN_VALUE;
+ | } else {
+ | $partitionEnd = end.longValue();
+ | }
+ | } else {
+ | return;
+ | }
+ | }
+ |
+ | while (!$overflow && $checkEnd) {
+ | long $value = $number;
+ | $number += ${step}L;
+ | if ($number < $value ^ ${step}L < 0) {
+ | $overflow = true;
+ | }
+ | ${consume(ctx, Seq(ev))}
+ | }
+ """.stripMargin
+
+ (rdd, code)
+ }
+
+ def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String = {
+ throw new UnsupportedOperationException
+ }
protected override def doExecute(): RDD[InternalRow] = {
sqlContext
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
index 7888e34e8a..72eb1f6cf0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
@@ -143,14 +143,14 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
private DataType[] columnTypes = null;
private int[] columnIndexes = null;
- ${declareMutableStates(ctx)}
+ ${ctx.declareMutableStates()}
public SpecificColumnarIterator() {
this.nativeOrder = ByteOrder.nativeOrder();
this.buffers = new byte[${columnTypes.length}][];
this.mutableRow = new MutableUnsafeRow(rowWriter);
- ${initMutableStates(ctx)}
+ ${ctx.initMutableStates()}
}
public void initialize(Iterator input, DataType[] columnTypes, int[] columnIndexes) {
@@ -190,6 +190,6 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
logDebug(s"Generated ColumnarIterator: ${CodeFormatter.format(code)}")
- compile(code).generate(Array.empty).asInstanceOf[ColumnarIterator]
+ CodeGenerator.compile(code).generate(Array.empty).asInstanceOf[ColumnarIterator]
}
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 89b9a68768..e8d0678989 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -36,12 +36,12 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
import testImplicits._
def rddIdOf(tableName: String): Int = {
- val executedPlan = sqlContext.table(tableName).queryExecution.executedPlan
- executedPlan.collect {
+ val plan = sqlContext.table(tableName).queryExecution.sparkPlan
+ plan.collect {
case InMemoryColumnarTableScan(_, _, relation) =>
relation.cachedColumnBuffers.id
case _ =>
- fail(s"Table $tableName is not cached\n" + executedPlan)
+ fail(s"Table $tableName is not cached\n" + plan)
}.head
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index eb4efcd1d4..b349bb6dc9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -629,7 +629,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
}
def checkNumProjects(df: DataFrame, expectedNumProjects: Int): Unit = {
- val projects = df.queryExecution.executedPlan.collect {
+ val projects = df.queryExecution.sparkPlan.collect {
case tungstenProject: Project => tungstenProject
}
assert(projects.size === expectedNumProjects)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index 39a65413bd..c17be8ace9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -123,15 +123,15 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
val df2 = Seq((1, "1"), (2, "2")).toDF("key", "value")
// equijoin - should be converted into broadcast join
- val plan1 = df1.join(broadcast(df2), "key").queryExecution.executedPlan
+ val plan1 = df1.join(broadcast(df2), "key").queryExecution.sparkPlan
assert(plan1.collect { case p: BroadcastHashJoin => p }.size === 1)
// no join key -- should not be a broadcast join
- val plan2 = df1.join(broadcast(df2)).queryExecution.executedPlan
+ val plan2 = df1.join(broadcast(df2)).queryExecution.sparkPlan
assert(plan2.collect { case p: BroadcastHashJoin => p }.size === 0)
// planner should not crash without a join
- broadcast(df1).queryExecution.executedPlan
+ broadcast(df1).queryExecution.sparkPlan
// SPARK-12275: no physical plan for BroadcastHint in some condition
withTempPath { path =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 75e81b9c91..bdb9421cc1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -247,7 +247,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
private def testCodeGen(sqlText: String, expectedResults: Seq[Row]): Unit = {
val df = sql(sqlText)
// First, check if we have GeneratedAggregate.
- val hasGeneratedAgg = df.queryExecution.executedPlan
+ val hasGeneratedAgg = df.queryExecution.sparkPlan
.collect { case _: aggregate.TungstenAggregate => true }
.nonEmpty
if (!hasGeneratedAgg) {
@@ -792,11 +792,11 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
test("SPARK-11111 null-safe join should not use cartesian product") {
val df = sql("select count(*) from testData a join testData b on (a.key <=> b.key)")
- val cp = df.queryExecution.executedPlan.collect {
+ val cp = df.queryExecution.sparkPlan.collect {
case cp: CartesianProduct => cp
}
assert(cp.isEmpty, "should not use CartesianProduct for null-safe join")
- val smj = df.queryExecution.executedPlan.collect {
+ val smj = df.queryExecution.sparkPlan.collect {
case smj: SortMergeJoin => smj
}
assert(smj.size > 0, "should use SortMergeJoin")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/BenchmarkWholeStageCodegen.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/BenchmarkWholeStageCodegen.scala
new file mode 100644
index 0000000000..788b04fcf8
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/BenchmarkWholeStageCodegen.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.util.Benchmark
+
+/**
+ * Benchmark to measure whole stage codegen performance.
+ * To run this:
+ * build/sbt "sql/test-only *BenchmarkWholeStageCodegen"
+ */
+class BenchmarkWholeStageCodegen extends SparkFunSuite {
+ def testWholeStage(values: Int): Unit = {
+ val conf = new SparkConf().setMaster("local[1]").setAppName("benchmark")
+ val sc = SparkContext.getOrCreate(conf)
+ val sqlContext = SQLContext.getOrCreate(sc)
+
+ val benchmark = new Benchmark("Single Int Column Scan", values)
+
+ benchmark.addCase("Without whole stage codegen") { iter =>
+ sqlContext.setConf("spark.sql.codegen.wholeStage", "false")
+ sqlContext.range(values).filter("(id & 1) = 1").count()
+ }
+
+ benchmark.addCase("With whole stage codegen") { iter =>
+ sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
+ sqlContext.range(values).filter("(id & 1) = 1").count()
+ }
+
+ /*
+ Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz
+ Single Int Column Scan: Avg Time(ms) Avg Rate(M/s) Relative Rate
+ -------------------------------------------------------------------------
+ Without whole stage codegen 6725.52 31.18 1.00 X
+ With whole stage codegen 2233.05 93.91 3.01 X
+ */
+ benchmark.run()
+ }
+
+ ignore("benchmark") {
+ testWholeStage(1024 * 1024 * 200)
+ }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 03a1b8e11d..49feeaf17d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -94,7 +94,7 @@ class PlannerSuite extends SharedSQLContext {
"""
|SELECT l.a, l.b
|FROM testData2 l JOIN (SELECT * FROM testLimit LIMIT 1) r ON (l.a = r.key)
- """.stripMargin).queryExecution.executedPlan
+ """.stripMargin).queryExecution.sparkPlan
val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
val sortMergeJoins = planned.collect { case join: SortMergeJoin => join }
@@ -147,7 +147,7 @@ class PlannerSuite extends SharedSQLContext {
val a = testData.as("a")
val b = sqlContext.table("tiny").as("b")
- val planned = a.join(b, $"a.key" === $"b.key").queryExecution.executedPlan
+ val planned = a.join(b, $"a.key" === $"b.key").queryExecution.sparkPlan
val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
val sortMergeJoins = planned.collect { case join: SortMergeJoin => join }
@@ -168,7 +168,7 @@ class PlannerSuite extends SharedSQLContext {
sqlContext.registerDataFrameAsTable(df, "testPushed")
withTempTable("testPushed") {
- val exp = sql("select * from testPushed where key = 15").queryExecution.executedPlan
+ val exp = sql("select * from testPushed where key = 15").queryExecution.sparkPlan
assert(exp.toString.contains("PushedFilters: [EqualTo(key,15)]"))
}
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
new file mode 100644
index 0000000000..c54fc6ba2d
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.test.SharedSQLContext
+
+class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {
+
+ test("range/filter should be combined") {
+ val df = sqlContext.range(10).filter("id = 1").selectExpr("id + 1")
+ val plan = df.queryExecution.executedPlan
+ assert(plan.find(_.isInstanceOf[WholeStageCodegen]).isDefined)
+
+ checkThatPlansAgree(
+ sqlContext.range(100),
+ (p: SparkPlan) =>
+ WholeStageCodegen(Filter('a == 1, InputAdapter(p)), Seq()),
+ (p: SparkPlan) => Filter('a == 1, p),
+ sortAnswers = false
+ )
+ }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 25afed25c8..6e21d5a061 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -31,7 +31,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
setupTestData()
test("simple columnar query") {
- val plan = sqlContext.executePlan(testData.logicalPlan).executedPlan
+ val plan = sqlContext.executePlan(testData.logicalPlan).sparkPlan
val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
checkAnswer(scan, testData.collect().toSeq)
@@ -48,7 +48,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
}
test("projection") {
- val plan = sqlContext.executePlan(testData.select('value, 'key).logicalPlan).executedPlan
+ val plan = sqlContext.executePlan(testData.select('value, 'key).logicalPlan).sparkPlan
val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
checkAnswer(scan, testData.collect().map {
@@ -57,7 +57,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
}
test("SPARK-1436 regression: in-memory columns must be able to be accessed multiple times") {
- val plan = sqlContext.executePlan(testData.logicalPlan).executedPlan
+ val plan = sqlContext.executePlan(testData.logicalPlan).sparkPlan
val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
checkAnswer(scan, testData.collect().toSeq)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala
index d762f7bfe9..647a7e9a4e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala
@@ -114,7 +114,7 @@ class PartitionBatchPruningSuite extends SparkFunSuite with SharedSQLContext {
df.collect().map(_(0)).toArray
}
- val (readPartitions, readBatches) = df.queryExecution.executedPlan.collect {
+ val (readPartitions, readBatches) = df.queryExecution.sparkPlan.collect {
case in: InMemoryColumnarTableScan => (in.readPartitions.value, in.readBatches.value)
}.head
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 58581d71e1..aee8e84db5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -62,7 +62,7 @@ class BroadcastJoinSuite extends QueryTest with BeforeAndAfterAll {
// Comparison at the end is for broadcast left semi join
val joinExpression = df1("key") === df2("key") && df1("value") > df2("value")
val df3 = df1.join(broadcast(df2), joinExpression, joinType)
- val plan = df3.queryExecution.executedPlan
+ val plan = df3.queryExecution.sparkPlan
assert(plan.collect { case p: T => p }.size === 1)
plan.executeCollect()
}