[SPARK-12796] [SQL] Whole stage codegen

This is the initial work for whole stage codegen, it support Projection/Filter/Range, we will continue work on this to support more physical operators. A micro benchmark show that a query with range, filter and projection could be 3X faster then before. It's turned on by default. For a tree that have at least two chained plans, a WholeStageCodegen will be inserted into it, for example, the following plan ``` Limit 10 +- Project [(id#5L + 1) AS (id + 1)#6L] +- Filter ((id#5L & 1) = 1) +- Range 0, 1, 4, 10, [id#5L] ``` will be translated into ``` Limit 10 +- WholeStageCodegen +- Project [(id#1L + 1) AS (id + 1)#2L] +- Filter ((id#1L & 1) = 1) +- Range 0, 1, 4, 10, [id#1L] ``` Here is the call graph to generate Java source for A and B (A support codegen, but B does not): ``` * WholeStageCodegen Plan A FakeInput Plan B * ========================================================================= * * -> execute() * | * doExecute() --------> produce() * | * doProduce() -------> produce() * | * doProduce() ---> execute() * | * consume() * doConsume() ------------| * | * doConsume() <----- consume() ``` A SparkPlan that support codegen need to implement doProduce() and doConsume(): ``` def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String ``` Author: Davies Liu <davies@databricks.com> Closes #10735 from davies/whole2.
author: Davies Liu <davies@databricks.com> 2016-01-16 10:29:27 -0800
committer: Davies Liu <davies.liu@gmail.com> 2016-01-16 10:29:27 -0800
commit: 3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6 (patch)
tree: 2e05f1fbb9eec2c870081b00b9e60505db07e1b9 /sql/core
parent: 86972fa52152d2149b88ba75be048a6986006285 (diff)
download: spark-3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6.tar.gz
spark-3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6.tar.bz2
spark-3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6.zip
18 files changed, 604 insertions, 29 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 3422d0ead4..95e5fbb119 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql
 
 import java.io.CharArrayWriter
-import java.util.Properties
 
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
@@ -39,12 +38,10 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, FileRelation, LogicalRDD, Queryable, QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
-import org.apache.spark.sql.sources.HadoopFsRelation
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
 
-
 private[sql] object DataFrame {
   def apply(sqlContext: SQLContext, logicalPlan: LogicalPlan): DataFrame = {
     new DataFrame(sqlContext, logicalPlan)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 4e3662724c..4c1eb0b30b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -489,6 +489,13 @@ private[spark] object SQLConf {
     isPublic = false,
     doc = "This flag should be set to true to enable support for SQL2011 reserved keywords.")
 
+  val WHOLESTAGE_CODEGEN_ENABLED = booleanConf("spark.sql.codegen.wholeStage",
+    defaultValue = Some(true),
+    doc = "When true, the whole stage (of multiple operators) will be compiled into single java" +
+      " method",
+    isPublic = false)
+
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
     val EXTERNAL_SORT = "spark.sql.planner.externalSort"
@@ -561,6 +568,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with ParserCon
 
   private[spark] def nativeView: Boolean = getConf(NATIVE_VIEW)
 
+  private[spark] def wholeStageEnabled: Boolean = getConf(WHOLESTAGE_CODEGEN_ENABLED)
+
   def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE)
 
   private[spark] def subexpressionEliminationEnabled: Boolean =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a0939adb6d..18ddffe1be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -904,7 +904,8 @@ class SQLContext private[sql](
   @transient
   protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] {
     val batches = Seq(
-      Batch("Add exchange", Once, EnsureRequirements(self))
+      Batch("Add exchange", Once, EnsureRequirements(self)),
+      Batch("Whole stage codegen", Once, CollapseCodegenStages(self))
     )
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/BufferedRowIterator.java b/sql/core/src/main/scala/org/apache/spark/sql/execution/BufferedRowIterator.java
new file mode 100644
index 0000000000..b1bbb1da10
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/BufferedRowIterator.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution;
+
+import scala.collection.Iterator;
+
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+
+/**
+ * An iterator interface used to pull the output from generated function for multiple operators
+ * (whole stage codegen).
+ *
+ * TODO: replaced it by batched columnar format.
+ */
+public class BufferedRowIterator {
+  protected InternalRow currentRow;
+  protected Iterator<InternalRow> input;
+  // used when there is no column in output
+  protected UnsafeRow unsafeRow = new UnsafeRow(0);
+
+  public boolean hasNext() {
+    if (currentRow == null) {
+      processNext();
+    }
+    return currentRow != null;
+  }
+
+  public InternalRow next() {
+    InternalRow r = currentRow;
+    currentRow = null;
+    return r;
+  }
+
+  public void setInput(Iterator<InternalRow> iter) {
+    input = iter;
+  }
+
+  /**
+   * Processes the input until have a row as output (currentRow).
+   *
+   * After it's called, if currentRow is still null, it means no more rows left.
+   */
+  protected void processNext() {
+    if (input.hasNext()) {
+      currentRow = input.next();
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 2355de3d05..75101ea0fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -97,7 +97,6 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   /** Specifies sort order for each partition requirements on the input data for this operator. */
   def requiredChildOrdering: Seq[Seq[SortOrder]] = Seq.fill(children.size)(Nil)
 
-
   /**
    * Returns the result of this query as an RDD[InternalRow] by delegating to doExecute
    * after adding query plan information to created RDDs for visualization.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala
new file mode 100644
index 0000000000..c15fabab80
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala
@@ -0,0 +1,299 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, BoundReference, Expression, LeafExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.rules.Rule
+
+/**
+  * An interface for those physical operators that support codegen.
+  */
+trait CodegenSupport extends SparkPlan {
+
+  /**
+    * Whether this SparkPlan support whole stage codegen or not.
+    */
+  def supportCodegen: Boolean = true
+
+  /**
+    * Which SparkPlan is calling produce() of this one. It's itself for the first SparkPlan.
+    */
+  private var parent: CodegenSupport = null
+
+  /**
+    * Returns an input RDD of InternalRow and Java source code to process them.
+    */
+  def produce(ctx: CodegenContext, parent: CodegenSupport): (RDD[InternalRow], String) = {
+    this.parent = parent
+    doProduce(ctx)
+  }
+
+  /**
+    * Generate the Java source code to process, should be overrided by subclass to support codegen.
+    *
+    * doProduce() usually generate the framework, for example, aggregation could generate this:
+    *
+    *   if (!initialized) {
+    *     # create a hash map, then build the aggregation hash map
+    *     # call child.produce()
+    *     initialized = true;
+    *   }
+    *   while (hashmap.hasNext()) {
+    *     row = hashmap.next();
+    *     # build the aggregation results
+    *     # create varialbles for results
+    *     # call consume(), wich will call parent.doConsume()
+    *   }
+    */
+  protected def doProduce(ctx: CodegenContext): (RDD[InternalRow], String)
+
+  /**
+    * Consume the columns generated from current SparkPlan, call it's parent or create an iterator.
+    */
+  protected def consume(ctx: CodegenContext, columns: Seq[ExprCode]): String = {
+    assert(columns.length == output.length)
+    parent.doConsume(ctx, this, columns)
+  }
+
+
+  /**
+    * Generate the Java source code to process the rows from child SparkPlan.
+    *
+    * This should be override by subclass to support codegen.
+    *
+    * For example, Filter will generate the code like this:
+    *
+    *   # code to evaluate the predicate expression, result is isNull1 and value2
+    *   if (isNull1 || value2) {
+    *     # call consume(), which will call parent.doConsume()
+    *   }
+    */
+  def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String
+}
+
+
+/**
+  * InputAdapter is used to hide a SparkPlan from a subtree that support codegen.
+  *
+  * This is the leaf node of a tree with WholeStageCodegen, is used to generate code that consumes
+  * an RDD iterator of InternalRow.
+  */
+case class InputAdapter(child: SparkPlan) extends LeafNode with CodegenSupport {
+
+  override def output: Seq[Attribute] = child.output
+
+  override def supportCodegen: Boolean = true
+
+  override def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) = {
+    val exprs = output.zipWithIndex.map(x => new BoundReference(x._2, x._1.dataType, true))
+    val row = ctx.freshName("row")
+    ctx.INPUT_ROW = row
+    ctx.currentVars = null
+    val columns = exprs.map(_.gen(ctx))
+    val code = s"""
+       |  while (input.hasNext()) {
+       |   InternalRow $row = (InternalRow) input.next();
+       |   ${columns.map(_.code).mkString("\n")}
+       |   ${consume(ctx, columns)}
+       | }
+     """.stripMargin
+    (child.execute(), code)
+  }
+
+  def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String = {
+    throw new UnsupportedOperationException
+  }
+
+  override def doExecute(): RDD[InternalRow] = {
+    throw new UnsupportedOperationException
+  }
+
+  override def simpleString: String = "INPUT"
+}
+
+/**
+  * WholeStageCodegen compile a subtree of plans that support codegen together into single Java
+  * function.
+  *
+  * Here is the call graph of to generate Java source (plan A support codegen, but plan B does not):
+  *
+  *   WholeStageCodegen       Plan A               FakeInput        Plan B
+  * =========================================================================
+  *
+  * -> execute()
+  *     |
+  *  doExecute() -------->   produce()
+  *                             |
+  *                          doProduce()  -------> produce()
+  *                                                   |
+  *                                                doProduce() ---> execute()
+  *                                                   |
+  *                                                consume()
+  *                          doConsume()  ------------|
+  *                             |
+  *  doConsume()  <-----    consume()
+  *
+  * SparkPlan A should override doProduce() and doConsume().
+  *
+  * doCodeGen() will create a CodeGenContext, which will hold a list of variables for input,
+  * used to generated code for BoundReference.
+  */
+case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan])
+  extends SparkPlan with CodegenSupport {
+
+  override def output: Seq[Attribute] = plan.output
+
+  override def doExecute(): RDD[InternalRow] = {
+    val ctx = new CodegenContext
+    val (rdd, code) = plan.produce(ctx, this)
+    val references = ctx.references.toArray
+    val source = s"""
+      public Object generate(Object[] references) {
+       return new GeneratedIterator(references);
+      }
+
+      class GeneratedIterator extends org.apache.spark.sql.execution.BufferedRowIterator {
+
+       private Object[] references;
+       ${ctx.declareMutableStates()}
+
+       public GeneratedIterator(Object[] references) {
+         this.references = references;
+         ${ctx.initMutableStates()}
+       }
+
+       protected void processNext() {
+         $code
+       }
+      }
+     """
+    // try to compile, helpful for debug
+    // println(s"${CodeFormatter.format(source)}")
+    CodeGenerator.compile(source)
+
+    rdd.mapPartitions { iter =>
+      val clazz = CodeGenerator.compile(source)
+      val buffer = clazz.generate(references).asInstanceOf[BufferedRowIterator]
+      buffer.setInput(iter)
+      new Iterator[InternalRow] {
+        override def hasNext: Boolean = buffer.hasNext
+        override def next: InternalRow = buffer.next()
+      }
+    }
+  }
+
+  override def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) = {
+    throw new UnsupportedOperationException
+  }
+
+  override def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String = {
+    if (input.nonEmpty) {
+      val colExprs = output.zipWithIndex.map { case (attr, i) =>
+        BoundReference(i, attr.dataType, attr.nullable)
+      }
+      // generate the code to create a UnsafeRow
+      ctx.currentVars = input
+      val code = GenerateUnsafeProjection.createCode(ctx, colExprs, false)
+      s"""
+         | ${code.code.trim}
+         | currentRow = ${code.value};
+         | return;
+     """.stripMargin
+    } else {
+      // There is no columns
+      s"""
+         | currentRow = unsafeRow;
+         | return;
+       """.stripMargin
+    }
+  }
+
+  override def generateTreeString(
+      depth: Int,
+      lastChildren: Seq[Boolean],
+      builder: StringBuilder): StringBuilder = {
+    if (depth > 0) {
+      lastChildren.init.foreach { isLast =>
+        val prefixFragment = if (isLast) "   " else ":  "
+        builder.append(prefixFragment)
+      }
+
+      val branch = if (lastChildren.last) "+- " else ":- "
+      builder.append(branch)
+    }
+
+    builder.append(simpleString)
+    builder.append("\n")
+
+    plan.generateTreeString(depth + 1, lastChildren :+children.isEmpty :+ true, builder)
+    if (children.nonEmpty) {
+      children.init.foreach(_.generateTreeString(depth + 1, lastChildren :+ false, builder))
+      children.last.generateTreeString(depth + 1, lastChildren :+ true, builder)
+    }
+
+    builder
+  }
+
+  override def simpleString: String = "WholeStageCodegen"
+}
+
+
+/**
+  * Find the chained plans that support codegen, collapse them together as WholeStageCodegen.
+  */
+private[sql] case class CollapseCodegenStages(sqlContext: SQLContext) extends Rule[SparkPlan] {
+
+  private def supportCodegen(plan: SparkPlan): Boolean = plan match {
+    case plan: CodegenSupport if plan.supportCodegen =>
+      // Non-leaf with CodegenFallback does not work with whole stage codegen
+      val willFallback = plan.expressions.exists(
+        _.find(e => e.isInstanceOf[CodegenFallback] && !e.isInstanceOf[LeafExpression]).isDefined
+      )
+      // the generated code will be huge if there are too many columns
+      val haveManyColumns = plan.output.length > 200
+      !willFallback && !haveManyColumns
+    case _ => false
+  }
+
+  def apply(plan: SparkPlan): SparkPlan = {
+    if (sqlContext.conf.wholeStageEnabled) {
+      plan.transform {
+        case plan: CodegenSupport if supportCodegen(plan) &&
+          // Whole stage codegen is only useful when there are at least two levels of operators that
+          // support it (save at least one projection/iterator).
+          plan.children.exists(supportCodegen) =>
+
+          var inputs = ArrayBuffer[SparkPlan]()
+          val combined = plan.transform {
+            case p if !supportCodegen(p) =>
+              inputs += p
+              InputAdapter(p)
+          }.asInstanceOf[CodegenSupport]
+          WholeStageCodegen(combined, inputs)
+      }
+    } else {
+      plan
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 92c9a56131..9e2e0357c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -22,19 +22,37 @@ import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD, ShuffledRDD}
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, ExpressionCanonicalizer}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.LongType
 import org.apache.spark.util.MutablePair
 import org.apache.spark.util.random.PoissonSampler
 
-case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryNode {
+case class Project(projectList: Seq[NamedExpression], child: SparkPlan)
+  extends UnaryNode with CodegenSupport {
 
   override private[sql] lazy val metrics = Map(
     "numRows" -> SQLMetrics.createLongMetric(sparkContext, "number of rows"))
 
   override def output: Seq[Attribute] = projectList.map(_.toAttribute)
 
+  protected override def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) = {
+    child.asInstanceOf[CodegenSupport].produce(ctx, this)
+  }
+
+  override def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String = {
+    val exprs = projectList.map(x =>
+      ExpressionCanonicalizer.execute(BindReferences.bindReference(x, child.output)))
+    ctx.currentVars = input
+    val output = exprs.map(_.gen(ctx))
+    s"""
+       | ${output.map(_.code).mkString("\n")}
+       |
+       | ${consume(ctx, output)}
+     """.stripMargin
+  }
+
   protected override def doExecute(): RDD[InternalRow] = {
     val numRows = longMetric("numRows")
     child.execute().mapPartitionsInternal { iter =>
@@ -51,13 +69,30 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends
 }
 
 
-case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
+case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode with CodegenSupport {
   override def output: Seq[Attribute] = child.output
 
   private[sql] override lazy val metrics = Map(
     "numInputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of input rows"),
     "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))
 
+  protected override def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) = {
+    child.asInstanceOf[CodegenSupport].produce(ctx, this)
+  }
+
+  override def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String = {
+    val expr = ExpressionCanonicalizer.execute(
+      BindReferences.bindReference(condition, child.output))
+    ctx.currentVars = input
+    val eval = expr.gen(ctx)
+    s"""
+       | ${eval.code}
+       | if (!${eval.isNull} && ${eval.value}) {
+       |   ${consume(ctx, ctx.currentVars)}
+       | }
+     """.stripMargin
+  }
+
   protected override def doExecute(): RDD[InternalRow] = {
     val numInputRows = longMetric("numInputRows")
     val numOutputRows = longMetric("numOutputRows")
@@ -116,7 +151,80 @@ case class Range(
     numSlices: Int,
     numElements: BigInt,
     output: Seq[Attribute])
-  extends LeafNode {
+  extends LeafNode with CodegenSupport {
+
+  protected override def doProduce(ctx: CodegenContext): (RDD[InternalRow], String) = {
+    val initTerm = ctx.freshName("range_initRange")
+    ctx.addMutableState("boolean", initTerm, s"$initTerm = false;")
+    val partitionEnd = ctx.freshName("range_partitionEnd")
+    ctx.addMutableState("long", partitionEnd, s"$partitionEnd = 0L;")
+    val number = ctx.freshName("range_number")
+    ctx.addMutableState("long", number, s"$number = 0L;")
+    val overflow = ctx.freshName("range_overflow")
+    ctx.addMutableState("boolean", overflow, s"$overflow = false;")
+
+    val value = ctx.freshName("range_value")
+    val ev = ExprCode("", "false", value)
+    val BigInt = classOf[java.math.BigInteger].getName
+    val checkEnd = if (step > 0) {
+      s"$number < $partitionEnd"
+    } else {
+      s"$number > $partitionEnd"
+    }
+
+    val rdd = sqlContext.sparkContext.parallelize(0 until numSlices, numSlices)
+      .map(i => InternalRow(i))
+
+    val code = s"""
+      | // initialize Range
+      | if (!$initTerm) {
+      |   $initTerm = true;
+      |   if (input.hasNext()) {
+      |     $BigInt index = $BigInt.valueOf(((InternalRow) input.next()).getInt(0));
+      |     $BigInt numSlice = $BigInt.valueOf(${numSlices}L);
+      |     $BigInt numElement = $BigInt.valueOf(${numElements.toLong}L);
+      |     $BigInt step = $BigInt.valueOf(${step}L);
+      |     $BigInt start = $BigInt.valueOf(${start}L);
+      |
+      |     $BigInt st = index.multiply(numElement).divide(numSlice).multiply(step).add(start);
+      |     if (st.compareTo($BigInt.valueOf(Long.MAX_VALUE)) > 0) {
+      |       $number = Long.MAX_VALUE;
+      |     } else if (st.compareTo($BigInt.valueOf(Long.MIN_VALUE)) < 0) {
+      |       $number = Long.MIN_VALUE;
+      |     } else {
+      |       $number = st.longValue();
+      |     }
+      |
+      |     $BigInt end = index.add($BigInt.ONE).multiply(numElement).divide(numSlice)
+      |       .multiply(step).add(start);
+      |     if (end.compareTo($BigInt.valueOf(Long.MAX_VALUE)) > 0) {
+      |       $partitionEnd = Long.MAX_VALUE;
+      |     } else if (end.compareTo($BigInt.valueOf(Long.MIN_VALUE)) < 0) {
+      |       $partitionEnd = Long.MIN_VALUE;
+      |     } else {
+      |       $partitionEnd = end.longValue();
+      |     }
+      |   } else {
+      |     return;
+      |   }
+      | }
+      |
+      | while (!$overflow && $checkEnd) {
+      |  long $value = $number;
+      |  $number += ${step}L;
+      |  if ($number < $value ^ ${step}L < 0) {
+      |    $overflow = true;
+      |  }
+      |  ${consume(ctx, Seq(ev))}
+      | }
+     """.stripMargin
+
+    (rdd, code)
+  }
+
+  def doConsume(ctx: CodegenContext, child: SparkPlan, input: Seq[ExprCode]): String = {
+    throw new UnsupportedOperationException
+  }
 
   protected override def doExecute(): RDD[InternalRow] = {
     sqlContext
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
index 7888e34e8a..72eb1f6cf0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
@@ -143,14 +143,14 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
         private DataType[] columnTypes = null;
         private int[] columnIndexes = null;
 
-        ${declareMutableStates(ctx)}
+        ${ctx.declareMutableStates()}
 
         public SpecificColumnarIterator() {
           this.nativeOrder = ByteOrder.nativeOrder();
           this.buffers = new byte[${columnTypes.length}][];
           this.mutableRow = new MutableUnsafeRow(rowWriter);
 
-          ${initMutableStates(ctx)}
+          ${ctx.initMutableStates()}
         }
 
         public void initialize(Iterator input, DataType[] columnTypes, int[] columnIndexes) {
@@ -190,6 +190,6 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
 
     logDebug(s"Generated ColumnarIterator: ${CodeFormatter.format(code)}")
 
-    compile(code).generate(Array.empty).asInstanceOf[ColumnarIterator]
+    CodeGenerator.compile(code).generate(Array.empty).asInstanceOf[ColumnarIterator]
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 89b9a68768..e8d0678989 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -36,12 +36,12 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
   import testImplicits._
 
   def rddIdOf(tableName: String): Int = {
-    val executedPlan = sqlContext.table(tableName).queryExecution.executedPlan
-    executedPlan.collect {
+    val plan = sqlContext.table(tableName).queryExecution.sparkPlan
+    plan.collect {
       case InMemoryColumnarTableScan(_, _, relation) =>
         relation.cachedColumnBuffers.id
       case _ =>
-        fail(s"Table $tableName is not cached\n" + executedPlan)
+        fail(s"Table $tableName is not cached\n" + plan)
     }.head
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index eb4efcd1d4..b349bb6dc9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -629,7 +629,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
     }
 
     def checkNumProjects(df: DataFrame, expectedNumProjects: Int): Unit = {
-      val projects = df.queryExecution.executedPlan.collect {
+      val projects = df.queryExecution.sparkPlan.collect {
         case tungstenProject: Project => tungstenProject
       }
       assert(projects.size === expectedNumProjects)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index 39a65413bd..c17be8ace9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -123,15 +123,15 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
     val df2 = Seq((1, "1"), (2, "2")).toDF("key", "value")
 
     // equijoin - should be converted into broadcast join
-    val plan1 = df1.join(broadcast(df2), "key").queryExecution.executedPlan
+    val plan1 = df1.join(broadcast(df2), "key").queryExecution.sparkPlan
     assert(plan1.collect { case p: BroadcastHashJoin => p }.size === 1)
 
     // no join key -- should not be a broadcast join
-    val plan2 = df1.join(broadcast(df2)).queryExecution.executedPlan
+    val plan2 = df1.join(broadcast(df2)).queryExecution.sparkPlan
     assert(plan2.collect { case p: BroadcastHashJoin => p }.size === 0)
 
     // planner should not crash without a join
-    broadcast(df1).queryExecution.executedPlan
+    broadcast(df1).queryExecution.sparkPlan
 
     // SPARK-12275: no physical plan for BroadcastHint in some condition
     withTempPath { path =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 75e81b9c91..bdb9421cc1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -247,7 +247,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   private def testCodeGen(sqlText: String, expectedResults: Seq[Row]): Unit = {
     val df = sql(sqlText)
     // First, check if we have GeneratedAggregate.
-    val hasGeneratedAgg = df.queryExecution.executedPlan
+    val hasGeneratedAgg = df.queryExecution.sparkPlan
       .collect { case _: aggregate.TungstenAggregate => true }
       .nonEmpty
     if (!hasGeneratedAgg) {
@@ -792,11 +792,11 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-11111 null-safe join should not use cartesian product") {
     val df = sql("select count(*) from testData a join testData b on (a.key <=> b.key)")
-    val cp = df.queryExecution.executedPlan.collect {
+    val cp = df.queryExecution.sparkPlan.collect {
       case cp: CartesianProduct => cp
     }
     assert(cp.isEmpty, "should not use CartesianProduct for null-safe join")
-    val smj = df.queryExecution.executedPlan.collect {
+    val smj = df.queryExecution.sparkPlan.collect {
       case smj: SortMergeJoin => smj
     }
     assert(smj.size > 0, "should use SortMergeJoin")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/BenchmarkWholeStageCodegen.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/BenchmarkWholeStageCodegen.scala
new file mode 100644
index 0000000000..788b04fcf8
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/BenchmarkWholeStageCodegen.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.util.Benchmark
+
+/**
+  * Benchmark to measure whole stage codegen performance.
+  * To run this:
+  *  build/sbt "sql/test-only *BenchmarkWholeStageCodegen"
+  */
+class BenchmarkWholeStageCodegen extends SparkFunSuite {
+  def testWholeStage(values: Int): Unit = {
+    val conf = new SparkConf().setMaster("local[1]").setAppName("benchmark")
+    val sc = SparkContext.getOrCreate(conf)
+    val sqlContext = SQLContext.getOrCreate(sc)
+
+    val benchmark = new Benchmark("Single Int Column Scan", values)
+
+    benchmark.addCase("Without whole stage codegen") { iter =>
+      sqlContext.setConf("spark.sql.codegen.wholeStage", "false")
+      sqlContext.range(values).filter("(id & 1) = 1").count()
+    }
+
+    benchmark.addCase("With whole stage codegen") { iter =>
+      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
+      sqlContext.range(values).filter("(id & 1) = 1").count()
+    }
+
+    /*
+      Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz
+      Single Int Column Scan:      Avg Time(ms)    Avg Rate(M/s)  Relative Rate
+      -------------------------------------------------------------------------
+      Without whole stage codegen       6725.52            31.18         1.00 X
+      With whole stage codegen          2233.05            93.91         3.01 X
+    */
+    benchmark.run()
+  }
+
+  ignore("benchmark") {
+    testWholeStage(1024 * 1024 * 200)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 03a1b8e11d..49feeaf17d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -94,7 +94,7 @@ class PlannerSuite extends SharedSQLContext {
           """
             |SELECT l.a, l.b
             |FROM testData2 l JOIN (SELECT * FROM testLimit LIMIT 1) r ON (l.a = r.key)
-          """.stripMargin).queryExecution.executedPlan
+          """.stripMargin).queryExecution.sparkPlan
 
         val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
         val sortMergeJoins = planned.collect { case join: SortMergeJoin => join }
@@ -147,7 +147,7 @@ class PlannerSuite extends SharedSQLContext {
 
         val a = testData.as("a")
         val b = sqlContext.table("tiny").as("b")
-        val planned = a.join(b, $"a.key" === $"b.key").queryExecution.executedPlan
+        val planned = a.join(b, $"a.key" === $"b.key").queryExecution.sparkPlan
 
         val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
         val sortMergeJoins = planned.collect { case join: SortMergeJoin => join }
@@ -168,7 +168,7 @@ class PlannerSuite extends SharedSQLContext {
       sqlContext.registerDataFrameAsTable(df, "testPushed")
 
       withTempTable("testPushed") {
-        val exp = sql("select * from testPushed where key = 15").queryExecution.executedPlan
+        val exp = sql("select * from testPushed where key = 15").queryExecution.sparkPlan
         assert(exp.toString.contains("PushedFilters: [EqualTo(key,15)]"))
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
new file mode 100644
index 0000000000..c54fc6ba2d
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.test.SharedSQLContext
+
+class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {
+
+  test("range/filter should be combined") {
+    val df = sqlContext.range(10).filter("id = 1").selectExpr("id + 1")
+    val plan = df.queryExecution.executedPlan
+    assert(plan.find(_.isInstanceOf[WholeStageCodegen]).isDefined)
+
+    checkThatPlansAgree(
+      sqlContext.range(100),
+      (p: SparkPlan) =>
+        WholeStageCodegen(Filter('a == 1, InputAdapter(p)), Seq()),
+      (p: SparkPlan) => Filter('a == 1, p),
+      sortAnswers = false
+    )
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 25afed25c8..6e21d5a061 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -31,7 +31,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   setupTestData()
 
   test("simple columnar query") {
-    val plan = sqlContext.executePlan(testData.logicalPlan).executedPlan
+    val plan = sqlContext.executePlan(testData.logicalPlan).sparkPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().toSeq)
@@ -48,7 +48,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("projection") {
-    val plan = sqlContext.executePlan(testData.select('value, 'key).logicalPlan).executedPlan
+    val plan = sqlContext.executePlan(testData.select('value, 'key).logicalPlan).sparkPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().map {
@@ -57,7 +57,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-1436 regression: in-memory columns must be able to be accessed multiple times") {
-    val plan = sqlContext.executePlan(testData.logicalPlan).executedPlan
+    val plan = sqlContext.executePlan(testData.logicalPlan).sparkPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().toSeq)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala
index d762f7bfe9..647a7e9a4e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/PartitionBatchPruningSuite.scala
@@ -114,7 +114,7 @@ class PartitionBatchPruningSuite extends SparkFunSuite with SharedSQLContext {
         df.collect().map(_(0)).toArray
       }
 
-      val (readPartitions, readBatches) = df.queryExecution.executedPlan.collect {
+      val (readPartitions, readBatches) = df.queryExecution.sparkPlan.collect {
         case in: InMemoryColumnarTableScan => (in.readPartitions.value, in.readBatches.value)
       }.head
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 58581d71e1..aee8e84db5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -62,7 +62,7 @@ class BroadcastJoinSuite extends QueryTest with BeforeAndAfterAll {
       // Comparison at the end is for broadcast left semi join
       val joinExpression = df1("key") === df2("key") && df1("value") > df2("value")
       val df3 = df1.join(broadcast(df2), joinExpression, joinType)
-      val plan = df3.queryExecution.executedPlan
+      val plan = df3.queryExecution.sparkPlan
       assert(plan.collect { case p: T => p }.size === 1)
       plan.executeCollect()
     }
author	Davies Liu <davies@databricks.com>	2016-01-16 10:29:27 -0800
committer	Davies Liu <davies.liu@gmail.com>	2016-01-16 10:29:27 -0800
commit	3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6 (patch)
tree	2e05f1fbb9eec2c870081b00b9e60505db07e1b9 /sql/core
parent	86972fa52152d2149b88ba75be048a6986006285 (diff)
download	spark-3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6.tar.gz spark-3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6.tar.bz2 spark-3c0d2365d57fc49ac9bf0d7cc9bd2ef633fb5fb6.zip