[SPARK-14251][SQL] Add SQL command for printing out generated code for debugging

## What changes were proposed in this pull request? This PR implements `EXPLAIN CODEGEN` SQL command which returns generated codes like `debugCodegen`. In `spark-shell`, we don't need to `import debug` module. In `spark-sql`, we can use this SQL command now. **Before** ``` scala> import org.apache.spark.sql.execution.debug._ scala> sql("select 'a' as a group by 1").debugCodegen() Found 2 WholeStageCodegen subtrees. == Subtree 1 / 2 == ... Generated code: ... == Subtree 2 / 2 == ... Generated code: ... ``` **After** ``` scala> sql("explain extended codegen select 'a' as a group by 1").collect().foreach(println) [Found 2 WholeStageCodegen subtrees.] [== Subtree 1 / 2 ==] ... [] [Generated code:] ... [] [== Subtree 2 / 2 ==] ... [] [Generated code:] ... ``` ## How was this patch tested? Pass the Jenkins tests (including new testcases) Author: Dongjoon Hyun <dongjoon@apache.org> Closes #12099 from dongjoon-hyun/SPARK-14251.
author: Dongjoon Hyun <dongjoon@apache.org> 2016-04-01 22:45:52 -0700
committer: Reynold Xin <rxin@databricks.com> 2016-04-01 22:45:52 -0700
commit: fa1af0aff7bde9bbf7bfa6a3ac74699734c2fd8a (patch)
tree: 1b0e52e2617c8021960cb5aa3beba5fff6f999c2
parent: 877dc712e66db69cb320e10ba5edebca401591e3 (diff)
download: spark-fa1af0aff7bde9bbf7bfa6a3ac74699734c2fd8a.tar.gz
spark-fa1af0aff7bde9bbf7bfa6a3ac74699734c2fd8a.tar.bz2
spark-fa1af0aff7bde9bbf7bfa6a3ac74699734c2fd8a.zip
7 files changed, 67 insertions, 31 deletions
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index d1747b9915..f34bb061e4 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -584,7 +584,7 @@ frameBound
 
 
 explainOption
-    : LOGICAL | FORMATTED | EXTENDED
+    : LOGICAL | FORMATTED | EXTENDED | CODEGEN
     ;
 
 transactionMode
@@ -633,7 +633,7 @@ nonReserved
     | DELIMITED | FIELDS | TERMINATED | COLLECTION | ITEMS | KEYS | ESCAPED | LINES | SEPARATED
     | EXTENDED | REFRESH | CLEAR | CACHE | UNCACHE | LAZY | TEMPORARY | OPTIONS
     | GROUPING | CUBE | ROLLUP
-    | EXPLAIN | FORMAT | LOGICAL | FORMATTED
+    | EXPLAIN | FORMAT | LOGICAL | FORMATTED | CODEGEN
     | TABLESAMPLE | USE | TO | BUCKET | PERCENTLIT | OUT | OF
     | SET
     | VIEW | REPLACE
@@ -724,6 +724,7 @@ DESCRIBE: 'DESCRIBE';
 EXPLAIN: 'EXPLAIN';
 FORMAT: 'FORMAT';
 LOGICAL: 'LOGICAL';
+CODEGEN: 'CODEGEN';
 CAST: 'CAST';
 SHOW: 'SHOW';
 TABLES: 'TABLES';
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 7efe98dd18..ff3ab7746c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -136,7 +136,8 @@ class SparkSqlAstBuilder extends AstBuilder {
     // Create the explain comment.
     val statement = plan(ctx.statement)
     if (isExplainableStatement(statement)) {
-      ExplainCommand(statement, extended = options.exists(_.EXTENDED != null))
+      ExplainCommand(statement, extended = options.exists(_.EXTENDED != null),
+        codegen = options.exists(_.CODEGEN != null))
     } else {
       ExplainCommand(OneRowRelation)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
index f90d8717ca..4bc62cdc4a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -28,10 +28,10 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.debug._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
-
 /**
  * A logical command that is executed for its side-effects.  `RunnableCommand`s are
  * wrapped in `ExecutedCommand` during execution.
@@ -237,15 +237,22 @@ case class ExplainCommand(
     logicalPlan: LogicalPlan,
     override val output: Seq[Attribute] =
       Seq(AttributeReference("plan", StringType, nullable = true)()),
-    extended: Boolean = false)
+    extended: Boolean = false,
+    codegen: Boolean = false)
   extends RunnableCommand {
 
   // Run through the optimizer to generate the physical plan.
   override def run(sqlContext: SQLContext): Seq[Row] = try {
     // TODO in Hive, the "extended" ExplainCommand prints the AST as well, and detailed properties.
     val queryExecution = sqlContext.executePlan(logicalPlan)
-    val outputString = if (extended) queryExecution.toString else queryExecution.simpleString
-
+    val outputString =
+      if (codegen) {
+        codegenString(queryExecution.executedPlan)
+      } else if (extended) {
+        queryExecution.toString
+      } else {
+        queryExecution.simpleString
+      }
     outputString.split("\n").map(Row(_))
   } catch { case cause: TreeNodeException[_] =>
     ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index 3a174ed94c..7b0c8ebdfa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -48,6 +48,25 @@ package object debug {
     // scalastyle:on println
   }
 
+  def codegenString(plan: SparkPlan): String = {
+    val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegen]()
+    plan transform {
+      case s: WholeStageCodegen =>
+        codegenSubtrees += s
+        s
+      case s => s
+    }
+    var output = s"Found ${codegenSubtrees.size} WholeStageCodegen subtrees.\n"
+    for ((s, i) <- codegenSubtrees.toSeq.zipWithIndex) {
+      output += s"== Subtree ${i + 1} / ${codegenSubtrees.size} ==\n"
+      output += s
+      output += "\nGenerated code:\n"
+      val (_, source) = s.doCodeGen()
+      output += s"${CodeFormatter.format(source)}\n"
+    }
+    output
+  }
+
   /**
    * Augments [[SQLContext]] with debug methods.
    */
@@ -81,28 +100,7 @@ package object debug {
      * WholeStageCodegen subtree).
      */
     def debugCodegen(): Unit = {
-      debugPrint(debugCodegenString())
-    }
-
-    /** Visible for testing. */
-    def debugCodegenString(): String = {
-      val plan = query.queryExecution.executedPlan
-      val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegen]()
-      plan transform {
-        case s: WholeStageCodegen =>
-          codegenSubtrees += s
-          s
-        case s => s
-      }
-      var output = s"Found ${codegenSubtrees.size} WholeStageCodegen subtrees.\n"
-      for ((s, i) <- codegenSubtrees.toSeq.zipWithIndex) {
-        output += s"== Subtree ${i + 1} / ${codegenSubtrees.size} ==\n"
-        output += s
-        output += "\nGenerated code:\n"
-        val (_, source) = s.doCodeGen()
-        output += s"${CodeFormatter.format(source)}\n"
-      }
-      output
+      debugPrint(codegenString(query.queryExecution.executedPlan))
     }
   }
 
@@ -123,6 +121,7 @@ package object debug {
 
     /**
      * A collection of metrics for each column of output.
+     *
      * @param elementTypes the actual runtime types for the output.  Useful when there are bugs
      *                     causing the wrong data to be projected.
      */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
index 979265e274..c0fce4b96a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
@@ -27,7 +27,7 @@ class DebuggingSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("debugCodegen") {
-    val res = sqlContext.range(10).groupBy("id").count().debugCodegenString()
+    val res = codegenString(sqlContext.range(10).groupBy("id").count().queryExecution.executedPlan)
     assert(res.contains("Subtree 1 / 2"))
     assert(res.contains("Subtree 2 / 2"))
     assert(res.contains("Object[]"))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index cd26a68f35..64d1341a47 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -24,7 +24,6 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.command.RunnableCommand
 import org.apache.spark.sql.execution.datasources.{BucketSpec, DataSource, LogicalRelation}
 import org.apache.spark.sql.hive.HiveContext
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
index b7ef5d1db7..c45d49d6c0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -101,4 +101,33 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
         "Physical Plan should not contain Subquery since it's eliminated by optimizer")
     }
   }
+
+  test("EXPLAIN CODEGEN command") {
+    checkExistence(sql("EXPLAIN CODEGEN SELECT 1"), true,
+      "WholeStageCodegen",
+      "Generated code:",
+      "/* 001 */ public Object generate(Object[] references) {",
+      "/* 002 */   return new GeneratedIterator(references);",
+      "/* 003 */ }"
+    )
+
+    checkExistence(sql("EXPLAIN CODEGEN SELECT 1"), false,
+      "== Physical Plan =="
+    )
+
+    checkExistence(sql("EXPLAIN EXTENDED CODEGEN SELECT 1"), true,
+      "WholeStageCodegen",
+      "Generated code:",
+      "/* 001 */ public Object generate(Object[] references) {",
+      "/* 002 */   return new GeneratedIterator(references);",
+      "/* 003 */ }"
+    )
+
+    checkExistence(sql("EXPLAIN EXTENDED CODEGEN SELECT 1"), false,
+      "== Parsed Logical Plan ==",
+      "== Analyzed Logical Plan ==",
+      "== Optimized Logical Plan ==",
+      "== Physical Plan =="
+    )
+  }
 }
author	Dongjoon Hyun <dongjoon@apache.org>	2016-04-01 22:45:52 -0700
committer	Reynold Xin <rxin@databricks.com>	2016-04-01 22:45:52 -0700
commit	fa1af0aff7bde9bbf7bfa6a3ac74699734c2fd8a (patch)
tree	1b0e52e2617c8021960cb5aa3beba5fff6f999c2
parent	877dc712e66db69cb320e10ba5edebca401591e3 (diff)
download	spark-fa1af0aff7bde9bbf7bfa6a3ac74699734c2fd8a.tar.gz spark-fa1af0aff7bde9bbf7bfa6a3ac74699734c2fd8a.tar.bz2 spark-fa1af0aff7bde9bbf7bfa6a3ac74699734c2fd8a.zip