[SPARK-2919] [SQL] Basic support for analyze command in HiveQl

The command we will support is ``` ANALYZE TABLE tablename COMPUTE STATISTICS noscan ``` Other cases shown in https://cwiki.apache.org/confluence/display/Hive/StatsDev#StatsDev-ExistingTables will still be treated as Hive native commands. JIRA: https://issues.apache.org/jira/browse/SPARK-2919 Author: Yin Huai <huai@cse.ohio-state.edu> Closes #1848 from yhuai/sqlAnalyze and squashes the following commits: 0b79d36 [Yin Huai] Typo and format. c59d94b [Yin Huai] Support "ANALYZE TABLE tableName COMPUTE STATISTICS noscan".
author: Yin Huai <huai@cse.ohio-state.edu> 2014-08-08 11:23:58 -0700
committer: Michael Armbrust <michael@databricks.com> 2014-08-08 11:23:58 -0700
commit: 45d8f4deab50ae069ecde2201bd486d464a4501e (patch)
tree: 2e7119af6159a9f33b0d60a1cc4fca7aa7babbf4 /sql
parent: c874723fa844b49f057bb2434a12228b2f717e99 (diff)
download: spark-45d8f4deab50ae069ecde2201bd486d464a4501e.tar.gz
spark-45d8f4deab50ae069ecde2201bd486d464a4501e.tar.bz2
spark-45d8f4deab50ae069ecde2201bd486d464a4501e.zip
4 files changed, 89 insertions, 5 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index bc2fefafd5..05b2f5f6cd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -46,6 +46,8 @@ private[hive] case class AddFile(filePath: String) extends Command
 
 private[hive] case class DropTable(tableName: String, ifExists: Boolean) extends Command
 
+private[hive] case class AnalyzeTable(tableName: String) extends Command
+
 /** Provides a mapping from HiveQL statements to catalyst logical plans and expression trees. */
 private[hive] object HiveQl {
   protected val nativeCommands = Seq(
@@ -74,7 +76,6 @@ private[hive] object HiveQl {
     "TOK_CREATEFUNCTION",
     "TOK_DROPFUNCTION",
 
-    "TOK_ANALYZE",
     "TOK_ALTERDATABASE_PROPERTIES",
     "TOK_ALTERINDEX_PROPERTIES",
     "TOK_ALTERINDEX_REBUILD",
@@ -92,7 +93,6 @@ private[hive] object HiveQl {
     "TOK_ALTERTABLE_SKEWED",
     "TOK_ALTERTABLE_TOUCH",
     "TOK_ALTERTABLE_UNARCHIVE",
-    "TOK_ANALYZE",
     "TOK_CREATEDATABASE",
     "TOK_CREATEFUNCTION",
     "TOK_CREATEINDEX",
@@ -239,7 +239,6 @@ private[hive] object HiveQl {
         ShellCommand(sql.drop(1))
       } else {
         val tree = getAst(sql)
-
         if (nativeCommands contains tree.getText) {
           NativeCommand(sql)
         } else {
@@ -387,6 +386,22 @@ private[hive] object HiveQl {
            ifExists) =>
       val tableName = tableNameParts.map { case Token(p, Nil) => p }.mkString(".")
       DropTable(tableName, ifExists.nonEmpty)
+    // Support "ANALYZE TABLE tableNmae COMPUTE STATISTICS noscan"
+    case Token("TOK_ANALYZE",
+            Token("TOK_TAB", Token("TOK_TABNAME", tableNameParts) :: partitionSpec) ::
+            isNoscan) =>
+      // Reference:
+      // https://cwiki.apache.org/confluence/display/Hive/StatsDev#StatsDev-ExistingTables
+      if (partitionSpec.nonEmpty) {
+        // Analyze partitions will be treated as a Hive native command.
+        NativePlaceholder
+      } else if (isNoscan.isEmpty) {
+        // If users do not specify "noscan", it will be treated as a Hive native command.
+        NativePlaceholder
+      } else {
+        val tableName = tableNameParts.map { case Token(p, Nil) => p }.mkString(".")
+        AnalyzeTable(tableName)
+      }
     // Just fake explain for any of the native commands.
     case Token("TOK_EXPLAIN", explainArgs)
       if noExplainCommands.contains(explainArgs.head.getText) =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 2175c5f383..85d2496a34 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -83,6 +83,8 @@ private[hive] trait HiveStrategies {
 
       case DropTable(tableName, ifExists) => execution.DropTable(tableName, ifExists) :: Nil
 
+      case AnalyzeTable(tableName) => execution.AnalyzeTable(tableName) :: Nil
+
       case describe: logical.DescribeCommand =>
         val resolvedTable = context.executePlan(describe.table).analyzed
         resolvedTable match {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 9cd0c86c6c..2985169da0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -25,6 +25,32 @@ import org.apache.spark.sql.hive.HiveContext
 
 /**
  * :: DeveloperApi ::
+ * Analyzes the given table in the current database to generate statistics, which will be
+ * used in query optimizations.
+ *
+ * Right now, it only supports Hive tables and it only updates the size of a Hive table
+ * in the Hive metastore.
+ */
+@DeveloperApi
+case class AnalyzeTable(tableName: String) extends LeafNode with Command {
+
+  def hiveContext = sqlContext.asInstanceOf[HiveContext]
+
+  def output = Seq.empty
+
+  override protected[sql] lazy val sideEffectResult = {
+    hiveContext.analyze(tableName)
+    Seq.empty[Any]
+  }
+
+  override def execute(): RDD[Row] = {
+    sideEffectResult
+    sparkContext.emptyRDD[Row]
+  }
+}
+
+/**
+ * :: DeveloperApi ::
  * Drops a table from the metastore and removes it if it is cached.
  */
 @DeveloperApi
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index bf5931bbf9..7c82964b5e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -19,13 +19,54 @@ package org.apache.spark.sql.hive
 
 import scala.reflect.ClassTag
 
+
 import org.apache.spark.sql.{SQLConf, QueryTest}
+import org.apache.spark.sql.catalyst.plans.logical.NativeCommand
 import org.apache.spark.sql.execution.{BroadcastHashJoin, ShuffledHashJoin}
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 
 class StatisticsSuite extends QueryTest {
 
+  test("parse analyze commands") {
+    def assertAnalyzeCommand(analyzeCommand: String, c: Class[_]) {
+      val parsed = HiveQl.parseSql(analyzeCommand)
+      val operators = parsed.collect {
+        case a: AnalyzeTable => a
+        case o => o
+      }
+
+      assert(operators.size === 1)
+      if (operators(0).getClass() != c) {
+        fail(
+          s"""$analyzeCommand expected command: $c, but got ${operators(0)}
+             |parsed command:
+             |$parsed
+           """.stripMargin)
+      }
+    }
+
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 COMPUTE STATISTICS",
+      classOf[NativeCommand])
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS",
+      classOf[NativeCommand])
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS noscan",
+      classOf[NativeCommand])
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS",
+      classOf[NativeCommand])
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS noscan",
+      classOf[NativeCommand])
+
+    assertAnalyzeCommand(
+      "ANALYZE TABLE Table1 COMPUTE STATISTICS nOscAn",
+      classOf[AnalyzeTable])
+  }
+
   test("analyze MetastoreRelations") {
     def queryTotalSize(tableName: String): BigInt =
       catalog.lookupRelation(None, tableName).statistics.sizeInBytes
@@ -37,7 +78,7 @@ class StatisticsSuite extends QueryTest {
 
     assert(queryTotalSize("analyzeTable") === defaultSizeInBytes)
 
-    analyze("analyzeTable")
+    sql("ANALYZE TABLE analyzeTable COMPUTE STATISTICS noscan")
 
     assert(queryTotalSize("analyzeTable") === BigInt(11624))
 
@@ -66,7 +107,7 @@ class StatisticsSuite extends QueryTest {
 
     assert(queryTotalSize("analyzeTable_part") === defaultSizeInBytes)
 
-    analyze("analyzeTable_part")
+    sql("ANALYZE TABLE analyzeTable_part COMPUTE STATISTICS noscan")
 
     assert(queryTotalSize("analyzeTable_part") === BigInt(17436))
author	Yin Huai <huai@cse.ohio-state.edu>	2014-08-08 11:23:58 -0700
committer	Michael Armbrust <michael@databricks.com>	2014-08-08 11:23:58 -0700
commit	45d8f4deab50ae069ecde2201bd486d464a4501e (patch)
tree	2e7119af6159a9f33b0d60a1cc4fca7aa7babbf4 /sql
parent	c874723fa844b49f057bb2434a12228b2f717e99 (diff)
download	spark-45d8f4deab50ae069ecde2201bd486d464a4501e.tar.gz spark-45d8f4deab50ae069ecde2201bd486d464a4501e.tar.bz2 spark-45d8f4deab50ae069ecde2201bd486d464a4501e.zip