aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorYin Huai <huai@cse.ohio-state.edu>2014-08-08 11:23:58 -0700
committerMichael Armbrust <michael@databricks.com>2014-08-08 11:23:58 -0700
commit45d8f4deab50ae069ecde2201bd486d464a4501e (patch)
tree2e7119af6159a9f33b0d60a1cc4fca7aa7babbf4 /sql
parentc874723fa844b49f057bb2434a12228b2f717e99 (diff)
downloadspark-45d8f4deab50ae069ecde2201bd486d464a4501e.tar.gz
spark-45d8f4deab50ae069ecde2201bd486d464a4501e.tar.bz2
spark-45d8f4deab50ae069ecde2201bd486d464a4501e.zip
[SPARK-2919] [SQL] Basic support for analyze command in HiveQl
The command we will support is ``` ANALYZE TABLE tablename COMPUTE STATISTICS noscan ``` Other cases shown in https://cwiki.apache.org/confluence/display/Hive/StatsDev#StatsDev-ExistingTables will still be treated as Hive native commands. JIRA: https://issues.apache.org/jira/browse/SPARK-2919 Author: Yin Huai <huai@cse.ohio-state.edu> Closes #1848 from yhuai/sqlAnalyze and squashes the following commits: 0b79d36 [Yin Huai] Typo and format. c59d94b [Yin Huai] Support "ANALYZE TABLE tableName COMPUTE STATISTICS noscan".
Diffstat (limited to 'sql')
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala21
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala2
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala (renamed from sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala)26
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala45
4 files changed, 89 insertions, 5 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index bc2fefafd5..05b2f5f6cd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -46,6 +46,8 @@ private[hive] case class AddFile(filePath: String) extends Command
private[hive] case class DropTable(tableName: String, ifExists: Boolean) extends Command
+private[hive] case class AnalyzeTable(tableName: String) extends Command
+
/** Provides a mapping from HiveQL statements to catalyst logical plans and expression trees. */
private[hive] object HiveQl {
protected val nativeCommands = Seq(
@@ -74,7 +76,6 @@ private[hive] object HiveQl {
"TOK_CREATEFUNCTION",
"TOK_DROPFUNCTION",
- "TOK_ANALYZE",
"TOK_ALTERDATABASE_PROPERTIES",
"TOK_ALTERINDEX_PROPERTIES",
"TOK_ALTERINDEX_REBUILD",
@@ -92,7 +93,6 @@ private[hive] object HiveQl {
"TOK_ALTERTABLE_SKEWED",
"TOK_ALTERTABLE_TOUCH",
"TOK_ALTERTABLE_UNARCHIVE",
- "TOK_ANALYZE",
"TOK_CREATEDATABASE",
"TOK_CREATEFUNCTION",
"TOK_CREATEINDEX",
@@ -239,7 +239,6 @@ private[hive] object HiveQl {
ShellCommand(sql.drop(1))
} else {
val tree = getAst(sql)
-
if (nativeCommands contains tree.getText) {
NativeCommand(sql)
} else {
@@ -387,6 +386,22 @@ private[hive] object HiveQl {
ifExists) =>
val tableName = tableNameParts.map { case Token(p, Nil) => p }.mkString(".")
DropTable(tableName, ifExists.nonEmpty)
+ // Support "ANALYZE TABLE tableNmae COMPUTE STATISTICS noscan"
+ case Token("TOK_ANALYZE",
+ Token("TOK_TAB", Token("TOK_TABNAME", tableNameParts) :: partitionSpec) ::
+ isNoscan) =>
+ // Reference:
+ // https://cwiki.apache.org/confluence/display/Hive/StatsDev#StatsDev-ExistingTables
+ if (partitionSpec.nonEmpty) {
+ // Analyze partitions will be treated as a Hive native command.
+ NativePlaceholder
+ } else if (isNoscan.isEmpty) {
+ // If users do not specify "noscan", it will be treated as a Hive native command.
+ NativePlaceholder
+ } else {
+ val tableName = tableNameParts.map { case Token(p, Nil) => p }.mkString(".")
+ AnalyzeTable(tableName)
+ }
// Just fake explain for any of the native commands.
case Token("TOK_EXPLAIN", explainArgs)
if noExplainCommands.contains(explainArgs.head.getText) =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 2175c5f383..85d2496a34 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -83,6 +83,8 @@ private[hive] trait HiveStrategies {
case DropTable(tableName, ifExists) => execution.DropTable(tableName, ifExists) :: Nil
+ case AnalyzeTable(tableName) => execution.AnalyzeTable(tableName) :: Nil
+
case describe: logical.DescribeCommand =>
val resolvedTable = context.executePlan(describe.table).analyzed
resolvedTable match {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 9cd0c86c6c..2985169da0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -25,6 +25,32 @@ import org.apache.spark.sql.hive.HiveContext
/**
* :: DeveloperApi ::
+ * Analyzes the given table in the current database to generate statistics, which will be
+ * used in query optimizations.
+ *
+ * Right now, it only supports Hive tables and it only updates the size of a Hive table
+ * in the Hive metastore.
+ */
+@DeveloperApi
+case class AnalyzeTable(tableName: String) extends LeafNode with Command {
+
+ def hiveContext = sqlContext.asInstanceOf[HiveContext]
+
+ def output = Seq.empty
+
+ override protected[sql] lazy val sideEffectResult = {
+ hiveContext.analyze(tableName)
+ Seq.empty[Any]
+ }
+
+ override def execute(): RDD[Row] = {
+ sideEffectResult
+ sparkContext.emptyRDD[Row]
+ }
+}
+
+/**
+ * :: DeveloperApi ::
* Drops a table from the metastore and removes it if it is cached.
*/
@DeveloperApi
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index bf5931bbf9..7c82964b5e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -19,13 +19,54 @@ package org.apache.spark.sql.hive
import scala.reflect.ClassTag
+
import org.apache.spark.sql.{SQLConf, QueryTest}
+import org.apache.spark.sql.catalyst.plans.logical.NativeCommand
import org.apache.spark.sql.execution.{BroadcastHashJoin, ShuffledHashJoin}
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.hive.test.TestHive._
class StatisticsSuite extends QueryTest {
+ test("parse analyze commands") {
+ def assertAnalyzeCommand(analyzeCommand: String, c: Class[_]) {
+ val parsed = HiveQl.parseSql(analyzeCommand)
+ val operators = parsed.collect {
+ case a: AnalyzeTable => a
+ case o => o
+ }
+
+ assert(operators.size === 1)
+ if (operators(0).getClass() != c) {
+ fail(
+ s"""$analyzeCommand expected command: $c, but got ${operators(0)}
+ |parsed command:
+ |$parsed
+ """.stripMargin)
+ }
+ }
+
+ assertAnalyzeCommand(
+ "ANALYZE TABLE Table1 COMPUTE STATISTICS",
+ classOf[NativeCommand])
+ assertAnalyzeCommand(
+ "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS",
+ classOf[NativeCommand])
+ assertAnalyzeCommand(
+ "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS noscan",
+ classOf[NativeCommand])
+ assertAnalyzeCommand(
+ "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS",
+ classOf[NativeCommand])
+ assertAnalyzeCommand(
+ "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS noscan",
+ classOf[NativeCommand])
+
+ assertAnalyzeCommand(
+ "ANALYZE TABLE Table1 COMPUTE STATISTICS nOscAn",
+ classOf[AnalyzeTable])
+ }
+
test("analyze MetastoreRelations") {
def queryTotalSize(tableName: String): BigInt =
catalog.lookupRelation(None, tableName).statistics.sizeInBytes
@@ -37,7 +78,7 @@ class StatisticsSuite extends QueryTest {
assert(queryTotalSize("analyzeTable") === defaultSizeInBytes)
- analyze("analyzeTable")
+ sql("ANALYZE TABLE analyzeTable COMPUTE STATISTICS noscan")
assert(queryTotalSize("analyzeTable") === BigInt(11624))
@@ -66,7 +107,7 @@ class StatisticsSuite extends QueryTest {
assert(queryTotalSize("analyzeTable_part") === defaultSizeInBytes)
- analyze("analyzeTable_part")
+ sql("ANALYZE TABLE analyzeTable_part COMPUTE STATISTICS noscan")
assert(queryTotalSize("analyzeTable_part") === BigInt(17436))