[SPARK-17072][SQL] support table-level statistics generation and storing into/loading from metastore

## What changes were proposed in this pull request? 1. Support generation table-level statistics for - hive tables in HiveExternalCatalog - data source tables in HiveExternalCatalog - data source tables in InMemoryCatalog. 2. Add a property "catalogStats" in CatalogTable to hold statistics in Spark side. 3. Put logics of statistics transformation between Spark and Hive in HiveClientImpl. 4. Extend Statistics class by adding rowCount (will add estimatedSize when we have column stats). ## How was this patch tested? add unit tests Author: wangzhenhua <wangzhenhua@huawei.com> Author: Zhenhua Wang <wangzhenhua@huawei.com> Closes #14712 from wzhfy/tableStats.
author: wangzhenhua <wangzhenhua@huawei.com> 2016-09-05 17:32:31 +0200
committer: Herman van Hovell <hvanhovell@databricks.com> 2016-09-05 17:32:31 +0200
commit: 6d86403d8b252776effcddd71338b4d21a224f9b (patch)
tree: 2fa41086f7d6c9dacb86d3e34d9c8f6f0b4fdcab
parent: 3ccb23e445711ea5d9059eb6de7c490c8fc9d112 (diff)
download: spark-6d86403d8b252776effcddd71338b4d21a224f9b.tar.gz
spark-6d86403d8b252776effcddd71338b4d21a224f9b.tar.bz2
spark-6d86403d8b252776effcddd71338b4d21a224f9b.zip
16 files changed, 363 insertions, 108 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 8408d765d4..79231ee9e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -22,7 +22,7 @@ import java.util.Date
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
 import org.apache.spark.sql.types.StructType
 
@@ -130,6 +130,7 @@ case class CatalogTable(
     createTime: Long = System.currentTimeMillis,
     lastAccessTime: Long = -1,
     properties: Map[String, String] = Map.empty,
+    stats: Option[Statistics] = None,
     viewOriginalText: Option[String] = None,
     viewText: Option[String] = None,
     comment: Option[String] = None,
@@ -190,6 +191,7 @@ case class CatalogTable(
         viewText.map("View: " + _).getOrElse(""),
         comment.map("Comment: " + _).getOrElse(""),
         if (properties.nonEmpty) s"Properties: $tableProperties" else "",
+        if (stats.isDefined) s"Statistics: ${stats.get}" else "",
         s"$storage")
 
     output.filter(_.nonEmpty).mkString("CatalogTable(\n\t", "\n\t", ")")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
index 6e6cc6962c..58fa537a18 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
@@ -31,6 +31,19 @@ package org.apache.spark.sql.catalyst.plans.logical
  *
  * @param sizeInBytes Physical size in bytes. For leaf operators this defaults to 1, otherwise it
  *                    defaults to the product of children's `sizeInBytes`.
+ * @param rowCount Estimated number of rows.
  * @param isBroadcastable If true, output is small enough to be used in a broadcast join.
  */
-case class Statistics(sizeInBytes: BigInt, isBroadcastable: Boolean = false)
+case class Statistics(
+    sizeInBytes: BigInt,
+    rowCount: Option[BigInt] = None,
+    isBroadcastable: Boolean = false) {
+  override def toString: String = {
+    val output =
+      Seq(s"sizeInBytes=$sizeInBytes",
+        if (rowCount.isDefined) s"rowCount=${rowCount.get}" else "",
+        s"isBroadcastable=$isBroadcastable"
+      )
+    output.filter(_.nonEmpty).mkString("Statistics(", ", ", ")")
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala
index dde91b0a86..6f821f80cc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala
@@ -590,8 +590,12 @@ class SQLBuilder private (
 
   object ExtractSQLTable {
     def unapply(plan: LogicalPlan): Option[SQLTable] = plan match {
-      case l @ LogicalRelation(_, _, Some(TableIdentifier(table, Some(database)))) =>
-        Some(SQLTable(database, table, l.output.map(_.withQualifier(None))))
+      case l @ LogicalRelation(_, _, Some(catalogTable))
+          if catalogTable.identifier.database.isDefined =>
+        Some(SQLTable(
+          catalogTable.identifier.database.get,
+          catalogTable.identifier.table,
+          l.output.map(_.withQualifier(None))))
 
       case relation: CatalogRelation =>
         val m = relation.catalogTable
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index fc078da07d..7ba1a9ff22 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -99,9 +99,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
       ctx.identifier.getText.toLowerCase == "noscan") {
       AnalyzeTableCommand(visitTableIdentifier(ctx.tableIdentifier).toString)
     } else {
-      // Always just run the no scan analyze. We should fix this and implement full analyze
-      // command in the future.
-      AnalyzeTableCommand(visitTableIdentifier(ctx.tableIdentifier).toString)
+      AnalyzeTableCommand(visitTableIdentifier(ctx.tableIdentifier).toString, noscan = false)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index a469d4da86..15687ddd72 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -21,19 +21,18 @@ import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 
-import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
+import org.apache.spark.sql.{AnalysisException, Dataset, Row, SparkSession}
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogTable}
+import org.apache.spark.sql.catalyst.plans.logical.Statistics
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 
 
 /**
  * Analyzes the given table in the current database to generate statistics, which will be
  * used in query optimizations.
- *
- * Right now, it only supports Hive tables and it only updates the size of a Hive table
- * in the Hive metastore.
  */
-case class AnalyzeTableCommand(tableName: String) extends RunnableCommand {
+case class AnalyzeTableCommand(tableName: String, noscan: Boolean = true) extends RunnableCommand {
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val sessionState = sparkSession.sessionState
@@ -71,8 +70,6 @@ case class AnalyzeTableCommand(tableName: String) extends RunnableCommand {
           size
         }
 
-        val tableParameters = catalogTable.properties
-        val oldTotalSize = tableParameters.get("totalSize").map(_.toLong).getOrElse(0L)
         val newTotalSize =
           catalogTable.storage.locationUri.map { p =>
             val path = new Path(p)
@@ -88,24 +85,47 @@ case class AnalyzeTableCommand(tableName: String) extends RunnableCommand {
             }
           }.getOrElse(0L)
 
-        // Update the Hive metastore if the total size of the table is different than the size
-        // recorded in the Hive metastore.
-        // This logic is based on org.apache.hadoop.hive.ql.exec.StatsTask.aggregateStats().
-        if (newTotalSize > 0 && newTotalSize != oldTotalSize) {
-          sessionState.catalog.alterTable(
-            catalogTable.copy(
-              properties = relation.catalogTable.properties +
-                (AnalyzeTableCommand.TOTAL_SIZE_FIELD -> newTotalSize.toString)))
-        }
+        updateTableStats(catalogTable, newTotalSize)
+
+      // data source tables have been converted into LogicalRelations
+      case logicalRel: LogicalRelation if logicalRel.catalogTable.isDefined =>
+        updateTableStats(logicalRel.catalogTable.get, logicalRel.relation.sizeInBytes)
 
       case otherRelation =>
-        throw new AnalysisException(s"ANALYZE TABLE is only supported for Hive tables, " +
-          s"but '${tableIdent.unquotedString}' is a ${otherRelation.nodeName}.")
+        throw new AnalysisException(s"ANALYZE TABLE is not supported for " +
+          s"${otherRelation.nodeName}.")
     }
+
+    def updateTableStats(catalogTable: CatalogTable, newTotalSize: Long): Unit = {
+      val oldTotalSize = catalogTable.stats.map(_.sizeInBytes.toLong).getOrElse(0L)
+      val oldRowCount = catalogTable.stats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
+      var newStats: Option[Statistics] = None
+      if (newTotalSize > 0 && newTotalSize != oldTotalSize) {
+        newStats = Some(Statistics(sizeInBytes = newTotalSize))
+      }
+      // We only set rowCount when noscan is false, because otherwise:
+      // 1. when total size is not changed, we don't need to alter the table;
+      // 2. when total size is changed, `oldRowCount` becomes invalid.
+      // This is to make sure that we only record the right statistics.
+      if (!noscan) {
+        val newRowCount = Dataset.ofRows(sparkSession, relation).count()
+        if (newRowCount >= 0 && newRowCount != oldRowCount) {
+          newStats = if (newStats.isDefined) {
+            newStats.map(_.copy(rowCount = Some(BigInt(newRowCount))))
+          } else {
+            Some(Statistics(sizeInBytes = oldTotalSize, rowCount = Some(BigInt(newRowCount))))
+          }
+        }
+      }
+      // Update the metastore if the above statistics of the table are different from those
+      // recorded in the metastore.
+      if (newStats.isDefined) {
+        sessionState.catalog.alterTable(catalogTable.copy(stats = newStats))
+        // Refresh the cached data source table in the catalog.
+        sessionState.catalog.refreshTable(tableIdent)
+      }
+    }
+
     Seq.empty[Row]
   }
 }
-
-object AnalyzeTableCommand {
-  val TOTAL_SIZE_FIELD = "totalSize"
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 8286467e96..c8ad5b3034 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -209,7 +209,7 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
 
     LogicalRelation(
       dataSource.resolveRelation(),
-      metastoreTableIdentifier = Some(table.identifier))
+      catalogTable = Some(table))
   }
 
   override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
@@ -366,7 +366,8 @@ object DataSourceStrategy extends Strategy with Logging {
       val scan = RowDataSourceScanExec(
         projects.map(_.toAttribute),
         scanBuilder(requestedColumns, candidatePredicates, pushedFilters),
-        relation.relation, UnknownPartitioning(0), metadata, relation.metastoreTableIdentifier)
+        relation.relation, UnknownPartitioning(0), metadata,
+        relation.catalogTable.map(_.identifier))
       filterCondition.map(execution.FilterExec(_, scan)).getOrElse(scan)
     } else {
       // Don't request columns that are only referenced by pushed filters.
@@ -376,7 +377,8 @@ object DataSourceStrategy extends Strategy with Logging {
       val scan = RowDataSourceScanExec(
         requestedColumns,
         scanBuilder(requestedColumns, candidatePredicates, pushedFilters),
-        relation.relation, UnknownPartitioning(0), metadata, relation.metastoreTableIdentifier)
+        relation.relation, UnknownPartitioning(0), metadata,
+        relation.catalogTable.map(_.identifier))
       execution.ProjectExec(
         projects, filterCondition.map(execution.FilterExec(_, scan)).getOrElse(scan))
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
index 8b36caf6f1..55ca4f1106 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -112,7 +112,7 @@ object FileSourceStrategy extends Strategy with Logging {
           outputSchema,
           partitionKeyFilters.toSeq,
           pushedDownFilters,
-          table)
+          table.map(_.identifier))
 
       val afterScanFilter = afterScanFilters.toSeq.reduceOption(expressions.And)
       val withFilter = afterScanFilter.map(execution.FilterExec(_, scan)).getOrElse(scan)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index 2a8e147011..d9562fd32e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -16,8 +16,8 @@
  */
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
+import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.sources.BaseRelation
@@ -33,7 +33,7 @@ import org.apache.spark.util.Utils
 case class LogicalRelation(
     relation: BaseRelation,
     expectedOutputAttributes: Option[Seq[Attribute]] = None,
-    metastoreTableIdentifier: Option[TableIdentifier] = None)
+    catalogTable: Option[CatalogTable] = None)
   extends LeafNode with MultiInstanceRelation {
 
   override val output: Seq[AttributeReference] = {
@@ -72,9 +72,10 @@ case class LogicalRelation(
   // expId can be different but the relation is still the same.
   override lazy val cleanArgs: Seq[Any] = Seq(relation)
 
-  @transient override lazy val statistics: Statistics = Statistics(
-    sizeInBytes = BigInt(relation.sizeInBytes)
-  )
+  @transient override lazy val statistics: Statistics = {
+    catalogTable.flatMap(_.stats.map(_.copy(sizeInBytes = relation.sizeInBytes))).getOrElse(
+      Statistics(sizeInBytes = relation.sizeInBytes))
+  }
 
   /** Used to lookup original attribute capitalization */
   val attributeMap: AttributeMap[AttributeReference] = AttributeMap(output.map(o => (o, o)))
@@ -89,7 +90,7 @@ case class LogicalRelation(
     LogicalRelation(
       relation,
       expectedOutputAttributes.map(_.map(_.newInstance())),
-      metastoreTableIdentifier).asInstanceOf[this.type]
+      catalogTable).asInstanceOf[this.type]
   }
 
   override def refresh(): Unit = relation match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index ae77e4cb96..5b96206ba8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -252,11 +252,11 @@ case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] {
         case relation: CatalogRelation =>
           val metadata = relation.catalogTable
           preprocess(i, metadata.identifier.quotedString, metadata.partitionColumnNames)
-        case LogicalRelation(h: HadoopFsRelation, _, identifier) =>
-          val tblName = identifier.map(_.quotedString).getOrElse("unknown")
+        case LogicalRelation(h: HadoopFsRelation, _, catalogTable) =>
+          val tblName = catalogTable.map(_.identifier.quotedString).getOrElse("unknown")
           preprocess(i, tblName, h.partitionSchema.map(_.name))
-        case LogicalRelation(_: InsertableRelation, _, identifier) =>
-          val tblName = identifier.map(_.quotedString).getOrElse("unknown")
+        case LogicalRelation(_: InsertableRelation, _, catalogTable) =>
+          val tblName = catalogTable.map(_.identifier.quotedString).getOrElse("unknown")
           preprocess(i, tblName, Nil)
         case other => i
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
index ab27381c06..8fdbd0f2c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
@@ -192,7 +192,7 @@ private[sql] class SessionState(sparkSession: SparkSession) {
    * Right now, it only supports catalog tables and it only updates the size of a catalog table
    * in the external catalog.
    */
-  def analyze(tableName: String): Unit = {
-    AnalyzeTableCommand(tableName).run(sparkSession)
+  def analyze(tableName: String, noscan: Boolean = true): Unit = {
+    AnalyzeTableCommand(tableName, noscan).run(sparkSession)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala
index 2c81cbf15f..264a2ffbeb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.plans.logical.{GlobalLimit, Join, LocalLimit}
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
@@ -75,4 +76,29 @@ class StatisticsSuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("test table-level statistics for data source table created in InMemoryCatalog") {
+    def checkTableStats(tableName: String, expectedRowCount: Option[BigInt]): Unit = {
+      val df = sql(s"SELECT * FROM $tableName")
+      val relations = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
+        assert(rel.catalogTable.isDefined)
+        assert(rel.catalogTable.get.stats.flatMap(_.rowCount) === expectedRowCount)
+        rel
+      }
+      assert(relations.size === 1)
+    }
+
+    val tableName = "tbl"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName(i INT, j STRING) USING parquet")
+      Seq(1 -> "a", 2 -> "b").toDF("i", "j").write.mode("overwrite").insertInto("tbl")
+
+      // noscan won't count the number of rows
+      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
+      checkTableStats(tableName, expectedRowCount = None)
+
+      // without noscan, we count the number of rows
+      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+      checkTableStats(tableName, expectedRowCount = Some(2))
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 1fe7f4d41d..2e127ef562 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
 import org.apache.spark.sql.hive.client.HiveClient
@@ -102,11 +103,13 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * metastore.
    */
   private def verifyTableProperties(table: CatalogTable): Unit = {
-    val datasourceKeys = table.properties.keys.filter(_.startsWith(DATASOURCE_PREFIX))
-    if (datasourceKeys.nonEmpty) {
+    val invalidKeys = table.properties.keys.filter { key =>
+      key.startsWith(DATASOURCE_PREFIX) || key.startsWith(STATISTICS_PREFIX)
+    }
+    if (invalidKeys.nonEmpty) {
       throw new AnalysisException(s"Cannot persistent ${table.qualifiedName} into hive metastore " +
-        s"as table property keys may not start with '$DATASOURCE_PREFIX': " +
-        datasourceKeys.mkString("[", ", ", "]"))
+        s"as table property keys may not start with '$DATASOURCE_PREFIX' or '$STATISTICS_PREFIX':" +
+        s" ${invalidKeys.mkString("[", ", ", "]")}")
     }
   }
 
@@ -388,21 +391,34 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     requireTableExists(db, tableDefinition.identifier.table)
     verifyTableProperties(tableDefinition)
 
-    if (DDLUtils.isDatasourceTable(tableDefinition)) {
-      val oldDef = client.getTable(db, tableDefinition.identifier.table)
+    // convert table statistics to properties so that we can persist them through hive api
+    val withStatsProps = if (tableDefinition.stats.isDefined) {
+      val stats = tableDefinition.stats.get
+      var statsProperties: Map[String, String] =
+        Map(STATISTICS_TOTAL_SIZE -> stats.sizeInBytes.toString())
+      if (stats.rowCount.isDefined) {
+        statsProperties += (STATISTICS_NUM_ROWS -> stats.rowCount.get.toString())
+      }
+      tableDefinition.copy(properties = tableDefinition.properties ++ statsProperties)
+    } else {
+      tableDefinition
+    }
+
+    if (DDLUtils.isDatasourceTable(withStatsProps)) {
+      val oldDef = client.getTable(db, withStatsProps.identifier.table)
       // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition,
       // to retain the spark specific format if it is. Also add old data source properties to table
       // properties, to retain the data source table format.
       val oldDataSourceProps = oldDef.properties.filter(_._1.startsWith(DATASOURCE_PREFIX))
-      val newDef = tableDefinition.copy(
+      val newDef = withStatsProps.copy(
         schema = oldDef.schema,
         partitionColumnNames = oldDef.partitionColumnNames,
         bucketSpec = oldDef.bucketSpec,
-        properties = oldDataSourceProps ++ tableDefinition.properties)
+        properties = oldDataSourceProps ++ withStatsProps.properties)
 
       client.alterTable(newDef)
     } else {
-      client.alterTable(tableDefinition)
+      client.alterTable(withStatsProps)
     }
   }
 
@@ -422,7 +438,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * properties, and filter out these special entries from table properties.
    */
   private def restoreTableMetadata(table: CatalogTable): CatalogTable = {
-    if (table.tableType == VIEW) {
+    val catalogTable = if (table.tableType == VIEW) {
       table
     } else {
       getProviderFromTableProperties(table).map { provider =>
@@ -452,6 +468,19 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         table.copy(provider = Some("hive"))
       }
     }
+    // construct Spark's statistics from information in Hive metastore
+    if (catalogTable.properties.contains(STATISTICS_TOTAL_SIZE)) {
+      val totalSize = BigInt(catalogTable.properties.get(STATISTICS_TOTAL_SIZE).get)
+      // TODO: we will compute "estimatedSize" when we have column stats:
+      // average size of row * number of rows
+      catalogTable.copy(
+        properties = removeStatsProperties(catalogTable),
+        stats = Some(Statistics(
+          sizeInBytes = totalSize,
+          rowCount = catalogTable.properties.get(STATISTICS_NUM_ROWS).map(BigInt(_)))))
+    } else {
+      catalogTable
+    }
   }
 
   override def tableExists(db: String, table: String): Boolean = withClient {
@@ -657,6 +686,14 @@ object HiveExternalCatalog {
   val DATASOURCE_SCHEMA_BUCKETCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "bucketCol."
   val DATASOURCE_SCHEMA_SORTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "sortCol."
 
+  val STATISTICS_PREFIX = "spark.sql.statistics."
+  val STATISTICS_TOTAL_SIZE = STATISTICS_PREFIX + "totalSize"
+  val STATISTICS_NUM_ROWS = STATISTICS_PREFIX + "numRows"
+
+  def removeStatsProperties(metadata: CatalogTable): Map[String, String] = {
+    metadata.properties.filterNot { case (key, _) => key.startsWith(STATISTICS_PREFIX) }
+  }
+
   def getProviderFromTableProperties(metadata: CatalogTable): Option[String] = {
     metadata.properties.get(DATASOURCE_PROVIDER)
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index ff82c7f7af..d31a8d643a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -82,7 +82,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
 
         LogicalRelation(
           dataSource.resolveRelation(checkPathExist = true),
-          metastoreTableIdentifier = Some(TableIdentifier(in.name, Some(in.database))))
+          catalogTable = Some(table))
       }
     }
 
@@ -257,10 +257,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
           fileFormat = defaultSource,
           options = options)
 
-        val created = LogicalRelation(
-          relation,
-          metastoreTableIdentifier =
-            Some(TableIdentifier(tableIdentifier.name, Some(tableIdentifier.database))))
+        val created = LogicalRelation(relation, catalogTable = Some(metastoreRelation.catalogTable))
         cachedDataSourceTables.put(tableIdentifier, created)
         created
       }
@@ -286,8 +283,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
               bucketSpec = bucketSpec,
               options = options,
               className = fileType).resolveRelation(),
-              metastoreTableIdentifier =
-                Some(TableIdentifier(tableIdentifier.name, Some(tableIdentifier.database))))
+              catalogTable = Some(metastoreRelation.catalogTable))
 
 
         cachedDataSourceTables.put(tableIdentifier, created)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
index a90da98811..0bfdc137fa 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
@@ -108,39 +108,41 @@ private[hive] case class MetastoreRelation(
     new HiveTable(tTable)
   }
 
-  @transient override lazy val statistics: Statistics = Statistics(
-    sizeInBytes = {
-      val totalSize = hiveQlTable.getParameters.get(StatsSetupConst.TOTAL_SIZE)
-      val rawDataSize = hiveQlTable.getParameters.get(StatsSetupConst.RAW_DATA_SIZE)
-      // TODO: check if this estimate is valid for tables after partition pruning.
-      // NOTE: getting `totalSize` directly from params is kind of hacky, but this should be
-      // relatively cheap if parameters for the table are populated into the metastore.
-      // Besides `totalSize`, there are also `numFiles`, `numRows`, `rawDataSize` keys
-      // (see StatsSetupConst in Hive) that we can look at in the future.
-      BigInt(
-        // When table is external,`totalSize` is always zero, which will influence join strategy
-        // so when `totalSize` is zero, use `rawDataSize` instead
-        // if the size is still less than zero, we try to get the file size from HDFS.
-        // given this is only needed for optimization, if the HDFS call fails we return the default.
-        if (totalSize != null && totalSize.toLong > 0L) {
-          totalSize.toLong
-        } else if (rawDataSize != null && rawDataSize.toLong > 0) {
-          rawDataSize.toLong
-        } else if (sparkSession.sessionState.conf.fallBackToHdfsForStatsEnabled) {
-          try {
-            val hadoopConf = sparkSession.sessionState.newHadoopConf()
-            val fs: FileSystem = hiveQlTable.getPath.getFileSystem(hadoopConf)
-            fs.getContentSummary(hiveQlTable.getPath).getLength
-          } catch {
-            case e: IOException =>
-              logWarning("Failed to get table size from hdfs.", e)
-              sparkSession.sessionState.conf.defaultSizeInBytes
-          }
-        } else {
-          sparkSession.sessionState.conf.defaultSizeInBytes
-        })
-    }
-  )
+  @transient override lazy val statistics: Statistics = {
+    catalogTable.stats.getOrElse(Statistics(
+      sizeInBytes = {
+        val totalSize = hiveQlTable.getParameters.get(StatsSetupConst.TOTAL_SIZE)
+        val rawDataSize = hiveQlTable.getParameters.get(StatsSetupConst.RAW_DATA_SIZE)
+        // TODO: check if this estimate is valid for tables after partition pruning.
+        // NOTE: getting `totalSize` directly from params is kind of hacky, but this should be
+        // relatively cheap if parameters for the table are populated into the metastore.
+        // Besides `totalSize`, there are also `numFiles`, `numRows`, `rawDataSize` keys
+        // (see StatsSetupConst in Hive) that we can look at in the future.
+        BigInt(
+          // When table is external,`totalSize` is always zero, which will influence join strategy
+          // so when `totalSize` is zero, use `rawDataSize` instead
+          // when `rawDataSize` is also zero, use `HiveExternalCatalog.STATISTICS_TOTAL_SIZE`,
+          // which is generated by analyze command.
+          if (totalSize != null && totalSize.toLong > 0L) {
+            totalSize.toLong
+          } else if (rawDataSize != null && rawDataSize.toLong > 0) {
+            rawDataSize.toLong
+          } else if (sparkSession.sessionState.conf.fallBackToHdfsForStatsEnabled) {
+            try {
+              val hadoopConf = sparkSession.sessionState.newHadoopConf()
+              val fs: FileSystem = hiveQlTable.getPath.getFileSystem(hadoopConf)
+              fs.getContentSummary(hiveQlTable.getPath).getLength
+            } catch {
+              case e: IOException =>
+                logWarning("Failed to get table size from hdfs.", e)
+                sparkSession.sessionState.conf.defaultSizeInBytes
+            }
+          } else {
+            sparkSession.sessionState.conf.defaultSizeInBytes
+          })
+      }
+    ))
+  }
 
   // When metastore partition pruning is turned off, we cache the list of all partitions to
   // mimic the behavior of Spark < 1.5
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index b275ab17a9..33ed675754 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -23,11 +23,14 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.execution.command.AnalyzeTableCommand
+import org.apache.spark.sql.catalyst.plans.logical.Statistics
+import org.apache.spark.sql.execution.command.{AnalyzeTableCommand, DDLUtils}
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types.StructType
 
 class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
 
@@ -168,6 +171,154 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
       TableIdentifier("tempTable"), ignoreIfNotExists = true, purge = false)
   }
 
+  private def checkMetastoreRelationStats(
+      tableName: String,
+      expectedStats: Option[Statistics]): Unit = {
+    val df = sql(s"SELECT * FROM $tableName")
+    val relations = df.queryExecution.analyzed.collect { case rel: MetastoreRelation =>
+      expectedStats match {
+        case Some(es) =>
+          assert(rel.catalogTable.stats.isDefined)
+          val stats = rel.catalogTable.stats.get
+          assert(stats.sizeInBytes === es.sizeInBytes)
+          assert(stats.rowCount === es.rowCount)
+        case None =>
+          assert(rel.catalogTable.stats.isEmpty)
+      }
+      rel
+    }
+    assert(relations.size === 1)
+  }
+
+  test("test table-level statistics for hive tables created in HiveExternalCatalog") {
+    val textTable = "textTable"
+    withTable(textTable) {
+      // Currently Spark's statistics are self-contained, we don't have statistics until we use
+      // the `ANALYZE TABLE` command.
+      sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
+      checkMetastoreRelationStats(textTable, expectedStats = None)
+      sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
+      checkMetastoreRelationStats(textTable, expectedStats = None)
+
+      // noscan won't count the number of rows
+      sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
+      checkMetastoreRelationStats(textTable, expectedStats =
+        Some(Statistics(sizeInBytes = 5812, rowCount = None)))
+
+      // without noscan, we count the number of rows
+      sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
+      checkMetastoreRelationStats(textTable, expectedStats =
+          Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+    }
+  }
+
+  test("test elimination of the influences of the old stats") {
+    val textTable = "textTable"
+    withTable(textTable) {
+      sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
+      sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
+      sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
+      checkMetastoreRelationStats(textTable, expectedStats =
+        Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+
+      sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
+      // when the total size is not changed, the old row count is kept
+      checkMetastoreRelationStats(textTable, expectedStats =
+        Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+
+      sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
+      sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
+      // update total size and remove the old and invalid row count
+      checkMetastoreRelationStats(textTable, expectedStats =
+        Some(Statistics(sizeInBytes = 11624, rowCount = None)))
+    }
+  }
+
+  private def checkLogicalRelationStats(
+      tableName: String,
+      expectedStats: Option[Statistics]): Unit = {
+    val df = sql(s"SELECT * FROM $tableName")
+    val relations = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
+      assert(rel.catalogTable.isDefined)
+      expectedStats match {
+        case Some(es) =>
+          assert(rel.catalogTable.get.stats.isDefined)
+          val stats = rel.catalogTable.get.stats.get
+          assert(stats.sizeInBytes === es.sizeInBytes)
+          assert(stats.rowCount === es.rowCount)
+        case None =>
+          assert(rel.catalogTable.get.stats.isEmpty)
+      }
+      rel
+    }
+    assert(relations.size === 1)
+  }
+
+  test("test statistics of LogicalRelation converted from MetastoreRelation") {
+    val parquetTable = "parquetTable"
+    val orcTable = "orcTable"
+    withTable(parquetTable, orcTable) {
+      sql(s"CREATE TABLE $parquetTable (key STRING, value STRING) STORED AS PARQUET")
+      sql(s"CREATE TABLE $orcTable (key STRING, value STRING) STORED AS ORC")
+      sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
+      sql(s"INSERT INTO TABLE $orcTable SELECT * FROM src")
+
+      // the default value for `spark.sql.hive.convertMetastoreParquet` is true, here we just set it
+      // for robustness
+      withSQLConf("spark.sql.hive.convertMetastoreParquet" -> "true") {
+        checkLogicalRelationStats(parquetTable, expectedStats = None)
+        sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
+        checkLogicalRelationStats(parquetTable, expectedStats =
+          Some(Statistics(sizeInBytes = 4236, rowCount = Some(500))))
+      }
+      withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "true") {
+        checkLogicalRelationStats(orcTable, expectedStats = None)
+        sql(s"ANALYZE TABLE $orcTable COMPUTE STATISTICS")
+        checkLogicalRelationStats(orcTable, expectedStats =
+          Some(Statistics(sizeInBytes = 3023, rowCount = Some(500))))
+      }
+    }
+  }
+
+  test("test table-level statistics for data source table created in HiveExternalCatalog") {
+    val parquetTable = "parquetTable"
+    withTable(parquetTable) {
+      sql(s"CREATE TABLE $parquetTable (key STRING, value STRING) USING PARQUET")
+      val catalogTable = spark.sessionState.catalog.getTableMetadata(TableIdentifier(parquetTable))
+      assert(DDLUtils.isDatasourceTable(catalogTable))
+
+      sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
+      checkLogicalRelationStats(parquetTable, expectedStats = None)
+
+      // noscan won't count the number of rows
+      sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
+      checkLogicalRelationStats(parquetTable, expectedStats =
+        Some(Statistics(sizeInBytes = 4236, rowCount = None)))
+
+      sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
+      sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
+      checkLogicalRelationStats(parquetTable, expectedStats =
+        Some(Statistics(sizeInBytes = 8472, rowCount = None)))
+
+      // without noscan, we count the number of rows
+      sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
+      checkLogicalRelationStats(parquetTable, expectedStats =
+        Some(Statistics(sizeInBytes = 8472, rowCount = Some(1000))))
+    }
+  }
+
+  test("statistics collection of a table with zero column") {
+    val table_no_cols = "table_no_cols"
+    withTable(table_no_cols) {
+      val rddNoCols = sparkContext.parallelize(1 to 10).map(_ => Row.empty)
+      val dfNoCols = spark.createDataFrame(rddNoCols, StructType(Seq.empty))
+      dfNoCols.write.format("json").saveAsTable(table_no_cols)
+      sql(s"ANALYZE TABLE $table_no_cols COMPUTE STATISTICS")
+      checkLogicalRelationStats(table_no_cols, expectedStats =
+        Some(Statistics(sizeInBytes = 30, rowCount = Some(10))))
+    }
+  }
+
   test("estimates the size of a test MetastoreRelation") {
     val df = sql("""SELECT * FROM src""")
     val sizes = df.queryExecution.analyzed.collect { case mr: MetastoreRelation =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index eff32805bf..3cba5b2a09 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -1026,26 +1026,29 @@ class HiveDDLSuite
     }
   }
 
-  test("datasource table property keys are not allowed") {
+  test("datasource and statistics table property keys are not allowed") {
     import org.apache.spark.sql.hive.HiveExternalCatalog.DATASOURCE_PREFIX
+    import org.apache.spark.sql.hive.HiveExternalCatalog.STATISTICS_PREFIX
 
     withTable("tbl") {
       sql("CREATE TABLE tbl(a INT) STORED AS parquet")
 
-      val e = intercept[AnalysisException] {
-        sql(s"ALTER TABLE tbl SET TBLPROPERTIES ('${DATASOURCE_PREFIX}foo' = 'loser')")
-      }
-      assert(e.getMessage.contains(DATASOURCE_PREFIX + "foo"))
+      Seq(DATASOURCE_PREFIX, STATISTICS_PREFIX).foreach { forbiddenPrefix =>
+        val e = intercept[AnalysisException] {
+          sql(s"ALTER TABLE tbl SET TBLPROPERTIES ('${forbiddenPrefix}foo' = 'loser')")
+        }
+        assert(e.getMessage.contains(forbiddenPrefix + "foo"))
 
-      val e2 = intercept[AnalysisException] {
-        sql(s"ALTER TABLE tbl UNSET TBLPROPERTIES ('${DATASOURCE_PREFIX}foo')")
-      }
-      assert(e2.getMessage.contains(DATASOURCE_PREFIX + "foo"))
+        val e2 = intercept[AnalysisException] {
+          sql(s"ALTER TABLE tbl UNSET TBLPROPERTIES ('${forbiddenPrefix}foo')")
+        }
+        assert(e2.getMessage.contains(forbiddenPrefix + "foo"))
 
-      val e3 = intercept[AnalysisException] {
-        sql(s"CREATE TABLE tbl TBLPROPERTIES ('${DATASOURCE_PREFIX}foo'='anything')")
+        val e3 = intercept[AnalysisException] {
+          sql(s"CREATE TABLE tbl TBLPROPERTIES ('${forbiddenPrefix}foo'='anything')")
+        }
+        assert(e3.getMessage.contains(forbiddenPrefix + "foo"))
       }
-      assert(e3.getMessage.contains(DATASOURCE_PREFIX + "foo"))
     }
   }
 }
author	wangzhenhua <wangzhenhua@huawei.com>	2016-09-05 17:32:31 +0200
committer	Herman van Hovell <hvanhovell@databricks.com>	2016-09-05 17:32:31 +0200
commit	6d86403d8b252776effcddd71338b4d21a224f9b (patch)
tree	2fa41086f7d6c9dacb86d3e34d9c8f6f0b4fdcab
parent	3ccb23e445711ea5d9059eb6de7c490c8fc9d112 (diff)
download	spark-6d86403d8b252776effcddd71338b4d21a224f9b.tar.gz spark-6d86403d8b252776effcddd71338b4d21a224f9b.tar.bz2 spark-6d86403d8b252776effcddd71338b4d21a224f9b.zip