[SPARK-17072][SQL] support table-level statistics generation and storing into/loading from metastore

## What changes were proposed in this pull request? 1. Support generation table-level statistics for - hive tables in HiveExternalCatalog - data source tables in HiveExternalCatalog - data source tables in InMemoryCatalog. 2. Add a property "catalogStats" in CatalogTable to hold statistics in Spark side. 3. Put logics of statistics transformation between Spark and Hive in HiveClientImpl. 4. Extend Statistics class by adding rowCount (will add estimatedSize when we have column stats). ## How was this patch tested? add unit tests Author: wangzhenhua <wangzhenhua@huawei.com> Author: Zhenhua Wang <wangzhenhua@huawei.com> Closes #14712 from wzhfy/tableStats.
author: wangzhenhua <wangzhenhua@huawei.com> 2016-09-05 17:32:31 +0200
committer: Herman van Hovell <hvanhovell@databricks.com> 2016-09-05 17:32:31 +0200
commit: 6d86403d8b252776effcddd71338b4d21a224f9b (patch)
tree: 2fa41086f7d6c9dacb86d3e34d9c8f6f0b4fdcab /sql/core/src/test/scala
parent: 3ccb23e445711ea5d9059eb6de7c490c8fc9d112 (diff)
download: spark-6d86403d8b252776effcddd71338b4d21a224f9b.tar.gz
spark-6d86403d8b252776effcddd71338b4d21a224f9b.tar.bz2
spark-6d86403d8b252776effcddd71338b4d21a224f9b.zip
1 files changed, 26 insertions, 0 deletions
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala
index 2c81cbf15f..264a2ffbeb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.plans.logical.{GlobalLimit, Join, LocalLimit}
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
@@ -75,4 +76,29 @@ class StatisticsSuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("test table-level statistics for data source table created in InMemoryCatalog") {
+    def checkTableStats(tableName: String, expectedRowCount: Option[BigInt]): Unit = {
+      val df = sql(s"SELECT * FROM $tableName")
+      val relations = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
+        assert(rel.catalogTable.isDefined)
+        assert(rel.catalogTable.get.stats.flatMap(_.rowCount) === expectedRowCount)
+        rel
+      }
+      assert(relations.size === 1)
+    }
+
+    val tableName = "tbl"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName(i INT, j STRING) USING parquet")
+      Seq(1 -> "a", 2 -> "b").toDF("i", "j").write.mode("overwrite").insertInto("tbl")
+
+      // noscan won't count the number of rows
+      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
+      checkTableStats(tableName, expectedRowCount = None)
+
+      // without noscan, we count the number of rows
+      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+      checkTableStats(tableName, expectedRowCount = Some(2))
+    }
+  }
 }
author	wangzhenhua <wangzhenhua@huawei.com>	2016-09-05 17:32:31 +0200
committer	Herman van Hovell <hvanhovell@databricks.com>	2016-09-05 17:32:31 +0200
commit	6d86403d8b252776effcddd71338b4d21a224f9b (patch)
tree	2fa41086f7d6c9dacb86d3e34d9c8f6f0b4fdcab /sql/core/src/test/scala
parent	3ccb23e445711ea5d9059eb6de7c490c8fc9d112 (diff)
download	spark-6d86403d8b252776effcddd71338b4d21a224f9b.tar.gz spark-6d86403d8b252776effcddd71338b4d21a224f9b.tar.bz2 spark-6d86403d8b252776effcddd71338b4d21a224f9b.zip