aboutsummaryrefslogtreecommitdiff
path: root/sql/hive
diff options
context:
space:
mode:
authorgatorsmile <gatorsmile@gmail.com>2016-09-07 08:13:12 +0800
committerWenchen Fan <wenchen@databricks.com>2016-09-07 08:13:12 +0800
commita40657bfd375bd27d65204bb42ed0cbd7bd1ebf2 (patch)
tree868d4e48f4edca4c09a49ea633876696e96b1b77 /sql/hive
parentc07cbb3534a57834b9b78e1572d40fb2af930f5f (diff)
downloadspark-a40657bfd375bd27d65204bb42ed0cbd7bd1ebf2.tar.gz
spark-a40657bfd375bd27d65204bb42ed0cbd7bd1ebf2.tar.bz2
spark-a40657bfd375bd27d65204bb42ed0cbd7bd1ebf2.zip
[SPARK-17408][TEST] Flaky test: org.apache.spark.sql.hive.StatisticsSuite
### What changes were proposed in this pull request? https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/64956/testReport/junit/org.apache.spark.sql.hive/StatisticsSuite/test_statistics_of_LogicalRelation_converted_from_MetastoreRelation/ ``` org.apache.spark.sql.hive.StatisticsSuite.test statistics of LogicalRelation converted from MetastoreRelation Failing for the past 1 build (Since Failed#64956 ) Took 1.4 sec. Error Message org.scalatest.exceptions.TestFailedException: 6871 did not equal 4236 Stacktrace sbt.ForkMain$ForkError: org.scalatest.exceptions.TestFailedException: 6871 did not equal 4236 at org.scalatest.Assertions$class.newAssertionFailedException(Assertions.scala:500) ``` This fix does not check the exact value of `sizeInBytes`. Instead, we compare whether it is larger than zero and compare the values between different values. In addition, we also combine `checkMetastoreRelationStats` and `checkLogicalRelationStats` into the same checking function. ### How was this patch tested? N/A Author: gatorsmile <gatorsmile@gmail.com> Closes #14978 from gatorsmile/spark17408.
Diffstat (limited to 'sql/hive')
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala141
1 files changed, 80 insertions, 61 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 33ed675754..9956706929 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -171,23 +171,37 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
TableIdentifier("tempTable"), ignoreIfNotExists = true, purge = false)
}
- private def checkMetastoreRelationStats(
+ private def checkStats(
+ stats: Option[Statistics],
+ hasSizeInBytes: Boolean,
+ expectedRowCounts: Option[Int]): Unit = {
+ if (hasSizeInBytes || expectedRowCounts.nonEmpty) {
+ assert(stats.isDefined)
+ assert(stats.get.sizeInBytes > 0)
+ assert(stats.get.rowCount === expectedRowCounts)
+ } else {
+ assert(stats.isEmpty)
+ }
+ }
+
+ private def checkStats(
tableName: String,
- expectedStats: Option[Statistics]): Unit = {
+ isDataSourceTable: Boolean,
+ hasSizeInBytes: Boolean,
+ expectedRowCounts: Option[Int]): Option[Statistics] = {
val df = sql(s"SELECT * FROM $tableName")
- val relations = df.queryExecution.analyzed.collect { case rel: MetastoreRelation =>
- expectedStats match {
- case Some(es) =>
- assert(rel.catalogTable.stats.isDefined)
- val stats = rel.catalogTable.stats.get
- assert(stats.sizeInBytes === es.sizeInBytes)
- assert(stats.rowCount === es.rowCount)
- case None =>
- assert(rel.catalogTable.stats.isEmpty)
- }
- rel
+ val stats = df.queryExecution.analyzed.collect {
+ case rel: MetastoreRelation =>
+ checkStats(rel.catalogTable.stats, hasSizeInBytes, expectedRowCounts)
+ assert(!isDataSourceTable, "Expected a data source table, but got a Hive serde table")
+ rel.catalogTable.stats
+ case rel: LogicalRelation =>
+ checkStats(rel.catalogTable.get.stats, hasSizeInBytes, expectedRowCounts)
+ assert(isDataSourceTable, "Expected a Hive serde table, but got a data source table")
+ rel.catalogTable.get.stats
}
- assert(relations.size === 1)
+ assert(stats.size == 1)
+ stats.head
}
test("test table-level statistics for hive tables created in HiveExternalCatalog") {
@@ -196,19 +210,28 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
// Currently Spark's statistics are self-contained, we don't have statistics until we use
// the `ANALYZE TABLE` command.
sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
- checkMetastoreRelationStats(textTable, expectedStats = None)
+ checkStats(
+ textTable,
+ isDataSourceTable = false,
+ hasSizeInBytes = false,
+ expectedRowCounts = None)
sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
- checkMetastoreRelationStats(textTable, expectedStats = None)
+ checkStats(
+ textTable,
+ isDataSourceTable = false,
+ hasSizeInBytes = false,
+ expectedRowCounts = None)
// noscan won't count the number of rows
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
- checkMetastoreRelationStats(textTable, expectedStats =
- Some(Statistics(sizeInBytes = 5812, rowCount = None)))
+ val fetchedStats1 = checkStats(
+ textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = None)
// without noscan, we count the number of rows
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
- checkMetastoreRelationStats(textTable, expectedStats =
- Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+ val fetchedStats2 = checkStats(
+ textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500))
+ assert(fetchedStats1.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
}
}
@@ -218,40 +241,22 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
- checkMetastoreRelationStats(textTable, expectedStats =
- Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+ val fetchedStats1 = checkStats(
+ textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500))
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
// when the total size is not changed, the old row count is kept
- checkMetastoreRelationStats(textTable, expectedStats =
- Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
+ val fetchedStats2 = checkStats(
+ textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500))
+ assert(fetchedStats1 == fetchedStats2)
sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
// update total size and remove the old and invalid row count
- checkMetastoreRelationStats(textTable, expectedStats =
- Some(Statistics(sizeInBytes = 11624, rowCount = None)))
- }
- }
-
- private def checkLogicalRelationStats(
- tableName: String,
- expectedStats: Option[Statistics]): Unit = {
- val df = sql(s"SELECT * FROM $tableName")
- val relations = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
- assert(rel.catalogTable.isDefined)
- expectedStats match {
- case Some(es) =>
- assert(rel.catalogTable.get.stats.isDefined)
- val stats = rel.catalogTable.get.stats.get
- assert(stats.sizeInBytes === es.sizeInBytes)
- assert(stats.rowCount === es.rowCount)
- case None =>
- assert(rel.catalogTable.get.stats.isEmpty)
- }
- rel
+ val fetchedStats3 = checkStats(
+ textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = None)
+ assert(fetchedStats3.get.sizeInBytes > fetchedStats2.get.sizeInBytes)
}
- assert(relations.size === 1)
}
test("test statistics of LogicalRelation converted from MetastoreRelation") {
@@ -266,16 +271,21 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
// the default value for `spark.sql.hive.convertMetastoreParquet` is true, here we just set it
// for robustness
withSQLConf("spark.sql.hive.convertMetastoreParquet" -> "true") {
- checkLogicalRelationStats(parquetTable, expectedStats = None)
+ checkStats(
+ parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
- checkLogicalRelationStats(parquetTable, expectedStats =
- Some(Statistics(sizeInBytes = 4236, rowCount = Some(500))))
+ checkStats(
+ parquetTable,
+ isDataSourceTable = true,
+ hasSizeInBytes = true,
+ expectedRowCounts = Some(500))
}
withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "true") {
- checkLogicalRelationStats(orcTable, expectedStats = None)
+ checkStats(
+ orcTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
sql(s"ANALYZE TABLE $orcTable COMPUTE STATISTICS")
- checkLogicalRelationStats(orcTable, expectedStats =
- Some(Statistics(sizeInBytes = 3023, rowCount = Some(500))))
+ checkStats(
+ orcTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = Some(500))
}
}
}
@@ -288,22 +298,28 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
assert(DDLUtils.isDatasourceTable(catalogTable))
sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
- checkLogicalRelationStats(parquetTable, expectedStats = None)
+ checkStats(
+ parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
// noscan won't count the number of rows
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
- checkLogicalRelationStats(parquetTable, expectedStats =
- Some(Statistics(sizeInBytes = 4236, rowCount = None)))
+ val fetchedStats1 = checkStats(
+ parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
- checkLogicalRelationStats(parquetTable, expectedStats =
- Some(Statistics(sizeInBytes = 8472, rowCount = None)))
+ val fetchedStats2 = checkStats(
+ parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
+ assert(fetchedStats2.get.sizeInBytes > fetchedStats1.get.sizeInBytes)
// without noscan, we count the number of rows
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
- checkLogicalRelationStats(parquetTable, expectedStats =
- Some(Statistics(sizeInBytes = 8472, rowCount = Some(1000))))
+ val fetchedStats3 = checkStats(
+ parquetTable,
+ isDataSourceTable = true,
+ hasSizeInBytes = true,
+ expectedRowCounts = Some(1000))
+ assert(fetchedStats3.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
}
}
@@ -314,8 +330,11 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
val dfNoCols = spark.createDataFrame(rddNoCols, StructType(Seq.empty))
dfNoCols.write.format("json").saveAsTable(table_no_cols)
sql(s"ANALYZE TABLE $table_no_cols COMPUTE STATISTICS")
- checkLogicalRelationStats(table_no_cols, expectedStats =
- Some(Statistics(sizeInBytes = 30, rowCount = Some(10))))
+ checkStats(
+ table_no_cols,
+ isDataSourceTable = true,
+ hasSizeInBytes = true,
+ expectedRowCounts = Some(10))
}
}