diff options
author | gatorsmile <gatorsmile@gmail.com> | 2016-09-07 08:13:12 +0800 |
---|---|---|
committer | Wenchen Fan <wenchen@databricks.com> | 2016-09-07 08:13:12 +0800 |
commit | a40657bfd375bd27d65204bb42ed0cbd7bd1ebf2 (patch) | |
tree | 868d4e48f4edca4c09a49ea633876696e96b1b77 /sql/hive/src/test | |
parent | c07cbb3534a57834b9b78e1572d40fb2af930f5f (diff) | |
download | spark-a40657bfd375bd27d65204bb42ed0cbd7bd1ebf2.tar.gz spark-a40657bfd375bd27d65204bb42ed0cbd7bd1ebf2.tar.bz2 spark-a40657bfd375bd27d65204bb42ed0cbd7bd1ebf2.zip |
[SPARK-17408][TEST] Flaky test: org.apache.spark.sql.hive.StatisticsSuite
### What changes were proposed in this pull request?
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/64956/testReport/junit/org.apache.spark.sql.hive/StatisticsSuite/test_statistics_of_LogicalRelation_converted_from_MetastoreRelation/
```
org.apache.spark.sql.hive.StatisticsSuite.test statistics of LogicalRelation converted from MetastoreRelation
Failing for the past 1 build (Since Failed#64956 )
Took 1.4 sec.
Error Message
org.scalatest.exceptions.TestFailedException: 6871 did not equal 4236
Stacktrace
sbt.ForkMain$ForkError: org.scalatest.exceptions.TestFailedException: 6871 did not equal 4236
at org.scalatest.Assertions$class.newAssertionFailedException(Assertions.scala:500)
```
This fix does not check the exact value of `sizeInBytes`. Instead, we compare whether it is larger than zero and compare the values between different values.
In addition, we also combine `checkMetastoreRelationStats` and `checkLogicalRelationStats` into the same checking function.
### How was this patch tested?
N/A
Author: gatorsmile <gatorsmile@gmail.com>
Closes #14978 from gatorsmile/spark17408.
Diffstat (limited to 'sql/hive/src/test')
-rw-r--r-- | sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala | 141 |
1 files changed, 80 insertions, 61 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 33ed675754..9956706929 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -171,23 +171,37 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils TableIdentifier("tempTable"), ignoreIfNotExists = true, purge = false) } - private def checkMetastoreRelationStats( + private def checkStats( + stats: Option[Statistics], + hasSizeInBytes: Boolean, + expectedRowCounts: Option[Int]): Unit = { + if (hasSizeInBytes || expectedRowCounts.nonEmpty) { + assert(stats.isDefined) + assert(stats.get.sizeInBytes > 0) + assert(stats.get.rowCount === expectedRowCounts) + } else { + assert(stats.isEmpty) + } + } + + private def checkStats( tableName: String, - expectedStats: Option[Statistics]): Unit = { + isDataSourceTable: Boolean, + hasSizeInBytes: Boolean, + expectedRowCounts: Option[Int]): Option[Statistics] = { val df = sql(s"SELECT * FROM $tableName") - val relations = df.queryExecution.analyzed.collect { case rel: MetastoreRelation => - expectedStats match { - case Some(es) => - assert(rel.catalogTable.stats.isDefined) - val stats = rel.catalogTable.stats.get - assert(stats.sizeInBytes === es.sizeInBytes) - assert(stats.rowCount === es.rowCount) - case None => - assert(rel.catalogTable.stats.isEmpty) - } - rel + val stats = df.queryExecution.analyzed.collect { + case rel: MetastoreRelation => + checkStats(rel.catalogTable.stats, hasSizeInBytes, expectedRowCounts) + assert(!isDataSourceTable, "Expected a data source table, but got a Hive serde table") + rel.catalogTable.stats + case rel: LogicalRelation => + checkStats(rel.catalogTable.get.stats, hasSizeInBytes, expectedRowCounts) + assert(isDataSourceTable, "Expected a Hive serde table, but got a data source table") + rel.catalogTable.get.stats } - assert(relations.size === 1) + assert(stats.size == 1) + stats.head } test("test table-level statistics for hive tables created in HiveExternalCatalog") { @@ -196,19 +210,28 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils // Currently Spark's statistics are self-contained, we don't have statistics until we use // the `ANALYZE TABLE` command. sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE") - checkMetastoreRelationStats(textTable, expectedStats = None) + checkStats( + textTable, + isDataSourceTable = false, + hasSizeInBytes = false, + expectedRowCounts = None) sql(s"INSERT INTO TABLE $textTable SELECT * FROM src") - checkMetastoreRelationStats(textTable, expectedStats = None) + checkStats( + textTable, + isDataSourceTable = false, + hasSizeInBytes = false, + expectedRowCounts = None) // noscan won't count the number of rows sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan") - checkMetastoreRelationStats(textTable, expectedStats = - Some(Statistics(sizeInBytes = 5812, rowCount = None))) + val fetchedStats1 = checkStats( + textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = None) // without noscan, we count the number of rows sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS") - checkMetastoreRelationStats(textTable, expectedStats = - Some(Statistics(sizeInBytes = 5812, rowCount = Some(500)))) + val fetchedStats2 = checkStats( + textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500)) + assert(fetchedStats1.get.sizeInBytes == fetchedStats2.get.sizeInBytes) } } @@ -218,40 +241,22 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE") sql(s"INSERT INTO TABLE $textTable SELECT * FROM src") sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS") - checkMetastoreRelationStats(textTable, expectedStats = - Some(Statistics(sizeInBytes = 5812, rowCount = Some(500)))) + val fetchedStats1 = checkStats( + textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500)) sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan") // when the total size is not changed, the old row count is kept - checkMetastoreRelationStats(textTable, expectedStats = - Some(Statistics(sizeInBytes = 5812, rowCount = Some(500)))) + val fetchedStats2 = checkStats( + textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = Some(500)) + assert(fetchedStats1 == fetchedStats2) sql(s"INSERT INTO TABLE $textTable SELECT * FROM src") sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan") // update total size and remove the old and invalid row count - checkMetastoreRelationStats(textTable, expectedStats = - Some(Statistics(sizeInBytes = 11624, rowCount = None))) - } - } - - private def checkLogicalRelationStats( - tableName: String, - expectedStats: Option[Statistics]): Unit = { - val df = sql(s"SELECT * FROM $tableName") - val relations = df.queryExecution.analyzed.collect { case rel: LogicalRelation => - assert(rel.catalogTable.isDefined) - expectedStats match { - case Some(es) => - assert(rel.catalogTable.get.stats.isDefined) - val stats = rel.catalogTable.get.stats.get - assert(stats.sizeInBytes === es.sizeInBytes) - assert(stats.rowCount === es.rowCount) - case None => - assert(rel.catalogTable.get.stats.isEmpty) - } - rel + val fetchedStats3 = checkStats( + textTable, isDataSourceTable = false, hasSizeInBytes = true, expectedRowCounts = None) + assert(fetchedStats3.get.sizeInBytes > fetchedStats2.get.sizeInBytes) } - assert(relations.size === 1) } test("test statistics of LogicalRelation converted from MetastoreRelation") { @@ -266,16 +271,21 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils // the default value for `spark.sql.hive.convertMetastoreParquet` is true, here we just set it // for robustness withSQLConf("spark.sql.hive.convertMetastoreParquet" -> "true") { - checkLogicalRelationStats(parquetTable, expectedStats = None) + checkStats( + parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None) sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS") - checkLogicalRelationStats(parquetTable, expectedStats = - Some(Statistics(sizeInBytes = 4236, rowCount = Some(500)))) + checkStats( + parquetTable, + isDataSourceTable = true, + hasSizeInBytes = true, + expectedRowCounts = Some(500)) } withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "true") { - checkLogicalRelationStats(orcTable, expectedStats = None) + checkStats( + orcTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None) sql(s"ANALYZE TABLE $orcTable COMPUTE STATISTICS") - checkLogicalRelationStats(orcTable, expectedStats = - Some(Statistics(sizeInBytes = 3023, rowCount = Some(500)))) + checkStats( + orcTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = Some(500)) } } } @@ -288,22 +298,28 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils assert(DDLUtils.isDatasourceTable(catalogTable)) sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src") - checkLogicalRelationStats(parquetTable, expectedStats = None) + checkStats( + parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None) // noscan won't count the number of rows sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan") - checkLogicalRelationStats(parquetTable, expectedStats = - Some(Statistics(sizeInBytes = 4236, rowCount = None))) + val fetchedStats1 = checkStats( + parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None) sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src") sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan") - checkLogicalRelationStats(parquetTable, expectedStats = - Some(Statistics(sizeInBytes = 8472, rowCount = None))) + val fetchedStats2 = checkStats( + parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None) + assert(fetchedStats2.get.sizeInBytes > fetchedStats1.get.sizeInBytes) // without noscan, we count the number of rows sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS") - checkLogicalRelationStats(parquetTable, expectedStats = - Some(Statistics(sizeInBytes = 8472, rowCount = Some(1000)))) + val fetchedStats3 = checkStats( + parquetTable, + isDataSourceTable = true, + hasSizeInBytes = true, + expectedRowCounts = Some(1000)) + assert(fetchedStats3.get.sizeInBytes == fetchedStats2.get.sizeInBytes) } } @@ -314,8 +330,11 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils val dfNoCols = spark.createDataFrame(rddNoCols, StructType(Seq.empty)) dfNoCols.write.format("json").saveAsTable(table_no_cols) sql(s"ANALYZE TABLE $table_no_cols COMPUTE STATISTICS") - checkLogicalRelationStats(table_no_cols, expectedStats = - Some(Statistics(sizeInBytes = 30, rowCount = Some(10)))) + checkStats( + table_no_cols, + isDataSourceTable = true, + hasSizeInBytes = true, + expectedRowCounts = Some(10)) } } |