From 09829be621f0f9bb5076abb3d832925624699fa9 Mon Sep 17 00:00:00 2001 From: Xiao Li Date: Wed, 8 Mar 2017 23:12:10 -0800 Subject: [SPARK-19235][SQL][TESTS] Enable Test Cases in DDLSuite with Hive Metastore ### What changes were proposed in this pull request? So far, the test cases in DDLSuites only verify the behaviors of InMemoryCatalog. That means, they do not cover the scenarios using HiveExternalCatalog. Thus, we need to improve the existing test suite to run these cases using Hive metastore. When porting these test cases, a bug of `SET LOCATION` is found. `path` is not set when the location is changed. After this PR, a few changes are made, as summarized below, - `DDLSuite` becomes an abstract class. Both `InMemoryCatalogedDDLSuite` and `HiveCatalogedDDLSuite` extend it. `InMemoryCatalogedDDLSuite` is using `InMemoryCatalog`. `HiveCatalogedDDLSuite` is using `HiveExternalCatalog`. - `InMemoryCatalogedDDLSuite` contains all the existing test cases in `DDLSuite`. - `HiveCatalogedDDLSuite` contains a subset of `DDLSuite`. The following test cases are excluded: 1. The following test cases only make sense for `InMemoryCatalog`: ``` test("desc table for parquet data source table using in-memory catalog") test("create a managed Hive source table") { test("create an external Hive source table") test("Create Hive Table As Select") ``` 2. The following test cases are unable to be ported because we are unable to alter table provider when using Hive metastore. In the future PRs we need to improve the test cases so that altering table provider is not needed: ``` test("alter table: set location (datasource table)") test("alter table: set properties (datasource table)") test("alter table: unset properties (datasource table)") test("alter table: set serde (datasource table)") test("alter table: set serde partition (datasource table)") test("alter table: change column (datasource table)") test("alter table: add partition (datasource table)") test("alter table: drop partition (datasource table)") test("alter table: rename partition (datasource table)") test("drop table - data source table") ``` **TODO** : in the future PRs, we need to remove `HiveDDLSuite` and move the test cases to either `DDLSuite`, `InMemoryCatalogedDDLSuite` or `HiveCatalogedDDLSuite`. ### How was this patch tested? N/A Author: Xiao Li Author: gatorsmile Closes #16592 from gatorsmile/refactorDDLSuite. --- .../spark/sql/hive/execution/HiveDDLSuite.scala | 157 ++++++++++----------- 1 file changed, 75 insertions(+), 82 deletions(-) (limited to 'sql/hive') diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 10d929a4a0..fce055048d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -27,16 +27,88 @@ import org.scalatest.BeforeAndAfterEach import org.apache.spark.SparkException import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode} import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException} -import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTableType, CatalogUtils, ExternalCatalogUtils} +import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.execution.command.DDLUtils +import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils} import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.orc.OrcFileOperator import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.sql.test.SQLTestUtils -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{MetadataBuilder, StructType} + +// TODO(gatorsmile): combine HiveCatalogedDDLSuite and HiveDDLSuite +class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeAndAfterEach { + override def afterEach(): Unit = { + try { + // drop all databases, tables and functions after each test + spark.sessionState.catalog.reset() + } finally { + super.afterEach() + } + } + + protected override def generateTable( + catalog: SessionCatalog, + name: TableIdentifier): CatalogTable = { + val storage = + CatalogStorageFormat( + locationUri = Some(catalog.defaultTablePath(name)), + inputFormat = Some("org.apache.hadoop.mapred.SequenceFileInputFormat"), + outputFormat = Some("org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat"), + serde = Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"), + compressed = false, + properties = Map("serialization.format" -> "1")) + val metadata = new MetadataBuilder() + .putString("key", "value") + .build() + CatalogTable( + identifier = name, + tableType = CatalogTableType.EXTERNAL, + storage = storage, + schema = new StructType() + .add("col1", "int", nullable = true, metadata = metadata) + .add("col2", "string") + .add("a", "int") + .add("b", "int"), + provider = Some("hive"), + partitionColumnNames = Seq("a", "b"), + createTime = 0L, + tracksPartitionsInCatalog = true) + } + + protected override def normalizeCatalogTable(table: CatalogTable): CatalogTable = { + val nondeterministicProps = Set( + "CreateTime", + "transient_lastDdlTime", + "grantTime", + "lastUpdateTime", + "last_modified_by", + "last_modified_time", + "Owner:", + "COLUMN_STATS_ACCURATE", + // The following are hive specific schema parameters which we do not need to match exactly. + "numFiles", + "numRows", + "rawDataSize", + "totalSize", + "totalNumberFiles", + "maxFileSize", + "minFileSize" + ) + + table.copy( + createTime = 0L, + lastAccessTime = 0L, + owner = "", + properties = table.properties.filterKeys(!nondeterministicProps.contains(_)), + // View texts are checked separately + viewText = None + ) + } + +} class HiveDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach { @@ -1719,61 +1791,6 @@ class HiveDDLSuite } } - Seq("a b", "a:b", "a%b").foreach { specialChars => - test(s"datasource table: location uri contains $specialChars") { - withTable("t", "t1") { - withTempDir { dir => - val loc = new File(dir, specialChars) - loc.mkdir() - spark.sql( - s""" - |CREATE TABLE t(a string) - |USING parquet - |LOCATION '$loc' - """.stripMargin) - - val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) - assert(table.location == new Path(loc.getAbsolutePath).toUri) - assert(new Path(table.location).toString.contains(specialChars)) - - assert(loc.listFiles().isEmpty) - spark.sql("INSERT INTO TABLE t SELECT 1") - assert(loc.listFiles().length >= 1) - checkAnswer(spark.table("t"), Row("1") :: Nil) - } - - withTempDir { dir => - val loc = new File(dir, specialChars) - loc.mkdir() - spark.sql( - s""" - |CREATE TABLE t1(a string, b string) - |USING parquet - |PARTITIONED BY(b) - |LOCATION '$loc' - """.stripMargin) - - val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t1")) - assert(table.location == new Path(loc.getAbsolutePath).toUri) - assert(new Path(table.location).toString.contains(specialChars)) - - assert(loc.listFiles().isEmpty) - spark.sql("INSERT INTO TABLE t1 PARTITION(b=2) SELECT 1") - val partFile = new File(loc, "b=2") - assert(partFile.listFiles().length >= 1) - checkAnswer(spark.table("t1"), Row("1", "2") :: Nil) - - spark.sql("INSERT INTO TABLE t1 PARTITION(b='2017-03-03 12:13%3A14') SELECT 1") - val partFile1 = new File(loc, "b=2017-03-03 12:13%3A14") - assert(!partFile1.exists()) - val partFile2 = new File(loc, "b=2017-03-03 12%3A13%253A14") - assert(partFile2.listFiles().length >= 1) - checkAnswer(spark.table("t1"), Row("1", "2") :: Row("1", "2017-03-03 12:13%3A14") :: Nil) - } - } - } - } - Seq("a b", "a:b", "a%b").foreach { specialChars => test(s"hive table: location uri contains $specialChars") { withTable("t") { @@ -1848,28 +1865,4 @@ class HiveDDLSuite } } } - - Seq("a b", "a:b", "a%b").foreach { specialChars => - test(s"location uri contains $specialChars for database") { - try { - withTable("t") { - withTempDir { dir => - val loc = new File(dir, specialChars) - spark.sql(s"CREATE DATABASE tmpdb LOCATION '$loc'") - spark.sql("USE tmpdb") - - Seq(1).toDF("a").write.saveAsTable("t") - val tblloc = new File(loc, "t") - val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) - val tblPath = new Path(tblloc.getAbsolutePath) - val fs = tblPath.getFileSystem(spark.sessionState.newHadoopConf()) - assert(table.location == makeQualifiedPath(tblloc.getAbsolutePath)) - assert(tblloc.listFiles().nonEmpty) - } - } - } finally { - spark.sql("DROP DATABASE IF EXISTS tmpdb") - } - } - } } -- cgit v1.2.3