aboutsummaryrefslogtreecommitdiff
path: root/sql/hive/src/test/scala/org
diff options
context:
space:
mode:
authorgatorsmile <gatorsmile@gmail.com>2017-02-06 13:30:07 +0800
committerWenchen Fan <wenchen@databricks.com>2017-02-06 13:30:07 +0800
commit65b10ffb3883cfed5b182db20b55a52ee0d89cba (patch)
tree848b84a6c16d373a65f7d6a11dc2ccf8daf87857 /sql/hive/src/test/scala/org
parent317fa7508143271bc694afbb425af49378f04cac (diff)
downloadspark-65b10ffb3883cfed5b182db20b55a52ee0d89cba.tar.gz
spark-65b10ffb3883cfed5b182db20b55a52ee0d89cba.tar.bz2
spark-65b10ffb3883cfed5b182db20b55a52ee0d89cba.zip
[SPARK-19279][SQL] Infer Schema for Hive Serde Tables and Block Creating a Hive Table With an Empty Schema
### What changes were proposed in this pull request? So far, we allow users to create a table with an empty schema: `CREATE TABLE tab1`. This could break many code paths if we enable it. Thus, we should follow Hive to block it. For Hive serde tables, some serde libraries require the specified schema and record it in the metastore. To get the list, we need to check `hive.serdes.using.metastore.for.schema,` which contains a list of serdes that require user-specified schema. The default values are - org.apache.hadoop.hive.ql.io.orc.OrcSerde - org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe - org.apache.hadoop.hive.serde2.dynamic_type.DynamicSerDe - org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe - org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe - org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe ### How was this patch tested? Added test cases for both Hive and data source tables Author: gatorsmile <gatorsmile@gmail.com> Closes #16636 from gatorsmile/fixEmptyTableSchema.
Diffstat (limited to 'sql/hive/src/test/scala/org')
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala78
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala84
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala23
3 files changed, 102 insertions, 83 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
index b20c10c6a3..43b6bf5fee 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -68,82 +68,4 @@ class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingl
sql("DROP TABLE IF EXISTS createAndInsertTest")
}
}
-
- test("SPARK-13709: reading partitioned Avro table with nested schema") {
- withTempDir { dir =>
- val path = dir.toURI.toString
- val tableName = "spark_13709"
- val tempTableName = "spark_13709_temp"
-
- new File(dir.getAbsolutePath, tableName).mkdir()
- new File(dir.getAbsolutePath, tempTableName).mkdir()
-
- val avroSchema =
- """{
- | "name": "test_record",
- | "type": "record",
- | "fields": [ {
- | "name": "f0",
- | "type": "int"
- | }, {
- | "name": "f1",
- | "type": {
- | "type": "record",
- | "name": "inner",
- | "fields": [ {
- | "name": "f10",
- | "type": "int"
- | }, {
- | "name": "f11",
- | "type": "double"
- | } ]
- | }
- | } ]
- |}
- """.stripMargin
-
- withTable(tableName, tempTableName) {
- // Creates the external partitioned Avro table to be tested.
- sql(
- s"""CREATE EXTERNAL TABLE $tableName
- |PARTITIONED BY (ds STRING)
- |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
- |STORED AS
- | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
- | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
- |LOCATION '$path/$tableName'
- |TBLPROPERTIES ('avro.schema.literal' = '$avroSchema')
- """.stripMargin
- )
-
- // Creates an temporary Avro table used to prepare testing Avro file.
- sql(
- s"""CREATE EXTERNAL TABLE $tempTableName
- |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
- |STORED AS
- | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
- | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
- |LOCATION '$path/$tempTableName'
- |TBLPROPERTIES ('avro.schema.literal' = '$avroSchema')
- """.stripMargin
- )
-
- // Generates Avro data.
- sql(s"INSERT OVERWRITE TABLE $tempTableName SELECT 1, STRUCT(2, 2.5)")
-
- // Adds generated Avro data as a new partition to the testing table.
- sql(s"ALTER TABLE $tableName ADD PARTITION (ds = 'foo') LOCATION '$path/$tempTableName'")
-
- // The following query fails before SPARK-13709 is fixed. This is because when reading data
- // from table partitions, Avro deserializer needs the Avro schema, which is defined in
- // table property "avro.schema.literal". However, we only initializes the deserializer using
- // partition properties, which doesn't include the wanted property entry. Merging two sets
- // of properties solves the problem.
- checkAnswer(
- sql(s"SELECT * FROM $tableName"),
- Row(1, Row(2, 2.5D), "foo")
- )
- }
- }
- }
}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 28b5bfd581..ca39c7e845 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -24,9 +24,8 @@ import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
import org.apache.hadoop.mapred.TextInputFormat
-import org.apache.spark.SparkFunSuite
import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchPermanentFunctionException}
import org.apache.spark.sql.catalyst.catalog._
@@ -47,7 +46,7 @@ import org.apache.spark.util.{MutableURLClassLoader, Utils}
* is not fully tested.
*/
@ExtendedHiveTest
-class VersionsSuite extends SparkFunSuite with SQLTestUtils with TestHiveSingleton with Logging {
+class VersionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton with Logging {
private val clientBuilder = new HiveClientBuilder
import clientBuilder.buildClient
@@ -571,6 +570,85 @@ class VersionsSuite extends SparkFunSuite with SQLTestUtils with TestHiveSinglet
}
}
+
+ test(s"$version: SPARK-13709: reading partitioned Avro table with nested schema") {
+ withTempDir { dir =>
+ val path = dir.toURI.toString
+ val tableName = "spark_13709"
+ val tempTableName = "spark_13709_temp"
+
+ new File(dir.getAbsolutePath, tableName).mkdir()
+ new File(dir.getAbsolutePath, tempTableName).mkdir()
+
+ val avroSchema =
+ """{
+ | "name": "test_record",
+ | "type": "record",
+ | "fields": [ {
+ | "name": "f0",
+ | "type": "int"
+ | }, {
+ | "name": "f1",
+ | "type": {
+ | "type": "record",
+ | "name": "inner",
+ | "fields": [ {
+ | "name": "f10",
+ | "type": "int"
+ | }, {
+ | "name": "f11",
+ | "type": "double"
+ | } ]
+ | }
+ | } ]
+ |}
+ """.stripMargin
+
+ withTable(tableName, tempTableName) {
+ // Creates the external partitioned Avro table to be tested.
+ sql(
+ s"""CREATE EXTERNAL TABLE $tableName
+ |PARTITIONED BY (ds STRING)
+ |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+ |STORED AS
+ | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+ | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+ |LOCATION '$path/$tableName'
+ |TBLPROPERTIES ('avro.schema.literal' = '$avroSchema')
+ """.stripMargin
+ )
+
+ // Creates an temporary Avro table used to prepare testing Avro file.
+ sql(
+ s"""CREATE EXTERNAL TABLE $tempTableName
+ |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+ |STORED AS
+ | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+ | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+ |LOCATION '$path/$tempTableName'
+ |TBLPROPERTIES ('avro.schema.literal' = '$avroSchema')
+ """.stripMargin
+ )
+
+ // Generates Avro data.
+ sql(s"INSERT OVERWRITE TABLE $tempTableName SELECT 1, STRUCT(2, 2.5)")
+
+ // Adds generated Avro data as a new partition to the testing table.
+ sql(s"ALTER TABLE $tableName ADD PARTITION (ds = 'foo') LOCATION '$path/$tempTableName'")
+
+ // The following query fails before SPARK-13709 is fixed. This is because when reading
+ // data from table partitions, Avro deserializer needs the Avro schema, which is defined
+ // in table property "avro.schema.literal". However, we only initializes the deserializer
+ // using partition properties, which doesn't include the wanted property entry. Merging
+ // two sets of properties solves the problem.
+ checkAnswer(
+ sql(s"SELECT * FROM $tableName"),
+ Row(1, Row(2, 2.5D), "foo")
+ )
+ }
+ }
+ }
+
// TODO: add more tests.
}
}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 58be079d01..9d9f3a620d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -79,6 +79,25 @@ class HiveDDLSuite
}
}
+ test("create a hive table without schema") {
+ import testImplicits._
+ withTempPath { tempDir =>
+ withTable("tab1", "tab2") {
+ (("a", "b") :: Nil).toDF().write.json(tempDir.getCanonicalPath)
+
+ var e = intercept[AnalysisException] { sql("CREATE TABLE tab1 USING hive") }.getMessage
+ assert(e.contains("Unable to infer the schema. The schema specification is required to " +
+ "create the table `default`.`tab1`"))
+
+ e = intercept[AnalysisException] {
+ sql(s"CREATE TABLE tab2 location '${tempDir.getCanonicalPath}'")
+ }.getMessage
+ assert(e.contains("Unable to infer the schema. The schema specification is required to " +
+ "create the table `default`.`tab2`"))
+ }
+ }
+ }
+
test("drop external tables in default database") {
withTempDir { tmpDir =>
val tabName = "tab1"
@@ -199,7 +218,7 @@ class HiveDDLSuite
val e = intercept[AnalysisException] {
sql("CREATE TABLE tbl(a int) PARTITIONED BY (a string)")
}
- assert(e.message == "Found duplicate column(s) in table definition of `tbl`: a")
+ assert(e.message == "Found duplicate column(s) in table definition of `default`.`tbl`: a")
}
test("add/drop partition with location - managed table") {
@@ -1192,7 +1211,7 @@ class HiveDDLSuite
assert(e2.getMessage.contains(forbiddenPrefix + "foo"))
val e3 = intercept[AnalysisException] {
- sql(s"CREATE TABLE tbl TBLPROPERTIES ('${forbiddenPrefix}foo'='anything')")
+ sql(s"CREATE TABLE tbl (a INT) TBLPROPERTIES ('${forbiddenPrefix}foo'='anything')")
}
assert(e3.getMessage.contains(forbiddenPrefix + "foo"))
}