[SPARK-19279][SQL] Infer Schema for Hive Serde Tables and Block Creating a Hive Table With an Empty Schema

### What changes were proposed in this pull request? So far, we allow users to create a table with an empty schema: `CREATE TABLE tab1`. This could break many code paths if we enable it. Thus, we should follow Hive to block it. For Hive serde tables, some serde libraries require the specified schema and record it in the metastore. To get the list, we need to check `hive.serdes.using.metastore.for.schema,` which contains a list of serdes that require user-specified schema. The default values are - org.apache.hadoop.hive.ql.io.orc.OrcSerde - org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe - org.apache.hadoop.hive.serde2.dynamic_type.DynamicSerDe - org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe - org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe - org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe - org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe ### How was this patch tested? Added test cases for both Hive and data source tables Author: gatorsmile <gatorsmile@gmail.com> Closes #16636 from gatorsmile/fixEmptyTableSchema.
author: gatorsmile <gatorsmile@gmail.com> 2017-02-06 13:30:07 +0800
committer: Wenchen Fan <wenchen@databricks.com> 2017-02-06 13:30:07 +0800
commit: 65b10ffb3883cfed5b182db20b55a52ee0d89cba (patch)
tree: 848b84a6c16d373a65f7d6a11dc2ccf8daf87857 /sql/hive/src/test/scala/org
parent: 317fa7508143271bc694afbb425af49378f04cac (diff)
download: spark-65b10ffb3883cfed5b182db20b55a52ee0d89cba.tar.gz
spark-65b10ffb3883cfed5b182db20b55a52ee0d89cba.tar.bz2
spark-65b10ffb3883cfed5b182db20b55a52ee0d89cba.zip
3 files changed, 102 insertions, 83 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
index b20c10c6a3..43b6bf5fee 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -68,82 +68,4 @@ class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingl
       sql("DROP TABLE IF EXISTS createAndInsertTest")
     }
   }
-
-  test("SPARK-13709: reading partitioned Avro table with nested schema") {
-    withTempDir { dir =>
-      val path = dir.toURI.toString
-      val tableName = "spark_13709"
-      val tempTableName = "spark_13709_temp"
-
-      new File(dir.getAbsolutePath, tableName).mkdir()
-      new File(dir.getAbsolutePath, tempTableName).mkdir()
-
-      val avroSchema =
-        """{
-          |  "name": "test_record",
-          |  "type": "record",
-          |  "fields": [ {
-          |    "name": "f0",
-          |    "type": "int"
-          |  }, {
-          |    "name": "f1",
-          |    "type": {
-          |      "type": "record",
-          |      "name": "inner",
-          |      "fields": [ {
-          |        "name": "f10",
-          |        "type": "int"
-          |      }, {
-          |        "name": "f11",
-          |        "type": "double"
-          |      } ]
-          |    }
-          |  } ]
-          |}
-        """.stripMargin
-
-      withTable(tableName, tempTableName) {
-        // Creates the external partitioned Avro table to be tested.
-        sql(
-          s"""CREATE EXTERNAL TABLE $tableName
-             |PARTITIONED BY (ds STRING)
-             |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
-             |STORED AS
-             |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
-             |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
-             |LOCATION '$path/$tableName'
-             |TBLPROPERTIES ('avro.schema.literal' = '$avroSchema')
-           """.stripMargin
-        )
-
-        // Creates an temporary Avro table used to prepare testing Avro file.
-        sql(
-          s"""CREATE EXTERNAL TABLE $tempTableName
-             |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
-             |STORED AS
-             |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
-             |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
-             |LOCATION '$path/$tempTableName'
-             |TBLPROPERTIES ('avro.schema.literal' = '$avroSchema')
-           """.stripMargin
-        )
-
-        // Generates Avro data.
-        sql(s"INSERT OVERWRITE TABLE $tempTableName SELECT 1, STRUCT(2, 2.5)")
-
-        // Adds generated Avro data as a new partition to the testing table.
-        sql(s"ALTER TABLE $tableName ADD PARTITION (ds = 'foo') LOCATION '$path/$tempTableName'")
-
-        // The following query fails before SPARK-13709 is fixed. This is because when reading data
-        // from table partitions, Avro deserializer needs the Avro schema, which is defined in
-        // table property "avro.schema.literal". However, we only initializes the deserializer using
-        // partition properties, which doesn't include the wanted property entry. Merging two sets
-        // of properties solves the problem.
-        checkAnswer(
-          sql(s"SELECT * FROM $tableName"),
-          Row(1, Row(2, 2.5D), "foo")
-        )
-      }
-    }
-  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 28b5bfd581..ca39c7e845 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -24,9 +24,8 @@ import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
 import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 import org.apache.hadoop.mapred.TextInputFormat
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchPermanentFunctionException}
 import org.apache.spark.sql.catalyst.catalog._
@@ -47,7 +46,7 @@ import org.apache.spark.util.{MutableURLClassLoader, Utils}
  * is not fully tested.
  */
 @ExtendedHiveTest
-class VersionsSuite extends SparkFunSuite with SQLTestUtils with TestHiveSingleton with Logging {
+class VersionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton with Logging {
 
   private val clientBuilder = new HiveClientBuilder
   import clientBuilder.buildClient
@@ -571,6 +570,85 @@ class VersionsSuite extends SparkFunSuite with SQLTestUtils with TestHiveSinglet
       }
     }
 
+
+    test(s"$version: SPARK-13709: reading partitioned Avro table with nested schema") {
+      withTempDir { dir =>
+        val path = dir.toURI.toString
+        val tableName = "spark_13709"
+        val tempTableName = "spark_13709_temp"
+
+        new File(dir.getAbsolutePath, tableName).mkdir()
+        new File(dir.getAbsolutePath, tempTableName).mkdir()
+
+        val avroSchema =
+          """{
+            |  "name": "test_record",
+            |  "type": "record",
+            |  "fields": [ {
+            |    "name": "f0",
+            |    "type": "int"
+            |  }, {
+            |    "name": "f1",
+            |    "type": {
+            |      "type": "record",
+            |      "name": "inner",
+            |      "fields": [ {
+            |        "name": "f10",
+            |        "type": "int"
+            |      }, {
+            |        "name": "f11",
+            |        "type": "double"
+            |      } ]
+            |    }
+            |  } ]
+            |}
+          """.stripMargin
+
+        withTable(tableName, tempTableName) {
+          // Creates the external partitioned Avro table to be tested.
+          sql(
+            s"""CREATE EXTERNAL TABLE $tableName
+               |PARTITIONED BY (ds STRING)
+               |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+               |STORED AS
+               |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+               |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+               |LOCATION '$path/$tableName'
+               |TBLPROPERTIES ('avro.schema.literal' = '$avroSchema')
+           """.stripMargin
+          )
+
+          // Creates an temporary Avro table used to prepare testing Avro file.
+          sql(
+            s"""CREATE EXTERNAL TABLE $tempTableName
+               |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+               |STORED AS
+               |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+               |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+               |LOCATION '$path/$tempTableName'
+               |TBLPROPERTIES ('avro.schema.literal' = '$avroSchema')
+           """.stripMargin
+          )
+
+          // Generates Avro data.
+          sql(s"INSERT OVERWRITE TABLE $tempTableName SELECT 1, STRUCT(2, 2.5)")
+
+          // Adds generated Avro data as a new partition to the testing table.
+          sql(s"ALTER TABLE $tableName ADD PARTITION (ds = 'foo') LOCATION '$path/$tempTableName'")
+
+          // The following query fails before SPARK-13709 is fixed. This is because when reading
+          // data from table partitions, Avro deserializer needs the Avro schema, which is defined
+          // in table property "avro.schema.literal". However, we only initializes the deserializer
+          // using partition properties, which doesn't include the wanted property entry. Merging
+          // two sets of properties solves the problem.
+          checkAnswer(
+            sql(s"SELECT * FROM $tableName"),
+            Row(1, Row(2, 2.5D), "foo")
+          )
+        }
+      }
+    }
+
     // TODO: add more tests.
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 58be079d01..9d9f3a620d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -79,6 +79,25 @@ class HiveDDLSuite
     }
   }
 
+  test("create a hive table without schema") {
+    import testImplicits._
+    withTempPath { tempDir =>
+      withTable("tab1", "tab2") {
+        (("a", "b") :: Nil).toDF().write.json(tempDir.getCanonicalPath)
+
+        var e = intercept[AnalysisException] { sql("CREATE TABLE tab1 USING hive") }.getMessage
+        assert(e.contains("Unable to infer the schema. The schema specification is required to " +
+          "create the table `default`.`tab1`"))
+
+        e = intercept[AnalysisException] {
+          sql(s"CREATE TABLE tab2 location '${tempDir.getCanonicalPath}'")
+        }.getMessage
+        assert(e.contains("Unable to infer the schema. The schema specification is required to " +
+          "create the table `default`.`tab2`"))
+      }
+    }
+  }
+
   test("drop external tables in default database") {
     withTempDir { tmpDir =>
       val tabName = "tab1"
@@ -199,7 +218,7 @@ class HiveDDLSuite
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int) PARTITIONED BY (a string)")
     }
-    assert(e.message == "Found duplicate column(s) in table definition of `tbl`: a")
+    assert(e.message == "Found duplicate column(s) in table definition of `default`.`tbl`: a")
   }
 
   test("add/drop partition with location - managed table") {
@@ -1192,7 +1211,7 @@ class HiveDDLSuite
         assert(e2.getMessage.contains(forbiddenPrefix + "foo"))
 
         val e3 = intercept[AnalysisException] {
-          sql(s"CREATE TABLE tbl TBLPROPERTIES ('${forbiddenPrefix}foo'='anything')")
+          sql(s"CREATE TABLE tbl (a INT) TBLPROPERTIES ('${forbiddenPrefix}foo'='anything')")
         }
         assert(e3.getMessage.contains(forbiddenPrefix + "foo"))
       }
author	gatorsmile <gatorsmile@gmail.com>	2017-02-06 13:30:07 +0800
committer	Wenchen Fan <wenchen@databricks.com>	2017-02-06 13:30:07 +0800
commit	65b10ffb3883cfed5b182db20b55a52ee0d89cba (patch)
tree	848b84a6c16d373a65f7d6a11dc2ccf8daf87857 /sql/hive/src/test/scala/org
parent	317fa7508143271bc694afbb425af49378f04cac (diff)
download	spark-65b10ffb3883cfed5b182db20b55a52ee0d89cba.tar.gz spark-65b10ffb3883cfed5b182db20b55a52ee0d89cba.tar.bz2 spark-65b10ffb3883cfed5b182db20b55a52ee0d89cba.zip