[SPARK-19305][SQL] partitioned table should always put partition columns at the end of table schema

## What changes were proposed in this pull request? For data source tables, we will always reorder the specified table schema, or the query in CTAS, to put partition columns at the end. e.g. `CREATE TABLE t(a int, b int, c int, d int) USING parquet PARTITIONED BY (d, b)` will create a table with schema `<a, c, d, b>` Hive serde tables don't have this problem before, because its CREATE TABLE syntax specifies data schema and partition schema individually. However, after we unifed the CREATE TABLE syntax, Hive serde table also need to do the reorder. This PR puts the reorder logic in a analyzer rule, which works with both data source tables and Hive serde tables. ## How was this patch tested? new regression test Author: Wenchen Fan <wenchen@databricks.com> Closes #16655 from cloud-fan/schema.
author: Wenchen Fan <wenchen@databricks.com> 2017-01-21 13:57:50 +0800
committer: Wenchen Fan <wenchen@databricks.com> 2017-01-21 13:57:50 +0800
commit: 3c2ba9fcc493504c9e7d3caf0b93256ca299cbfe (patch)
tree: 6ed86e1626f75b5672c9ba451db059dcde664a55 /sql/hive
parent: f174cdc7478d0b81f9cfa896284a5ec4c6bb952d (diff)
download: spark-3c2ba9fcc493504c9e7d3caf0b93256ca299cbfe.tar.gz
spark-3c2ba9fcc493504c9e7d3caf0b93256ca299cbfe.tar.bz2
spark-3c2ba9fcc493504c9e7d3caf0b93256ca299cbfe.zip
1 files changed, 30 insertions, 0 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index edef30823b..7f58603d32 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -1384,4 +1384,34 @@ class HiveDDLSuite
       assert(e2.message.contains("Hive data source can only be used with tables"))
     }
   }
+
+  test("partitioned table should always put partition columns at the end of table schema") {
+    def getTableColumns(tblName: String): Seq[String] = {
+      spark.sessionState.catalog.getTableMetadata(TableIdentifier(tblName)).schema.map(_.name)
+    }
+
+    withTable("t", "t1", "t2", "t3", "t4") {
+      sql("CREATE TABLE t(a int, b int, c int, d int) USING parquet PARTITIONED BY (d, b)")
+      assert(getTableColumns("t") == Seq("a", "c", "d", "b"))
+
+      sql("CREATE TABLE t1 USING parquet PARTITIONED BY (d, b) AS SELECT 1 a, 1 b, 1 c, 1 d")
+      assert(getTableColumns("t1") == Seq("a", "c", "d", "b"))
+
+      Seq((1, 1, 1, 1)).toDF("a", "b", "c", "d").write.partitionBy("d", "b").saveAsTable("t2")
+      assert(getTableColumns("t2") == Seq("a", "c", "d", "b"))
+
+      withTempPath { path =>
+        val dataPath = new File(new File(path, "d=1"), "b=1").getCanonicalPath
+        Seq(1 -> 1).toDF("a", "c").write.save(dataPath)
+
+        sql(s"CREATE TABLE t3 USING parquet LOCATION '${path.getCanonicalPath}'")
+        assert(getTableColumns("t3") == Seq("a", "c", "d", "b"))
+      }
+
+      sql("CREATE TABLE t4(a int, b int, c int, d int) USING hive PARTITIONED BY (d, b)")
+      assert(getTableColumns("t4") == Seq("a", "c", "d", "b"))
+
+      // TODO: add test for creating partitioned hive serde table as select, once we support it.
+    }
+  }
 }
author	Wenchen Fan <wenchen@databricks.com>	2017-01-21 13:57:50 +0800
committer	Wenchen Fan <wenchen@databricks.com>	2017-01-21 13:57:50 +0800
commit	3c2ba9fcc493504c9e7d3caf0b93256ca299cbfe (patch)
tree	6ed86e1626f75b5672c9ba451db059dcde664a55 /sql/hive
parent	f174cdc7478d0b81f9cfa896284a5ec4c6bb952d (diff)
download	spark-3c2ba9fcc493504c9e7d3caf0b93256ca299cbfe.tar.gz spark-3c2ba9fcc493504c9e7d3caf0b93256ca299cbfe.tar.bz2 spark-3c2ba9fcc493504c9e7d3caf0b93256ca299cbfe.zip