[SPARK-19611][SQL] Preserve metastore field order when merging inferred schema

## What changes were proposed in this pull request? The ```HiveMetastoreCatalog.mergeWithMetastoreSchema()``` method added in #16944 may not preserve the same field order as the metastore schema in some cases, which can cause queries to fail. This change ensures that the metastore field order is preserved. ## How was this patch tested? A test for ensuring that metastore order is preserved was added to ```HiveSchemaInferenceSuite.``` The particular failure usecase from #16944 was tested manually as well. Author: Budde <budde@amazon.com> Closes #17249 from budde/PreserveMetastoreFieldOrder.
author: Budde <budde@amazon.com> 2017-03-10 15:18:37 -0800
committer: Wenchen Fan <wenchen@databricks.com> 2017-03-10 15:18:37 -0800
commit: bc30351404d8bc610cbae65fdc12ca613e7735c6 (patch)
tree: fa740da8d1644fea3a3bf6c9b37dc5d9573e9302
parent: 8f0490e22b4c7f1fdf381c70c5894d46b7f7e6fb (diff)
download: spark-bc30351404d8bc610cbae65fdc12ca613e7735c6.tar.gz
spark-bc30351404d8bc610cbae65fdc12ca613e7735c6.tar.bz2
spark-bc30351404d8bc610cbae65fdc12ca613e7735c6.zip
2 files changed, 22 insertions, 4 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 056af49559..9f0d1ceb28 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -356,13 +356,10 @@ private[hive] object HiveMetastoreCatalog {
       .filterKeys(!inferredSchema.map(_.name.toLowerCase).contains(_))
       .values
       .filter(_.nullable)
-
     // Merge missing nullable fields to inferred schema and build a case-insensitive field map.
     val inferredFields = StructType(inferredSchema ++ missingNullables)
       .map(f => f.name.toLowerCase -> f).toMap
-    StructType(metastoreFields.map { case(name, field) =>
-      field.copy(name = inferredFields(name).name)
-    }.toSeq)
+    StructType(metastoreSchema.map(f => f.copy(name = inferredFields(f.name).name)))
   } catch {
     case NonFatal(_) =>
       val msg = s"""Detected conflicting schemas when merging the schema obtained from the Hive
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
index 7895580381..e48ce2304d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
@@ -293,6 +293,27 @@ class HiveSchemaInferenceSuite
           StructField("firstField", StringType, nullable = true),
           StructField("secondField", StringType, nullable = true))))
     }.getMessage.contains("Detected conflicting schemas"))
+
+    // Schema merge should maintain metastore order.
+    assertResult(
+      StructType(Seq(
+        StructField("first_field", StringType, nullable = true),
+        StructField("second_field", StringType, nullable = true),
+        StructField("third_field", StringType, nullable = true),
+        StructField("fourth_field", StringType, nullable = true),
+        StructField("fifth_field", StringType, nullable = true)))) {
+      HiveMetastoreCatalog.mergeWithMetastoreSchema(
+        StructType(Seq(
+          StructField("first_field", StringType, nullable = true),
+          StructField("second_field", StringType, nullable = true),
+          StructField("third_field", StringType, nullable = true),
+          StructField("fourth_field", StringType, nullable = true),
+          StructField("fifth_field", StringType, nullable = true))),
+        StructType(Seq(
+          StructField("fifth_field", StringType, nullable = true),
+          StructField("third_field", StringType, nullable = true),
+          StructField("second_field", StringType, nullable = true))))
+    }
   }
 }
author	Budde <budde@amazon.com>	2017-03-10 15:18:37 -0800
committer	Wenchen Fan <wenchen@databricks.com>	2017-03-10 15:18:37 -0800
commit	bc30351404d8bc610cbae65fdc12ca613e7735c6 (patch)
tree	fa740da8d1644fea3a3bf6c9b37dc5d9573e9302
parent	8f0490e22b4c7f1fdf381c70c5894d46b7f7e6fb (diff)
download	spark-bc30351404d8bc610cbae65fdc12ca613e7735c6.tar.gz spark-bc30351404d8bc610cbae65fdc12ca613e7735c6.tar.bz2 spark-bc30351404d8bc610cbae65fdc12ca613e7735c6.zip