aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBudde <budde@amazon.com>2017-03-10 15:18:37 -0800
committerWenchen Fan <wenchen@databricks.com>2017-03-10 15:18:37 -0800
commitbc30351404d8bc610cbae65fdc12ca613e7735c6 (patch)
treefa740da8d1644fea3a3bf6c9b37dc5d9573e9302
parent8f0490e22b4c7f1fdf381c70c5894d46b7f7e6fb (diff)
downloadspark-bc30351404d8bc610cbae65fdc12ca613e7735c6.tar.gz
spark-bc30351404d8bc610cbae65fdc12ca613e7735c6.tar.bz2
spark-bc30351404d8bc610cbae65fdc12ca613e7735c6.zip
[SPARK-19611][SQL] Preserve metastore field order when merging inferred schema
## What changes were proposed in this pull request? The ```HiveMetastoreCatalog.mergeWithMetastoreSchema()``` method added in #16944 may not preserve the same field order as the metastore schema in some cases, which can cause queries to fail. This change ensures that the metastore field order is preserved. ## How was this patch tested? A test for ensuring that metastore order is preserved was added to ```HiveSchemaInferenceSuite.``` The particular failure usecase from #16944 was tested manually as well. Author: Budde <budde@amazon.com> Closes #17249 from budde/PreserveMetastoreFieldOrder.
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala5
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala21
2 files changed, 22 insertions, 4 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 056af49559..9f0d1ceb28 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -356,13 +356,10 @@ private[hive] object HiveMetastoreCatalog {
.filterKeys(!inferredSchema.map(_.name.toLowerCase).contains(_))
.values
.filter(_.nullable)
-
// Merge missing nullable fields to inferred schema and build a case-insensitive field map.
val inferredFields = StructType(inferredSchema ++ missingNullables)
.map(f => f.name.toLowerCase -> f).toMap
- StructType(metastoreFields.map { case(name, field) =>
- field.copy(name = inferredFields(name).name)
- }.toSeq)
+ StructType(metastoreSchema.map(f => f.copy(name = inferredFields(f.name).name)))
} catch {
case NonFatal(_) =>
val msg = s"""Detected conflicting schemas when merging the schema obtained from the Hive
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
index 7895580381..e48ce2304d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
@@ -293,6 +293,27 @@ class HiveSchemaInferenceSuite
StructField("firstField", StringType, nullable = true),
StructField("secondField", StringType, nullable = true))))
}.getMessage.contains("Detected conflicting schemas"))
+
+ // Schema merge should maintain metastore order.
+ assertResult(
+ StructType(Seq(
+ StructField("first_field", StringType, nullable = true),
+ StructField("second_field", StringType, nullable = true),
+ StructField("third_field", StringType, nullable = true),
+ StructField("fourth_field", StringType, nullable = true),
+ StructField("fifth_field", StringType, nullable = true)))) {
+ HiveMetastoreCatalog.mergeWithMetastoreSchema(
+ StructType(Seq(
+ StructField("first_field", StringType, nullable = true),
+ StructField("second_field", StringType, nullable = true),
+ StructField("third_field", StringType, nullable = true),
+ StructField("fourth_field", StringType, nullable = true),
+ StructField("fifth_field", StringType, nullable = true))),
+ StructType(Seq(
+ StructField("fifth_field", StringType, nullable = true),
+ StructField("third_field", StringType, nullable = true),
+ StructField("second_field", StringType, nullable = true))))
+ }
}
}