aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorYash Datta <Yash.Datta@guavus.com>2015-03-26 21:13:38 +0800
committerCheng Lian <lian@databricks.com>2015-03-26 21:13:38 +0800
commit1c05027a143d1b0bf3df192984e6cac752b1e926 (patch)
treef9ac6606175278fa2ef3896613f1fcbf654c97a1 /sql
parent0c88ce5416d7687bc806a7655e17009ad5823d30 (diff)
downloadspark-1c05027a143d1b0bf3df192984e6cac752b1e926.tar.gz
spark-1c05027a143d1b0bf3df192984e6cac752b1e926.tar.bz2
spark-1c05027a143d1b0bf3df192984e6cac752b1e926.zip
[SQL][SPARK-6471]: Metastore schema should only be a subset of parquet schema to support dropping of columns using replace columns
Currently in the parquet relation 2 implementation, error is thrown in case merged schema is not exactly the same as metastore schema. But to support cases like deletion of column using replace column command, we can relax the restriction so that even if metastore schema is a subset of merged parquet schema, the query will work. Author: Yash Datta <Yash.Datta@guavus.com> Closes #5141 from saucam/replace_col and squashes the following commits: e858d5b [Yash Datta] SPARK-6471: Fix test cases, add a new test case for metastore schema to be subset of parquet schema 5f2f467 [Yash Datta] SPARK-6471: Metastore schema should only be a subset of parquet schema to support dropping of columns using replace columns
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala5
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala18
2 files changed, 19 insertions, 4 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 410600b052..3516cfe680 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -758,12 +758,13 @@ private[sql] object ParquetRelation2 extends Logging {
|${parquetSchema.prettyJson}
""".stripMargin
- assert(metastoreSchema.size == parquetSchema.size, schemaConflictMessage)
+ assert(metastoreSchema.size <= parquetSchema.size, schemaConflictMessage)
val ordinalMap = metastoreSchema.zipWithIndex.map {
case (field, index) => field.name.toLowerCase -> index
}.toMap
- val reorderedParquetSchema = parquetSchema.sortBy(f => ordinalMap(f.name.toLowerCase))
+ val reorderedParquetSchema = parquetSchema.sortBy(f =>
+ ordinalMap.getOrElse(f.name.toLowerCase, metastoreSchema.size + 1))
StructType(metastoreSchema.zip(reorderedParquetSchema).map {
// Uses Parquet field names but retains Metastore data types.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index 321832cd43..8462f9bb2d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -212,8 +212,11 @@ class ParquetSchemaSuite extends FunSuite with ParquetTest {
StructField("UPPERCase", IntegerType, nullable = true))))
}
- // Conflicting field count
- assert(intercept[Throwable] {
+ // MetaStore schema is subset of parquet schema
+ assertResult(
+ StructType(Seq(
+ StructField("UPPERCase", DoubleType, nullable = false)))) {
+
ParquetRelation2.mergeMetastoreParquetSchema(
StructType(Seq(
StructField("uppercase", DoubleType, nullable = false))),
@@ -221,6 +224,17 @@ class ParquetSchemaSuite extends FunSuite with ParquetTest {
StructType(Seq(
StructField("lowerCase", BinaryType),
StructField("UPPERCase", IntegerType, nullable = true))))
+ }
+
+ // Conflicting field count
+ assert(intercept[Throwable] {
+ ParquetRelation2.mergeMetastoreParquetSchema(
+ StructType(Seq(
+ StructField("uppercase", DoubleType, nullable = false),
+ StructField("lowerCase", BinaryType))),
+
+ StructType(Seq(
+ StructField("UPPERCase", IntegerType, nullable = true))))
}.getMessage.contains("detected conflicting schemas"))
// Conflicting field names