aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2015-09-05 17:50:12 +0800
committerCheng Lian <lian@databricks.com>2015-09-05 17:50:12 +0800
commitbca8c072bd710beda6cfac1533a67f32f579b134 (patch)
treefc44f81bf8ad07d3ef2783c0b1a67bae722dd31b /sql
parent7a4f326c00fb33c384b4fb927310d687ec063329 (diff)
downloadspark-bca8c072bd710beda6cfac1533a67f32f579b134.tar.gz
spark-bca8c072bd710beda6cfac1533a67f32f579b134.tar.bz2
spark-bca8c072bd710beda6cfac1533a67f32f579b134.zip
[SPARK-10434] [SQL] Fixes Parquet schema of arrays that may contain null
To keep full compatibility of Parquet write path with Spark 1.4, we should rename the innermost field name of arrays that may contain null from "array_element" to "array". Please refer to [SPARK-10434] [1] for more details. [1]: https://issues.apache.org/jira/browse/SPARK-10434 Author: Cheng Lian <lian@databricks.com> Closes #8586 from liancheng/spark-10434/fix-parquet-array-type.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala13
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala6
2 files changed, 10 insertions, 9 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
index a21ab1dbb2..2d237da81c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
@@ -426,13 +426,14 @@ private[parquet] class CatalystSchemaConverter(
// ArrayType and MapType (for Spark versions <= 1.4.x)
// ===================================================
- // Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level
- // LIST structure. This behavior mimics parquet-hive (1.6.0rc3). Note that this case is
- // covered by the backwards-compatibility rules implemented in `isElementType()`.
+ // Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level
+ // `LIST` structure. This behavior is somewhat a hybrid of parquet-hive and parquet-avro
+ // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element
+ // field name "array" is borrowed from parquet-avro.
case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec =>
// <list-repetition> group <name> (LIST) {
// optional group bag {
- // repeated <element-type> element;
+ // repeated <element-type> array;
// }
// }
ConversionPatterns.listType(
@@ -441,8 +442,8 @@ private[parquet] class CatalystSchemaConverter(
Types
.buildGroup(REPEATED)
// "array_element" is the name chosen by parquet-hive (1.7.0 and prior version)
- .addField(convertField(StructField("array_element", elementType, nullable)))
- .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME))
+ .addField(convertField(StructField("array", elementType, nullable)))
+ .named("bag"))
// Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
// LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 28c59a4abd..5331d7c035 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -197,7 +197,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
|message root {
| optional group _1 (LIST) {
| repeated group bag {
- | optional int32 array_element;
+ | optional int32 array;
| }
| }
|}
@@ -266,7 +266,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
| optional binary _1 (UTF8);
| optional group _2 (LIST) {
| repeated group bag {
- | optional group array_element {
+ | optional group array {
| required int32 _1;
| required double _2;
| }
@@ -645,7 +645,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
"""message root {
| optional group f1 (LIST) {
| repeated group bag {
- | optional int32 array_element;
+ | optional int32 array;
| }
| }
|}