aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Yu <qyu@us.ibm.com>2016-06-09 09:50:09 -0700
committerWenchen Fan <wenchen@databricks.com>2016-06-09 09:50:09 -0700
commit99386fe3989f758844de14b2c28eccfdf8163221 (patch)
treebc78843acc3dbe4ccc352e7313cdc151a6e61778
parent147c020823080c60b495f7950629d8134bf895db (diff)
downloadspark-99386fe3989f758844de14b2c28eccfdf8163221.tar.gz
spark-99386fe3989f758844de14b2c28eccfdf8163221.tar.bz2
spark-99386fe3989f758844de14b2c28eccfdf8163221.zip
[SPARK-15804][SQL] Include metadata in the toStructType
## What changes were proposed in this pull request? The help function 'toStructType' in the AttributeSeq class doesn't include the metadata when it builds the StructField, so it causes this reported problem https://issues.apache.org/jira/browse/SPARK-15804?jql=project%20%3D%20SPARK when spark writes the the dataframe with the metadata to the parquet datasource. The code path is when spark writes the dataframe to the parquet datasource through the InsertIntoHadoopFsRelationCommand, spark will build the WriteRelation container, and it will call the help function 'toStructType' to create StructType which contains StructField, it should include the metadata there, otherwise, we will lost the user provide metadata. ## How was this patch tested? added test case in ParquetQuerySuite.scala (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Author: Kevin Yu <qyu@us.ibm.com> Closes #13555 from kevinyu98/spark-15804.
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala2
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala15
2 files changed, 16 insertions, 1 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
index 81f5bb4a65..a6125c61e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -91,7 +91,7 @@ package object expressions {
implicit class AttributeSeq(val attrs: Seq[Attribute]) extends Serializable {
/** Creates a StructType with a schema matching this `Seq[Attribute]`. */
def toStructType: StructType = {
- StructType(attrs.map(a => StructField(a.name, a.dataType, a.nullable)))
+ StructType(attrs.map(a => StructField(a.name, a.dataType, a.nullable, a.metadata)))
}
// It's possible that `attrs` is a linked list, which can lead to bad O(n^2) loops when
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index 78b97f6995..ea57f71c50 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -625,6 +625,21 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
}
}
}
+
+ test("SPARK-15804: write out the metadata to parquet file") {
+ val df = Seq((1, "abc"), (2, "hello")).toDF("a", "b")
+ val md = new MetadataBuilder().putString("key", "value").build()
+ val dfWithmeta = df.select('a, 'b.as("b", md))
+
+ withTempPath { dir =>
+ val path = dir.getCanonicalPath
+ dfWithmeta.write.parquet(path)
+
+ readParquetFile(path) { df =>
+ assert(df.schema.last.metadata.getString("key") == "value")
+ }
+ }
+ }
}
object TestingUDT {