aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorLiang-Chi Hsieh <viirya@gmail.com>2015-02-27 11:06:47 +0800
committerCheng Lian <lian@databricks.com>2015-02-27 11:06:47 +0800
commit4ad5153f5449319a7e82c9013ccff4494ab58ef1 (patch)
tree69f283b5d5a098c8681a3d9b2829549b5e562b05 /sql
parent18f2098433e0bfef9497bacd601fdf098ed03eab (diff)
downloadspark-4ad5153f5449319a7e82c9013ccff4494ab58ef1.tar.gz
spark-4ad5153f5449319a7e82c9013ccff4494ab58ef1.tar.bz2
spark-4ad5153f5449319a7e82c9013ccff4494ab58ef1.zip
[SPARK-6037][SQL] Avoiding duplicate Parquet schema merging
`FilteringParquetRowInputFormat` manually merges Parquet schemas before computing splits. However, it is duplicate because the schemas are already merged in `ParquetRelation2`. We don't need to re-merge them at `InputFormat`. Author: Liang-Chi Hsieh <viirya@gmail.com> Closes #4786 from viirya/dup_parquet_schemas_merge and squashes the following commits: ef78a5a [Liang-Chi Hsieh] Avoiding duplicate Parquet schema merging.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala23
1 files changed, 7 insertions, 16 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 9061d3f5fe..4e4f647767 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -434,22 +434,13 @@ private[parquet] class FilteringParquetRowInputFormat
return splits
}
- Option(globalMetaData.getKeyValueMetaData.get(RowReadSupport.SPARK_METADATA_KEY)).foreach {
- schemas =>
- val mergedSchema = schemas
- .map(DataType.fromJson(_).asInstanceOf[StructType])
- .reduce(_ merge _)
- .json
-
- val mergedMetadata = globalMetaData
- .getKeyValueMetaData
- .updated(RowReadSupport.SPARK_METADATA_KEY, setAsJavaSet(Set(mergedSchema)))
-
- globalMetaData = new GlobalMetaData(
- globalMetaData.getSchema,
- mergedMetadata,
- globalMetaData.getCreatedBy)
- }
+ val metadata = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
+ val mergedMetadata = globalMetaData
+ .getKeyValueMetaData
+ .updated(RowReadSupport.SPARK_METADATA_KEY, setAsJavaSet(Set(metadata)))
+
+ globalMetaData = new GlobalMetaData(globalMetaData.getSchema,
+ mergedMetadata, globalMetaData.getCreatedBy)
val readContext = getReadSupport(configuration).init(
new InitContext(configuration,