From 339905578790fa37fcad9684b859b443313a5aa2 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 17 May 2015 15:42:21 +0800 Subject: [SPARK-7447] [SQL] Don't re-merge Parquet schema when the relation is deserialized JIRA: https://issues.apache.org/jira/browse/SPARK-7447 `MetadataCache` in `ParquetRelation2` is annotated as `transient`. When `ParquetRelation2` is deserialized, we ask `MetadataCache` to refresh and perform schema merging again. It is time-consuming especially for very many parquet files. With the new `FSBasedParquetRelation`, although `MetadataCache` is not `transient` now, `MetadataCache.refresh()` still performs schema merging again when the relation is deserialized. Author: Liang-Chi Hsieh Closes #6012 from viirya/without_remerge_schema and squashes the following commits: 2663957 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into without_remerge_schema 6ac7d93 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into without_remerge_schema b0fc09b [Liang-Chi Hsieh] Don't generate and merge parquetSchema multiple times. --- .../org/apache/spark/sql/parquet/newParquet.scala | 32 ++++++++++++---------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala index 946062f6ea..bcbdb1ebd2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala @@ -340,7 +340,7 @@ private[sql] class ParquetRelation2( // Schema of the actual Parquet files, without partition columns discovered from partition // directory paths. - var dataSchema: StructType = _ + var dataSchema: StructType = null // Schema of the whole table, including partition columns. var schema: StructType = _ @@ -379,19 +379,23 @@ private[sql] class ParquetRelation2( f -> new Footer(f.getPath, parquetMetadata) }.seq.toMap - dataSchema = { - val dataSchema0 = - maybeDataSchema - .orElse(readSchema()) - .orElse(maybeMetastoreSchema) - .getOrElse(sys.error("Failed to get the schema.")) - - // If this Parquet relation is converted from a Hive Metastore table, must reconcile case - // case insensitivity issue and possible schema mismatch (probably caused by schema - // evolution). - maybeMetastoreSchema - .map(ParquetRelation2.mergeMetastoreParquetSchema(_, dataSchema0)) - .getOrElse(dataSchema0) + // If we already get the schema, don't need to re-compute it since the schema merging is + // time-consuming. + if (dataSchema == null) { + dataSchema = { + val dataSchema0 = + maybeDataSchema + .orElse(readSchema()) + .orElse(maybeMetastoreSchema) + .getOrElse(sys.error("Failed to get the schema.")) + + // If this Parquet relation is converted from a Hive Metastore table, must reconcile case + // case insensitivity issue and possible schema mismatch (probably caused by schema + // evolution). + maybeMetastoreSchema + .map(ParquetRelation2.mergeMetastoreParquetSchema(_, dataSchema0)) + .getOrElse(dataSchema0) + } } } -- cgit v1.2.3