diff options
author | Cheng Lian <lian@databricks.com> | 2016-06-02 16:16:27 -0700 |
---|---|---|
committer | Cheng Lian <lian@databricks.com> | 2016-06-02 16:16:27 -0700 |
commit | 431542765785304edb76a19885fbc5f9b8ae7d64 (patch) | |
tree | ef19e4c29d63eb81484fef8a65ccad5905315758 /sql/hive | |
parent | 72353311d3a37cb523c5bdd8072ffdff99af9749 (diff) | |
download | spark-431542765785304edb76a19885fbc5f9b8ae7d64.tar.gz spark-431542765785304edb76a19885fbc5f9b8ae7d64.tar.bz2 spark-431542765785304edb76a19885fbc5f9b8ae7d64.zip |
[SPARK-15719][SQL] Disables writing Parquet summary files by default
## What changes were proposed in this pull request?
This PR disables writing Parquet summary files by default (i.e., when Hadoop configuration "parquet.enable.summary-metadata" is not set).
Please refer to [SPARK-15719][1] for more details.
## How was this patch tested?
New test case added in `ParquetQuerySuite` to check no summary files are written by default.
[1]: https://issues.apache.org/jira/browse/SPARK-15719
Author: Cheng Lian <lian@databricks.com>
Closes #13455 from liancheng/spark-15719-disable-parquet-summary-files.
Diffstat (limited to 'sql/hive')
-rw-r--r-- | sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala | 29 |
1 files changed, 16 insertions, 13 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala index f9a1d16d90..8aa018d0a9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala @@ -21,6 +21,7 @@ import java.io.File import com.google.common.io.Files import org.apache.hadoop.fs.Path +import org.apache.parquet.hadoop.ParquetOutputFormat import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql._ @@ -124,23 +125,25 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest { } test("SPARK-8604: Parquet data source should write summary file while doing appending") { - withTempPath { dir => - val path = dir.getCanonicalPath - val df = spark.range(0, 5).toDF() - df.write.mode(SaveMode.Overwrite).parquet(path) + withSQLConf(ParquetOutputFormat.ENABLE_JOB_SUMMARY -> "true") { + withTempPath { dir => + val path = dir.getCanonicalPath + val df = spark.range(0, 5).toDF() + df.write.mode(SaveMode.Overwrite).parquet(path) - val summaryPath = new Path(path, "_metadata") - val commonSummaryPath = new Path(path, "_common_metadata") + val summaryPath = new Path(path, "_metadata") + val commonSummaryPath = new Path(path, "_common_metadata") - val fs = summaryPath.getFileSystem(spark.sessionState.newHadoopConf()) - fs.delete(summaryPath, true) - fs.delete(commonSummaryPath, true) + val fs = summaryPath.getFileSystem(spark.sessionState.newHadoopConf()) + fs.delete(summaryPath, true) + fs.delete(commonSummaryPath, true) - df.write.mode(SaveMode.Append).parquet(path) - checkAnswer(spark.read.parquet(path), df.union(df)) + df.write.mode(SaveMode.Append).parquet(path) + checkAnswer(spark.read.parquet(path), df.union(df)) - assert(fs.exists(summaryPath)) - assert(fs.exists(commonSummaryPath)) + assert(fs.exists(summaryPath)) + assert(fs.exists(commonSummaryPath)) + } } } |