[SPARK-15719][SQL] Disables writing Parquet summary files by default

## What changes were proposed in this pull request? This PR disables writing Parquet summary files by default (i.e., when Hadoop configuration "parquet.enable.summary-metadata" is not set). Please refer to [SPARK-15719][1] for more details. ## How was this patch tested? New test case added in `ParquetQuerySuite` to check no summary files are written by default. [1]: https://issues.apache.org/jira/browse/SPARK-15719 Author: Cheng Lian <lian@databricks.com> Closes #13455 from liancheng/spark-15719-disable-parquet-summary-files.
author: Cheng Lian <lian@databricks.com> 2016-06-02 16:16:27 -0700
committer: Cheng Lian <lian@databricks.com> 2016-06-02 16:16:27 -0700
commit: 431542765785304edb76a19885fbc5f9b8ae7d64 (patch)
tree: ef19e4c29d63eb81484fef8a65ccad5905315758 /sql/hive
parent: 72353311d3a37cb523c5bdd8072ffdff99af9749 (diff)
download: spark-431542765785304edb76a19885fbc5f9b8ae7d64.tar.gz
spark-431542765785304edb76a19885fbc5f9b8ae7d64.tar.bz2
spark-431542765785304edb76a19885fbc5f9b8ae7d64.zip
1 files changed, 16 insertions, 13 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
index f9a1d16d90..8aa018d0a9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
@@ -21,6 +21,7 @@ import java.io.File
 
 import com.google.common.io.Files
 import org.apache.hadoop.fs.Path
+import org.apache.parquet.hadoop.ParquetOutputFormat
 
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
@@ -124,23 +125,25 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
   }
 
   test("SPARK-8604: Parquet data source should write summary file while doing appending") {
-    withTempPath { dir =>
-      val path = dir.getCanonicalPath
-      val df = spark.range(0, 5).toDF()
-      df.write.mode(SaveMode.Overwrite).parquet(path)
+    withSQLConf(ParquetOutputFormat.ENABLE_JOB_SUMMARY -> "true") {
+      withTempPath { dir =>
+        val path = dir.getCanonicalPath
+        val df = spark.range(0, 5).toDF()
+        df.write.mode(SaveMode.Overwrite).parquet(path)
 
-      val summaryPath = new Path(path, "_metadata")
-      val commonSummaryPath = new Path(path, "_common_metadata")
+        val summaryPath = new Path(path, "_metadata")
+        val commonSummaryPath = new Path(path, "_common_metadata")
 
-      val fs = summaryPath.getFileSystem(spark.sessionState.newHadoopConf())
-      fs.delete(summaryPath, true)
-      fs.delete(commonSummaryPath, true)
+        val fs = summaryPath.getFileSystem(spark.sessionState.newHadoopConf())
+        fs.delete(summaryPath, true)
+        fs.delete(commonSummaryPath, true)
 
-      df.write.mode(SaveMode.Append).parquet(path)
-      checkAnswer(spark.read.parquet(path), df.union(df))
+        df.write.mode(SaveMode.Append).parquet(path)
+        checkAnswer(spark.read.parquet(path), df.union(df))
 
-      assert(fs.exists(summaryPath))
-      assert(fs.exists(commonSummaryPath))
+        assert(fs.exists(summaryPath))
+        assert(fs.exists(commonSummaryPath))
+      }
     }
   }
author	Cheng Lian <lian@databricks.com>	2016-06-02 16:16:27 -0700
committer	Cheng Lian <lian@databricks.com>	2016-06-02 16:16:27 -0700
commit	431542765785304edb76a19885fbc5f9b8ae7d64 (patch)
tree	ef19e4c29d63eb81484fef8a65ccad5905315758 /sql/hive
parent	72353311d3a37cb523c5bdd8072ffdff99af9749 (diff)
download	spark-431542765785304edb76a19885fbc5f9b8ae7d64.tar.gz spark-431542765785304edb76a19885fbc5f9b8ae7d64.tar.bz2 spark-431542765785304edb76a19885fbc5f9b8ae7d64.zip