[SPARK-6016][SQL] Cannot read the parquet table after overwriting the existing table when spark.sql.parquet.cacheMetadata=true

Please see JIRA (https://issues.apache.org/jira/browse/SPARK-6016) for details of the bug. Author: Yin Huai <yhuai@databricks.com> Closes #4775 from yhuai/parquetFooterCache and squashes the following commits: 78787b1 [Yin Huai] Remove footerCache in FilteringParquetRowInputFormat. dff6fba [Yin Huai] Failed unit test.
author: Yin Huai <yhuai@databricks.com> 2015-02-27 01:01:32 +0800
committer: Cheng Lian <lian@databricks.com> 2015-02-27 01:01:32 +0800
commit: 192e42a2933eb283e12bfdfb46e2ef895228af4a (patch)
tree: 514f7bad2ebbb1b761996cc64b6a537ea19f1b75 /sql/hive
parent: f02394d06473889d0d7897c4583239e6e136ff46 (diff)
download: spark-192e42a2933eb283e12bfdfb46e2ef895228af4a.tar.gz
spark-192e42a2933eb283e12bfdfb46e2ef895228af4a.tar.bz2
spark-192e42a2933eb283e12bfdfb46e2ef895228af4a.zip
1 files changed, 27 insertions, 0 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index 80fd5cda20..6a9d9daf67 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.hive.execution.{InsertIntoHiveTable, HiveTableScan}
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.sources.{InsertIntoDataSource, LogicalRelation}
+import org.apache.spark.sql.SaveMode
 
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
@@ -409,6 +410,32 @@ class ParquetSourceSuiteBase extends ParquetPartitioningTest {
       )
     """)
   }
+
+  test("SPARK-6016 make sure to use the latest footers") {
+    sql("drop table if exists spark_6016_fix")
+
+    // Create a DataFrame with two partitions. So, the created table will have two parquet files.
+    val df1 = jsonRDD(sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i}"""), 2))
+    df1.saveAsTable("spark_6016_fix", "parquet", SaveMode.Overwrite)
+    checkAnswer(
+      sql("select * from spark_6016_fix"),
+      (1 to 10).map(i => Row(i))
+    )
+
+    // Create a DataFrame with four partitions. So, the created table will have four parquet files.
+    val df2 = jsonRDD(sparkContext.parallelize((1 to 10).map(i => s"""{"b":$i}"""), 4))
+    df2.saveAsTable("spark_6016_fix", "parquet", SaveMode.Overwrite)
+    // For the bug of SPARK-6016, we are caching two outdated footers for df1. Then,
+    // since the new table has four parquet files, we are trying to read new footers from two files
+    // and then merge metadata in footers of these four (two outdated ones and two latest one),
+    // which will cause an error.
+    checkAnswer(
+      sql("select * from spark_6016_fix"),
+      (1 to 10).map(i => Row(i))
+    )
+
+    sql("drop table spark_6016_fix")
+  }
 }
 
 class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase {
author	Yin Huai <yhuai@databricks.com>	2015-02-27 01:01:32 +0800
committer	Cheng Lian <lian@databricks.com>	2015-02-27 01:01:32 +0800
commit	192e42a2933eb283e12bfdfb46e2ef895228af4a (patch)
tree	514f7bad2ebbb1b761996cc64b6a537ea19f1b75 /sql/hive
parent	f02394d06473889d0d7897c4583239e6e136ff46 (diff)
download	spark-192e42a2933eb283e12bfdfb46e2ef895228af4a.tar.gz spark-192e42a2933eb283e12bfdfb46e2ef895228af4a.tar.bz2 spark-192e42a2933eb283e12bfdfb46e2ef895228af4a.zip