[SPARK-15454][SQL] Filter out files starting with _

## What changes were proposed in this pull request? Many other systems (e.g. Impala) uses _xxx as staging, and Spark should not be reading those files. ## How was this patch tested? Added a unit test case. Author: Reynold Xin <rxin@databricks.com> Closes #13227 from rxin/SPARK-15454.
author: Reynold Xin <rxin@databricks.com> 2016-05-20 14:49:54 -0700
committer: Reynold Xin <rxin@databricks.com> 2016-05-20 14:49:54 -0700
commit: dcac8e6f49918a809fb3f2b8bf666582c479a6eb (patch)
tree: df15d1745f63c87bb6130955e0cf66a4fb91368f /sql
parent: 0e70fd61b4bc92bd744fc44dd3cbe91443207c72 (diff)
download: spark-dcac8e6f49918a809fb3f2b8bf666582c479a6eb.tar.gz
spark-dcac8e6f49918a809fb3f2b8bf666582c479a6eb.tar.bz2
spark-dcac8e6f49918a809fb3f2b8bf666582c479a6eb.zip
2 files changed, 16 insertions, 5 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
index 88125a2b4d..e0e569bca4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
@@ -341,11 +341,11 @@ private[sql] object HadoopFsRelation extends Logging {
 
   /** Checks if we should filter out this path name. */
   def shouldFilterOut(pathName: String): Boolean = {
-    // TODO: We should try to filter out all files/dirs starting with "." or "_".
-    // The only reason that we are not doing it now is that Parquet needs to find those
-    // metadata files from leaf files returned by this methods. We should refactor
-    // this logic to not mix metadata files with data files.
-    pathName == "_SUCCESS" || pathName == "_temporary" || pathName.startsWith(".")
+    // We filter everything that starts with _ and ., except _common_metadata and _metadata
+    // because Parquet needs to find those metadata files from leaf files returned by this method.
+    // We should refactor this logic to not mix metadata files with data files.
+    (pathName.startsWith("_") || pathName.startsWith(".")) &&
+      !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
index 89d57653ad..3c68dc8bb9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
@@ -39,4 +39,15 @@ class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
       assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize))
     }
   }
+
+  test("file filtering") {
+    assert(!HadoopFsRelation.shouldFilterOut("abcd"))
+    assert(HadoopFsRelation.shouldFilterOut(".ab"))
+    assert(HadoopFsRelation.shouldFilterOut("_cd"))
+
+    assert(!HadoopFsRelation.shouldFilterOut("_metadata"))
+    assert(!HadoopFsRelation.shouldFilterOut("_common_metadata"))
+    assert(HadoopFsRelation.shouldFilterOut("_ab_metadata"))
+    assert(HadoopFsRelation.shouldFilterOut("_cd_common_metadata"))
+  }
 }
author	Reynold Xin <rxin@databricks.com>	2016-05-20 14:49:54 -0700
committer	Reynold Xin <rxin@databricks.com>	2016-05-20 14:49:54 -0700
commit	dcac8e6f49918a809fb3f2b8bf666582c479a6eb (patch)
tree	df15d1745f63c87bb6130955e0cf66a4fb91368f /sql
parent	0e70fd61b4bc92bd744fc44dd3cbe91443207c72 (diff)
download	spark-dcac8e6f49918a809fb3f2b8bf666582c479a6eb.tar.gz spark-dcac8e6f49918a809fb3f2b8bf666582c479a6eb.tar.bz2 spark-dcac8e6f49918a809fb3f2b8bf666582c479a6eb.zip