aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2016-05-20 14:49:54 -0700
committerReynold Xin <rxin@databricks.com>2016-05-20 14:49:54 -0700
commitdcac8e6f49918a809fb3f2b8bf666582c479a6eb (patch)
treedf15d1745f63c87bb6130955e0cf66a4fb91368f /sql
parent0e70fd61b4bc92bd744fc44dd3cbe91443207c72 (diff)
downloadspark-dcac8e6f49918a809fb3f2b8bf666582c479a6eb.tar.gz
spark-dcac8e6f49918a809fb3f2b8bf666582c479a6eb.tar.bz2
spark-dcac8e6f49918a809fb3f2b8bf666582c479a6eb.zip
[SPARK-15454][SQL] Filter out files starting with _
## What changes were proposed in this pull request? Many other systems (e.g. Impala) uses _xxx as staging, and Spark should not be reading those files. ## How was this patch tested? Added a unit test case. Author: Reynold Xin <rxin@databricks.com> Closes #13227 from rxin/SPARK-15454.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala10
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala11
2 files changed, 16 insertions, 5 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
index 88125a2b4d..e0e569bca4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala
@@ -341,11 +341,11 @@ private[sql] object HadoopFsRelation extends Logging {
/** Checks if we should filter out this path name. */
def shouldFilterOut(pathName: String): Boolean = {
- // TODO: We should try to filter out all files/dirs starting with "." or "_".
- // The only reason that we are not doing it now is that Parquet needs to find those
- // metadata files from leaf files returned by this methods. We should refactor
- // this logic to not mix metadata files with data files.
- pathName == "_SUCCESS" || pathName == "_temporary" || pathName.startsWith(".")
+ // We filter everything that starts with _ and ., except _common_metadata and _metadata
+ // because Parquet needs to find those metadata files from leaf files returned by this method.
+ // We should refactor this logic to not mix metadata files with data files.
+ (pathName.startsWith("_") || pathName.startsWith(".")) &&
+ !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
}
/**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
index 89d57653ad..3c68dc8bb9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
@@ -39,4 +39,15 @@ class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize))
}
}
+
+ test("file filtering") {
+ assert(!HadoopFsRelation.shouldFilterOut("abcd"))
+ assert(HadoopFsRelation.shouldFilterOut(".ab"))
+ assert(HadoopFsRelation.shouldFilterOut("_cd"))
+
+ assert(!HadoopFsRelation.shouldFilterOut("_metadata"))
+ assert(!HadoopFsRelation.shouldFilterOut("_common_metadata"))
+ assert(HadoopFsRelation.shouldFilterOut("_ab_metadata"))
+ assert(HadoopFsRelation.shouldFilterOut("_cd_common_metadata"))
+ }
}