[SPARK-17613] S3A base paths with no '/' at the end return empty DataFrames

## What changes were proposed in this pull request? Consider you have a bucket as `s3a://some-bucket` and under it you have files: ``` s3a://some-bucket/file1.parquet s3a://some-bucket/file2.parquet ``` Getting the parent path of `s3a://some-bucket/file1.parquet` yields `s3a://some-bucket/` and the ListingFileCatalog uses this as the key in the hash map. When catalog.allFiles is called, we use `s3a://some-bucket` (no slash at the end) to get the list of files, and we're left with an empty list! This PR fixes this by adding a `/` at the end of the `URI` iff the given `Path` doesn't have a parent, i.e. is the root. This is a no-op if the path already had a `/` at the end, and is handled through the Hadoop Path, path merging semantics. ## How was this patch tested? Unit test in `FileCatalogSuite`. Author: Burak Yavuz <brkyvz@gmail.com> Closes #15169 from brkyvz/SPARK-17613.
author: Burak Yavuz <brkyvz@gmail.com> 2016-09-22 13:05:41 -0700
committer: Josh Rosen <joshrosen@databricks.com> 2016-09-22 13:05:41 -0700
commit: 85d609cf25c1da2df3cd4f5d5aeaf3cbcf0d674c (patch)
tree: daa705aed500801fbf0d3487a67ef951bbbdf23d /sql/core/src/test
parent: 9f24a17c59b1130d97efa7d313c06577f7344338 (diff)
download: spark-85d609cf25c1da2df3cd4f5d5aeaf3cbcf0d674c.tar.gz
spark-85d609cf25c1da2df3cd4f5d5aeaf3cbcf0d674c.tar.bz2
spark-85d609cf25c1da2df3cd4f5d5aeaf3cbcf0d674c.zip
1 files changed, 44 insertions, 1 deletions
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
index 5c8d3226e9..fa3abd0098 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
@@ -18,10 +18,12 @@
 package org.apache.spark.sql.execution.datasources
 
 import java.io.File
+import java.net.URI
 
+import scala.collection.mutable
 import scala.language.reflectiveCalls
 
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
 
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.test.SharedSQLContext
@@ -78,4 +80,45 @@ class FileCatalogSuite extends SharedSQLContext {
       assert(catalog1.listLeafFiles(catalog1.paths).isEmpty)
     }
   }
+
+  test("SPARK-17613 - PartitioningAwareFileCatalog: base path w/o '/' at end") {
+    class MockCatalog(
+      override val paths: Seq[Path]) extends PartitioningAwareFileCatalog(spark, Map.empty, None) {
+
+      override def refresh(): Unit = {}
+
+      override def leafFiles: mutable.LinkedHashMap[Path, FileStatus] = mutable.LinkedHashMap(
+        new Path("mockFs://some-bucket/file1.json") -> new FileStatus()
+      )
+
+      override def leafDirToChildrenFiles: Map[Path, Array[FileStatus]] = Map(
+        new Path("mockFs://some-bucket/") -> Array(new FileStatus())
+      )
+
+      override def partitionSpec(): PartitionSpec = {
+        PartitionSpec.emptySpec
+      }
+    }
+
+    withSQLConf(
+        "fs.mockFs.impl" -> classOf[FakeParentPathFileSystem].getName,
+        "fs.mockFs.impl.disable.cache" -> "true") {
+      val pathWithSlash = new Path("mockFs://some-bucket/")
+      assert(pathWithSlash.getParent === null)
+      val pathWithoutSlash = new Path("mockFs://some-bucket")
+      assert(pathWithoutSlash.getParent === null)
+      val catalog1 = new MockCatalog(Seq(pathWithSlash))
+      val catalog2 = new MockCatalog(Seq(pathWithoutSlash))
+      assert(catalog1.allFiles().nonEmpty)
+      assert(catalog2.allFiles().nonEmpty)
+    }
+  }
+}
+
+class FakeParentPathFileSystem extends RawLocalFileSystem {
+  override def getScheme: String = "mockFs"
+
+  override def getUri: URI = {
+    URI.create("mockFs://some-bucket")
+  }
 }
author	Burak Yavuz <brkyvz@gmail.com>	2016-09-22 13:05:41 -0700
committer	Josh Rosen <joshrosen@databricks.com>	2016-09-22 13:05:41 -0700
commit	85d609cf25c1da2df3cd4f5d5aeaf3cbcf0d674c (patch)
tree	daa705aed500801fbf0d3487a67ef951bbbdf23d /sql/core/src/test
parent	9f24a17c59b1130d97efa7d313c06577f7344338 (diff)
download	spark-85d609cf25c1da2df3cd4f5d5aeaf3cbcf0d674c.tar.gz spark-85d609cf25c1da2df3cd4f5d5aeaf3cbcf0d674c.tar.bz2 spark-85d609cf25c1da2df3cd4f5d5aeaf3cbcf0d674c.zip