[SPARK-17796][SQL] Support wildcard character in filename for LOAD DATA LOCAL INPATH

## What changes were proposed in this pull request? Currently, Spark 2.0 raises an `input path does not exist` AnalysisException if the file name contains '*'. It is misleading since it occurs when there exist some matched files. Also, it was a supported feature in Spark 1.6.2. This PR aims to support wildcard characters in filename for `LOAD DATA LOCAL INPATH` SQL command like Spark 1.6.2. **Reported Error Scenario** ```scala scala> sql("CREATE TABLE t(a string)") res0: org.apache.spark.sql.DataFrame = [] scala> sql("LOAD DATA LOCAL INPATH '/tmp/x*' INTO TABLE t") org.apache.spark.sql.AnalysisException: LOAD DATA input path does not exist: /tmp/x*; ``` ## How was this patch tested? Pass the Jenkins test with a new test case. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #15376 from dongjoon-hyun/SPARK-17796.
author: Dongjoon Hyun <dongjoon@apache.org> 2016-10-20 09:53:12 +0100
committer: Sean Owen <sowen@cloudera.com> 2016-10-20 09:53:12 +0100
commit: 986a3b8b5bedb1d64e2cf7c95bfdf5505f3e8c69 (patch)
tree: 8e62fcff9c68e0833ec066c80ae00631a3bed3db /sql/hive
parent: c2c107abad8b462218d33c70b946e840663228a1 (diff)
download: spark-986a3b8b5bedb1d64e2cf7c95bfdf5505f3e8c69.tar.gz
spark-986a3b8b5bedb1d64e2cf7c95bfdf5505f3e8c69.tar.bz2
spark-986a3b8b5bedb1d64e2cf7c95bfdf5505f3e8c69.zip
1 files changed, 30 insertions, 0 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index e26b6b57ef..495b4f874a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.sql.hive.execution
 
+import java.io.{File, PrintWriter}
+import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 
 import scala.sys.process.{Process, ProcessLogger}
 import scala.util.Try
 
+import com.google.common.io.Files
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql._
@@ -1917,6 +1920,33 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+  test("SPARK-17796 Support wildcard character in filename for LOAD DATA LOCAL INPATH") {
+    withTempDir { dir =>
+      for (i <- 1 to 3) {
+        Files.write(s"$i", new File(s"$dir/part-r-0000$i"), StandardCharsets.UTF_8)
+      }
+      for (i <- 5 to 7) {
+        Files.write(s"$i", new File(s"$dir/part-s-0000$i"), StandardCharsets.UTF_8)
+      }
+
+      withTable("load_t") {
+        sql("CREATE TABLE load_t (a STRING)")
+        sql(s"LOAD DATA LOCAL INPATH '$dir/*part-r*' INTO TABLE load_t")
+        checkAnswer(sql("SELECT * FROM load_t"), Seq(Row("1"), Row("2"), Row("3")))
+
+        val m = intercept[AnalysisException] {
+          sql("LOAD DATA LOCAL INPATH '/non-exist-folder/*part*' INTO TABLE load_t")
+        }.getMessage
+        assert(m.contains("LOAD DATA input path does not exist"))
+
+        val m2 = intercept[AnalysisException] {
+          sql(s"LOAD DATA LOCAL INPATH '$dir*/*part*' INTO TABLE load_t")
+        }.getMessage
+        assert(m2.contains("LOAD DATA input path allows only filename wildcard"))
+      }
+    }
+  }
+
   def testCommandAvailable(command: String): Boolean = {
     val attempt = Try(Process(command).run(ProcessLogger(_ => ())).exitValue())
     attempt.isSuccess && attempt.get == 0
author	Dongjoon Hyun <dongjoon@apache.org>	2016-10-20 09:53:12 +0100
committer	Sean Owen <sowen@cloudera.com>	2016-10-20 09:53:12 +0100
commit	986a3b8b5bedb1d64e2cf7c95bfdf5505f3e8c69 (patch)
tree	8e62fcff9c68e0833ec066c80ae00631a3bed3db /sql/hive
parent	c2c107abad8b462218d33c70b946e840663228a1 (diff)
download	spark-986a3b8b5bedb1d64e2cf7c95bfdf5505f3e8c69.tar.gz spark-986a3b8b5bedb1d64e2cf7c95bfdf5505f3e8c69.tar.bz2 spark-986a3b8b5bedb1d64e2cf7c95bfdf5505f3e8c69.zip