[SPARK-12334][SQL][PYSPARK] Support read from multiple input paths for orc file in DataFrameReader.orc

Beside the issue in spark api, also fix 2 minor issues in pyspark - support read from multiple input paths for orc - support read from multiple input paths for text Author: Jeff Zhang <zjffdu@apache.org> Closes #10307 from zjffdu/SPARK-12334.
author: Jeff Zhang <zjffdu@apache.org> 2017-03-09 11:44:34 -0800
committer: Holden Karau <holden@us.ibm.com> 2017-03-09 11:44:34 -0800
commit: cabe1df8606e7e5b9e6efb106045deb3f39f5f13 (patch)
tree: f46f3bd4a2d85abe2b1b12632dfd7b27f0da226e /sql
parent: 30b18e69361746b4d656474374d8b486bb48a19e (diff)
download: spark-cabe1df8606e7e5b9e6efb106045deb3f39f5f13.tar.gz
spark-cabe1df8606e7e5b9e6efb106045deb3f39f5f13.tar.bz2
spark-cabe1df8606e7e5b9e6efb106045deb3f39f5f13.zip
2 files changed, 12 insertions, 3 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index a5e38e25b1..4f4cc93117 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -262,7 +262,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a JSON file and returns the results as a `DataFrame`.
+   * Loads JSON files and returns the results as a `DataFrame`.
    *
    * <a href="http://jsonlines.org/">JSON Lines</a> (newline-delimited JSON) is supported by
    * default. For JSON (one record per file), set the `wholeFile` option to true.
@@ -438,7 +438,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a CSV file and returns the result as a `DataFrame`.
+   * Loads CSV files and returns the result as a `DataFrame`.
    *
    * This function will go through the input once to determine the input schema if `inferSchema`
    * is enabled. To avoid going through the entire data once, disable `inferSchema` option or
@@ -549,7 +549,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads an ORC file and returns the result as a `DataFrame`.
+   * Loads ORC files and returns the result as a `DataFrame`.
    *
    * @param paths input paths
    * @since 2.0.0
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 38a5477796..5d8ba9d7c8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.util.Utils
 
 case class AllDataTypesWithNonPrimitiveType(
     stringField: String,
@@ -611,4 +612,12 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       }
     }
   }
+
+   test("read from multiple orc input paths") {
+     val path1 = Utils.createTempDir()
+     val path2 = Utils.createTempDir()
+     makeOrcFile((1 to 10).map(Tuple1.apply), path1)
+     makeOrcFile((1 to 10).map(Tuple1.apply), path2)
+     assertResult(20)(read.orc(path1.getCanonicalPath, path2.getCanonicalPath).count())
+   }
 }
author	Jeff Zhang <zjffdu@apache.org>	2017-03-09 11:44:34 -0800
committer	Holden Karau <holden@us.ibm.com>	2017-03-09 11:44:34 -0800
commit	cabe1df8606e7e5b9e6efb106045deb3f39f5f13 (patch)
tree	f46f3bd4a2d85abe2b1b12632dfd7b27f0da226e /sql
parent	30b18e69361746b4d656474374d8b486bb48a19e (diff)
download	spark-cabe1df8606e7e5b9e6efb106045deb3f39f5f13.tar.gz spark-cabe1df8606e7e5b9e6efb106045deb3f39f5f13.tar.bz2 spark-cabe1df8606e7e5b9e6efb106045deb3f39f5f13.zip