aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSean Owen <sowen@cloudera.com>2015-06-29 17:21:35 -0700
committerAndrew Or <andrew@databricks.com>2015-06-29 17:21:47 -0700
commitb2684557fa0d2ec14b7529324443c8154d81c348 (patch)
treef52e443b76aacdb2e2a2b5179d31936e46e1cc03
parentcdfa388dd0c9e10be24184be30e4d0a73207fe62 (diff)
downloadspark-b2684557fa0d2ec14b7529324443c8154d81c348.tar.gz
spark-b2684557fa0d2ec14b7529324443c8154d81c348.tar.bz2
spark-b2684557fa0d2ec14b7529324443c8154d81c348.zip
[SPARK-8437] [DOCS] Using directory path without wildcard for filename slow for large number of files with wholeTextFiles and binaryFiles
Note that 'dir/*' can be more efficient in some Hadoop FS implementations that 'dir/' Author: Sean Owen <sowen@cloudera.com> Closes #7036 from srowen/SPARK-8437 and squashes the following commits: 0e813ae [Sean Owen] Note that 'dir/*' can be more efficient in some Hadoop FS implementations that 'dir/' (cherry picked from commit 5d30eae56051c563a8427f330b09ef66db0a0d21) Signed-off-by: Andrew Or <andrew@databricks.com>
-rw-r--r--core/src/main/scala/org/apache/spark/SparkContext.scala8
1 files changed, 6 insertions, 2 deletions
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index b4c0d4c2f5..f8af710de7 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -824,6 +824,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
* }}}
*
* @note Small files are preferred, large file is also allowable, but may cause bad performance.
+ * @note On some filesystems, `.../path/*` can be a more efficient way to read all files in a directory
+ * rather than `.../path/` or `.../path`
*
* @param minPartitions A suggestion value of the minimal splitting number for input data.
*/
@@ -871,9 +873,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
* (a-hdfs-path/part-nnnnn, its content)
* }}}
*
- * @param minPartitions A suggestion value of the minimal splitting number for input data.
- *
* @note Small files are preferred; very large files may cause bad performance.
+ * @note On some filesystems, `.../path/*` can be a more efficient way to read all files in a directory
+ * rather than `.../path/` or `.../path`
+ *
+ * @param minPartitions A suggestion value of the minimal splitting number for input data.
*/
@Experimental
def binaryFiles(