diff options
author | Sean Owen <sowen@cloudera.com> | 2015-06-29 17:21:35 -0700 |
---|---|---|
committer | Andrew Or <andrew@databricks.com> | 2015-06-29 17:21:35 -0700 |
commit | 5d30eae56051c563a8427f330b09ef66db0a0d21 (patch) | |
tree | 6b2661afd698cb7d0092e499466c08ddc19e6e40 | |
parent | fbf75738feddebb352d5cedf503b573105d4b7a7 (diff) | |
download | spark-5d30eae56051c563a8427f330b09ef66db0a0d21.tar.gz spark-5d30eae56051c563a8427f330b09ef66db0a0d21.tar.bz2 spark-5d30eae56051c563a8427f330b09ef66db0a0d21.zip |
[SPARK-8437] [DOCS] Using directory path without wildcard for filename slow for large number of files with wholeTextFiles and binaryFiles
Note that 'dir/*' can be more efficient in some Hadoop FS implementations that 'dir/'
Author: Sean Owen <sowen@cloudera.com>
Closes #7036 from srowen/SPARK-8437 and squashes the following commits:
0e813ae [Sean Owen] Note that 'dir/*' can be more efficient in some Hadoop FS implementations that 'dir/'
-rw-r--r-- | core/src/main/scala/org/apache/spark/SparkContext.scala | 8 |
1 files changed, 6 insertions, 2 deletions
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index b3c3bf3746..cb7e24c374 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -831,6 +831,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * }}} * * @note Small files are preferred, large file is also allowable, but may cause bad performance. + * @note On some filesystems, `.../path/*` can be a more efficient way to read all files in a directory + * rather than `.../path/` or `.../path` * * @param minPartitions A suggestion value of the minimal splitting number for input data. */ @@ -878,9 +880,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * (a-hdfs-path/part-nnnnn, its content) * }}} * - * @param minPartitions A suggestion value of the minimal splitting number for input data. - * * @note Small files are preferred; very large files may cause bad performance. + * @note On some filesystems, `.../path/*` can be a more efficient way to read all files in a directory + * rather than `.../path/` or `.../path` + * + * @param minPartitions A suggestion value of the minimal splitting number for input data. */ @Experimental def binaryFiles( |