diff options
-rw-r--r-- | core/src/main/scala/org/apache/spark/SparkContext.scala | 4 | ||||
-rw-r--r-- | docs/programming-guide.md | 2 |
2 files changed, 5 insertions, 1 deletions
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 02c009cdb5..bd3f454485 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -851,6 +851,8 @@ class SparkContext(config: SparkConf) extends Logging { * @note Small files are preferred, large file is also allowable, but may cause bad performance. * @note On some filesystems, `.../path/*` can be a more efficient way to read all files * in a directory rather than `.../path/` or `.../path` + * @note Partitioning is determined by data locality. This may result in too few partitions + * by default. * * @param path Directory to the input data files, the path can be comma separated paths as the * list of inputs. @@ -900,6 +902,8 @@ class SparkContext(config: SparkConf) extends Logging { * @note Small files are preferred; very large files may cause bad performance. * @note On some filesystems, `.../path/*` can be a more efficient way to read all files * in a directory rather than `.../path/` or `.../path` + * @note Partitioning is determined by data locality. This may result in too few partitions + * by default. * * @param path Directory to the input data files, the path can be comma separated paths as the * list of inputs. diff --git a/docs/programming-guide.md b/docs/programming-guide.md index 353730c28f..a4017b5b97 100644 --- a/docs/programming-guide.md +++ b/docs/programming-guide.md @@ -347,7 +347,7 @@ Some notes on reading files with Spark: Apart from text files, Spark's Scala API also supports several other data formats: -* `SparkContext.wholeTextFiles` lets you read a directory containing multiple small text files, and returns each of them as (filename, content) pairs. This is in contrast with `textFile`, which would return one record per line in each file. +* `SparkContext.wholeTextFiles` lets you read a directory containing multiple small text files, and returns each of them as (filename, content) pairs. This is in contrast with `textFile`, which would return one record per line in each file. Partitioning is determined by data locality which, in some cases, may result in too few partitions. For those cases, `wholeTextFiles` provides an optional second argument for controlling the minimal number of partitions. * For [SequenceFiles](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/SequenceFileInputFormat.html), use SparkContext's `sequenceFile[K, V]` method where `K` and `V` are the types of key and values in the file. These should be subclasses of Hadoop's [Writable](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/io/Writable.html) interface, like [IntWritable](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/io/IntWritable.html) and [Text](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/io/Text.html). In addition, Spark allows you to specify native types for a few common Writables; for example, `sequenceFile[Int, String]` will automatically read IntWritables and Texts. |