aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/context.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/context.py')
-rw-r--r--python/pyspark/context.py16
1 files changed, 12 insertions, 4 deletions
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 84bc0a3b7c..3ab98e262d 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -331,12 +331,16 @@ class SparkContext(object):
return RDD(self._jsc.objectFile(name, minPartitions), self,
BatchedSerializer(PickleSerializer()))
- def textFile(self, name, minPartitions=None):
+ def textFile(self, name, minPartitions=None, use_unicode=True):
"""
Read a text file from HDFS, a local file system (available on all
nodes), or any Hadoop-supported file system URI, and return it as an
RDD of Strings.
+ If use_unicode is False, the strings will be kept as `str` (encoding
+ as `utf-8`), which is faster and smaller than unicode. (Added in
+ Spark 1.2)
+
>>> path = os.path.join(tempdir, "sample-text.txt")
>>> with open(path, "w") as testFile:
... testFile.write("Hello world!")
@@ -346,9 +350,9 @@ class SparkContext(object):
"""
minPartitions = minPartitions or min(self.defaultParallelism, 2)
return RDD(self._jsc.textFile(name, minPartitions), self,
- UTF8Deserializer())
+ UTF8Deserializer(use_unicode))
- def wholeTextFiles(self, path, minPartitions=None):
+ def wholeTextFiles(self, path, minPartitions=None, use_unicode=True):
"""
Read a directory of text files from HDFS, a local file system
(available on all nodes), or any Hadoop-supported file system
@@ -356,6 +360,10 @@ class SparkContext(object):
key-value pair, where the key is the path of each file, the
value is the content of each file.
+ If use_unicode is False, the strings will be kept as `str` (encoding
+ as `utf-8`), which is faster and smaller than unicode. (Added in
+ Spark 1.2)
+
For example, if you have the following files::
hdfs://a-hdfs-path/part-00000
@@ -386,7 +394,7 @@ class SparkContext(object):
"""
minPartitions = minPartitions or self.defaultMinPartitions
return RDD(self._jsc.wholeTextFiles(path, minPartitions), self,
- PairDeserializer(UTF8Deserializer(), UTF8Deserializer()))
+ PairDeserializer(UTF8Deserializer(use_unicode), UTF8Deserializer(use_unicode)))
def _dictToJavaMap(self, d):
jm = self._jvm.java.util.HashMap()