aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/rdd.py
diff options
context:
space:
mode:
authorDavies Liu <davies.liu@gmail.com>2014-08-18 13:58:35 -0700
committerJosh Rosen <joshrosen@apache.org>2014-08-18 13:58:35 -0700
commitd1d0ee41c27f1d07fed0c5d56ba26c723cc3dc26 (patch)
tree3ac09d9c32b47167ffe6e56e91eb05995c654c1e /python/pyspark/rdd.py
parent3a5962f0f5acea5cbfd3cf1e3ed16e03b3bec37a (diff)
downloadspark-d1d0ee41c27f1d07fed0c5d56ba26c723cc3dc26.tar.gz
spark-d1d0ee41c27f1d07fed0c5d56ba26c723cc3dc26.tar.bz2
spark-d1d0ee41c27f1d07fed0c5d56ba26c723cc3dc26.zip
[SPARK-3103] [PySpark] fix saveAsTextFile() with utf-8
bugfix: It will raise an exception when it try to encode non-ASCII strings into unicode. It should only encode unicode as "utf-8". Author: Davies Liu <davies.liu@gmail.com> Closes #2018 from davies/fix_utf8 and squashes the following commits: 4db7967 [Davies Liu] fix saveAsTextFile() with utf-8
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r--python/pyspark/rdd.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 240381e5ba..c708b69cc1 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1191,7 +1191,9 @@ class RDD(object):
for x in iterator:
if not isinstance(x, basestring):
x = unicode(x)
- yield x.encode("utf-8")
+ if isinstance(x, unicode):
+ x = x.encode("utf-8")
+ yield x
keyed = self.mapPartitionsWithIndex(func)
keyed._bypass_serializer = True
keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)