aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--python/pyspark/rdd.py4
-rw-r--r--python/pyspark/tests.py9
2 files changed, 12 insertions, 1 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 240381e5ba..c708b69cc1 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1191,7 +1191,9 @@ class RDD(object):
for x in iterator:
if not isinstance(x, basestring):
x = unicode(x)
- yield x.encode("utf-8")
+ if isinstance(x, unicode):
+ x = x.encode("utf-8")
+ yield x
keyed = self.mapPartitionsWithIndex(func)
keyed._bypass_serializer = True
keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index f1fece998c..69d543d9d0 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -256,6 +256,15 @@ class TestRDDFunctions(PySparkTestCase):
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
+ def test_save_as_textfile_with_utf8(self):
+ x = u"\u00A1Hola, mundo!"
+ data = self.sc.parallelize([x.encode("utf-8")])
+ tempFile = tempfile.NamedTemporaryFile(delete=True)
+ tempFile.close()
+ data.saveAsTextFile(tempFile.name)
+ raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
+ self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
+
def test_transforming_cartesian_result(self):
# Regression test for SPARK-1034
rdd1 = self.sc.parallelize([1, 2])