diff options
author | Davies Liu <davies.liu@gmail.com> | 2014-08-18 13:58:35 -0700 |
---|---|---|
committer | Josh Rosen <joshrosen@apache.org> | 2014-08-18 13:58:35 -0700 |
commit | d1d0ee41c27f1d07fed0c5d56ba26c723cc3dc26 (patch) | |
tree | 3ac09d9c32b47167ffe6e56e91eb05995c654c1e /python/pyspark/tests.py | |
parent | 3a5962f0f5acea5cbfd3cf1e3ed16e03b3bec37a (diff) | |
download | spark-d1d0ee41c27f1d07fed0c5d56ba26c723cc3dc26.tar.gz spark-d1d0ee41c27f1d07fed0c5d56ba26c723cc3dc26.tar.bz2 spark-d1d0ee41c27f1d07fed0c5d56ba26c723cc3dc26.zip |
[SPARK-3103] [PySpark] fix saveAsTextFile() with utf-8
bugfix: It will raise an exception when it try to encode non-ASCII strings into unicode. It should only encode unicode as "utf-8".
Author: Davies Liu <davies.liu@gmail.com>
Closes #2018 from davies/fix_utf8 and squashes the following commits:
4db7967 [Davies Liu] fix saveAsTextFile() with utf-8
Diffstat (limited to 'python/pyspark/tests.py')
-rw-r--r-- | python/pyspark/tests.py | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index f1fece998c..69d543d9d0 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -256,6 +256,15 @@ class TestRDDFunctions(PySparkTestCase): raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) + def test_save_as_textfile_with_utf8(self): + x = u"\u00A1Hola, mundo!" + data = self.sc.parallelize([x.encode("utf-8")]) + tempFile = tempfile.NamedTemporaryFile(delete=True) + tempFile.close() + data.saveAsTextFile(tempFile.name) + raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) + self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) + def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) |