From 3787f514d9a8e45d2c257b4696e30bc1a1935748 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@apache.org>
Date: Thu, 28 Nov 2013 23:44:56 -0800
Subject: Fix UnicodeEncodeError in PySpark saveAsTextFile().

Fixes SPARK-970.
---
 python/pyspark/tests.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'python/pyspark/tests.py')

diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 621e1cb58c..3987642bf4 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -19,6 +19,8 @@
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
 """
+from fileinput import input
+from glob import glob
 import os
 import shutil
 import sys
@@ -138,6 +140,19 @@ class TestAddFile(PySparkTestCase):
         self.assertEqual("Hello World from inside a package!", UserClass().hello())
 
 
+class TestRDDFunctions(PySparkTestCase):
+
+    def test_save_as_textfile_with_unicode(self):
+        # Regression test for SPARK-970
+        x = u"\u00A1Hola, mundo!"
+        data = self.sc.parallelize([x])
+        tempFile = NamedTemporaryFile(delete=True)
+        tempFile.close()
+        data.saveAsTextFile(tempFile.name)
+        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
+        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
+
+
 class TestIO(PySparkTestCase):
 
     def test_stdout_redirection(self):
-- 
cgit v1.2.3