From 3787f514d9a8e45d2c257b4696e30bc1a1935748 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 28 Nov 2013 23:44:56 -0800 Subject: Fix UnicodeEncodeError in PySpark saveAsTextFile(). Fixes SPARK-970. --- python/pyspark/rdd.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'python/pyspark/rdd.py') diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 957f3f89c0..d8da02072c 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -605,7 +605,10 @@ class RDD(object): '0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n' """ def func(split, iterator): - return (str(x).encode("utf-8") for x in iterator) + for x in iterator: + if not isinstance(x, basestring): + x = unicode(x) + yield x.encode("utf-8") keyed = PipelinedRDD(self, func) keyed._bypass_serializer = True keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path) -- cgit v1.2.3