diff options
author | Ahir Reddy <ahirreddy@gmail.com> | 2014-08-14 10:48:52 -0700 |
---|---|---|
committer | Michael Armbrust <michael@databricks.com> | 2014-08-14 10:49:01 -0700 |
commit | 850abaa36043104e5f09bf2754d1ae3f9ce86e3d (patch) | |
tree | 480956f21b0789e1905f104a25d71772339db01f /python/pyspark/sql.py | |
parent | de501e169f24e4573747aec85b7651c98633c028 (diff) | |
download | spark-850abaa36043104e5f09bf2754d1ae3f9ce86e3d.tar.gz spark-850abaa36043104e5f09bf2754d1ae3f9ce86e3d.tar.bz2 spark-850abaa36043104e5f09bf2754d1ae3f9ce86e3d.zip |
[SQL] Python JsonRDD UTF8 Encoding Fix
Only encode unicode objects to UTF-8, and not strings
Author: Ahir Reddy <ahirreddy@gmail.com>
Closes #1914 from ahirreddy/json-rdd-unicode-fix1 and squashes the following commits:
ca4e9ba [Ahir Reddy] Encoding Fix
(cherry picked from commit fde692b361773110c262abe219e7c8128bd76419)
Signed-off-by: Michael Armbrust <michael@databricks.com>
Diffstat (limited to 'python/pyspark/sql.py')
-rw-r--r-- | python/pyspark/sql.py | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 46540ca3f1..95086a2258 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -1267,7 +1267,9 @@ class SQLContext: for x in iterator: if not isinstance(x, basestring): x = unicode(x) - yield x.encode("utf-8") + if isinstance(x, unicode): + x = x.encode("utf-8") + yield x keyed = rdd.mapPartitions(func) keyed._bypass_serializer = True jrdd = keyed._jrdd.map(self._jvm.BytesToString()) |