aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark
diff options
context:
space:
mode:
authorAhir Reddy <ahirreddy@gmail.com>2014-08-14 10:48:52 -0700
committerMichael Armbrust <michael@databricks.com>2014-08-14 10:48:52 -0700
commitfde692b361773110c262abe219e7c8128bd76419 (patch)
tree63e255c8b3f414c9a3ff47733cabab2f7126e915 /python/pyspark
parentadd75d4831fdc35712bf8b737574ea0bc677c37c (diff)
downloadspark-fde692b361773110c262abe219e7c8128bd76419.tar.gz
spark-fde692b361773110c262abe219e7c8128bd76419.tar.bz2
spark-fde692b361773110c262abe219e7c8128bd76419.zip
[SQL] Python JsonRDD UTF8 Encoding Fix
Only encode unicode objects to UTF-8, and not strings Author: Ahir Reddy <ahirreddy@gmail.com> Closes #1914 from ahirreddy/json-rdd-unicode-fix1 and squashes the following commits: ca4e9ba [Ahir Reddy] Encoding Fix
Diffstat (limited to 'python/pyspark')
-rw-r--r--python/pyspark/sql.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 46540ca3f1..95086a2258 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -1267,7 +1267,9 @@ class SQLContext:
for x in iterator:
if not isinstance(x, basestring):
x = unicode(x)
- yield x.encode("utf-8")
+ if isinstance(x, unicode):
+ x = x.encode("utf-8")
+ yield x
keyed = rdd.mapPartitions(func)
keyed._bypass_serializer = True
jrdd = keyed._jrdd.map(self._jvm.BytesToString())