aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql.py
diff options
context:
space:
mode:
authorAhir Reddy <ahirreddy@gmail.com>2014-08-14 10:48:52 -0700
committerMichael Armbrust <michael@databricks.com>2014-08-14 10:49:01 -0700
commit850abaa36043104e5f09bf2754d1ae3f9ce86e3d (patch)
tree480956f21b0789e1905f104a25d71772339db01f /python/pyspark/sql.py
parentde501e169f24e4573747aec85b7651c98633c028 (diff)
downloadspark-850abaa36043104e5f09bf2754d1ae3f9ce86e3d.tar.gz
spark-850abaa36043104e5f09bf2754d1ae3f9ce86e3d.tar.bz2
spark-850abaa36043104e5f09bf2754d1ae3f9ce86e3d.zip
[SQL] Python JsonRDD UTF8 Encoding Fix
Only encode unicode objects to UTF-8, and not strings Author: Ahir Reddy <ahirreddy@gmail.com> Closes #1914 from ahirreddy/json-rdd-unicode-fix1 and squashes the following commits: ca4e9ba [Ahir Reddy] Encoding Fix (cherry picked from commit fde692b361773110c262abe219e7c8128bd76419) Signed-off-by: Michael Armbrust <michael@databricks.com>
Diffstat (limited to 'python/pyspark/sql.py')
-rw-r--r--python/pyspark/sql.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 46540ca3f1..95086a2258 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -1267,7 +1267,9 @@ class SQLContext:
for x in iterator:
if not isinstance(x, basestring):
x = unicode(x)
- yield x.encode("utf-8")
+ if isinstance(x, unicode):
+ x = x.encode("utf-8")
+ yield x
keyed = rdd.mapPartitions(func)
keyed._bypass_serializer = True
jrdd = keyed._jrdd.map(self._jvm.BytesToString())