SPARK-1421. Make MLlib work on Python 2.6

The reason it wasn't working was passing a bytearray to stream.write(), which is not supported in Python 2.6 but is in 2.7. (This array came from NumPy when we converted data to send it over to Java). Now we just convert those bytearrays to strings of bytes, which preserves nonprintable characters as well. Author: Matei Zaharia <matei@databricks.com> Closes #335 from mateiz/mllib-python-2.6 and squashes the following commits: f26c59f [Matei Zaharia] Update docs to no longer say we need Python 2.7 a84d6af [Matei Zaharia] SPARK-1421. Make MLlib work on Python 2.6
author: Matei Zaharia <matei@databricks.com> 2014-04-05 20:52:05 -0700
committer: Matei Zaharia <matei@databricks.com> 2014-04-05 20:52:05 -0700
commit: 0b855167818b9afd2d2aa9f617b9861d77b2425d (patch)
tree: c0f4258b8b6ec6ffcdc805a069e77bbfe1cab84f /python/pyspark/serializers.py
parent: 890d63bd4e16296ac70e151b3754727ea42b583c (diff)
download: spark-0b855167818b9afd2d2aa9f617b9861d77b2425d.tar.gz
spark-0b855167818b9afd2d2aa9f617b9861d77b2425d.tar.bz2
spark-0b855167818b9afd2d2aa9f617b9861d77b2425d.zip
1 files changed, 10 insertions, 1 deletions
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 4d802924df..b253807974 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -64,6 +64,7 @@ import cPickle
 from itertools import chain, izip, product
 import marshal
 import struct
+import sys
 from pyspark import cloudpickle
 
 
@@ -113,6 +114,11 @@ class FramedSerializer(Serializer):
     where C{length} is a 32-bit integer and data is C{length} bytes.
     """
 
+    def __init__(self):
+        # On Python 2.6, we can't write bytearrays to streams, so we need to convert them
+        # to strings first. Check if the version number is that old.
+        self._only_write_strings = sys.version_info[0:2] <= (2, 6)
+
     def dump_stream(self, iterator, stream):
         for obj in iterator:
             self._write_with_length(obj, stream)
@@ -127,7 +133,10 @@ class FramedSerializer(Serializer):
     def _write_with_length(self, obj, stream):
         serialized = self.dumps(obj)
         write_int(len(serialized), stream)
-        stream.write(serialized)
+        if self._only_write_strings:
+            stream.write(str(serialized))
+        else:
+            stream.write(serialized)
 
     def _read_with_length(self, stream):
         length = read_int(stream)
author	Matei Zaharia <matei@databricks.com>	2014-04-05 20:52:05 -0700
committer	Matei Zaharia <matei@databricks.com>	2014-04-05 20:52:05 -0700
commit	0b855167818b9afd2d2aa9f617b9861d77b2425d (patch)
tree	c0f4258b8b6ec6ffcdc805a069e77bbfe1cab84f /python/pyspark/serializers.py
parent	890d63bd4e16296ac70e151b3754727ea42b583c (diff)
download	spark-0b855167818b9afd2d2aa9f617b9861d77b2425d.tar.gz spark-0b855167818b9afd2d2aa9f617b9861d77b2425d.tar.bz2 spark-0b855167818b9afd2d2aa9f617b9861d77b2425d.zip