diff options
author | Josh Rosen <joshrosen@apache.org> | 2014-08-11 11:54:09 -0700 |
---|---|---|
committer | Josh Rosen <joshrosen@apache.org> | 2014-08-11 11:54:09 -0700 |
commit | db06a81fb7a413faa3fe0f8c35918f70454cb05d (patch) | |
tree | 7d6b7e1766aadf0a875d5577ee349b73c817ceef /python/pyspark | |
parent | ba28a8fcbc3ba432e7ea4d6f0b535450a6ec96c6 (diff) | |
download | spark-db06a81fb7a413faa3fe0f8c35918f70454cb05d.tar.gz spark-db06a81fb7a413faa3fe0f8c35918f70454cb05d.tar.bz2 spark-db06a81fb7a413faa3fe0f8c35918f70454cb05d.zip |
[PySpark] [SPARK-2954] [SPARK-2948] [SPARK-2910] [SPARK-2101] Python 2.6 Fixes
- Modify python/run-tests to test with Python 2.6
- Use unittest2 when running on Python 2.6.
- Fix issue with namedtuple.
- Skip TestOutputFormat.test_newhadoop on Python 2.6 until SPARK-2951 is fixed.
- Fix MLlib _deserialize_double on Python 2.6.
Closes #1868. Closes #1042.
Author: Josh Rosen <joshrosen@apache.org>
Closes #1874 from JoshRosen/python2.6 and squashes the following commits:
983d259 [Josh Rosen] [SPARK-2954] Fix MLlib _deserialize_double on Python 2.6.
5d18fd7 [Josh Rosen] [SPARK-2948] [SPARK-2910] [SPARK-2101] Python 2.6 fixes
Diffstat (limited to 'python/pyspark')
-rw-r--r-- | python/pyspark/mllib/_common.py | 11 | ||||
-rw-r--r-- | python/pyspark/mllib/tests.py | 7 | ||||
-rw-r--r-- | python/pyspark/serializers.py | 4 | ||||
-rw-r--r-- | python/pyspark/tests.py | 13 |
4 files changed, 28 insertions, 7 deletions
diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py index db341da85f..bb60d3d0c8 100644 --- a/python/pyspark/mllib/_common.py +++ b/python/pyspark/mllib/_common.py @@ -16,6 +16,7 @@ # import struct +import sys import numpy from numpy import ndarray, float64, int64, int32, array_equal, array from pyspark import SparkContext, RDD @@ -78,6 +79,14 @@ DENSE_MATRIX_MAGIC = 3 LABELED_POINT_MAGIC = 4 +# Workaround for SPARK-2954: before Python 2.7, struct.unpack couldn't unpack bytearray()s. +if sys.version_info[:2] <= (2, 6): + def _unpack(fmt, string): + return struct.unpack(fmt, buffer(string)) +else: + _unpack = struct.unpack + + def _deserialize_numpy_array(shape, ba, offset, dtype=float64): """ Deserialize a numpy array of the given type from an offset in @@ -191,7 +200,7 @@ def _deserialize_double(ba, offset=0): raise TypeError("_deserialize_double called on a %s; wanted bytearray" % type(ba)) if len(ba) - offset != 8: raise TypeError("_deserialize_double called on a %d-byte array; wanted 8 bytes." % nb) - return struct.unpack("d", ba[offset:])[0] + return _unpack("d", ba[offset:])[0] def _deserialize_double_vector(ba, offset=0): diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 6f3ec8ac94..8a851bd35c 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -19,8 +19,13 @@ Fuller unit tests for Python MLlib. """ +import sys from numpy import array, array_equal -import unittest + +if sys.version_info[:2] <= (2, 6): + import unittest2 as unittest +else: + import unittest from pyspark.mllib._common import _convert_vector, _serialize_double_vector, \ _deserialize_double_vector, _dot, _squared_distance diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index b35558db3e..df90cafb24 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -314,8 +314,8 @@ def _hijack_namedtuple(): _old_namedtuple = _copy_func(collections.namedtuple) - def namedtuple(name, fields, verbose=False, rename=False): - cls = _old_namedtuple(name, fields, verbose, rename) + def namedtuple(*args, **kwargs): + cls = _old_namedtuple(*args, **kwargs) return _hack_namedtuple(cls) # replace namedtuple with new one diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 88a61176e5..22b51110ed 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -29,9 +29,14 @@ import subprocess import sys import tempfile import time -import unittest import zipfile +if sys.version_info[:2] <= (2, 6): + import unittest2 as unittest +else: + import unittest + + from pyspark.context import SparkContext from pyspark.files import SparkFiles from pyspark.serializers import read_int @@ -605,6 +610,7 @@ class TestOutputFormat(PySparkTestCase): conf=input_conf).collect()) self.assertEqual(old_dataset, dict_data) + @unittest.skipIf(sys.version_info[:2] <= (2, 6), "Skipped on 2.6 until SPARK-2951 is fixed") def test_newhadoop(self): basepath = self.tempdir.name # use custom ArrayWritable types and converters to handle arrays @@ -905,8 +911,9 @@ class TestSparkSubmit(unittest.TestCase): pattern = re.compile(r'^ *\|', re.MULTILINE) content = re.sub(pattern, '', content.strip()) path = os.path.join(self.programDir, name + ".zip") - with zipfile.ZipFile(path, 'w') as zip: - zip.writestr(name, content) + zip = zipfile.ZipFile(path, 'w') + zip.writestr(name, content) + zip.close() return path def test_single_script(self): |