diff options
author | Davies Liu <davies.liu@gmail.com> | 2014-08-04 12:13:41 -0700 |
---|---|---|
committer | Josh Rosen <joshrosen@apache.org> | 2014-08-04 12:13:41 -0700 |
commit | 59f84a9531f7974a053fd4963ce9afd88273ea4c (patch) | |
tree | f2f72b2b2a041624ff4462e777f19874cd477b41 /python/pyspark/tests.py | |
parent | e053c55819363fab7068bb9165e3379f0c2f570c (diff) | |
download | spark-59f84a9531f7974a053fd4963ce9afd88273ea4c.tar.gz spark-59f84a9531f7974a053fd4963ce9afd88273ea4c.tar.bz2 spark-59f84a9531f7974a053fd4963ce9afd88273ea4c.zip |
[SPARK-1687] [PySpark] pickable namedtuple
Add an hook to replace original namedtuple with an pickable one, then namedtuple could be used in RDDs.
PS: pyspark should be import BEFORE "from collections import namedtuple"
Author: Davies Liu <davies.liu@gmail.com>
Closes #1623 from davies/namedtuple and squashes the following commits:
045dad8 [Davies Liu] remove unrelated code changes
4132f32 [Davies Liu] address comment
55b1c1a [Davies Liu] fix tests
61f86eb [Davies Liu] replace all the reference of namedtuple to new hacked one
98df6c6 [Davies Liu] Merge branch 'master' of github.com:apache/spark into namedtuple
f7b1bde [Davies Liu] add hack for CloudPickleSerializer
0c5c849 [Davies Liu] Merge branch 'master' of github.com:apache/spark into namedtuple
21991e6 [Davies Liu] hack namedtuple in __main__ module, make it picklable.
93b03b8 [Davies Liu] pickable namedtuple
Diffstat (limited to 'python/pyspark/tests.py')
-rw-r--r-- | python/pyspark/tests.py | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index acc3c30371..4ac94ba729 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -112,6 +112,17 @@ class TestMerger(unittest.TestCase): m._cleanup() +class SerializationTestCase(unittest.TestCase): + + def test_namedtuple(self): + from collections import namedtuple + from cPickle import dumps, loads + P = namedtuple("P", "x y") + p1 = P(1, 3) + p2 = loads(dumps(p1, 2)) + self.assertEquals(p1, p2) + + class PySparkTestCase(unittest.TestCase): def setUp(self): @@ -298,6 +309,14 @@ class TestRDDFunctions(PySparkTestCase): self.assertEqual([1], rdd.map(itemgetter(1)).collect()) self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect()) + def test_namedtuple_in_rdd(self): + from collections import namedtuple + Person = namedtuple("Person", "id firstName lastName") + jon = Person(1, "Jon", "Doe") + jane = Person(2, "Jane", "Doe") + theDoes = self.sc.parallelize([jon, jane]) + self.assertEquals([jon, jane], theDoes.collect()) + class TestIO(PySparkTestCase): |