[SPARK-3681] [SQL] [PySpark] fix serialization of List and Map in SchemaRDD

Currently, the schema of object in ArrayType or MapType is attached lazily, it will have better performance but introduce issues while serialization or accessing nested objects. This patch will apply schema to the objects of ArrayType or MapType immediately when accessing them, will be a little bit slower, but much robust. Author: Davies Liu <davies.liu@gmail.com> Closes #2526 from davies/nested and squashes the following commits: 2399ae5 [Davies Liu] fix serialization of List and Map in SchemaRDD
author: Davies Liu <davies.liu@gmail.com> 2014-09-27 12:21:37 -0700
committer: Michael Armbrust <michael@databricks.com> 2014-09-27 12:21:37 -0700
commit: 0d8cdf0ede908f6c488a075170f1563815009e29 (patch)
tree: a166fcfd95d7f66f0b9c54e5f9b0866eb95959ee /python/pyspark/tests.py
parent: f0c7e19550d46f81a0a3ff272bbf66ce4bafead6 (diff)
download: spark-0d8cdf0ede908f6c488a075170f1563815009e29.tar.gz
spark-0d8cdf0ede908f6c488a075170f1563815009e29.tar.bz2
spark-0d8cdf0ede908f6c488a075170f1563815009e29.zip
1 files changed, 21 insertions, 0 deletions
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index d1bb2033b7..29df754c6f 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -698,6 +698,27 @@ class TestSQL(PySparkTestCase):
         srdd3 = self.sqlCtx.applySchema(rdd, srdd.schema())
         self.assertEqual(10, srdd3.count())
 
+    def test_serialize_nested_array_and_map(self):
+        d = [Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})]
+        rdd = self.sc.parallelize(d)
+        srdd = self.sqlCtx.inferSchema(rdd)
+        row = srdd.first()
+        self.assertEqual(1, len(row.l))
+        self.assertEqual(1, row.l[0].a)
+        self.assertEqual("2", row.d["key"].d)
+
+        l = srdd.map(lambda x: x.l).first()
+        self.assertEqual(1, len(l))
+        self.assertEqual('s', l[0].b)
+
+        d = srdd.map(lambda x: x.d).first()
+        self.assertEqual(1, len(d))
+        self.assertEqual(1.0, d["key"].c)
+
+        row = srdd.map(lambda x: x.d["key"]).first()
+        self.assertEqual(1.0, row.c)
+        self.assertEqual("2", row.d)
+
 
 class TestIO(PySparkTestCase):
author	Davies Liu <davies.liu@gmail.com>	2014-09-27 12:21:37 -0700
committer	Michael Armbrust <michael@databricks.com>	2014-09-27 12:21:37 -0700
commit	0d8cdf0ede908f6c488a075170f1563815009e29 (patch)
tree	a166fcfd95d7f66f0b9c54e5f9b0866eb95959ee /python/pyspark/tests.py
parent	f0c7e19550d46f81a0a3ff272bbf66ce4bafead6 (diff)
download	spark-0d8cdf0ede908f6c488a075170f1563815009e29.tar.gz spark-0d8cdf0ede908f6c488a075170f1563815009e29.tar.bz2 spark-0d8cdf0ede908f6c488a075170f1563815009e29.zip