diff options
author | Yin Huai <huai@cse.ohio-state.edu> | 2014-07-07 18:37:38 -0700 |
---|---|---|
committer | Michael Armbrust <michael@databricks.com> | 2014-07-07 18:37:38 -0700 |
commit | 4352a2fdaa64efee7158eabef65703460ff284ec (patch) | |
tree | 2c4a946d15c8f3584d4cad6c14a9562187733256 /python/pyspark/sql.py | |
parent | f0496ee10847db921a028a34f70385f9b740b3f3 (diff) | |
download | spark-4352a2fdaa64efee7158eabef65703460ff284ec.tar.gz spark-4352a2fdaa64efee7158eabef65703460ff284ec.tar.bz2 spark-4352a2fdaa64efee7158eabef65703460ff284ec.zip |
[SPARK-2376][SQL] Selecting list values inside nested JSON objects raises java.lang.IllegalArgumentException
JIRA: https://issues.apache.org/jira/browse/SPARK-2376
Author: Yin Huai <huai@cse.ohio-state.edu>
Closes #1320 from yhuai/SPARK-2376 and squashes the following commits:
0107417 [Yin Huai] Merge remote-tracking branch 'upstream/master' into SPARK-2376
480803d [Yin Huai] Correctly handling JSON arrays in PySpark.
Diffstat (limited to 'python/pyspark/sql.py')
-rw-r--r-- | python/pyspark/sql.py | 24 |
1 files changed, 14 insertions, 10 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 5051c82da3..ffe177576f 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -152,10 +152,12 @@ class SQLContext: >>> ofn.close() >>> srdd = sqlCtx.jsonFile(jsonFile) >>> sqlCtx.registerRDDAsTable(srdd, "table1") - >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1") - >>> srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}}, - ... {"f1": 2, "f2": "row2", "f3":{"field4":22}}, - ... {"f1": 3, "f2": "row3", "f3":{"field4":33}}] + >>> srdd2 = sqlCtx.sql( + ... "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1") + >>> srdd2.collect() == [ + ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None}, + ... {"f1":2, "f2":None, "f3":{"field4":22, "field5": [10, 11]}, "f4":[{"field7": "row2"}]}, + ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}] True """ jschema_rdd = self._ssql_ctx.jsonFile(path) @@ -167,10 +169,12 @@ class SQLContext: >>> srdd = sqlCtx.jsonRDD(json) >>> sqlCtx.registerRDDAsTable(srdd, "table1") - >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1") - >>> srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}}, - ... {"f1": 2, "f2": "row2", "f3":{"field4":22}}, - ... {"f1": 3, "f2": "row3", "f3":{"field4":33}}] + >>> srdd2 = sqlCtx.sql( + ... "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1") + >>> srdd2.collect() == [ + ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None}, + ... {"f1":2, "f2":None, "f3":{"field4":22, "field5": [10, 11]}, "f4":[{"field7": "row2"}]}, + ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}] True """ def func(split, iterator): @@ -492,8 +496,8 @@ def _test(): globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) jsonStrings = ['{"field1": 1, "field2": "row1", "field3":{"field4":11}}', - '{"field1" : 2, "field2": "row2", "field3":{"field4":22}}', - '{"field1" : 3, "field2": "row3", "field3":{"field4":33}}'] + '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]}, "field6":[{"field7": "row2"}]}', + '{"field1" : null, "field2": "row3", "field3":{"field4":33, "field5": []}}'] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) globs['nestedRdd1'] = sc.parallelize([ |