[SPARK-7133] [SQL] Implement struct, array, and map field accessor

It's the first step: generalize UnresolvedGetField to support all map, struct, and array TODO: add `apply` in Scala and `__getitem__` in Python, and unify the `getItem` and `getField` methods to one single API(or should we keep them for compatibility?). Author: Wenchen Fan <cloud0fan@outlook.com> Closes #5744 from cloud-fan/generalize and squashes the following commits: 715c589 [Wenchen Fan] address comments 7ea5b31 [Wenchen Fan] fix python test 4f0833a [Wenchen Fan] add python test f515d69 [Wenchen Fan] add apply method and test cases 8df6199 [Wenchen Fan] fix python test 239730c [Wenchen Fan] fix test compile 2a70526 [Wenchen Fan] use _bin_op in dataframe.py 6bf72bc [Wenchen Fan] address comments 3f880c3 [Wenchen Fan] add java doc ab35ab5 [Wenchen Fan] fix python test b5961a9 [Wenchen Fan] fix style c9d85f5 [Wenchen Fan] generalize UnresolvedGetField to support all map, struct, and array
author: Wenchen Fan <cloud0fan@outlook.com> 2015-05-08 11:49:38 -0700
committer: Michael Armbrust <michael@databricks.com> 2015-05-08 11:49:38 -0700
commit: 2d05f325dc3c70349bd17ed399897f22d967c687 (patch)
tree: 80c39fe01722882e02c9a4e0be9c35a74c082b78 /python/pyspark
parent: a1ec08f7edc8d956afcfbb92d10b26b7619486e8 (diff)
download: spark-2d05f325dc3c70349bd17ed399897f22d967c687.tar.gz
spark-2d05f325dc3c70349bd17ed399897f22d967c687.tar.bz2
spark-2d05f325dc3c70349bd17ed399897f22d967c687.zip
2 files changed, 19 insertions, 12 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index cee804f5cc..a9697999e8 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1275,7 +1275,7 @@ class Column(object):
 
     # container operators
     __contains__ = _bin_op("contains")
-    __getitem__ = _bin_op("getItem")
+    __getitem__ = _bin_op("apply")
 
     # bitwise operators
     bitwiseOR = _bin_op("bitwiseOR")
@@ -1308,19 +1308,19 @@ class Column(object):
         >>> from pyspark.sql import Row
         >>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
         >>> df.select(df.r.getField("b")).show()
-        +---+
-        |r.b|
-        +---+
-        |  b|
-        +---+
+        +----+
+        |r[b]|
+        +----+
+        |   b|
+        +----+
         >>> df.select(df.r.a).show()
-        +---+
-        |r.a|
-        +---+
-        |  1|
-        +---+
+        +----+
+        |r[a]|
+        +----+
+        |   1|
+        +----+
         """
-        return Column(self._jc.getField(name))
+        return self[name]
 
     def __getattr__(self, item):
         if item.startswith("__"):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 45dfedce22..7e63f4d646 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -519,6 +519,13 @@ class SQLTests(ReusedPySparkTestCase):
         self.assertEqual("v", df.select(df.d["k"]).first()[0])
         self.assertEqual("v", df.select(df.d.getItem("k")).first()[0])
 
+    def test_field_accessor(self):
+        df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF()
+        self.assertEqual(1, df.select(df.l[0]).first()[0])
+        self.assertEqual(1, df.select(df.r["a"]).first()[0])
+        self.assertEqual("b", df.select(df.r["b"]).first()[0])
+        self.assertEqual("v", df.select(df.d["k"]).first()[0])
+
     def test_infer_long_type(self):
         longrow = [Row(f1='a', f2=100000000000000)]
         df = self.sc.parallelize(longrow).toDF()
author	Wenchen Fan <cloud0fan@outlook.com>	2015-05-08 11:49:38 -0700
committer	Michael Armbrust <michael@databricks.com>	2015-05-08 11:49:38 -0700
commit	2d05f325dc3c70349bd17ed399897f22d967c687 (patch)
tree	80c39fe01722882e02c9a4e0be9c35a74c082b78 /python/pyspark
parent	a1ec08f7edc8d956afcfbb92d10b26b7619486e8 (diff)
download	spark-2d05f325dc3c70349bd17ed399897f22d967c687.tar.gz spark-2d05f325dc3c70349bd17ed399897f22d967c687.tar.bz2 spark-2d05f325dc3c70349bd17ed399897f22d967c687.zip