diff options
author | Wenchen Fan <cloud0fan@outlook.com> | 2015-05-08 11:49:38 -0700 |
---|---|---|
committer | Michael Armbrust <michael@databricks.com> | 2015-05-08 11:49:38 -0700 |
commit | 2d05f325dc3c70349bd17ed399897f22d967c687 (patch) | |
tree | 80c39fe01722882e02c9a4e0be9c35a74c082b78 /python/pyspark | |
parent | a1ec08f7edc8d956afcfbb92d10b26b7619486e8 (diff) | |
download | spark-2d05f325dc3c70349bd17ed399897f22d967c687.tar.gz spark-2d05f325dc3c70349bd17ed399897f22d967c687.tar.bz2 spark-2d05f325dc3c70349bd17ed399897f22d967c687.zip |
[SPARK-7133] [SQL] Implement struct, array, and map field accessor
It's the first step: generalize UnresolvedGetField to support all map, struct, and array
TODO: add `apply` in Scala and `__getitem__` in Python, and unify the `getItem` and `getField` methods to one single API(or should we keep them for compatibility?).
Author: Wenchen Fan <cloud0fan@outlook.com>
Closes #5744 from cloud-fan/generalize and squashes the following commits:
715c589 [Wenchen Fan] address comments
7ea5b31 [Wenchen Fan] fix python test
4f0833a [Wenchen Fan] add python test
f515d69 [Wenchen Fan] add apply method and test cases
8df6199 [Wenchen Fan] fix python test
239730c [Wenchen Fan] fix test compile
2a70526 [Wenchen Fan] use _bin_op in dataframe.py
6bf72bc [Wenchen Fan] address comments
3f880c3 [Wenchen Fan] add java doc
ab35ab5 [Wenchen Fan] fix python test
b5961a9 [Wenchen Fan] fix style
c9d85f5 [Wenchen Fan] generalize UnresolvedGetField to support all map, struct, and array
Diffstat (limited to 'python/pyspark')
-rw-r--r-- | python/pyspark/sql/dataframe.py | 24 | ||||
-rw-r--r-- | python/pyspark/sql/tests.py | 7 |
2 files changed, 19 insertions, 12 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index cee804f5cc..a9697999e8 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1275,7 +1275,7 @@ class Column(object): # container operators __contains__ = _bin_op("contains") - __getitem__ = _bin_op("getItem") + __getitem__ = _bin_op("apply") # bitwise operators bitwiseOR = _bin_op("bitwiseOR") @@ -1308,19 +1308,19 @@ class Column(object): >>> from pyspark.sql import Row >>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF() >>> df.select(df.r.getField("b")).show() - +---+ - |r.b| - +---+ - | b| - +---+ + +----+ + |r[b]| + +----+ + | b| + +----+ >>> df.select(df.r.a).show() - +---+ - |r.a| - +---+ - | 1| - +---+ + +----+ + |r[a]| + +----+ + | 1| + +----+ """ - return Column(self._jc.getField(name)) + return self[name] def __getattr__(self, item): if item.startswith("__"): diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 45dfedce22..7e63f4d646 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -519,6 +519,13 @@ class SQLTests(ReusedPySparkTestCase): self.assertEqual("v", df.select(df.d["k"]).first()[0]) self.assertEqual("v", df.select(df.d.getItem("k")).first()[0]) + def test_field_accessor(self): + df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF() + self.assertEqual(1, df.select(df.l[0]).first()[0]) + self.assertEqual(1, df.select(df.r["a"]).first()[0]) + self.assertEqual("b", df.select(df.r["b"]).first()[0]) + self.assertEqual("v", df.select(df.d["k"]).first()[0]) + def test_infer_long_type(self): longrow = [Row(f1='a', f2=100000000000000)] df = self.sc.parallelize(longrow).toDF() |