aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-07-01 21:14:13 -0700
committerReynold Xin <rxin@databricks.com>2015-07-01 21:14:13 -0700
commit9fd13d5613b6d16a78d97d4798f085b56107d343 (patch)
tree9687bc3c9da9a72e5ae3814972f5a72c0bb7181f /python/pyspark
parent3a342dedc04799948bf6da69843bd1a91202ffe5 (diff)
downloadspark-9fd13d5613b6d16a78d97d4798f085b56107d343.tar.gz
spark-9fd13d5613b6d16a78d97d4798f085b56107d343.tar.bz2
spark-9fd13d5613b6d16a78d97d4798f085b56107d343.zip
[SPARK-8770][SQL] Create BinaryOperator abstract class.
Our current BinaryExpression abstract class is not for generic binary expressions, i.e. it requires left/right children to have the same type. However, due to its name, contributors build new binary expressions that don't have that assumption (e.g. Sha) and still extend BinaryExpression. This patch creates a new BinaryOperator abstract class, and update the analyzer o only apply type casting rule there. This patch also adds the notion of "prettyName" to expressions, which defines the user-facing name for the expression. Author: Reynold Xin <rxin@databricks.com> Closes #7174 from rxin/binary-opterator and squashes the following commits: f31900d [Reynold Xin] [SPARK-8770][SQL] Create BinaryOperator abstract class. fceb216 [Reynold Xin] Merge branch 'master' of github.com:apache/spark into binary-opterator d8518cf [Reynold Xin] Updated Python tests.
Diffstat (limited to 'python/pyspark')
-rw-r--r--python/pyspark/sql/dataframe.py10
-rw-r--r--python/pyspark/sql/functions.py4
-rw-r--r--python/pyspark/sql/group.py24
3 files changed, 19 insertions, 19 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 273a40dd52..1e9c657cf8 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -802,11 +802,11 @@ class DataFrame(object):
Each element should be a column name (string) or an expression (:class:`Column`).
>>> df.groupBy().avg().collect()
- [Row(AVG(age)=3.5)]
+ [Row(avg(age)=3.5)]
>>> df.groupBy('name').agg({'age': 'mean'}).collect()
- [Row(name=u'Alice', AVG(age)=2.0), Row(name=u'Bob', AVG(age)=5.0)]
+ [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)]
>>> df.groupBy(df.name).avg().collect()
- [Row(name=u'Alice', AVG(age)=2.0), Row(name=u'Bob', AVG(age)=5.0)]
+ [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)]
>>> df.groupBy(['name', df.age]).count().collect()
[Row(name=u'Bob', age=5, count=1), Row(name=u'Alice', age=2, count=1)]
"""
@@ -864,10 +864,10 @@ class DataFrame(object):
(shorthand for ``df.groupBy.agg()``).
>>> df.agg({"age": "max"}).collect()
- [Row(MAX(age)=5)]
+ [Row(max(age)=5)]
>>> from pyspark.sql import functions as F
>>> df.agg(F.min(df.age)).collect()
- [Row(MIN(age)=2)]
+ [Row(min(age)=2)]
"""
return self.groupBy().agg(*exprs)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 4e2be88e9e..f9a15d4a66 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -266,7 +266,7 @@ def coalesce(*cols):
>>> cDf.select(coalesce(cDf["a"], cDf["b"])).show()
+-------------+
- |Coalesce(a,b)|
+ |coalesce(a,b)|
+-------------+
| null|
| 1|
@@ -275,7 +275,7 @@ def coalesce(*cols):
>>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show()
+----+----+---------------+
- | a| b|Coalesce(a,0.0)|
+ | a| b|coalesce(a,0.0)|
+----+----+---------------+
|null|null| 0.0|
| 1|null| 1.0|
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 5a37a673ee..04594d5a83 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -75,11 +75,11 @@ class GroupedData(object):
>>> gdf = df.groupBy(df.name)
>>> gdf.agg({"*": "count"}).collect()
- [Row(name=u'Alice', COUNT(1)=1), Row(name=u'Bob', COUNT(1)=1)]
+ [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)]
>>> from pyspark.sql import functions as F
>>> gdf.agg(F.min(df.age)).collect()
- [Row(name=u'Alice', MIN(age)=2), Row(name=u'Bob', MIN(age)=5)]
+ [Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)]
"""
assert exprs, "exprs should not be empty"
if len(exprs) == 1 and isinstance(exprs[0], dict):
@@ -110,9 +110,9 @@ class GroupedData(object):
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df.groupBy().mean('age').collect()
- [Row(AVG(age)=3.5)]
+ [Row(avg(age)=3.5)]
>>> df3.groupBy().mean('age', 'height').collect()
- [Row(AVG(age)=3.5, AVG(height)=82.5)]
+ [Row(avg(age)=3.5, avg(height)=82.5)]
"""
@df_varargs_api
@@ -125,9 +125,9 @@ class GroupedData(object):
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df.groupBy().avg('age').collect()
- [Row(AVG(age)=3.5)]
+ [Row(avg(age)=3.5)]
>>> df3.groupBy().avg('age', 'height').collect()
- [Row(AVG(age)=3.5, AVG(height)=82.5)]
+ [Row(avg(age)=3.5, avg(height)=82.5)]
"""
@df_varargs_api
@@ -136,9 +136,9 @@ class GroupedData(object):
"""Computes the max value for each numeric columns for each group.
>>> df.groupBy().max('age').collect()
- [Row(MAX(age)=5)]
+ [Row(max(age)=5)]
>>> df3.groupBy().max('age', 'height').collect()
- [Row(MAX(age)=5, MAX(height)=85)]
+ [Row(max(age)=5, max(height)=85)]
"""
@df_varargs_api
@@ -149,9 +149,9 @@ class GroupedData(object):
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df.groupBy().min('age').collect()
- [Row(MIN(age)=2)]
+ [Row(min(age)=2)]
>>> df3.groupBy().min('age', 'height').collect()
- [Row(MIN(age)=2, MIN(height)=80)]
+ [Row(min(age)=2, min(height)=80)]
"""
@df_varargs_api
@@ -162,9 +162,9 @@ class GroupedData(object):
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df.groupBy().sum('age').collect()
- [Row(SUM(age)=7)]
+ [Row(sum(age)=7)]
>>> df3.groupBy().sum('age', 'height').collect()
- [Row(SUM(age)=7, SUM(height)=165)]
+ [Row(sum(age)=7, sum(height)=165)]
"""