aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-02-04 18:35:51 -0800
committerReynold Xin <rxin@databricks.com>2015-02-04 18:35:51 -0800
commit1fbd124b1bd6159086d8e88b139ce0817af02322 (patch)
tree1124f3c60011e2ac5b0e3d867f32bb9c0277d41c /python
parent9a7ce70eabc0ccaa036e142fc97bf0d37faa0b63 (diff)
downloadspark-1fbd124b1bd6159086d8e88b139ce0817af02322.tar.gz
spark-1fbd124b1bd6159086d8e88b139ce0817af02322.tar.bz2
spark-1fbd124b1bd6159086d8e88b139ce0817af02322.zip
[SPARK-5605][SQL][DF] Allow using String to specify colum name in DSL aggregate functions
Author: Reynold Xin <rxin@databricks.com> Closes #4376 from rxin/SPARK-5605 and squashes the following commits: c55f5fa [Reynold Xin] Added a Python test. f4b8dbb [Reynold Xin] [SPARK-5605][SQL][DF] Allow using String to specify colum name in DSL aggregate functions.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql.py13
1 files changed, 8 insertions, 5 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 5b56b36bdc..417db34d67 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -23,7 +23,7 @@ public classes of Spark SQL:
- L{DataFrame}
A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
addition to normal RDD operations, DataFrames also support SQL.
- - L{GroupedDataFrame}
+ - L{GroupedData}
- L{Column}
Column is a DataFrame with a single column.
- L{Row}
@@ -62,7 +62,7 @@ __all__ = [
"StringType", "BinaryType", "BooleanType", "DateType", "TimestampType", "DecimalType",
"DoubleType", "FloatType", "ByteType", "IntegerType", "LongType",
"ShortType", "ArrayType", "MapType", "StructField", "StructType",
- "SQLContext", "HiveContext", "DataFrame", "GroupedDataFrame", "Column", "Row", "Dsl",
+ "SQLContext", "HiveContext", "DataFrame", "GroupedData", "Column", "Row", "Dsl",
"SchemaRDD"]
@@ -2231,7 +2231,7 @@ class DataFrame(object):
def groupBy(self, *cols):
""" Group the :class:`DataFrame` using the specified columns,
- so we can run aggregation on them. See :class:`GroupedDataFrame`
+ so we can run aggregation on them. See :class:`GroupedData`
for all the available aggregate functions.
>>> df.groupBy().avg().collect()
@@ -2244,7 +2244,7 @@ class DataFrame(object):
jcols = ListConverter().convert([_to_java_column(c) for c in cols],
self._sc._gateway._gateway_client)
jdf = self._jdf.groupBy(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
- return GroupedDataFrame(jdf, self.sql_ctx)
+ return GroupedData(jdf, self.sql_ctx)
def agg(self, *exprs):
""" Aggregate on the entire :class:`DataFrame` without groups
@@ -2308,7 +2308,7 @@ def dfapi(f):
return _api
-class GroupedDataFrame(object):
+class GroupedData(object):
"""
A set of methods for aggregations on a :class:`DataFrame`,
@@ -2638,6 +2638,9 @@ class Dsl(object):
>>> from pyspark.sql import Dsl
>>> df.agg(Dsl.countDistinct(df.age, df.name).alias('c')).collect()
[Row(c=2)]
+
+ >>> df.agg(Dsl.countDistinct("age", "name").alias('c')).collect()
+ [Row(c=2)]
"""
sc = SparkContext._active_spark_context
jcols = ListConverter().convert([_to_java_column(c) for c in cols],