[SPARK-5605][SQL][DF] Allow using String to specify colum name in DSL aggregate functions

Author: Reynold Xin <rxin@databricks.com> Closes #4376 from rxin/SPARK-5605 and squashes the following commits: c55f5fa [Reynold Xin] Added a Python test. f4b8dbb [Reynold Xin] [SPARK-5605][SQL][DF] Allow using String to specify colum name in DSL aggregate functions.
author: Reynold Xin <rxin@databricks.com> 2015-02-04 18:35:51 -0800
committer: Reynold Xin <rxin@databricks.com> 2015-02-04 18:35:51 -0800
commit: 1fbd124b1bd6159086d8e88b139ce0817af02322 (patch)
tree: 1124f3c60011e2ac5b0e3d867f32bb9c0277d41c /python/pyspark/sql.py
parent: 9a7ce70eabc0ccaa036e142fc97bf0d37faa0b63 (diff)
download: spark-1fbd124b1bd6159086d8e88b139ce0817af02322.tar.gz
spark-1fbd124b1bd6159086d8e88b139ce0817af02322.tar.bz2
spark-1fbd124b1bd6159086d8e88b139ce0817af02322.zip
1 files changed, 8 insertions, 5 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 5b56b36bdc..417db34d67 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -23,7 +23,7 @@ public classes of Spark SQL:
     - L{DataFrame}
       A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
       addition to normal RDD operations, DataFrames also support SQL.
-    - L{GroupedDataFrame}
+    - L{GroupedData}
     - L{Column}
       Column is a DataFrame with a single column.
     - L{Row}
@@ -62,7 +62,7 @@ __all__ = [
     "StringType", "BinaryType", "BooleanType", "DateType", "TimestampType", "DecimalType",
     "DoubleType", "FloatType", "ByteType", "IntegerType", "LongType",
     "ShortType", "ArrayType", "MapType", "StructField", "StructType",
-    "SQLContext", "HiveContext", "DataFrame", "GroupedDataFrame", "Column", "Row", "Dsl",
+    "SQLContext", "HiveContext", "DataFrame", "GroupedData", "Column", "Row", "Dsl",
     "SchemaRDD"]
 
 
@@ -2231,7 +2231,7 @@ class DataFrame(object):
 
     def groupBy(self, *cols):
         """ Group the :class:`DataFrame` using the specified columns,
-        so we can run aggregation on them. See :class:`GroupedDataFrame`
+        so we can run aggregation on them. See :class:`GroupedData`
         for all the available aggregate functions.
 
         >>> df.groupBy().avg().collect()
@@ -2244,7 +2244,7 @@ class DataFrame(object):
         jcols = ListConverter().convert([_to_java_column(c) for c in cols],
                                         self._sc._gateway._gateway_client)
         jdf = self._jdf.groupBy(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
-        return GroupedDataFrame(jdf, self.sql_ctx)
+        return GroupedData(jdf, self.sql_ctx)
 
     def agg(self, *exprs):
         """ Aggregate on the entire :class:`DataFrame` without groups
@@ -2308,7 +2308,7 @@ def dfapi(f):
     return _api
 
 
-class GroupedDataFrame(object):
+class GroupedData(object):
 
     """
     A set of methods for aggregations on a :class:`DataFrame`,
@@ -2638,6 +2638,9 @@ class Dsl(object):
         >>> from pyspark.sql import Dsl
         >>> df.agg(Dsl.countDistinct(df.age, df.name).alias('c')).collect()
         [Row(c=2)]
+
+        >>> df.agg(Dsl.countDistinct("age", "name").alias('c')).collect()
+        [Row(c=2)]
         """
         sc = SparkContext._active_spark_context
         jcols = ListConverter().convert([_to_java_column(c) for c in cols],
author	Reynold Xin <rxin@databricks.com>	2015-02-04 18:35:51 -0800
committer	Reynold Xin <rxin@databricks.com>	2015-02-04 18:35:51 -0800
commit	1fbd124b1bd6159086d8e88b139ce0817af02322 (patch)
tree	1124f3c60011e2ac5b0e3d867f32bb9c0277d41c /python/pyspark/sql.py
parent	9a7ce70eabc0ccaa036e142fc97bf0d37faa0b63 (diff)
download	spark-1fbd124b1bd6159086d8e88b139ce0817af02322.tar.gz spark-1fbd124b1bd6159086d8e88b139ce0817af02322.tar.bz2 spark-1fbd124b1bd6159086d8e88b139ce0817af02322.zip