diff options
author | Reynold Xin <rxin@databricks.com> | 2015-02-04 18:35:51 -0800 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-02-04 18:35:51 -0800 |
commit | 1fbd124b1bd6159086d8e88b139ce0817af02322 (patch) | |
tree | 1124f3c60011e2ac5b0e3d867f32bb9c0277d41c /python/pyspark/sql.py | |
parent | 9a7ce70eabc0ccaa036e142fc97bf0d37faa0b63 (diff) | |
download | spark-1fbd124b1bd6159086d8e88b139ce0817af02322.tar.gz spark-1fbd124b1bd6159086d8e88b139ce0817af02322.tar.bz2 spark-1fbd124b1bd6159086d8e88b139ce0817af02322.zip |
[SPARK-5605][SQL][DF] Allow using String to specify colum name in DSL aggregate functions
Author: Reynold Xin <rxin@databricks.com>
Closes #4376 from rxin/SPARK-5605 and squashes the following commits:
c55f5fa [Reynold Xin] Added a Python test.
f4b8dbb [Reynold Xin] [SPARK-5605][SQL][DF] Allow using String to specify colum name in DSL aggregate functions.
Diffstat (limited to 'python/pyspark/sql.py')
-rw-r--r-- | python/pyspark/sql.py | 13 |
1 files changed, 8 insertions, 5 deletions
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 5b56b36bdc..417db34d67 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -23,7 +23,7 @@ public classes of Spark SQL: - L{DataFrame} A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In addition to normal RDD operations, DataFrames also support SQL. - - L{GroupedDataFrame} + - L{GroupedData} - L{Column} Column is a DataFrame with a single column. - L{Row} @@ -62,7 +62,7 @@ __all__ = [ "StringType", "BinaryType", "BooleanType", "DateType", "TimestampType", "DecimalType", "DoubleType", "FloatType", "ByteType", "IntegerType", "LongType", "ShortType", "ArrayType", "MapType", "StructField", "StructType", - "SQLContext", "HiveContext", "DataFrame", "GroupedDataFrame", "Column", "Row", "Dsl", + "SQLContext", "HiveContext", "DataFrame", "GroupedData", "Column", "Row", "Dsl", "SchemaRDD"] @@ -2231,7 +2231,7 @@ class DataFrame(object): def groupBy(self, *cols): """ Group the :class:`DataFrame` using the specified columns, - so we can run aggregation on them. See :class:`GroupedDataFrame` + so we can run aggregation on them. See :class:`GroupedData` for all the available aggregate functions. >>> df.groupBy().avg().collect() @@ -2244,7 +2244,7 @@ class DataFrame(object): jcols = ListConverter().convert([_to_java_column(c) for c in cols], self._sc._gateway._gateway_client) jdf = self._jdf.groupBy(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols)) - return GroupedDataFrame(jdf, self.sql_ctx) + return GroupedData(jdf, self.sql_ctx) def agg(self, *exprs): """ Aggregate on the entire :class:`DataFrame` without groups @@ -2308,7 +2308,7 @@ def dfapi(f): return _api -class GroupedDataFrame(object): +class GroupedData(object): """ A set of methods for aggregations on a :class:`DataFrame`, @@ -2638,6 +2638,9 @@ class Dsl(object): >>> from pyspark.sql import Dsl >>> df.agg(Dsl.countDistinct(df.age, df.name).alias('c')).collect() [Row(c=2)] + + >>> df.agg(Dsl.countDistinct("age", "name").alias('c')).collect() + [Row(c=2)] """ sc = SparkContext._active_spark_context jcols = ListConverter().convert([_to_java_column(c) for c in cols], |