diff options
-rw-r--r-- | python/pyspark/sql/group.py | 88 | ||||
-rw-r--r-- | sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala | 146 | ||||
-rw-r--r-- | sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java | 1 |
3 files changed, 28 insertions, 207 deletions
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py index 946b53e71c..71c0bccc5e 100644 --- a/python/pyspark/sql/group.py +++ b/python/pyspark/sql/group.py @@ -167,94 +167,6 @@ class GroupedData(object): [Row(sum(age)=7, sum(height)=165)] """ - @df_varargs_api - @since(1.6) - def stddev(self, *cols): - """Compute the sample standard deviation for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().stddev('age', 'height').collect() - [Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)] - """ - - @df_varargs_api - @since(1.6) - def stddev_samp(self, *cols): - """Compute the sample standard deviation for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().stddev_samp('age', 'height').collect() - [Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)] - """ - - @df_varargs_api - @since(1.6) - def stddev_pop(self, *cols): - """Compute the population standard deviation for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().stddev_pop('age', 'height').collect() - [Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)] - """ - - @df_varargs_api - @since(1.6) - def variance(self, *cols): - """Compute the sample variance for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().variance('age', 'height').collect() - [Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)] - """ - - @df_varargs_api - @since(1.6) - def var_pop(self, *cols): - """Compute the sample variance for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().var_pop('age', 'height').collect() - [Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)] - """ - - @df_varargs_api - @since(1.6) - def var_samp(self, *cols): - """Compute the sample variance for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().var_samp('age', 'height').collect() - [Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)] - """ - - @df_varargs_api - @since(1.6) - def skewness(self, *cols): - """Compute the skewness for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().skewness('age', 'height').collect() - [Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)] - """ - - @df_varargs_api - @since(1.6) - def kurtosis(self, *cols): - """Compute the kurtosis for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().kurtosis('age', 'height').collect() - [Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)] - """ - def _test(): import doctest diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala index dc96384a4d..c2b2a4013d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala @@ -26,42 +26,14 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{Rollup, Cube, Aggregate} import org.apache.spark.sql.types.NumericType -/** - * Companion object for GroupedData - */ -private[sql] object GroupedData { - def apply( - df: DataFrame, - groupingExprs: Seq[Expression], - groupType: GroupType): GroupedData = { - new GroupedData(df, groupingExprs, groupType: GroupType) - } - - /** - * The Grouping Type - */ - private[sql] trait GroupType - - /** - * To indicate it's the GroupBy - */ - private[sql] object GroupByType extends GroupType - - /** - * To indicate it's the CUBE - */ - private[sql] object CubeType extends GroupType - - /** - * To indicate it's the ROLLUP - */ - private[sql] object RollupType extends GroupType -} /** * :: Experimental :: * A set of methods for aggregations on a [[DataFrame]], created by [[DataFrame.groupBy]]. * + * The main method is the agg function, which has multiple variants. This class also contains + * convenience some first order statistics such as mean, sum for convenience. + * * @since 1.3.0 */ @Experimental @@ -124,7 +96,7 @@ class GroupedData protected[sql]( case "avg" | "average" | "mean" => Average case "max" => Max case "min" => Min - case "stddev" => Stddev + case "stddev" | "std" => Stddev case "stddev_pop" => StddevPop case "stddev_samp" => StddevSamp case "variance" => Variance @@ -256,30 +228,6 @@ class GroupedData protected[sql]( } /** - * Compute the skewness for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the skewness values for them. - * - * @since 1.6.0 - */ - @scala.annotation.varargs - def skewness(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(Skewness) - } - - /** - * Compute the kurtosis for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the kurtosis values for them. - * - * @since 1.6.0 - */ - @scala.annotation.varargs - def kurtosis(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(Kurtosis) - } - - /** * Compute the max value for each numeric columns for each group. * The resulting [[DataFrame]] will also contain the grouping columns. * When specified columns are given, only compute the max values for them. @@ -316,86 +264,48 @@ class GroupedData protected[sql]( } /** - * Compute the sample standard deviation for each numeric columns for each group. + * Compute the sum for each numeric columns for each group. * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the stddev for them. + * When specified columns are given, only compute the sum for them. * - * @since 1.6.0 + * @since 1.3.0 */ @scala.annotation.varargs - def stddev(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(Stddev) + def sum(colNames: String*): DataFrame = { + aggregateNumericColumns(colNames : _*)(Sum) } +} - /** - * Compute the population standard deviation for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the stddev for them. - * - * @since 1.6.0 - */ - @scala.annotation.varargs - def stddev_pop(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(StddevPop) - } - /** - * Compute the sample standard deviation for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the stddev for them. - * - * @since 1.6.0 - */ - @scala.annotation.varargs - def stddev_samp(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(StddevSamp) +/** + * Companion object for GroupedData. + */ +private[sql] object GroupedData { + + def apply( + df: DataFrame, + groupingExprs: Seq[Expression], + groupType: GroupType): GroupedData = { + new GroupedData(df, groupingExprs, groupType: GroupType) } /** - * Compute the sum for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the sum for them. - * - * @since 1.3.0 + * The Grouping Type */ - @scala.annotation.varargs - def sum(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(Sum) - } + private[sql] trait GroupType /** - * Compute the sample variance for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the variance for them. - * - * @since 1.6.0 + * To indicate it's the GroupBy */ - @scala.annotation.varargs - def variance(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(Variance) - } + private[sql] object GroupByType extends GroupType /** - * Compute the population variance for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the variance for them. - * - * @since 1.6.0 + * To indicate it's the CUBE */ - @scala.annotation.varargs - def var_pop(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(VariancePop) - } + private[sql] object CubeType extends GroupType /** - * Compute the sample variance for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the variance for them. - * - * @since 1.6.0 + * To indicate it's the ROLLUP */ - @scala.annotation.varargs - def var_samp(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(VarianceSamp) - } + private[sql] object RollupType extends GroupType } diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java index a1a3fdbb48..49f516e86d 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java @@ -91,7 +91,6 @@ public class JavaDataFrameSuite { df.groupBy().mean("key"); df.groupBy().max("key"); df.groupBy().min("key"); - df.groupBy().stddev("key"); df.groupBy().sum("key"); // Varargs in column expressions |