aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--python/pyspark/sql/group.py88
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala146
-rw-r--r--sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java1
3 files changed, 28 insertions, 207 deletions
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 946b53e71c..71c0bccc5e 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -167,94 +167,6 @@ class GroupedData(object):
[Row(sum(age)=7, sum(height)=165)]
"""
- @df_varargs_api
- @since(1.6)
- def stddev(self, *cols):
- """Compute the sample standard deviation for each numeric columns for each group.
-
- :param cols: list of column names (string). Non-numeric columns are ignored.
-
- >>> df3.groupBy().stddev('age', 'height').collect()
- [Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)]
- """
-
- @df_varargs_api
- @since(1.6)
- def stddev_samp(self, *cols):
- """Compute the sample standard deviation for each numeric columns for each group.
-
- :param cols: list of column names (string). Non-numeric columns are ignored.
-
- >>> df3.groupBy().stddev_samp('age', 'height').collect()
- [Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)]
- """
-
- @df_varargs_api
- @since(1.6)
- def stddev_pop(self, *cols):
- """Compute the population standard deviation for each numeric columns for each group.
-
- :param cols: list of column names (string). Non-numeric columns are ignored.
-
- >>> df3.groupBy().stddev_pop('age', 'height').collect()
- [Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)]
- """
-
- @df_varargs_api
- @since(1.6)
- def variance(self, *cols):
- """Compute the sample variance for each numeric columns for each group.
-
- :param cols: list of column names (string). Non-numeric columns are ignored.
-
- >>> df3.groupBy().variance('age', 'height').collect()
- [Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)]
- """
-
- @df_varargs_api
- @since(1.6)
- def var_pop(self, *cols):
- """Compute the sample variance for each numeric columns for each group.
-
- :param cols: list of column names (string). Non-numeric columns are ignored.
-
- >>> df3.groupBy().var_pop('age', 'height').collect()
- [Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)]
- """
-
- @df_varargs_api
- @since(1.6)
- def var_samp(self, *cols):
- """Compute the sample variance for each numeric columns for each group.
-
- :param cols: list of column names (string). Non-numeric columns are ignored.
-
- >>> df3.groupBy().var_samp('age', 'height').collect()
- [Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)]
- """
-
- @df_varargs_api
- @since(1.6)
- def skewness(self, *cols):
- """Compute the skewness for each numeric columns for each group.
-
- :param cols: list of column names (string). Non-numeric columns are ignored.
-
- >>> df3.groupBy().skewness('age', 'height').collect()
- [Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)]
- """
-
- @df_varargs_api
- @since(1.6)
- def kurtosis(self, *cols):
- """Compute the kurtosis for each numeric columns for each group.
-
- :param cols: list of column names (string). Non-numeric columns are ignored.
-
- >>> df3.groupBy().kurtosis('age', 'height').collect()
- [Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)]
- """
-
def _test():
import doctest
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index dc96384a4d..c2b2a4013d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -26,42 +26,14 @@ import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.{Rollup, Cube, Aggregate}
import org.apache.spark.sql.types.NumericType
-/**
- * Companion object for GroupedData
- */
-private[sql] object GroupedData {
- def apply(
- df: DataFrame,
- groupingExprs: Seq[Expression],
- groupType: GroupType): GroupedData = {
- new GroupedData(df, groupingExprs, groupType: GroupType)
- }
-
- /**
- * The Grouping Type
- */
- private[sql] trait GroupType
-
- /**
- * To indicate it's the GroupBy
- */
- private[sql] object GroupByType extends GroupType
-
- /**
- * To indicate it's the CUBE
- */
- private[sql] object CubeType extends GroupType
-
- /**
- * To indicate it's the ROLLUP
- */
- private[sql] object RollupType extends GroupType
-}
/**
* :: Experimental ::
* A set of methods for aggregations on a [[DataFrame]], created by [[DataFrame.groupBy]].
*
+ * The main method is the agg function, which has multiple variants. This class also contains
+ * convenience some first order statistics such as mean, sum for convenience.
+ *
* @since 1.3.0
*/
@Experimental
@@ -124,7 +96,7 @@ class GroupedData protected[sql](
case "avg" | "average" | "mean" => Average
case "max" => Max
case "min" => Min
- case "stddev" => Stddev
+ case "stddev" | "std" => Stddev
case "stddev_pop" => StddevPop
case "stddev_samp" => StddevSamp
case "variance" => Variance
@@ -256,30 +228,6 @@ class GroupedData protected[sql](
}
/**
- * Compute the skewness for each numeric columns for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
- * When specified columns are given, only compute the skewness values for them.
- *
- * @since 1.6.0
- */
- @scala.annotation.varargs
- def skewness(colNames: String*): DataFrame = {
- aggregateNumericColumns(colNames : _*)(Skewness)
- }
-
- /**
- * Compute the kurtosis for each numeric columns for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
- * When specified columns are given, only compute the kurtosis values for them.
- *
- * @since 1.6.0
- */
- @scala.annotation.varargs
- def kurtosis(colNames: String*): DataFrame = {
- aggregateNumericColumns(colNames : _*)(Kurtosis)
- }
-
- /**
* Compute the max value for each numeric columns for each group.
* The resulting [[DataFrame]] will also contain the grouping columns.
* When specified columns are given, only compute the max values for them.
@@ -316,86 +264,48 @@ class GroupedData protected[sql](
}
/**
- * Compute the sample standard deviation for each numeric columns for each group.
+ * Compute the sum for each numeric columns for each group.
* The resulting [[DataFrame]] will also contain the grouping columns.
- * When specified columns are given, only compute the stddev for them.
+ * When specified columns are given, only compute the sum for them.
*
- * @since 1.6.0
+ * @since 1.3.0
*/
@scala.annotation.varargs
- def stddev(colNames: String*): DataFrame = {
- aggregateNumericColumns(colNames : _*)(Stddev)
+ def sum(colNames: String*): DataFrame = {
+ aggregateNumericColumns(colNames : _*)(Sum)
}
+}
- /**
- * Compute the population standard deviation for each numeric columns for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
- * When specified columns are given, only compute the stddev for them.
- *
- * @since 1.6.0
- */
- @scala.annotation.varargs
- def stddev_pop(colNames: String*): DataFrame = {
- aggregateNumericColumns(colNames : _*)(StddevPop)
- }
- /**
- * Compute the sample standard deviation for each numeric columns for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
- * When specified columns are given, only compute the stddev for them.
- *
- * @since 1.6.0
- */
- @scala.annotation.varargs
- def stddev_samp(colNames: String*): DataFrame = {
- aggregateNumericColumns(colNames : _*)(StddevSamp)
+/**
+ * Companion object for GroupedData.
+ */
+private[sql] object GroupedData {
+
+ def apply(
+ df: DataFrame,
+ groupingExprs: Seq[Expression],
+ groupType: GroupType): GroupedData = {
+ new GroupedData(df, groupingExprs, groupType: GroupType)
}
/**
- * Compute the sum for each numeric columns for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
- * When specified columns are given, only compute the sum for them.
- *
- * @since 1.3.0
+ * The Grouping Type
*/
- @scala.annotation.varargs
- def sum(colNames: String*): DataFrame = {
- aggregateNumericColumns(colNames : _*)(Sum)
- }
+ private[sql] trait GroupType
/**
- * Compute the sample variance for each numeric columns for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
- * When specified columns are given, only compute the variance for them.
- *
- * @since 1.6.0
+ * To indicate it's the GroupBy
*/
- @scala.annotation.varargs
- def variance(colNames: String*): DataFrame = {
- aggregateNumericColumns(colNames : _*)(Variance)
- }
+ private[sql] object GroupByType extends GroupType
/**
- * Compute the population variance for each numeric columns for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
- * When specified columns are given, only compute the variance for them.
- *
- * @since 1.6.0
+ * To indicate it's the CUBE
*/
- @scala.annotation.varargs
- def var_pop(colNames: String*): DataFrame = {
- aggregateNumericColumns(colNames : _*)(VariancePop)
- }
+ private[sql] object CubeType extends GroupType
/**
- * Compute the sample variance for each numeric columns for each group.
- * The resulting [[DataFrame]] will also contain the grouping columns.
- * When specified columns are given, only compute the variance for them.
- *
- * @since 1.6.0
+ * To indicate it's the ROLLUP
*/
- @scala.annotation.varargs
- def var_samp(colNames: String*): DataFrame = {
- aggregateNumericColumns(colNames : _*)(VarianceSamp)
- }
+ private[sql] object RollupType extends GroupType
}
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index a1a3fdbb48..49f516e86d 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -91,7 +91,6 @@ public class JavaDataFrameSuite {
df.groupBy().mean("key");
df.groupBy().max("key");
df.groupBy().min("key");
- df.groupBy().stddev("key");
df.groupBy().sum("key");
// Varargs in column expressions