From 142df4834bc33dc7b84b626c6ee3508ab1abe015 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 8 Jul 2016 14:36:50 -0700 Subject: [SPARK-16429][SQL] Include `StringType` columns in `describe()` ## What changes were proposed in this pull request? Currently, Spark `describe` supports `StringType`. However, `describe()` returns a dataset for only all numeric columns. This PR aims to include `StringType` columns in `describe()`, `describe` without argument. **Background** ```scala scala> spark.read.json("examples/src/main/resources/people.json").describe("age", "name").show() +-------+------------------+-------+ |summary| age| name| +-------+------------------+-------+ | count| 2| 3| | mean| 24.5| null| | stddev|7.7781745930520225| null| | min| 19| Andy| | max| 30|Michael| +-------+------------------+-------+ ``` **Before** ```scala scala> spark.read.json("examples/src/main/resources/people.json").describe().show() +-------+------------------+ |summary| age| +-------+------------------+ | count| 2| | mean| 24.5| | stddev|7.7781745930520225| | min| 19| | max| 30| +-------+------------------+ ``` **After** ```scala scala> spark.read.json("examples/src/main/resources/people.json").describe().show() +-------+------------------+-------+ |summary| age| name| +-------+------------------+-------+ | count| 2| 3| | mean| 24.5| null| | stddev|7.7781745930520225| null| | min| 19| Andy| | max| 30|Michael| +-------+------------------+-------+ ``` ## How was this patch tested? Pass the Jenkins with a update testcase. Author: Dongjoon Hyun Closes #14095 from dongjoon-hyun/SPARK-16429. --- python/pyspark/sql/dataframe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'python') diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index dd670a9b3d..ab41e88620 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -751,15 +751,15 @@ class DataFrame(object): @since("1.3.1") def describe(self, *cols): - """Computes statistics for numeric columns. + """Computes statistics for numeric and string columns. This include count, mean, stddev, min, and max. If no columns are - given, this function computes statistics for all numerical columns. + given, this function computes statistics for all numerical or string columns. .. note:: This function is meant for exploratory data analysis, as we make no \ guarantee about the backward compatibility of the schema of the resulting DataFrame. - >>> df.describe().show() + >>> df.describe(['age']).show() +-------+------------------+ |summary| age| +-------+------------------+ @@ -769,7 +769,7 @@ class DataFrame(object): | min| 2| | max| 5| +-------+------------------+ - >>> df.describe(['age', 'name']).show() + >>> df.describe().show() +-------+------------------+-----+ |summary| age| name| +-------+------------------+-----+ -- cgit v1.2.3