aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorDongjoon Hyun <dongjoon@apache.org>2016-07-08 14:36:50 -0700
committerReynold Xin <rxin@databricks.com>2016-07-08 14:36:50 -0700
commit142df4834bc33dc7b84b626c6ee3508ab1abe015 (patch)
tree04eab461749ee26103eec7869e4f91eefd4d1b44 /sql
parent67e085ef6dd62774095f3187844c091db1a6a72c (diff)
downloadspark-142df4834bc33dc7b84b626c6ee3508ab1abe015.tar.gz
spark-142df4834bc33dc7b84b626c6ee3508ab1abe015.tar.bz2
spark-142df4834bc33dc7b84b626c6ee3508ab1abe015.zip
[SPARK-16429][SQL] Include `StringType` columns in `describe()`
## What changes were proposed in this pull request? Currently, Spark `describe` supports `StringType`. However, `describe()` returns a dataset for only all numeric columns. This PR aims to include `StringType` columns in `describe()`, `describe` without argument. **Background** ```scala scala> spark.read.json("examples/src/main/resources/people.json").describe("age", "name").show() +-------+------------------+-------+ |summary| age| name| +-------+------------------+-------+ | count| 2| 3| | mean| 24.5| null| | stddev|7.7781745930520225| null| | min| 19| Andy| | max| 30|Michael| +-------+------------------+-------+ ``` **Before** ```scala scala> spark.read.json("examples/src/main/resources/people.json").describe().show() +-------+------------------+ |summary| age| +-------+------------------+ | count| 2| | mean| 24.5| | stddev|7.7781745930520225| | min| 19| | max| 30| +-------+------------------+ ``` **After** ```scala scala> spark.read.json("examples/src/main/resources/people.json").describe().show() +-------+------------------+-------+ |summary| age| name| +-------+------------------+-------+ | count| 2| 3| | mean| 24.5| null| | stddev|7.7781745930520225| null| | min| 19| Andy| | max| 30|Michael| +-------+------------------+-------+ ``` ## How was this patch tested? Pass the Jenkins with a update testcase. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #14095 from dongjoon-hyun/SPARK-16429.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala16
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala36
2 files changed, 31 insertions, 21 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index ededf7f4fe..ed4ccdb4c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -228,6 +228,15 @@ class Dataset[T] private[sql](
}
}
+ private def aggregatableColumns: Seq[Expression] = {
+ schema.fields
+ .filter(f => f.dataType.isInstanceOf[NumericType] || f.dataType.isInstanceOf[StringType])
+ .map { n =>
+ queryExecution.analyzed.resolveQuoted(n.name, sparkSession.sessionState.analyzer.resolver)
+ .get
+ }
+ }
+
/**
* Compose the string representing rows for output
*
@@ -1886,8 +1895,9 @@ class Dataset[T] private[sql](
}
/**
- * Computes statistics for numeric columns, including count, mean, stddev, min, and max.
- * If no columns are given, this function computes statistics for all numerical columns.
+ * Computes statistics for numeric and string columns, including count, mean, stddev, min, and
+ * max. If no columns are given, this function computes statistics for all numerical or string
+ * columns.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
* backward compatibility of the schema of the resulting Dataset. If you want to
@@ -1920,7 +1930,7 @@ class Dataset[T] private[sql](
"max" -> ((child: Expression) => Max(child).toAggregateExpression()))
val outputCols =
- (if (cols.isEmpty) numericColumns.map(usePrettyExpression(_).sql) else cols).toList
+ (if (cols.isEmpty) aggregatableColumns.map(usePrettyExpression(_).sql) else cols).toList
val ret: Seq[Row] = if (outputCols.nonEmpty) {
val aggExprs = statistics.flatMap { case (_, colToAgg) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 9d53be8e2b..905da554f1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -651,44 +651,44 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
("Amy", 24, 180)).toDF("name", "age", "height")
val describeResult = Seq(
- Row("count", "4", "4"),
- Row("mean", "33.0", "178.0"),
- Row("stddev", "19.148542155126762", "11.547005383792516"),
- Row("min", "16", "164"),
- Row("max", "60", "192"))
+ Row("count", "4", "4", "4"),
+ Row("mean", null, "33.0", "178.0"),
+ Row("stddev", null, "19.148542155126762", "11.547005383792516"),
+ Row("min", "Alice", "16", "164"),
+ Row("max", "David", "60", "192"))
val emptyDescribeResult = Seq(
- Row("count", "0", "0"),
- Row("mean", null, null),
- Row("stddev", null, null),
- Row("min", null, null),
- Row("max", null, null))
+ Row("count", "0", "0", "0"),
+ Row("mean", null, null, null),
+ Row("stddev", null, null, null),
+ Row("min", null, null, null),
+ Row("max", null, null, null))
def getSchemaAsSeq(df: DataFrame): Seq[String] = df.schema.map(_.name)
- val describeTwoCols = describeTestData.describe("age", "height")
- assert(getSchemaAsSeq(describeTwoCols) === Seq("summary", "age", "height"))
+ val describeTwoCols = describeTestData.describe("name", "age", "height")
+ assert(getSchemaAsSeq(describeTwoCols) === Seq("summary", "name", "age", "height"))
checkAnswer(describeTwoCols, describeResult)
// All aggregate value should have been cast to string
describeTwoCols.collect().foreach { row =>
- assert(row.get(1).isInstanceOf[String], "expected string but found " + row.get(1).getClass)
assert(row.get(2).isInstanceOf[String], "expected string but found " + row.get(2).getClass)
+ assert(row.get(3).isInstanceOf[String], "expected string but found " + row.get(3).getClass)
}
val describeAllCols = describeTestData.describe()
- assert(getSchemaAsSeq(describeAllCols) === Seq("summary", "age", "height"))
+ assert(getSchemaAsSeq(describeAllCols) === Seq("summary", "name", "age", "height"))
checkAnswer(describeAllCols, describeResult)
val describeOneCol = describeTestData.describe("age")
assert(getSchemaAsSeq(describeOneCol) === Seq("summary", "age"))
- checkAnswer(describeOneCol, describeResult.map { case Row(s, d, _) => Row(s, d)} )
+ checkAnswer(describeOneCol, describeResult.map { case Row(s, _, d, _) => Row(s, d)} )
val describeNoCol = describeTestData.select("name").describe()
- assert(getSchemaAsSeq(describeNoCol) === Seq("summary"))
- checkAnswer(describeNoCol, describeResult.map { case Row(s, _, _) => Row(s)} )
+ assert(getSchemaAsSeq(describeNoCol) === Seq("summary", "name"))
+ checkAnswer(describeNoCol, describeResult.map { case Row(s, n, _, _) => Row(s, n)} )
val emptyDescription = describeTestData.limit(0).describe()
- assert(getSchemaAsSeq(emptyDescription) === Seq("summary", "age", "height"))
+ assert(getSchemaAsSeq(emptyDescription) === Seq("summary", "name", "age", "height"))
checkAnswer(emptyDescription, emptyDescribeResult)
}