aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2016-05-26 16:23:48 -0700
committerCheng Lian <lian@databricks.com>2016-05-26 16:23:48 -0700
commite7082caeb4a53c1ee172d136894eece1ac880f65 (patch)
tree0e72422a3b1df5e4c26595984ee2af78df2204df /sql
parentfe6de16f781ff659b34e0ddda427d371d3d94536 (diff)
downloadspark-e7082caeb4a53c1ee172d136894eece1ac880f65.tar.gz
spark-e7082caeb4a53c1ee172d136894eece1ac880f65.tar.bz2
spark-e7082caeb4a53c1ee172d136894eece1ac880f65.zip
[SPARK-15550][SQL] Dataset.show() should show contents nested products as rows
## What changes were proposed in this pull request? This PR addresses two related issues: 1. `Dataset.showString()` should show case classes/Java beans at all levels as rows, while master code only handles top level ones. 2. `Dataset.showString()` should show full contents produced the underlying query plan Dataset is only a view of the underlying query plan. Columns not referred by the encoder are still reachable using methods like `Dataset.col`. So it probably makes more sense to show full contents of the query plan. ## How was this patch tested? Two new test cases are added in `DatasetSuite` to check `.showString()` output. Author: Cheng Lian <lian@databricks.com> Closes #13331 from liancheng/spark-15550-ds-show.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala10
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala68
2 files changed, 52 insertions, 26 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 961ae32b0b..85f0cf8a60 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -237,19 +237,13 @@ class Dataset[T] private[sql](
*/
private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = {
val numRows = _numRows.max(0)
- val takeResult = take(numRows + 1)
+ val takeResult = toDF().take(numRows + 1)
val hasMoreData = takeResult.length > numRows
val data = takeResult.take(numRows)
// For array values, replace Seq and Array with square brackets
// For cells that are beyond 20 characters, replace it with the first 17 and "..."
- val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map {
- case r: Row => r
- case tuple: Product => Row.fromTuple(tuple)
- case definedByCtor: DefinedByConstructorParams =>
- Row.fromSeq(ScalaReflection.getConstructorParameterValues(definedByCtor))
- case o => Row(o)
- }.map { row =>
+ val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row =>
row.toSeq.map { cell =>
val str = cell match {
case null => "null"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 05de79eb2f..32320a6435 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -22,9 +22,8 @@ import java.sql.{Date, Timestamp}
import scala.language.postfixOps
-import org.scalatest.words.MatcherWords.be
-
import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder}
+import org.apache.spark.sql.catalyst.util.sideBySide
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.functions._
import org.apache.spark.sql.test.SharedSQLContext
@@ -217,7 +216,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
checkDataset(
ds.filter(_._1 == "b").select(expr("_1").as[String]),
- ("b"))
+ "b")
}
test("foreach") {
@@ -436,20 +435,6 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
assert(ds.toString == "[_1: int, _2: int]")
}
- test("showString: Kryo encoder") {
- implicit val kryoEncoder = Encoders.kryo[KryoData]
- val ds = Seq(KryoData(1), KryoData(2)).toDS()
-
- val expectedAnswer = """+-----------+
- || value|
- |+-----------+
- ||KryoData(1)|
- ||KryoData(2)|
- |+-----------+
- |""".stripMargin
- assert(ds.showString(10) === expectedAnswer)
- }
-
test("Kryo encoder") {
implicit val kryoEncoder = Encoders.kryo[KryoData]
val ds = Seq(KryoData(1), KryoData(2)).toDS()
@@ -677,7 +662,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
}
test("dataset.rdd with generic case class") {
- val ds = Seq(Generic(1, 1.0), Generic(2, 2.0)).toDS
+ val ds = Seq(Generic(1, 1.0), Generic(2, 2.0)).toDS()
val ds2 = ds.map(g => Generic(g.id, g.value))
assert(ds.rdd.map(r => r.id).count === 2)
assert(ds2.rdd.map(r => r.id).count === 2)
@@ -731,6 +716,53 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
val df = Seq(1 -> 2).toDF("a", "b")
checkAnswer(df.map(row => row)(RowEncoder(df.schema)).select("b", "a"), Row(2, 1))
}
+
+ private def checkShowString[T](ds: Dataset[T], expected: String): Unit = {
+ val numRows = expected.split("\n").length - 4
+ val actual = ds.showString(numRows, truncate = true)
+
+ if (expected != actual) {
+ fail(
+ "Dataset.showString() gives wrong result:\n\n" + sideBySide(
+ "== Expected ==\n" + expected,
+ "== Actual ==\n" + actual
+ ).mkString("\n")
+ )
+ }
+ }
+
+ test("SPARK-15550 Dataset.show() should show contents of the underlying logical plan") {
+ val df = Seq((1, "foo", "extra"), (2, "bar", "extra")).toDF("b", "a", "c")
+ val ds = df.as[ClassData]
+ val expected =
+ """+---+---+-----+
+ || b| a| c|
+ |+---+---+-----+
+ || 1|foo|extra|
+ || 2|bar|extra|
+ |+---+---+-----+
+ |""".stripMargin
+
+ checkShowString(ds, expected)
+ }
+
+ test("SPARK-15550 Dataset.show() should show inner nested products as rows") {
+ val ds = Seq(
+ NestedStruct(ClassData("foo", 1)),
+ NestedStruct(ClassData("bar", 2))
+ ).toDS()
+
+ val expected =
+ """+-------+
+ || f|
+ |+-------+
+ ||[foo,1]|
+ ||[bar,2]|
+ |+-------+
+ |""".stripMargin
+
+ checkShowString(ds, expected)
+ }
}
case class Generic[T](id: T, value: Double)