From e7082caeb4a53c1ee172d136894eece1ac880f65 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Thu, 26 May 2016 16:23:48 -0700 Subject: [SPARK-15550][SQL] Dataset.show() should show contents nested products as rows ## What changes were proposed in this pull request? This PR addresses two related issues: 1. `Dataset.showString()` should show case classes/Java beans at all levels as rows, while master code only handles top level ones. 2. `Dataset.showString()` should show full contents produced the underlying query plan Dataset is only a view of the underlying query plan. Columns not referred by the encoder are still reachable using methods like `Dataset.col`. So it probably makes more sense to show full contents of the query plan. ## How was this patch tested? Two new test cases are added in `DatasetSuite` to check `.showString()` output. Author: Cheng Lian Closes #13331 from liancheng/spark-15550-ds-show. --- .../main/scala/org/apache/spark/sql/Dataset.scala | 10 +--- .../scala/org/apache/spark/sql/DatasetSuite.scala | 68 ++++++++++++++++------ 2 files changed, 52 insertions(+), 26 deletions(-) (limited to 'sql') diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 961ae32b0b..85f0cf8a60 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -237,19 +237,13 @@ class Dataset[T] private[sql]( */ private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = { val numRows = _numRows.max(0) - val takeResult = take(numRows + 1) + val takeResult = toDF().take(numRows + 1) val hasMoreData = takeResult.length > numRows val data = takeResult.take(numRows) // For array values, replace Seq and Array with square brackets // For cells that are beyond 20 characters, replace it with the first 17 and "..." - val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { - case r: Row => r - case tuple: Product => Row.fromTuple(tuple) - case definedByCtor: DefinedByConstructorParams => - Row.fromSeq(ScalaReflection.getConstructorParameterValues(definedByCtor)) - case o => Row(o) - }.map { row => + val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row => row.toSeq.map { cell => val str = cell match { case null => "null" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 05de79eb2f..32320a6435 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -22,9 +22,8 @@ import java.sql.{Date, Timestamp} import scala.language.postfixOps -import org.scalatest.words.MatcherWords.be - import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder} +import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext @@ -217,7 +216,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS() checkDataset( ds.filter(_._1 == "b").select(expr("_1").as[String]), - ("b")) + "b") } test("foreach") { @@ -436,20 +435,6 @@ class DatasetSuite extends QueryTest with SharedSQLContext { assert(ds.toString == "[_1: int, _2: int]") } - test("showString: Kryo encoder") { - implicit val kryoEncoder = Encoders.kryo[KryoData] - val ds = Seq(KryoData(1), KryoData(2)).toDS() - - val expectedAnswer = """+-----------+ - || value| - |+-----------+ - ||KryoData(1)| - ||KryoData(2)| - |+-----------+ - |""".stripMargin - assert(ds.showString(10) === expectedAnswer) - } - test("Kryo encoder") { implicit val kryoEncoder = Encoders.kryo[KryoData] val ds = Seq(KryoData(1), KryoData(2)).toDS() @@ -677,7 +662,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { } test("dataset.rdd with generic case class") { - val ds = Seq(Generic(1, 1.0), Generic(2, 2.0)).toDS + val ds = Seq(Generic(1, 1.0), Generic(2, 2.0)).toDS() val ds2 = ds.map(g => Generic(g.id, g.value)) assert(ds.rdd.map(r => r.id).count === 2) assert(ds2.rdd.map(r => r.id).count === 2) @@ -731,6 +716,53 @@ class DatasetSuite extends QueryTest with SharedSQLContext { val df = Seq(1 -> 2).toDF("a", "b") checkAnswer(df.map(row => row)(RowEncoder(df.schema)).select("b", "a"), Row(2, 1)) } + + private def checkShowString[T](ds: Dataset[T], expected: String): Unit = { + val numRows = expected.split("\n").length - 4 + val actual = ds.showString(numRows, truncate = true) + + if (expected != actual) { + fail( + "Dataset.showString() gives wrong result:\n\n" + sideBySide( + "== Expected ==\n" + expected, + "== Actual ==\n" + actual + ).mkString("\n") + ) + } + } + + test("SPARK-15550 Dataset.show() should show contents of the underlying logical plan") { + val df = Seq((1, "foo", "extra"), (2, "bar", "extra")).toDF("b", "a", "c") + val ds = df.as[ClassData] + val expected = + """+---+---+-----+ + || b| a| c| + |+---+---+-----+ + || 1|foo|extra| + || 2|bar|extra| + |+---+---+-----+ + |""".stripMargin + + checkShowString(ds, expected) + } + + test("SPARK-15550 Dataset.show() should show inner nested products as rows") { + val ds = Seq( + NestedStruct(ClassData("foo", 1)), + NestedStruct(ClassData("bar", 2)) + ).toDS() + + val expected = + """+-------+ + || f| + |+-------+ + ||[foo,1]| + ||[bar,2]| + |+-------+ + |""".stripMargin + + checkShowString(ds, expected) + } } case class Generic[T](id: T, value: Double) -- cgit v1.2.3