aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorgatorsmile <gatorsmile@gmail.com>2015-12-16 13:22:34 -0800
committerMichael Armbrust <michael@databricks.com>2015-12-16 13:22:34 -0800
commitedf65cd961b913ef54104770630a50fd4b120b4b (patch)
treecfa89be56bdac9dd5a66704de7c261f16c38d42f /sql
parenta783a8ed49814a09fde653433a3d6de398ddf888 (diff)
downloadspark-edf65cd961b913ef54104770630a50fd4b120b4b.tar.gz
spark-edf65cd961b913ef54104770630a50fd4b120b4b.tar.bz2
spark-edf65cd961b913ef54104770630a50fd4b120b4b.zip
[SPARK-12164][SQL] Decode the encoded values and then display
Based on the suggestions from marmbrus cloud-fan in https://github.com/apache/spark/pull/10165 , this PR is to print the decoded values(user objects) in `Dataset.show` ```scala implicit val kryoEncoder = Encoders.kryo[KryoClassData] val ds = Seq(KryoClassData("a", 1), KryoClassData("b", 2), KryoClassData("c", 3)).toDS() ds.show(20, false); ``` The current output is like ``` +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |value | +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |[1, 0, 111, 114, 103, 46, 97, 112, 97, 99, 104, 101, 46, 115, 112, 97, 114, 107, 46, 115, 113, 108, 46, 75, 114, 121, 111, 67, 108, 97, 115, 115, 68, 97, 116, -31, 1, 1, -126, 97, 2]| |[1, 0, 111, 114, 103, 46, 97, 112, 97, 99, 104, 101, 46, 115, 112, 97, 114, 107, 46, 115, 113, 108, 46, 75, 114, 121, 111, 67, 108, 97, 115, 115, 68, 97, 116, -31, 1, 1, -126, 98, 4]| |[1, 0, 111, 114, 103, 46, 97, 112, 97, 99, 104, 101, 46, 115, 112, 97, 114, 107, 46, 115, 113, 108, 46, 75, 114, 121, 111, 67, 108, 97, 115, 115, 68, 97, 116, -31, 1, 1, -126, 99, 6]| +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ``` After the fix, it will be like the below if and only if the users override the `toString` function in the class `KryoClassData` ```scala override def toString: String = s"KryoClassData($a, $b)" ``` ``` +-------------------+ |value | +-------------------+ |KryoClassData(a, 1)| |KryoClassData(b, 2)| |KryoClassData(c, 3)| +-------------------+ ``` If users do not override the `toString` function, the results will be like ``` +---------------------------------------+ |value | +---------------------------------------+ |org.apache.spark.sql.KryoClassData68ef| |org.apache.spark.sql.KryoClassData6915| |org.apache.spark.sql.KryoClassData693b| +---------------------------------------+ ``` Question: Should we add another optional parameter in the function `show`? It will decide if the function `show` will display the hex values or the object values? Author: gatorsmile <gatorsmile@gmail.com> Closes #10215 from gatorsmile/showDecodedValue.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala50
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala37
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/Queryable.scala65
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala15
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala14
5 files changed, 133 insertions, 48 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 497bd48266..6250e95216 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -165,13 +165,11 @@ class DataFrame private[sql](
* @param _numRows Number of rows to show
* @param truncate Whether truncate long strings and align cells right
*/
- private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = {
+ override private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = {
val numRows = _numRows.max(0)
- val sb = new StringBuilder
val takeResult = take(numRows + 1)
val hasMoreData = takeResult.length > numRows
val data = takeResult.take(numRows)
- val numCols = schema.fieldNames.length
// For array values, replace Seq and Array with square brackets
// For cells that are beyond 20 characters, replace it with the first 17 and "..."
@@ -179,6 +177,7 @@ class DataFrame private[sql](
row.toSeq.map { cell =>
val str = cell match {
case null => "null"
+ case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
case array: Array[_] => array.mkString("[", ", ", "]")
case seq: Seq[_] => seq.mkString("[", ", ", "]")
case _ => cell.toString
@@ -187,50 +186,7 @@ class DataFrame private[sql](
}: Seq[String]
}
- // Initialise the width of each column to a minimum value of '3'
- val colWidths = Array.fill(numCols)(3)
-
- // Compute the width of each column
- for (row <- rows) {
- for ((cell, i) <- row.zipWithIndex) {
- colWidths(i) = math.max(colWidths(i), cell.length)
- }
- }
-
- // Create SeparateLine
- val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
-
- // column names
- rows.head.zipWithIndex.map { case (cell, i) =>
- if (truncate) {
- StringUtils.leftPad(cell, colWidths(i))
- } else {
- StringUtils.rightPad(cell, colWidths(i))
- }
- }.addString(sb, "|", "|", "|\n")
-
- sb.append(sep)
-
- // data
- rows.tail.map {
- _.zipWithIndex.map { case (cell, i) =>
- if (truncate) {
- StringUtils.leftPad(cell.toString, colWidths(i))
- } else {
- StringUtils.rightPad(cell.toString, colWidths(i))
- }
- }.addString(sb, "|", "|", "|\n")
- }
-
- sb.append(sep)
-
- // For Data that has more than "numRows" records
- if (hasMoreData) {
- val rowsString = if (numRows == 1) "row" else "rows"
- sb.append(s"only showing top $numRows $rowsString\n")
- }
-
- sb.toString()
+ formatString ( rows, numRows, hasMoreData, truncate )
}
/**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index dc69822e92..79b4244ac0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -225,7 +225,42 @@ class Dataset[T] private[sql](
*
* @since 1.6.0
*/
- def show(numRows: Int, truncate: Boolean): Unit = toDF().show(numRows, truncate)
+ // scalastyle:off println
+ def show(numRows: Int, truncate: Boolean): Unit = println(showString(numRows, truncate))
+ // scalastyle:on println
+
+ /**
+ * Compose the string representing rows for output
+ * @param _numRows Number of rows to show
+ * @param truncate Whether truncate long strings and align cells right
+ */
+ override private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = {
+ val numRows = _numRows.max(0)
+ val takeResult = take(numRows + 1)
+ val hasMoreData = takeResult.length > numRows
+ val data = takeResult.take(numRows)
+
+ // For array values, replace Seq and Array with square brackets
+ // For cells that are beyond 20 characters, replace it with the first 17 and "..."
+ val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: (data.map {
+ case r: Row => r
+ case tuple: Product => Row.fromTuple(tuple)
+ case o => Row(o)
+ } map { row =>
+ row.toSeq.map { cell =>
+ val str = cell match {
+ case null => "null"
+ case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
+ case array: Array[_] => array.mkString("[", ", ", "]")
+ case seq: Seq[_] => seq.mkString("[", ", ", "]")
+ case _ => cell.toString
+ }
+ if (truncate && str.length > 20) str.substring(0, 17) + "..." else str
+ }: Seq[String]
+ })
+
+ formatString ( rows, numRows, hasMoreData, truncate )
+ }
/**
* Returns a new [[Dataset]] that has exactly `numPartitions` partitions.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Queryable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Queryable.scala
index f2f5997d1b..b397d42612 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Queryable.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Queryable.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution
import scala.util.control.NonFatal
+import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.StructType
@@ -42,4 +43,68 @@ private[sql] trait Queryable {
def explain(extended: Boolean): Unit
def explain(): Unit
+
+ private[sql] def showString(_numRows: Int, truncate: Boolean = true): String
+
+ /**
+ * Format the string representing rows for output
+ * @param rows The rows to show
+ * @param numRows Number of rows to show
+ * @param hasMoreData Whether some rows are not shown due to the limit
+ * @param truncate Whether truncate long strings and align cells right
+ *
+ */
+ private[sql] def formatString (
+ rows: Seq[Seq[String]],
+ numRows: Int,
+ hasMoreData : Boolean,
+ truncate: Boolean = true): String = {
+ val sb = new StringBuilder
+ val numCols = schema.fieldNames.length
+
+ // Initialise the width of each column to a minimum value of '3'
+ val colWidths = Array.fill(numCols)(3)
+
+ // Compute the width of each column
+ for (row <- rows) {
+ for ((cell, i) <- row.zipWithIndex) {
+ colWidths(i) = math.max(colWidths(i), cell.length)
+ }
+ }
+
+ // Create SeparateLine
+ val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
+
+ // column names
+ rows.head.zipWithIndex.map { case (cell, i) =>
+ if (truncate) {
+ StringUtils.leftPad(cell, colWidths(i))
+ } else {
+ StringUtils.rightPad(cell, colWidths(i))
+ }
+ }.addString(sb, "|", "|", "|\n")
+
+ sb.append(sep)
+
+ // data
+ rows.tail.map {
+ _.zipWithIndex.map { case (cell, i) =>
+ if (truncate) {
+ StringUtils.leftPad(cell.toString, colWidths(i))
+ } else {
+ StringUtils.rightPad(cell.toString, colWidths(i))
+ }
+ }.addString(sb, "|", "|", "|\n")
+ }
+
+ sb.append(sep)
+
+ // For Data that has more than "numRows" records
+ if (hasMoreData) {
+ val rowsString = if (numRows == 1) "row" else "rows"
+ sb.append(s"only showing top $numRows $rowsString\n")
+ }
+
+ sb.toString()
+ }
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index c0bbf73ab1..0644bdaaa3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -585,6 +585,21 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
assert(df.showString(10) === expectedAnswer)
}
+ test("showString: binary") {
+ val df = Seq(
+ ("12".getBytes, "ABC.".getBytes),
+ ("34".getBytes, "12346".getBytes)
+ ).toDF()
+ val expectedAnswer = """+-------+----------------+
+ || _1| _2|
+ |+-------+----------------+
+ ||[31 32]| [41 42 43 2E]|
+ ||[33 34]|[31 32 33 34 36]|
+ |+-------+----------------+
+ |""".stripMargin
+ assert(df.showString(10) === expectedAnswer)
+ }
+
test("showString: minimum column width") {
val df = Seq(
(1, 1),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 8f8db31826..f1b6b98dc1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -426,6 +426,20 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
assert(ds.toString == "[_1: int, _2: int]")
}
+ test("showString: Kryo encoder") {
+ implicit val kryoEncoder = Encoders.kryo[KryoData]
+ val ds = Seq(KryoData(1), KryoData(2)).toDS()
+
+ val expectedAnswer = """+-----------+
+ || value|
+ |+-----------+
+ ||KryoData(1)|
+ ||KryoData(2)|
+ |+-----------+
+ |""".stripMargin
+ assert(ds.showString(10) === expectedAnswer)
+ }
+
test("Kryo encoder") {
implicit val kryoEncoder = Encoders.kryo[KryoData]
val ds = Seq(KryoData(1), KryoData(2)).toDS()