aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorPrashant Sharma <prashsh1@in.ibm.com>2016-06-28 17:11:06 +0530
committerPrashant Sharma <prashsh1@in.ibm.com>2016-06-28 17:11:06 +0530
commitf6b497fcdddc705a9e1022e20b0dbc15da1b5a5a (patch)
treeb28b241100d7ec0cbd79b0e7b79ac3f0a98dae04 /sql
parent4cbf611c1dc88111ff49d005e902ad5864799ede (diff)
downloadspark-f6b497fcdddc705a9e1022e20b0dbc15da1b5a5a.tar.gz
spark-f6b497fcdddc705a9e1022e20b0dbc15da1b5a5a.tar.bz2
spark-f6b497fcdddc705a9e1022e20b0dbc15da1b5a5a.zip
[SPARK-16128][SQL] Allow setting length of characters to be truncated to, in Dataset.show function.
## What changes were proposed in this pull request? Allowing truncate to a specific number of character is convenient at times, especially while operating from the REPL. Sometimes those last few characters make all the difference, and showing everything brings in whole lot of noise. ## How was this patch tested? Existing tests. + 1 new test in DataFrameSuite. For SparkR and pyspark, existing tests and manual testing. Author: Prashant Sharma <prashsh1@in.ibm.com> Author: Prashant Sharma <prashant@apache.org> Closes #13839 from ScrapCodes/add_truncateTo_DF.show.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala47
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala27
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala2
3 files changed, 65 insertions, 11 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 85d060639c..9997162f7c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -232,16 +232,18 @@ class Dataset[T] private[sql](
* Compose the string representing rows for output
*
* @param _numRows Number of rows to show
- * @param truncate Whether truncate long strings and align cells right
+ * @param truncate If set to more than 0, truncates strings to `truncate` characters and
+ * all cells will be aligned right.
*/
- private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = {
+ private[sql] def showString(_numRows: Int, truncate: Int = 20): String = {
val numRows = _numRows.max(0)
val takeResult = toDF().take(numRows + 1)
val hasMoreData = takeResult.length > numRows
val data = takeResult.take(numRows)
// For array values, replace Seq and Array with square brackets
- // For cells that are beyond 20 characters, replace it with the first 17 and "..."
+ // For cells that are beyond `truncate` characters, replace it with the
+ // first `truncate-3` and "..."
val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row =>
row.toSeq.map { cell =>
val str = cell match {
@@ -251,7 +253,13 @@ class Dataset[T] private[sql](
case seq: Seq[_] => seq.mkString("[", ", ", "]")
case _ => cell.toString
}
- if (truncate && str.length > 20) str.substring(0, 17) + "..." else str
+ if (truncate > 0 && str.length > truncate) {
+ // do not show ellipses for strings shorter than 4 characters.
+ if (truncate < 4) str.substring(0, truncate)
+ else str.substring(0, truncate - 3) + "..."
+ } else {
+ str
+ }
}: Seq[String]
}
@@ -273,7 +281,7 @@ class Dataset[T] private[sql](
// column names
rows.head.zipWithIndex.map { case (cell, i) =>
- if (truncate) {
+ if (truncate > 0) {
StringUtils.leftPad(cell, colWidths(i))
} else {
StringUtils.rightPad(cell, colWidths(i))
@@ -285,7 +293,7 @@ class Dataset[T] private[sql](
// data
rows.tail.map {
_.zipWithIndex.map { case (cell, i) =>
- if (truncate) {
+ if (truncate > 0) {
StringUtils.leftPad(cell.toString, colWidths(i))
} else {
StringUtils.rightPad(cell.toString, colWidths(i))
@@ -523,7 +531,32 @@ class Dataset[T] private[sql](
* @since 1.6.0
*/
// scalastyle:off println
- def show(numRows: Int, truncate: Boolean): Unit = println(showString(numRows, truncate))
+ def show(numRows: Int, truncate: Boolean): Unit = if (truncate) {
+ println(showString(numRows, truncate = 20))
+ } else {
+ println(showString(numRows, truncate = 0))
+ }
+ // scalastyle:on println
+
+ /**
+ * Displays the Dataset in a tabular form. For example:
+ * {{{
+ * year month AVG('Adj Close) MAX('Adj Close)
+ * 1980 12 0.503218 0.595103
+ * 1981 01 0.523289 0.570307
+ * 1982 02 0.436504 0.475256
+ * 1983 03 0.410516 0.442194
+ * 1984 04 0.450090 0.483521
+ * }}}
+ *
+ * @param numRows Number of rows to show
+ * @param truncate If set to more than 0, truncates strings to `truncate` characters and
+ * all cells will be aligned right.
+ * @group action
+ * @since 1.6.0
+ */
+ // scalastyle:off println
+ def show(numRows: Int, truncate: Int): Unit = println(showString(numRows, truncate))
// scalastyle:on println
/**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 1afee9f021..6a0a7df3f4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -723,7 +723,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
testData.select($"*").show(1000)
}
- test("showString: truncate = [true, false]") {
+ test("showString: truncate = [0, 20]") {
val longString = Array.fill(21)("1").mkString
val df = sparkContext.parallelize(Seq("1", longString)).toDF()
val expectedAnswerForFalse = """+---------------------+
@@ -733,7 +733,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
||111111111111111111111|
|+---------------------+
|""".stripMargin
- assert(df.showString(10, false) === expectedAnswerForFalse)
+ assert(df.showString(10, truncate = 0) === expectedAnswerForFalse)
val expectedAnswerForTrue = """+--------------------+
|| value|
|+--------------------+
@@ -741,7 +741,28 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
||11111111111111111...|
|+--------------------+
|""".stripMargin
- assert(df.showString(10, true) === expectedAnswerForTrue)
+ assert(df.showString(10, truncate = 20) === expectedAnswerForTrue)
+ }
+
+ test("showString: truncate = [3, 17]") {
+ val longString = Array.fill(21)("1").mkString
+ val df = sparkContext.parallelize(Seq("1", longString)).toDF()
+ val expectedAnswerForFalse = """+-----+
+ ||value|
+ |+-----+
+ || 1|
+ || 111|
+ |+-----+
+ |""".stripMargin
+ assert(df.showString(10, truncate = 3) === expectedAnswerForFalse)
+ val expectedAnswerForTrue = """+-----------------+
+ || value|
+ |+-----------------+
+ || 1|
+ ||11111111111111...|
+ |+-----------------+
+ |""".stripMargin
+ assert(df.showString(10, truncate = 17) === expectedAnswerForTrue)
}
test("showString(negative)") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 43cbc03b7a..0b6f40872f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -730,7 +730,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
private def checkShowString[T](ds: Dataset[T], expected: String): Unit = {
val numRows = expected.split("\n").length - 4
- val actual = ds.showString(numRows, truncate = true)
+ val actual = ds.showString(numRows, truncate = 20)
if (expected != actual) {
fail(