From f32e69ecc333867fc966f65cd0aeaeddd43e0945 Mon Sep 17 00:00:00 2001 From: 云峤 Date: Mon, 4 May 2015 12:08:38 -0700 Subject: [SPARK-7319][SQL] Improve the output from DataFrame.show() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: 云峤 Closes #5865 from kaka1992/df.show and squashes the following commits: c79204b [云峤] Update a1338f6 [云峤] Update python dataFrame show test and add empty df unit test. 734369c [云峤] Update python dataFrame show test and add empty df unit test. 84aec3e [云峤] Update python dataFrame show test and add empty df unit test. 159b3d5 [云峤] update 03ef434 [云峤] update 7394fd5 [云峤] update test show ced487a [云峤] update pep8 b6e690b [云峤] Merge remote-tracking branch 'upstream/master' into df.show 30ac311 [云峤] [SPARK-7294] ADD BETWEEN 7d62368 [云峤] [SPARK-7294] ADD BETWEEN baf839b [云峤] [SPARK-7294] ADD BETWEEN d11d5b9 [云峤] [SPARK-7294] ADD BETWEEN --- R/pkg/R/DataFrame.R | 2 +- R/pkg/inst/tests/test_sparkSQL.R | 2 +- python/pyspark/sql/dataframe.py | 105 ++++++++++++++------- .../scala/org/apache/spark/sql/DataFrame.scala | 28 ++++-- .../org/apache/spark/sql/DataFrameSuite.scala | 19 ++++ 5 files changed, 112 insertions(+), 44 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index b59b700af5..841e77e55e 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -167,7 +167,7 @@ setMethod("isLocal", setMethod("showDF", signature(x = "DataFrame"), function(x, numRows = 20) { - cat(callJMethod(x@sdf, "showString", numToInt(numRows)), "\n") + callJMethod(x@sdf, "showString", numToInt(numRows)) }) #' show diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index af7a6c5820..f82e56fdd8 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -641,7 +641,7 @@ test_that("toJSON() returns an RDD of the correct values", { test_that("showDF()", { df <- jsonFile(sqlCtx, jsonPath) - expect_output(showDF(df), "age name \nnull Michael\n30 Andy \n19 Justin ") + expect_output(showDF(df), "+----+-------+\n| age| name|\n+----+-------+\n|null|Michael|\n| 30| Andy|\n| 19| Justin|\n+----+-------+\n") }) test_that("isLocal()", { diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index aac5b8c4c5..22762c5bbb 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -275,9 +275,12 @@ class DataFrame(object): >>> df DataFrame[age: int, name: string] >>> df.show() - age name - 2 Alice - 5 Bob + +---+-----+ + |age| name| + +---+-----+ + | 2|Alice| + | 5| Bob| + +---+-----+ """ print(self._jdf.showString(n)) @@ -591,12 +594,15 @@ class DataFrame(object): given, this function computes statistics for all numerical columns. >>> df.describe().show() - summary age - count 2 - mean 3.5 - stddev 1.5 - min 2 - max 5 + +-------+---+ + |summary|age| + +-------+---+ + | count| 2| + | mean|3.5| + | stddev|1.5| + | min| 2| + | max| 5| + +-------+---+ """ jdf = self._jdf.describe(self._jseq(cols)) return DataFrame(jdf, self.sql_ctx) @@ -801,12 +807,18 @@ class DataFrame(object): :param subset: optional list of column names to consider. >>> df4.dropna().show() - age height name - 10 80 Alice + +---+------+-----+ + |age|height| name| + +---+------+-----+ + | 10| 80|Alice| + +---+------+-----+ >>> df4.na.drop().show() - age height name - 10 80 Alice + +---+------+-----+ + |age|height| name| + +---+------+-----+ + | 10| 80|Alice| + +---+------+-----+ """ if how is not None and how not in ['any', 'all']: raise ValueError("how ('" + how + "') should be 'any' or 'all'") @@ -837,25 +849,34 @@ class DataFrame(object): then the non-string column is simply ignored. >>> df4.fillna(50).show() - age height name - 10 80 Alice - 5 50 Bob - 50 50 Tom - 50 50 null + +---+------+-----+ + |age|height| name| + +---+------+-----+ + | 10| 80|Alice| + | 5| 50| Bob| + | 50| 50| Tom| + | 50| 50| null| + +---+------+-----+ >>> df4.fillna({'age': 50, 'name': 'unknown'}).show() - age height name - 10 80 Alice - 5 null Bob - 50 null Tom - 50 null unknown + +---+------+-------+ + |age|height| name| + +---+------+-------+ + | 10| 80| Alice| + | 5| null| Bob| + | 50| null| Tom| + | 50| null|unknown| + +---+------+-------+ >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show() - age height name - 10 80 Alice - 5 null Bob - 50 null Tom - 50 null unknown + +---+------+-------+ + |age|height| name| + +---+------+-------+ + | 10| 80| Alice| + | 5| null| Bob| + | 50| null| Tom| + | 50| null|unknown| + +---+------+-------+ """ if not isinstance(value, (float, int, long, basestring, dict)): raise ValueError("value should be a float, int, long, string, or dict") @@ -1241,11 +1262,17 @@ class Column(object): >>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"]) >>> df.select(df.l.getItem(0), df.d.getItem("key")).show() - l[0] d[key] - 1 value + +----+------+ + |l[0]|d[key]| + +----+------+ + | 1| value| + +----+------+ >>> df.select(df.l[0], df.d["key"]).show() - l[0] d[key] - 1 value + +----+------+ + |l[0]|d[key]| + +----+------+ + | 1| value| + +----+------+ """ return self[key] @@ -1255,11 +1282,17 @@ class Column(object): >>> from pyspark.sql import Row >>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF() >>> df.select(df.r.getField("b")).show() - r.b - b + +---+ + |r.b| + +---+ + | b| + +---+ >>> df.select(df.r.a).show() - r.a - 1 + +---+ + |r.a| + +---+ + | 1| + +---+ """ return Column(self._jc.getField(name)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index c421006c8f..cf344710ff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql import java.io.CharArrayWriter import java.sql.DriverManager + import scala.collection.JavaConversions._ import scala.language.implicitConversions import scala.reflect.ClassTag @@ -28,6 +29,7 @@ import scala.util.control.NonFatal import com.fasterxml.jackson.core.JsonFactory +import org.apache.commons.lang3.StringUtils import org.apache.spark.annotation.{DeveloperApi, Experimental} import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.SerDeUtil @@ -175,6 +177,7 @@ class DataFrame private[sql]( * @param numRows Number of rows to show */ private[sql] def showString(numRows: Int): String = { + val sb = new StringBuilder val data = take(numRows) val numCols = schema.fieldNames.length @@ -194,12 +197,25 @@ class DataFrame private[sql]( } } - // Pad the cells - rows.map { row => - row.zipWithIndex.map { case (cell, i) => - String.format(s"%-${colWidths(i)}s", cell) - }.mkString(" ") - }.mkString("\n") + // Create SeparateLine + val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString() + + // column names + rows.head.zipWithIndex.map { case (cell, i) => + StringUtils.leftPad(cell.toString, colWidths(i)) + }.addString(sb, "|", "|", "|\n") + + sb.append(sep) + + // data + rows.tail.map { + _.zipWithIndex.map { case (cell, i) => + StringUtils.leftPad(cell.toString, colWidths(i)) + }.addString(sb, "|", "|", "|\n") + } + + sb.append(sep) + sb.toString() } override def toString: String = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index e286fef23c..ff31e15e2d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -598,6 +598,25 @@ class DataFrameSuite extends QueryTest { testData.select($"*").show(1000) } + test("SPARK-7319 showString") { + val expectedAnswer = """+---+-----+ + ||key|value| + |+---+-----+ + || 1| 1| + |+---+-----+ + |""".stripMargin + assert(testData.select($"*").showString(1) === expectedAnswer) + } + + test("SPARK-7327 show with empty dataFrame") { + val expectedAnswer = """+---+-----+ + ||key|value| + |+---+-----+ + |+---+-----+ + |""".stripMargin + assert(testData.select($"*").filter($"key" < 0).showString(1) === expectedAnswer) + } + test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") { val rowRDD = TestSQLContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0)))) val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false))) -- cgit v1.2.3