aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author云峤 <chensong.cs@alibaba-inc.com>2015-05-04 12:08:38 -0700
committerReynold Xin <rxin@databricks.com>2015-05-04 12:08:38 -0700
commitf32e69ecc333867fc966f65cd0aeaeddd43e0945 (patch)
tree2dbcc9704acb83d0a7b40f3cee8cc084e29f6dd9
parente0833c5958bbd73ff27cfe6865648d7b6e5a99bc (diff)
downloadspark-f32e69ecc333867fc966f65cd0aeaeddd43e0945.tar.gz
spark-f32e69ecc333867fc966f65cd0aeaeddd43e0945.tar.bz2
spark-f32e69ecc333867fc966f65cd0aeaeddd43e0945.zip
[SPARK-7319][SQL] Improve the output from DataFrame.show()
Author: 云峤 <chensong.cs@alibaba-inc.com> Closes #5865 from kaka1992/df.show and squashes the following commits: c79204b [云峤] Update a1338f6 [云峤] Update python dataFrame show test and add empty df unit test. 734369c [云峤] Update python dataFrame show test and add empty df unit test. 84aec3e [云峤] Update python dataFrame show test and add empty df unit test. 159b3d5 [云峤] update 03ef434 [云峤] update 7394fd5 [云峤] update test show ced487a [云峤] update pep8 b6e690b [云峤] Merge remote-tracking branch 'upstream/master' into df.show 30ac311 [云峤] [SPARK-7294] ADD BETWEEN 7d62368 [云峤] [SPARK-7294] ADD BETWEEN baf839b [云峤] [SPARK-7294] ADD BETWEEN d11d5b9 [云峤] [SPARK-7294] ADD BETWEEN
-rw-r--r--R/pkg/R/DataFrame.R2
-rw-r--r--R/pkg/inst/tests/test_sparkSQL.R2
-rw-r--r--python/pyspark/sql/dataframe.py105
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala28
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala19
5 files changed, 112 insertions, 44 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index b59b700af5..841e77e55e 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -167,7 +167,7 @@ setMethod("isLocal",
setMethod("showDF",
signature(x = "DataFrame"),
function(x, numRows = 20) {
- cat(callJMethod(x@sdf, "showString", numToInt(numRows)), "\n")
+ callJMethod(x@sdf, "showString", numToInt(numRows))
})
#' show
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index af7a6c5820..f82e56fdd8 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -641,7 +641,7 @@ test_that("toJSON() returns an RDD of the correct values", {
test_that("showDF()", {
df <- jsonFile(sqlCtx, jsonPath)
- expect_output(showDF(df), "age name \nnull Michael\n30 Andy \n19 Justin ")
+ expect_output(showDF(df), "+----+-------+\n| age| name|\n+----+-------+\n|null|Michael|\n| 30| Andy|\n| 19| Justin|\n+----+-------+\n")
})
test_that("isLocal()", {
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index aac5b8c4c5..22762c5bbb 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -275,9 +275,12 @@ class DataFrame(object):
>>> df
DataFrame[age: int, name: string]
>>> df.show()
- age name
- 2 Alice
- 5 Bob
+ +---+-----+
+ |age| name|
+ +---+-----+
+ | 2|Alice|
+ | 5| Bob|
+ +---+-----+
"""
print(self._jdf.showString(n))
@@ -591,12 +594,15 @@ class DataFrame(object):
given, this function computes statistics for all numerical columns.
>>> df.describe().show()
- summary age
- count 2
- mean 3.5
- stddev 1.5
- min 2
- max 5
+ +-------+---+
+ |summary|age|
+ +-------+---+
+ | count| 2|
+ | mean|3.5|
+ | stddev|1.5|
+ | min| 2|
+ | max| 5|
+ +-------+---+
"""
jdf = self._jdf.describe(self._jseq(cols))
return DataFrame(jdf, self.sql_ctx)
@@ -801,12 +807,18 @@ class DataFrame(object):
:param subset: optional list of column names to consider.
>>> df4.dropna().show()
- age height name
- 10 80 Alice
+ +---+------+-----+
+ |age|height| name|
+ +---+------+-----+
+ | 10| 80|Alice|
+ +---+------+-----+
>>> df4.na.drop().show()
- age height name
- 10 80 Alice
+ +---+------+-----+
+ |age|height| name|
+ +---+------+-----+
+ | 10| 80|Alice|
+ +---+------+-----+
"""
if how is not None and how not in ['any', 'all']:
raise ValueError("how ('" + how + "') should be 'any' or 'all'")
@@ -837,25 +849,34 @@ class DataFrame(object):
then the non-string column is simply ignored.
>>> df4.fillna(50).show()
- age height name
- 10 80 Alice
- 5 50 Bob
- 50 50 Tom
- 50 50 null
+ +---+------+-----+
+ |age|height| name|
+ +---+------+-----+
+ | 10| 80|Alice|
+ | 5| 50| Bob|
+ | 50| 50| Tom|
+ | 50| 50| null|
+ +---+------+-----+
>>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
- age height name
- 10 80 Alice
- 5 null Bob
- 50 null Tom
- 50 null unknown
+ +---+------+-------+
+ |age|height| name|
+ +---+------+-------+
+ | 10| 80| Alice|
+ | 5| null| Bob|
+ | 50| null| Tom|
+ | 50| null|unknown|
+ +---+------+-------+
>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
- age height name
- 10 80 Alice
- 5 null Bob
- 50 null Tom
- 50 null unknown
+ +---+------+-------+
+ |age|height| name|
+ +---+------+-------+
+ | 10| 80| Alice|
+ | 5| null| Bob|
+ | 50| null| Tom|
+ | 50| null|unknown|
+ +---+------+-------+
"""
if not isinstance(value, (float, int, long, basestring, dict)):
raise ValueError("value should be a float, int, long, string, or dict")
@@ -1241,11 +1262,17 @@ class Column(object):
>>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
>>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
- l[0] d[key]
- 1 value
+ +----+------+
+ |l[0]|d[key]|
+ +----+------+
+ | 1| value|
+ +----+------+
>>> df.select(df.l[0], df.d["key"]).show()
- l[0] d[key]
- 1 value
+ +----+------+
+ |l[0]|d[key]|
+ +----+------+
+ | 1| value|
+ +----+------+
"""
return self[key]
@@ -1255,11 +1282,17 @@ class Column(object):
>>> from pyspark.sql import Row
>>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
>>> df.select(df.r.getField("b")).show()
- r.b
- b
+ +---+
+ |r.b|
+ +---+
+ | b|
+ +---+
>>> df.select(df.r.a).show()
- r.a
- 1
+ +---+
+ |r.a|
+ +---+
+ | 1|
+ +---+
"""
return Column(self._jc.getField(name))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index c421006c8f..cf344710ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql
import java.io.CharArrayWriter
import java.sql.DriverManager
+
import scala.collection.JavaConversions._
import scala.language.implicitConversions
import scala.reflect.ClassTag
@@ -28,6 +29,7 @@ import scala.util.control.NonFatal
import com.fasterxml.jackson.core.JsonFactory
+import org.apache.commons.lang3.StringUtils
import org.apache.spark.annotation.{DeveloperApi, Experimental}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.python.SerDeUtil
@@ -175,6 +177,7 @@ class DataFrame private[sql](
* @param numRows Number of rows to show
*/
private[sql] def showString(numRows: Int): String = {
+ val sb = new StringBuilder
val data = take(numRows)
val numCols = schema.fieldNames.length
@@ -194,12 +197,25 @@ class DataFrame private[sql](
}
}
- // Pad the cells
- rows.map { row =>
- row.zipWithIndex.map { case (cell, i) =>
- String.format(s"%-${colWidths(i)}s", cell)
- }.mkString(" ")
- }.mkString("\n")
+ // Create SeparateLine
+ val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
+
+ // column names
+ rows.head.zipWithIndex.map { case (cell, i) =>
+ StringUtils.leftPad(cell.toString, colWidths(i))
+ }.addString(sb, "|", "|", "|\n")
+
+ sb.append(sep)
+
+ // data
+ rows.tail.map {
+ _.zipWithIndex.map { case (cell, i) =>
+ StringUtils.leftPad(cell.toString, colWidths(i))
+ }.addString(sb, "|", "|", "|\n")
+ }
+
+ sb.append(sep)
+ sb.toString()
}
override def toString: String = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index e286fef23c..ff31e15e2d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -598,6 +598,25 @@ class DataFrameSuite extends QueryTest {
testData.select($"*").show(1000)
}
+ test("SPARK-7319 showString") {
+ val expectedAnswer = """+---+-----+
+ ||key|value|
+ |+---+-----+
+ || 1| 1|
+ |+---+-----+
+ |""".stripMargin
+ assert(testData.select($"*").showString(1) === expectedAnswer)
+ }
+
+ test("SPARK-7327 show with empty dataFrame") {
+ val expectedAnswer = """+---+-----+
+ ||key|value|
+ |+---+-----+
+ |+---+-----+
+ |""".stripMargin
+ assert(testData.select($"*").filter($"key" < 0).showString(1) === expectedAnswer)
+ }
+
test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
val rowRDD = TestSQLContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))