aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorNong Li <nong@databricks.com>2016-03-02 15:03:19 -0800
committerDavies Liu <davies.liu@gmail.com>2016-03-02 15:03:19 -0800
commite2780ce8252ded93a695125c0a745d8b93193cca (patch)
treea8b3463d65e82a5100286946c7cd059b2a97a5c9 /sql
parentb5a59a0fe2ea703ce2712561e7b9044f772660a2 (diff)
downloadspark-e2780ce8252ded93a695125c0a745d8b93193cca.tar.gz
spark-e2780ce8252ded93a695125c0a745d8b93193cca.tar.bz2
spark-e2780ce8252ded93a695125c0a745d8b93193cca.zip
[SPARK-13574] [SQL] Add benchmark to measure string dictionary decode.
## What changes were proposed in this pull request? Also updated the other benchmarks when the default to use vectorized decode was flipped. Author: Nong Li <nong@databricks.com> Closes #11454 from nongli/benchmark.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala70
1 files changed, 52 insertions, 18 deletions
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
index 660f0f173a..14dbdf3409 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
@@ -37,6 +37,10 @@ object ParquetReadBenchmark {
val sc = new SparkContext("local[1]", "test-sql-context", conf)
val sqlContext = new SQLContext(sc)
+ // Set default configs. Individual cases will change them if necessary.
+ sqlContext.conf.setConfString(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, "true")
+ sqlContext.conf.setConfString(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true")
+
def withTempPath(f: File => Unit): Unit = {
val path = Utils.createTempDir()
path.delete()
@@ -72,7 +76,7 @@ object ParquetReadBenchmark {
.write.parquet(dir.getCanonicalPath)
sqlContext.read.parquet(dir.getCanonicalPath).registerTempTable("tempTable")
- sqlBenchmark.addCase("SQL Parquet Reader") { iter =>
+ sqlBenchmark.addCase("SQL Parquet Vectorized") { iter =>
sqlContext.sql("select sum(id) from tempTable").collect()
}
@@ -82,8 +86,8 @@ object ParquetReadBenchmark {
}
}
- sqlBenchmark.addCase("SQL Parquet Vectorized") { iter =>
- withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
+ sqlBenchmark.addCase("SQL Parquet Non-Vectorized") { iter =>
+ withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
sqlContext.sql("select sum(id) from tempTable").collect()
}
}
@@ -152,9 +156,9 @@ object ParquetReadBenchmark {
Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
SQL Single Int Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------
- SQL Parquet Reader 1042 / 1208 15.1 66.2 1.0X
- SQL Parquet MR 1544 / 1607 10.2 98.2 0.7X
- SQL Parquet Vectorized 674 / 739 23.3 42.9 1.5X
+ SQL Parquet Vectorized 657 / 778 23.9 41.8 1.0X
+ SQL Parquet MR 1606 / 1731 9.8 102.1 0.4X
+ SQL Parquet Non-Vectorized 1133 / 1216 13.9 72.1 0.6X
*/
sqlBenchmark.run()
@@ -172,8 +176,6 @@ object ParquetReadBenchmark {
}
def intStringScanBenchmark(values: Int): Unit = {
- val benchmark = new Benchmark("Int and String Scan", values)
-
withTempPath { dir =>
withTempTable("t1", "tempTable") {
sqlContext.range(values).registerTempTable("t1")
@@ -183,7 +185,7 @@ object ParquetReadBenchmark {
val benchmark = new Benchmark("Int and String Scan", values)
- benchmark.addCase("SQL Parquet Reader") { iter =>
+ benchmark.addCase("SQL Parquet Vectorized") { iter =>
sqlContext.sql("select sum(c1), sum(length(c2)) from tempTable").collect
}
@@ -193,15 +195,14 @@ object ParquetReadBenchmark {
}
}
- benchmark.addCase("SQL Parquet Vectorized") { iter =>
- withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
+ benchmark.addCase("SQL Parquet Non-vectorized") { iter =>
+ withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
sqlContext.sql("select sum(c1), sum(length(c2)) from tempTable").collect
}
}
-
val files = SpecificParquetRecordReaderBase.listDirectory(dir).toArray
- benchmark.addCase("ParquetReader") { num =>
+ benchmark.addCase("ParquetReader Non-vectorized") { num =>
var sum1 = 0L
var sum2 = 0L
files.map(_.asInstanceOf[String]).foreach { p =>
@@ -219,11 +220,43 @@ object ParquetReadBenchmark {
/*
Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
Int and String Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- -------------------------------------------------------------------------------
- SQL Parquet Reader 1381 / 1679 7.6 131.7 1.0X
- SQL Parquet MR 2005 / 2177 5.2 191.2 0.7X
- SQL Parquet Vectorized 919 / 1044 11.4 87.6 1.5X
- ParquetReader 1035 / 1163 10.1 98.7 1.3X
+ -------------------------------------------------------------------------------------------
+ SQL Parquet Vectorized 1025 / 1180 10.2 97.8 1.0X
+ SQL Parquet MR 2157 / 2222 4.9 205.7 0.5X
+ SQL Parquet Non-vectorized 1450 / 1466 7.2 138.3 0.7X
+ ParquetReader Non-vectorized 1005 / 1022 10.4 95.9 1.0X
+ */
+ benchmark.run()
+ }
+ }
+ }
+
+ def stringDictionaryScanBenchmark(values: Int): Unit = {
+ withTempPath { dir =>
+ withTempTable("t1", "tempTable") {
+ sqlContext.range(values).registerTempTable("t1")
+ sqlContext.sql("select cast((id % 200) + 10000 as STRING) as c1 from t1")
+ .write.parquet(dir.getCanonicalPath)
+ sqlContext.read.parquet(dir.getCanonicalPath).registerTempTable("tempTable")
+
+ val benchmark = new Benchmark("String Dictionary", values)
+
+ benchmark.addCase("SQL Parquet Vectorized") { iter =>
+ sqlContext.sql("select sum(length(c1)) from tempTable").collect
+ }
+
+ benchmark.addCase("SQL Parquet MR") { iter =>
+ withSQLConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key -> "false") {
+ sqlContext.sql("select sum(length(c1)) from tempTable").collect
+ }
+ }
+
+ /*
+ Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
+ String Dictionary: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+ -------------------------------------------------------------------------------------------
+ SQL Parquet Vectorized 578 / 593 18.1 55.1 1.0X
+ SQL Parquet MR 1021 / 1032 10.3 97.4 0.6X
*/
benchmark.run()
}
@@ -233,5 +266,6 @@ object ParquetReadBenchmark {
def main(args: Array[String]): Unit = {
intScanBenchmark(1024 * 1024 * 15)
intStringScanBenchmark(1024 * 1024 * 10)
+ stringDictionaryScanBenchmark(1024 * 1024 * 10)
}
}