[SPARK-4386] Improve performance when writing Parquet files.

If you profile the writing of a Parquet file, the single worst time consuming call inside of org.apache.spark.sql.parquet.MutableRowWriteSupport.write is actually in the scala.collection.AbstractSequence.size call. This is because the size call actually ends up COUNTING the elements in a scala.collection.LinearSeqOptimized.length ("optimized?"). This doesn't need to be done. "size" is called repeatedly where needed rather than called once at the top of the method and stored in a 'val'. Author: Jim Carroll <jim@dontcallme.com> Closes #3254 from jimfcarroll/parquet-perf and squashes the following commits: 30cc0b5 [Jim Carroll] Improve performance when writing Parquet files. (cherry picked from commit f76b9683706232c3d4e8e6e61627b8188dcb79dc) Signed-off-by: Michael Armbrust <michael@databricks.com>
author: Jim Carroll <jim@dontcallme.com> 2014-11-14 15:11:53 -0800
committer: Michael Armbrust <michael@databricks.com> 2014-11-14 15:12:07 -0800
commit: 7f242dc2911bbc821e90fed81421af9b8d6dcd9a (patch)
tree: 66df58384f53009eb64021d86c1c8f05edb2042f /sql
parent: 1cac30083b97c98c3663e2d2cd057124f033eb34 (diff)
download: spark-7f242dc2911bbc821e90fed81421af9b8d6dcd9a.tar.gz
spark-7f242dc2911bbc821e90fed81421af9b8d6dcd9a.tar.bz2
spark-7f242dc2911bbc821e90fed81421af9b8d6dcd9a.zip
1 files changed, 8 insertions, 6 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index 7bc2496600..ef3687e692 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -152,14 +152,15 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
   }
 
   override def write(record: Row): Unit = {
-    if (attributes.size > record.size) {
+    val attributesSize = attributes.size
+    if (attributesSize > record.size) {
       throw new IndexOutOfBoundsException(
-        s"Trying to write more fields than contained in row (${attributes.size}>${record.size})")
+        s"Trying to write more fields than contained in row (${attributesSize}>${record.size})")
     }
 
     var index = 0
     writer.startMessage()
-    while(index < attributes.size) {
+    while(index < attributesSize) {
       // null values indicate optional fields but we do not check currently
       if (record(index) != null) {
         writer.startField(attributes(index).name, index)
@@ -312,14 +313,15 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
 // Optimized for non-nested rows
 private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
   override def write(record: Row): Unit = {
-    if (attributes.size > record.size) {
+    val attributesSize = attributes.size
+    if (attributesSize > record.size) {
       throw new IndexOutOfBoundsException(
-        s"Trying to write more fields than contained in row (${attributes.size}>${record.size})")
+        s"Trying to write more fields than contained in row (${attributesSize}>${record.size})")
     }
 
     var index = 0
     writer.startMessage()
-    while(index < attributes.size) {
+    while(index < attributesSize) {
       // null values indicate optional fields but we do not check currently
       if (record(index) != null && record(index) != Nil) {
         writer.startField(attributes(index).name, index)
author	Jim Carroll <jim@dontcallme.com>	2014-11-14 15:11:53 -0800
committer	Michael Armbrust <michael@databricks.com>	2014-11-14 15:12:07 -0800
commit	7f242dc2911bbc821e90fed81421af9b8d6dcd9a (patch)
tree	66df58384f53009eb64021d86c1c8f05edb2042f /sql
parent	1cac30083b97c98c3663e2d2cd057124f033eb34 (diff)
download	spark-7f242dc2911bbc821e90fed81421af9b8d6dcd9a.tar.gz spark-7f242dc2911bbc821e90fed81421af9b8d6dcd9a.tar.bz2 spark-7f242dc2911bbc821e90fed81421af9b8d6dcd9a.zip