From f76b9683706232c3d4e8e6e61627b8188dcb79dc Mon Sep 17 00:00:00 2001 From: Jim Carroll Date: Fri, 14 Nov 2014 15:11:53 -0800 Subject: [SPARK-4386] Improve performance when writing Parquet files. If you profile the writing of a Parquet file, the single worst time consuming call inside of org.apache.spark.sql.parquet.MutableRowWriteSupport.write is actually in the scala.collection.AbstractSequence.size call. This is because the size call actually ends up COUNTING the elements in a scala.collection.LinearSeqOptimized.length ("optimized?"). This doesn't need to be done. "size" is called repeatedly where needed rather than called once at the top of the method and stored in a 'val'. Author: Jim Carroll Closes #3254 from jimfcarroll/parquet-perf and squashes the following commits: 30cc0b5 [Jim Carroll] Improve performance when writing Parquet files. --- .../org/apache/spark/sql/parquet/ParquetTableSupport.scala | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'sql/core') diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala index 7bc2496600..ef3687e692 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala @@ -152,14 +152,15 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging { } override def write(record: Row): Unit = { - if (attributes.size > record.size) { + val attributesSize = attributes.size + if (attributesSize > record.size) { throw new IndexOutOfBoundsException( - s"Trying to write more fields than contained in row (${attributes.size}>${record.size})") + s"Trying to write more fields than contained in row (${attributesSize}>${record.size})") } var index = 0 writer.startMessage() - while(index < attributes.size) { + while(index < attributesSize) { // null values indicate optional fields but we do not check currently if (record(index) != null) { writer.startField(attributes(index).name, index) @@ -312,14 +313,15 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging { // Optimized for non-nested rows private[parquet] class MutableRowWriteSupport extends RowWriteSupport { override def write(record: Row): Unit = { - if (attributes.size > record.size) { + val attributesSize = attributes.size + if (attributesSize > record.size) { throw new IndexOutOfBoundsException( - s"Trying to write more fields than contained in row (${attributes.size}>${record.size})") + s"Trying to write more fields than contained in row (${attributesSize}>${record.size})") } var index = 0 writer.startMessage() - while(index < attributes.size) { + while(index < attributesSize) { // null values indicate optional fields but we do not check currently if (record(index) != null && record(index) != Nil) { writer.startField(attributes(index).name, index) -- cgit v1.2.3