aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Davies <Michael.BellDavies@gmail.com>2014-12-30 13:40:51 -0800
committerMichael Armbrust <michael@databricks.com>2014-12-30 13:41:08 -0800
commit7a245412f7b1337c766981f43bcbb64890439002 (patch)
tree0935515bae8a29c40ecd9d40851b1cb5d6546e84
parentcde8a310a76d780b9d178218b219f85ca30a1968 (diff)
downloadspark-7a245412f7b1337c766981f43bcbb64890439002.tar.gz
spark-7a245412f7b1337c766981f43bcbb64890439002.tar.bz2
spark-7a245412f7b1337c766981f43bcbb64890439002.zip
[SPARK-4386] Improve performance when writing Parquet files
Convert type of RowWriteSupport.attributes to Array. Analysis of performance for writing very wide tables shows that time is spent predominantly in apply method on attributes var. Type of attributes previously was LinearSeqOptimized and apply is O(N) which made write O(N squared). Measurements on 575 column table showed this change made a 6x improvement in write times. Author: Michael Davies <Michael.BellDavies@gmail.com> Closes #3843 from MickDavies/SPARK-4386 and squashes the following commits: 892519d [Michael Davies] [SPARK-4386] Improve performance when writing Parquet files (cherry picked from commit 7425bec320227bf8818dc2844c12d5373d166364) Signed-off-by: Michael Armbrust <michael@databricks.com>
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala4
1 files changed, 2 insertions, 2 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index ef3687e692..9049eb5932 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -130,7 +130,7 @@ private[parquet] object RowReadSupport {
private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
private[parquet] var writer: RecordConsumer = null
- private[parquet] var attributes: Seq[Attribute] = null
+ private[parquet] var attributes: Array[Attribute] = null
override def init(configuration: Configuration): WriteSupport.WriteContext = {
val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
@@ -138,7 +138,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr)
if (attributes == null) {
- attributes = ParquetTypesConverter.convertFromString(origAttributesStr)
+ attributes = ParquetTypesConverter.convertFromString(origAttributesStr).toArray
}
log.debug(s"write support initialized for requested schema $attributes")