diff options
author | Forest Fang <forest.fang@outlook.com> | 2015-07-18 21:05:44 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-07-18 21:05:44 -0700 |
commit | 6cb6096c016178b9ce5c97592abe529ddb18cef2 (patch) | |
tree | 20c8f6292c8dcf610abb4fe58fe479e6e2999abf | |
parent | 45d798c323ffe32bc2eba4dbd271c4572f5a30cf (diff) | |
download | spark-6cb6096c016178b9ce5c97592abe529ddb18cef2.tar.gz spark-6cb6096c016178b9ce5c97592abe529ddb18cef2.tar.bz2 spark-6cb6096c016178b9ce5c97592abe529ddb18cef2.zip |
[SPARK-8443][SQL] Split GenerateMutableProjection Codegen due to JVM Code Size Limits
By grouping projection calls into multiple apply function, we are able to push the number of projections codegen can handle from ~1k to ~60k. I have set the unit test to test against 5k as 60k took 15s for the unit test to complete.
Author: Forest Fang <forest.fang@outlook.com>
Closes #7076 from saurfang/codegen_size_limit and squashes the following commits:
b7a7635 [Forest Fang] [SPARK-8443][SQL] Execute and verify split projections in test
adef95a [Forest Fang] [SPARK-8443][SQL] Use safer factor and rewrite splitting code
1b5aa7e [Forest Fang] [SPARK-8443][SQL] inline execution if one block only
9405680 [Forest Fang] [SPARK-8443][SQL] split projection code by size limit
2 files changed, 50 insertions, 3 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala index 71e47d4f9b..b82bd6814b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.sql.catalyst.expressions._ +import scala.collection.mutable.ArrayBuffer + // MutableProjection is not accessible in Java abstract class BaseMutableProjection extends MutableProjection @@ -45,10 +47,41 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu else ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)}; """ - }.mkString("\n") + } + // collect projections into blocks as function has 64kb codesize limit in JVM + val projectionBlocks = new ArrayBuffer[String]() + val blockBuilder = new StringBuilder() + for (projection <- projectionCode) { + if (blockBuilder.length > 16 * 1000) { + projectionBlocks.append(blockBuilder.toString()) + blockBuilder.clear() + } + blockBuilder.append(projection) + } + projectionBlocks.append(blockBuilder.toString()) + + val (projectionFuns, projectionCalls) = { + // inline execution if codesize limit was not broken + if (projectionBlocks.length == 1) { + ("", projectionBlocks.head) + } else { + ( + projectionBlocks.zipWithIndex.map { case (body, i) => + s""" + |private void apply$i(InternalRow i) { + | $body + |} + """.stripMargin + }.mkString, + projectionBlocks.indices.map(i => s"apply$i(i);").mkString("\n") + ) + } + } + val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) => s"private $javaType $variableName = $initialValue;" }.mkString("\n ") + val code = s""" public Object generate($exprType[] expr) { return new SpecificProjection(expr); @@ -75,9 +108,11 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu return (InternalRow) mutableRow; } + $projectionFuns + public Object apply(Object _i) { InternalRow i = (InternalRow) _i; - $projectionCode + $projectionCalls return mutableRow; } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index 481b335d15..e05218a23a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._ /** * Additional tests for code generation. */ -class CodeGenerationSuite extends SparkFunSuite { +class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { test("multithreaded eval") { import scala.concurrent._ @@ -42,4 +42,16 @@ class CodeGenerationSuite extends SparkFunSuite { futures.foreach(Await.result(_, 10.seconds)) } + + test("SPARK-8443: split wide projections into blocks due to JVM code size limit") { + val length = 5000 + val expressions = List.fill(length)(EqualTo(Literal(1), Literal(1))) + val plan = GenerateMutableProjection.generate(expressions)() + val actual = plan(new GenericMutableRow(length)).toSeq + val expected = Seq.fill(length)(true) + + if (!checkResult(actual, expected)) { + fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected") + } + } } |