aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorForest Fang <forest.fang@outlook.com>2015-07-18 21:05:44 -0700
committerReynold Xin <rxin@databricks.com>2015-07-18 21:05:44 -0700
commit6cb6096c016178b9ce5c97592abe529ddb18cef2 (patch)
tree20c8f6292c8dcf610abb4fe58fe479e6e2999abf
parent45d798c323ffe32bc2eba4dbd271c4572f5a30cf (diff)
downloadspark-6cb6096c016178b9ce5c97592abe529ddb18cef2.tar.gz
spark-6cb6096c016178b9ce5c97592abe529ddb18cef2.tar.bz2
spark-6cb6096c016178b9ce5c97592abe529ddb18cef2.zip
[SPARK-8443][SQL] Split GenerateMutableProjection Codegen due to JVM Code Size Limits
By grouping projection calls into multiple apply function, we are able to push the number of projections codegen can handle from ~1k to ~60k. I have set the unit test to test against 5k as 60k took 15s for the unit test to complete. Author: Forest Fang <forest.fang@outlook.com> Closes #7076 from saurfang/codegen_size_limit and squashes the following commits: b7a7635 [Forest Fang] [SPARK-8443][SQL] Execute and verify split projections in test adef95a [Forest Fang] [SPARK-8443][SQL] Use safer factor and rewrite splitting code 1b5aa7e [Forest Fang] [SPARK-8443][SQL] inline execution if one block only 9405680 [Forest Fang] [SPARK-8443][SQL] split projection code by size limit
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala39
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala14
2 files changed, 50 insertions, 3 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 71e47d4f9b..b82bd6814b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst.expressions.codegen
import org.apache.spark.sql.catalyst.expressions._
+import scala.collection.mutable.ArrayBuffer
+
// MutableProjection is not accessible in Java
abstract class BaseMutableProjection extends MutableProjection
@@ -45,10 +47,41 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
else
${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)};
"""
- }.mkString("\n")
+ }
+ // collect projections into blocks as function has 64kb codesize limit in JVM
+ val projectionBlocks = new ArrayBuffer[String]()
+ val blockBuilder = new StringBuilder()
+ for (projection <- projectionCode) {
+ if (blockBuilder.length > 16 * 1000) {
+ projectionBlocks.append(blockBuilder.toString())
+ blockBuilder.clear()
+ }
+ blockBuilder.append(projection)
+ }
+ projectionBlocks.append(blockBuilder.toString())
+
+ val (projectionFuns, projectionCalls) = {
+ // inline execution if codesize limit was not broken
+ if (projectionBlocks.length == 1) {
+ ("", projectionBlocks.head)
+ } else {
+ (
+ projectionBlocks.zipWithIndex.map { case (body, i) =>
+ s"""
+ |private void apply$i(InternalRow i) {
+ | $body
+ |}
+ """.stripMargin
+ }.mkString,
+ projectionBlocks.indices.map(i => s"apply$i(i);").mkString("\n")
+ )
+ }
+ }
+
val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
s"private $javaType $variableName = $initialValue;"
}.mkString("\n ")
+
val code = s"""
public Object generate($exprType[] expr) {
return new SpecificProjection(expr);
@@ -75,9 +108,11 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
return (InternalRow) mutableRow;
}
+ $projectionFuns
+
public Object apply(Object _i) {
InternalRow i = (InternalRow) _i;
- $projectionCode
+ $projectionCalls
return mutableRow;
}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 481b335d15..e05218a23a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
/**
* Additional tests for code generation.
*/
-class CodeGenerationSuite extends SparkFunSuite {
+class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
test("multithreaded eval") {
import scala.concurrent._
@@ -42,4 +42,16 @@ class CodeGenerationSuite extends SparkFunSuite {
futures.foreach(Await.result(_, 10.seconds))
}
+
+ test("SPARK-8443: split wide projections into blocks due to JVM code size limit") {
+ val length = 5000
+ val expressions = List.fill(length)(EqualTo(Literal(1), Literal(1)))
+ val plan = GenerateMutableProjection.generate(expressions)()
+ val actual = plan(new GenericMutableRow(length)).toSeq
+ val expected = Seq.fill(length)(true)
+
+ if (!checkResult(actual, expected)) {
+ fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected")
+ }
+ }
}