From 78d3b6051eea1b5fa9b1a4caa186214b8cca17bf Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Tue, 8 Mar 2016 10:23:19 -0800 Subject: [SPARK-13657] [SQL] Support parsing very long AND/OR expressions ## What changes were proposed in this pull request? In order to avoid StackOverflow when parse a expression with hundreds of ORs, we should use loop instead of recursive functions to flatten the tree as list. This PR also build a balanced tree to reduce the depth of generated And/Or expression, to avoid StackOverflow in analyzer/optimizer. ## How was this patch tested? Add new unit tests. Manually tested with TPCDS Q3 with hundreds predicates in it [1]. These predicates help to reduce the number of partitions, then the query time went from 60 seconds to 8 seconds. [1] https://github.com/cloudera/impala-tpcds-kit/blob/master/queries/q3.sql Author: Davies Liu Closes #11501 from davies/long_or. --- .../spark/sql/catalyst/parser/CatalystQl.scala | 42 ++++++++++++++++++++-- .../sql/catalyst/parser/CatalystQlSuite.scala | 11 ++++++ 2 files changed, 51 insertions(+), 2 deletions(-) (limited to 'sql') diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala index 44f7d8a056..5d96d8e192 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala @@ -19,6 +19,9 @@ package org.apache.spark.sql.catalyst.parser import java.sql.Date +import scala.collection.mutable.ArrayBuffer +import scala.util.matching.Regex + import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis._ @@ -523,6 +526,39 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C noParseRule("Select", node) } + /** + * Flattens the left deep tree with the specified pattern into a list. + */ + private def flattenLeftDeepTree(node: ASTNode, pattern: Regex): Seq[ASTNode] = { + val collected = ArrayBuffer[ASTNode]() + var rest = node + while (rest match { + case Token(pattern(), l :: r :: Nil) => + collected += r + rest = l + true + case _ => false + }) { + // do nothing + } + collected += rest + // keep them in the same order as in SQL + collected.reverse + } + + /** + * Creates a balanced tree that has similar number of nodes on left and right. + * + * This help to reduce the depth of the tree to prevent StackOverflow in analyzer/optimizer. + */ + private def balancedTree( + expr: Seq[Expression], + f: (Expression, Expression) => Expression): Expression = expr.length match { + case 1 => expr.head + case 2 => f(expr.head, expr(1)) + case l => f(balancedTree(expr.slice(0, l / 2), f), balancedTree(expr.slice(l / 2, l), f)) + } + protected def nodeToExpr(node: ASTNode): Expression = node match { /* Attribute References */ case Token("TOK_TABLE_OR_COL", Token(name, Nil) :: Nil) => @@ -635,8 +671,10 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C } /* Boolean Logic */ - case Token(AND(), left :: right:: Nil) => And(nodeToExpr(left), nodeToExpr(right)) - case Token(OR(), left :: right:: Nil) => Or(nodeToExpr(left), nodeToExpr(right)) + case Token(AND(), left :: right:: Nil) => + balancedTree(flattenLeftDeepTree(node, AND).map(nodeToExpr), And) + case Token(OR(), left :: right:: Nil) => + balancedTree(flattenLeftDeepTree(node, OR).map(nodeToExpr), Or) case Token(NOT(), child :: Nil) => Not(nodeToExpr(child)) case Token("!", child :: Nil) => Not(nodeToExpr(child)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala index 0660791c4c..048b4f12b9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala @@ -203,6 +203,17 @@ class CatalystQlSuite extends PlanTest { "from windowData") } + test("very long AND/OR expression") { + val equals = (1 to 1000).map(x => s"$x == $x") + val expr = parser.parseExpression(equals.mkString(" AND ")) + assert(expr.isInstanceOf[And]) + assert(expr.collect( { case EqualTo(_, _) => true } ).size == 1000) + + val expr2 = parser.parseExpression(equals.mkString(" OR ")) + assert(expr2.isInstanceOf[Or]) + assert(expr2.collect( { case EqualTo(_, _) => true } ).size == 1000) + } + test("subquery") { parser.parsePlan("select (select max(b) from s) ss from t") parser.parsePlan("select * from t where a = (select b from s)") -- cgit v1.2.3