[SPARK-13657] [SQL] Support parsing very long AND/OR expressions

## What changes were proposed in this pull request? In order to avoid StackOverflow when parse a expression with hundreds of ORs, we should use loop instead of recursive functions to flatten the tree as list. This PR also build a balanced tree to reduce the depth of generated And/Or expression, to avoid StackOverflow in analyzer/optimizer. ## How was this patch tested? Add new unit tests. Manually tested with TPCDS Q3 with hundreds predicates in it [1]. These predicates help to reduce the number of partitions, then the query time went from 60 seconds to 8 seconds. [1] https://github.com/cloudera/impala-tpcds-kit/blob/master/queries/q3.sql Author: Davies Liu <davies@databricks.com> Closes #11501 from davies/long_or.
author: Davies Liu <davies@databricks.com> 2016-03-08 10:23:19 -0800
committer: Davies Liu <davies.liu@gmail.com> 2016-03-08 10:23:19 -0800
commit: 78d3b6051eea1b5fa9b1a4caa186214b8cca17bf (patch)
tree: 5e6003b17ec007b0f7c5827a081cab499c8d6f30 /sql
parent: 54040f8d350d2aad3078dcffef808c62b7c0b73d (diff)
download: spark-78d3b6051eea1b5fa9b1a4caa186214b8cca17bf.tar.gz
spark-78d3b6051eea1b5fa9b1a4caa186214b8cca17bf.tar.bz2
spark-78d3b6051eea1b5fa9b1a4caa186214b8cca17bf.zip
2 files changed, 51 insertions, 2 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala
index 44f7d8a056..5d96d8e192 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala
@@ -19,6 +19,9 @@ package org.apache.spark.sql.catalyst.parser
 
 import java.sql.Date
 
+import scala.collection.mutable.ArrayBuffer
+import scala.util.matching.Regex
+
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis._
@@ -523,6 +526,39 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       noParseRule("Select", node)
   }
 
+  /**
+   * Flattens the left deep tree with the specified pattern into a list.
+   */
+  private def flattenLeftDeepTree(node: ASTNode, pattern: Regex): Seq[ASTNode] = {
+    val collected = ArrayBuffer[ASTNode]()
+    var rest = node
+    while (rest match {
+      case Token(pattern(), l :: r :: Nil) =>
+        collected += r
+        rest = l
+        true
+      case _ => false
+    }) {
+      // do nothing
+    }
+    collected += rest
+    // keep them in the same order as in SQL
+    collected.reverse
+  }
+
+  /**
+   * Creates a balanced tree that has similar number of nodes on left and right.
+   *
+   * This help to reduce the depth of the tree to prevent StackOverflow in analyzer/optimizer.
+   */
+  private def balancedTree(
+      expr: Seq[Expression],
+      f: (Expression, Expression) => Expression): Expression = expr.length match {
+    case 1 => expr.head
+    case 2 => f(expr.head, expr(1))
+    case l => f(balancedTree(expr.slice(0, l / 2), f), balancedTree(expr.slice(l / 2, l), f))
+  }
+
   protected def nodeToExpr(node: ASTNode): Expression = node match {
     /* Attribute References */
     case Token("TOK_TABLE_OR_COL", Token(name, Nil) :: Nil) =>
@@ -635,8 +671,10 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       }
 
     /* Boolean Logic */
-    case Token(AND(), left :: right:: Nil) => And(nodeToExpr(left), nodeToExpr(right))
-    case Token(OR(), left :: right:: Nil) => Or(nodeToExpr(left), nodeToExpr(right))
+    case Token(AND(), left :: right:: Nil) =>
+      balancedTree(flattenLeftDeepTree(node, AND).map(nodeToExpr), And)
+    case Token(OR(), left :: right:: Nil) =>
+      balancedTree(flattenLeftDeepTree(node, OR).map(nodeToExpr), Or)
     case Token(NOT(), child :: Nil) => Not(nodeToExpr(child))
     case Token("!", child :: Nil) => Not(nodeToExpr(child))
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala
index 0660791c4c..048b4f12b9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala
@@ -203,6 +203,17 @@ class CatalystQlSuite extends PlanTest {
       "from windowData")
   }
 
+  test("very long AND/OR expression") {
+    val equals = (1 to 1000).map(x => s"$x == $x")
+    val expr = parser.parseExpression(equals.mkString(" AND "))
+    assert(expr.isInstanceOf[And])
+    assert(expr.collect( { case EqualTo(_, _) => true } ).size == 1000)
+
+    val expr2 = parser.parseExpression(equals.mkString(" OR "))
+    assert(expr2.isInstanceOf[Or])
+    assert(expr2.collect( { case EqualTo(_, _) => true } ).size == 1000)
+  }
+
   test("subquery") {
     parser.parsePlan("select (select max(b) from s) ss from t")
     parser.parsePlan("select * from t where a = (select b from s)")
author	Davies Liu <davies@databricks.com>	2016-03-08 10:23:19 -0800
committer	Davies Liu <davies.liu@gmail.com>	2016-03-08 10:23:19 -0800
commit	78d3b6051eea1b5fa9b1a4caa186214b8cca17bf (patch)
tree	5e6003b17ec007b0f7c5827a081cab499c8d6f30 /sql
parent	54040f8d350d2aad3078dcffef808c62b7c0b73d (diff)
download	spark-78d3b6051eea1b5fa9b1a4caa186214b8cca17bf.tar.gz spark-78d3b6051eea1b5fa9b1a4caa186214b8cca17bf.tar.bz2 spark-78d3b6051eea1b5fa9b1a4caa186214b8cca17bf.zip