aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2016-03-08 10:23:19 -0800
committerDavies Liu <davies.liu@gmail.com>2016-03-08 10:23:19 -0800
commit78d3b6051eea1b5fa9b1a4caa186214b8cca17bf (patch)
tree5e6003b17ec007b0f7c5827a081cab499c8d6f30 /sql
parent54040f8d350d2aad3078dcffef808c62b7c0b73d (diff)
downloadspark-78d3b6051eea1b5fa9b1a4caa186214b8cca17bf.tar.gz
spark-78d3b6051eea1b5fa9b1a4caa186214b8cca17bf.tar.bz2
spark-78d3b6051eea1b5fa9b1a4caa186214b8cca17bf.zip
[SPARK-13657] [SQL] Support parsing very long AND/OR expressions
## What changes were proposed in this pull request? In order to avoid StackOverflow when parse a expression with hundreds of ORs, we should use loop instead of recursive functions to flatten the tree as list. This PR also build a balanced tree to reduce the depth of generated And/Or expression, to avoid StackOverflow in analyzer/optimizer. ## How was this patch tested? Add new unit tests. Manually tested with TPCDS Q3 with hundreds predicates in it [1]. These predicates help to reduce the number of partitions, then the query time went from 60 seconds to 8 seconds. [1] https://github.com/cloudera/impala-tpcds-kit/blob/master/queries/q3.sql Author: Davies Liu <davies@databricks.com> Closes #11501 from davies/long_or.
Diffstat (limited to 'sql')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala42
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala11
2 files changed, 51 insertions, 2 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala
index 44f7d8a056..5d96d8e192 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala
@@ -19,6 +19,9 @@ package org.apache.spark.sql.catalyst.parser
import java.sql.Date
+import scala.collection.mutable.ArrayBuffer
+import scala.util.matching.Regex
+
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis._
@@ -523,6 +526,39 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
noParseRule("Select", node)
}
+ /**
+ * Flattens the left deep tree with the specified pattern into a list.
+ */
+ private def flattenLeftDeepTree(node: ASTNode, pattern: Regex): Seq[ASTNode] = {
+ val collected = ArrayBuffer[ASTNode]()
+ var rest = node
+ while (rest match {
+ case Token(pattern(), l :: r :: Nil) =>
+ collected += r
+ rest = l
+ true
+ case _ => false
+ }) {
+ // do nothing
+ }
+ collected += rest
+ // keep them in the same order as in SQL
+ collected.reverse
+ }
+
+ /**
+ * Creates a balanced tree that has similar number of nodes on left and right.
+ *
+ * This help to reduce the depth of the tree to prevent StackOverflow in analyzer/optimizer.
+ */
+ private def balancedTree(
+ expr: Seq[Expression],
+ f: (Expression, Expression) => Expression): Expression = expr.length match {
+ case 1 => expr.head
+ case 2 => f(expr.head, expr(1))
+ case l => f(balancedTree(expr.slice(0, l / 2), f), balancedTree(expr.slice(l / 2, l), f))
+ }
+
protected def nodeToExpr(node: ASTNode): Expression = node match {
/* Attribute References */
case Token("TOK_TABLE_OR_COL", Token(name, Nil) :: Nil) =>
@@ -635,8 +671,10 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
}
/* Boolean Logic */
- case Token(AND(), left :: right:: Nil) => And(nodeToExpr(left), nodeToExpr(right))
- case Token(OR(), left :: right:: Nil) => Or(nodeToExpr(left), nodeToExpr(right))
+ case Token(AND(), left :: right:: Nil) =>
+ balancedTree(flattenLeftDeepTree(node, AND).map(nodeToExpr), And)
+ case Token(OR(), left :: right:: Nil) =>
+ balancedTree(flattenLeftDeepTree(node, OR).map(nodeToExpr), Or)
case Token(NOT(), child :: Nil) => Not(nodeToExpr(child))
case Token("!", child :: Nil) => Not(nodeToExpr(child))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala
index 0660791c4c..048b4f12b9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala
@@ -203,6 +203,17 @@ class CatalystQlSuite extends PlanTest {
"from windowData")
}
+ test("very long AND/OR expression") {
+ val equals = (1 to 1000).map(x => s"$x == $x")
+ val expr = parser.parseExpression(equals.mkString(" AND "))
+ assert(expr.isInstanceOf[And])
+ assert(expr.collect( { case EqualTo(_, _) => true } ).size == 1000)
+
+ val expr2 = parser.parseExpression(equals.mkString(" OR "))
+ assert(expr2.isInstanceOf[Or])
+ assert(expr2.collect( { case EqualTo(_, _) => true } ).size == 1000)
+ }
+
test("subquery") {
parser.parsePlan("select (select max(b) from s) ss from t")
parser.parsePlan("select * from t where a = (select b from s)")