[SPARK-20104][SQL] Don't estimate IsNull or IsNotNull predicates for non-leaf node

## What changes were proposed in this pull request? In current stage, we don't have advanced statistics such as sketches or histograms. As a result, some operator can't estimate `nullCount` accurately. E.g. left outer join estimation does not accurately update `nullCount` currently. So for `IsNull` and `IsNotNull` predicates, we only estimate them when the child is a leaf node, whose `nullCount` is accurate. ## How was this patch tested? A new test case is added in `FilterEstimationSuite`. Author: wangzhenhua <wangzhenhua@huawei.com> Closes #17438 from wzhfy/nullEstimation.
author: wangzhenhua <wangzhenhua@huawei.com> 2017-03-27 23:41:27 +0800
committer: Wenchen Fan <wenchen@databricks.com> 2017-03-27 23:41:27 +0800
commit: 890493458de396cfcffdd71233cfdd39e834944b (patch)
tree: c1e9184b89e4281fa1f2027943b68d8efaffa26b /sql/catalyst
parent: 3fbf0a5f9297f438bc92db11f106d4a0ae568613 (diff)
download: spark-890493458de396cfcffdd71233cfdd39e834944b.tar.gz
spark-890493458de396cfcffdd71233cfdd39e834944b.tar.bz2
spark-890493458de396cfcffdd71233cfdd39e834944b.zip
2 files changed, 33 insertions, 4 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index b10785b05d..f14df93160 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -24,7 +24,7 @@ import scala.math.BigDecimal.RoundingMode
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.CatalystConf
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Filter, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Filter, LeafNode, Statistics}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 
@@ -174,10 +174,16 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
       case InSet(ar: Attribute, set) =>
         evaluateInSet(ar, set, update)
 
-      case IsNull(ar: Attribute) =>
+      // In current stage, we don't have advanced statistics such as sketches or histograms.
+      // As a result, some operator can't estimate `nullCount` accurately. E.g. left outer join
+      // estimation does not accurately update `nullCount` currently.
+      // So for IsNull and IsNotNull predicates, we only estimate them when the child is a leaf
+      // node, whose `nullCount` is accurate.
+      // This is a limitation due to lack of advanced stats. We should remove it in the future.
+      case IsNull(ar: Attribute) if plan.child.isInstanceOf[LeafNode] =>
         evaluateNullCheck(ar, isNull = true, update)
 
-      case IsNotNull(ar: Attribute) =>
+      case IsNotNull(ar: Attribute) if plan.child.isInstanceOf[LeafNode] =>
         evaluateNullCheck(ar, isNull = false, update)
 
       case _ =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
index 4691913c8c..07abe1ed28 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.catalyst.statsEstimation
 import java.sql.Date
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Filter, Statistics}
+import org.apache.spark.sql.catalyst.plans.LeftOuter
+import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Filter, Join, Statistics}
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
 import org.apache.spark.sql.types._
 
@@ -340,6 +341,28 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
       expectedRowCount = 2)
   }
 
+  // This is a limitation test. We should remove it after the limitation is removed.
+  test("don't estimate IsNull or IsNotNull if the child is a non-leaf node") {
+    val attrIntLargerRange = AttributeReference("c1", IntegerType)()
+    val colStatIntLargerRange = ColumnStat(distinctCount = 20, min = Some(1), max = Some(20),
+      nullCount = 10, avgLen = 4, maxLen = 4)
+    val smallerTable = childStatsTestPlan(Seq(attrInt), 10L)
+    val largerTable = StatsTestPlan(
+      outputList = Seq(attrIntLargerRange),
+      rowCount = 30,
+      attributeStats = AttributeMap(Seq(attrIntLargerRange -> colStatIntLargerRange)))
+    val nonLeafChild = Join(largerTable, smallerTable, LeftOuter,
+      Some(EqualTo(attrIntLargerRange, attrInt)))
+
+    Seq(IsNull(attrIntLargerRange), IsNotNull(attrIntLargerRange)).foreach { predicate =>
+      validateEstimatedStats(
+        Filter(predicate, nonLeafChild),
+        // column stats don't change
+        Seq(attrInt -> colStatInt, attrIntLargerRange -> colStatIntLargerRange),
+        expectedRowCount = 30)
+    }
+  }
+
   private def childStatsTestPlan(outList: Seq[Attribute], tableRowCount: BigInt): StatsTestPlan = {
     StatsTestPlan(
       outputList = outList,
author	wangzhenhua <wangzhenhua@huawei.com>	2017-03-27 23:41:27 +0800
committer	Wenchen Fan <wenchen@databricks.com>	2017-03-27 23:41:27 +0800
commit	890493458de396cfcffdd71233cfdd39e834944b (patch)
tree	c1e9184b89e4281fa1f2027943b68d8efaffa26b /sql/catalyst
parent	3fbf0a5f9297f438bc92db11f106d4a0ae568613 (diff)
download	spark-890493458de396cfcffdd71233cfdd39e834944b.tar.gz spark-890493458de396cfcffdd71233cfdd39e834944b.tar.bz2 spark-890493458de396cfcffdd71233cfdd39e834944b.zip