[SPARK-19350][SQL] Cardinality estimation of Limit and Sample

## What changes were proposed in this pull request? Before this pr, LocalLimit/GlobalLimit/Sample propagates the same row count and column stats from its child, which is incorrect. We can get the correct rowCount in Statistics for GlobalLimit/Sample whether cbo is enabled or not. We don't know the rowCount for LocalLimit because we don't know the partition number at that time. Column stats should not be propagated because we don't know the distribution of columns after Limit or Sample. ## How was this patch tested? Added test cases. Author: wangzhenhua <wangzhenhua@huawei.com> Closes #16696 from wzhfy/limitEstimation.
author: wangzhenhua <wangzhenhua@huawei.com> 2017-03-06 21:45:36 -0800
committer: Xiao Li <gatorsmile@gmail.com> 2017-03-06 21:45:36 -0800
commit: 9909f6d361fdf2b7ef30fa7fbbc91e00f2999794 (patch)
tree: 4c8e52db7af59664f066bd3b06ff93d576dccecc /sql/core/src
parent: b0a5cd89097c563e9949d8cfcf84d18b03b8d24c (diff)
download: spark-9909f6d361fdf2b7ef30fa7fbbc91e00f2999794.tar.gz
spark-9909f6d361fdf2b7ef30fa7fbbc91e00f2999794.tar.bz2
spark-9909f6d361fdf2b7ef30fa7fbbc91e00f2999794.zip
1 files changed, 0 insertions, 24 deletions
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index bbb31dbc8f..1f547c5a2a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -112,30 +112,6 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
       spark.sessionState.conf.autoBroadcastJoinThreshold)
   }
 
-  test("estimates the size of limit") {
-    withTempView("test") {
-      Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v")
-        .createOrReplaceTempView("test")
-      Seq((0, 1), (1, 24), (2, 48)).foreach { case (limit, expected) =>
-        val df = sql(s"""SELECT * FROM test limit $limit""")
-
-        val sizesGlobalLimit = df.queryExecution.analyzed.collect { case g: GlobalLimit =>
-          g.stats(conf).sizeInBytes
-        }
-        assert(sizesGlobalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}")
-        assert(sizesGlobalLimit.head === BigInt(expected),
-          s"expected exact size $expected for table 'test', got: ${sizesGlobalLimit.head}")
-
-        val sizesLocalLimit = df.queryExecution.analyzed.collect { case l: LocalLimit =>
-          l.stats(conf).sizeInBytes
-        }
-        assert(sizesLocalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}")
-        assert(sizesLocalLimit.head === BigInt(expected),
-          s"expected exact size $expected for table 'test', got: ${sizesLocalLimit.head}")
-      }
-    }
-  }
-
   test("column stats round trip serialization") {
     // Make sure we serialize and then deserialize and we will get the result data
     val df = data.toDF(stats.keys.toSeq :+ "carray" : _*)
author	wangzhenhua <wangzhenhua@huawei.com>	2017-03-06 21:45:36 -0800
committer	Xiao Li <gatorsmile@gmail.com>	2017-03-06 21:45:36 -0800
commit	9909f6d361fdf2b7ef30fa7fbbc91e00f2999794 (patch)
tree	4c8e52db7af59664f066bd3b06ff93d576dccecc /sql/core/src
parent	b0a5cd89097c563e9949d8cfcf84d18b03b8d24c (diff)
download	spark-9909f6d361fdf2b7ef30fa7fbbc91e00f2999794.tar.gz spark-9909f6d361fdf2b7ef30fa7fbbc91e00f2999794.tar.bz2 spark-9909f6d361fdf2b7ef30fa7fbbc91e00f2999794.zip