aboutsummaryrefslogtreecommitdiff
path: root/sql/catalyst
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2014-10-26 16:10:09 -0700
committerMichael Armbrust <michael@databricks.com>2014-10-26 16:10:09 -0700
commit2838bf8aadd5228829c1a869863bc4da7877fdfb (patch)
tree474e9dc739631b81c20c812c38413d969fe47f2c /sql/catalyst
parent879a16585808e8fe34bdede741565efc4c9f9bb3 (diff)
downloadspark-2838bf8aadd5228829c1a869863bc4da7877fdfb.tar.gz
spark-2838bf8aadd5228829c1a869863bc4da7877fdfb.tar.bz2
spark-2838bf8aadd5228829c1a869863bc4da7877fdfb.zip
[SPARK-3537][SPARK-3914][SQL] Refines in-memory columnar table statistics
This PR refines in-memory columnar table statistics: 1. adds 2 more statistics for in-memory table columns: `count` and `sizeInBytes` 1. adds filter pushdown support for `IS NULL` and `IS NOT NULL`. 1. caches and propagates statistics in `InMemoryRelation` once the underlying cached RDD is materialized. Statistics are collected to driver side with an accumulator. This PR also fixes SPARK-3914 by properly propagating in-memory statistics. Author: Cheng Lian <lian@databricks.com> Closes #2860 from liancheng/propagates-in-mem-stats and squashes the following commits: 0cc5271 [Cheng Lian] Restricts visibility of o.a.s.s.c.p.l.Statistics c5ff904 [Cheng Lian] Fixes test table name conflict a8c818d [Cheng Lian] Refines tests 1d01074 [Cheng Lian] Bug fix: shouldn't call STRING.actualSize on null string value 7dc6a34 [Cheng Lian] Adds more in-memory table statistics and propagates them properly
Diffstat (limited to 'sql/catalyst')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala10
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala31
2 files changed, 19 insertions, 22 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala
index 8364379644..82e760b6c6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala
@@ -23,8 +23,7 @@ package org.apache.spark.sql.catalyst.expressions
* of the name, or the expected nullability).
*/
object AttributeMap {
- def apply[A](kvs: Seq[(Attribute, A)]) =
- new AttributeMap(kvs.map(kv => (kv._1.exprId, (kv._1, kv._2))).toMap)
+ def apply[A](kvs: Seq[(Attribute, A)]) = new AttributeMap(kvs.map(kv => (kv._1.exprId, kv)).toMap)
}
class AttributeMap[A](baseMap: Map[ExprId, (Attribute, A)])
@@ -32,10 +31,9 @@ class AttributeMap[A](baseMap: Map[ExprId, (Attribute, A)])
override def get(k: Attribute): Option[A] = baseMap.get(k.exprId).map(_._2)
- override def + [B1 >: A](kv: (Attribute, B1)): Map[Attribute, B1] =
- (baseMap.map(_._2) + kv).toMap
+ override def + [B1 >: A](kv: (Attribute, B1)): Map[Attribute, B1] = baseMap.values.toMap + kv
- override def iterator: Iterator[(Attribute, A)] = baseMap.map(_._2).iterator
+ override def iterator: Iterator[(Attribute, A)] = baseMap.valuesIterator
- override def -(key: Attribute): Map[Attribute, A] = (baseMap.map(_._2) - key).toMap
+ override def -(key: Attribute): Map[Attribute, A] = baseMap.values.toMap - key
}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 882e9c6110..ed578e081b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -26,25 +26,24 @@ import org.apache.spark.sql.catalyst.trees.TreeNode
import org.apache.spark.sql.catalyst.types.StructType
import org.apache.spark.sql.catalyst.trees
+/**
+ * Estimates of various statistics. The default estimation logic simply lazily multiplies the
+ * corresponding statistic produced by the children. To override this behavior, override
+ * `statistics` and assign it an overriden version of `Statistics`.
+ *
+ * '''NOTE''': concrete and/or overriden versions of statistics fields should pay attention to the
+ * performance of the implementations. The reason is that estimations might get triggered in
+ * performance-critical processes, such as query plan planning.
+ *
+ * @param sizeInBytes Physical size in bytes. For leaf operators this defaults to 1, otherwise it
+ * defaults to the product of children's `sizeInBytes`.
+ */
+private[sql] case class Statistics(sizeInBytes: BigInt)
+
abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
self: Product =>
- /**
- * Estimates of various statistics. The default estimation logic simply lazily multiplies the
- * corresponding statistic produced by the children. To override this behavior, override
- * `statistics` and assign it an overriden version of `Statistics`.
- *
- * '''NOTE''': concrete and/or overriden versions of statistics fields should pay attention to the
- * performance of the implementations. The reason is that estimations might get triggered in
- * performance-critical processes, such as query plan planning.
- *
- * @param sizeInBytes Physical size in bytes. For leaf operators this defaults to 1, otherwise it
- * defaults to the product of children's `sizeInBytes`.
- */
- case class Statistics(
- sizeInBytes: BigInt
- )
- lazy val statistics: Statistics = {
+ def statistics: Statistics = {
if (children.size == 0) {
throw new UnsupportedOperationException(s"LeafNode $nodeName must implement statistics.")
}