diff options
author | Davies Liu <davies@databricks.com> | 2016-06-23 11:48:48 -0700 |
---|---|---|
committer | Davies Liu <davies.liu@gmail.com> | 2016-06-23 11:48:48 -0700 |
commit | 10396d9505c752cc18b6424f415d4ff0f460ad65 (patch) | |
tree | b2f45ba2c96e182d2fad139d022651dbfe88494a /sql/catalyst | |
parent | 60398dabc50d402bbab4190fbe94ebed6d3a48dc (diff) | |
download | spark-10396d9505c752cc18b6424f415d4ff0f460ad65.tar.gz spark-10396d9505c752cc18b6424f415d4ff0f460ad65.tar.bz2 spark-10396d9505c752cc18b6424f415d4ff0f460ad65.zip |
[SPARK-16163] [SQL] Cache the statistics for logical plans
## What changes were proposed in this pull request?
This calculation of statistics is not trivial anymore, it could be very slow on large query (for example, TPC-DS Q64 took several minutes to plan).
During the planning of a query, the statistics of any logical plan should not change (even InMemoryRelation), so we should use `lazy val` to cache the statistics.
For InMemoryRelation, the statistics could be updated after materialization, it's only useful when used in another query (before planning), because once we finished the planning, the statistics will not be used anymore.
## How was this patch tested?
Testsed with TPC-DS Q64, it could be planned in a second after the patch.
Author: Davies Liu <davies@databricks.com>
Closes #13871 from davies/fix_statistics.
Diffstat (limited to 'sql/catalyst')
-rw-r--r-- | sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala | 20 |
1 files changed, 10 insertions, 10 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index ff3dcbc957..79f9a210a3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -159,7 +159,7 @@ case class Intersect(left: LogicalPlan, right: LogicalPlan) extends SetOperation } } - override def statistics: Statistics = { + override lazy val statistics: Statistics = { val leftSize = left.statistics.sizeInBytes val rightSize = right.statistics.sizeInBytes val sizeInBytes = if (leftSize < rightSize) leftSize else rightSize @@ -184,7 +184,7 @@ case class Except(left: LogicalPlan, right: LogicalPlan) extends SetOperation(le left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType } && duplicateResolved - override def statistics: Statistics = { + override lazy val statistics: Statistics = { left.statistics.copy() } } @@ -224,7 +224,7 @@ case class Union(children: Seq[LogicalPlan]) extends LogicalPlan { children.length > 1 && childrenResolved && allChildrenCompatible } - override def statistics: Statistics = { + override lazy val statistics: Statistics = { val sizeInBytes = children.map(_.statistics.sizeInBytes).sum Statistics(sizeInBytes = sizeInBytes) } @@ -333,7 +333,7 @@ case class Join( case _ => resolvedExceptNatural } - override def statistics: Statistics = joinType match { + override lazy val statistics: Statistics = joinType match { case LeftAnti | LeftSemi => // LeftSemi and LeftAnti won't ever be bigger than left left.statistics.copy() @@ -351,7 +351,7 @@ case class BroadcastHint(child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output // set isBroadcastable to true so the child will be broadcasted - override def statistics: Statistics = super.statistics.copy(isBroadcastable = true) + override lazy val statistics: Statistics = super.statistics.copy(isBroadcastable = true) } case class InsertIntoTable( @@ -451,7 +451,7 @@ case class Range( override def newInstance(): Range = copy(output = output.map(_.newInstance())) - override def statistics: Statistics = { + override lazy val statistics: Statistics = { val sizeInBytes = LongType.defaultSize * numElements Statistics( sizeInBytes = sizeInBytes ) } @@ -486,7 +486,7 @@ case class Aggregate( override def validConstraints: Set[Expression] = child.constraints.union(getAliasedConstraints(aggregateExpressions)) - override def statistics: Statistics = { + override lazy val statistics: Statistics = { if (groupingExpressions.isEmpty) { super.statistics.copy(sizeInBytes = 1) } else { @@ -586,7 +586,7 @@ case class Expand( override def references: AttributeSet = AttributeSet(projections.flatten.flatMap(_.references)) - override def statistics: Statistics = { + override lazy val statistics: Statistics = { val sizeInBytes = super.statistics.sizeInBytes * projections.length Statistics(sizeInBytes = sizeInBytes) } @@ -706,7 +706,7 @@ case class Sample( override def output: Seq[Attribute] = child.output - override def statistics: Statistics = { + override lazy val statistics: Statistics = { val ratio = upperBound - lowerBound // BigInt can't multiply with Double var sizeInBytes = child.statistics.sizeInBytes * (ratio * 100).toInt / 100 @@ -753,5 +753,5 @@ case object OneRowRelation extends LeafNode { * * [[LeafNode]]s must override this. */ - override def statistics: Statistics = Statistics(sizeInBytes = 1) + override lazy val statistics: Statistics = Statistics(sizeInBytes = 1) } |