aboutsummaryrefslogtreecommitdiff
path: root/sql/hive
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2016-02-10 23:23:01 -0800
committerDavies Liu <davies.liu@gmail.com>2016-02-10 23:23:01 -0800
commit8f744fe3d931c2380613b8e5bafa1bb1fd292839 (patch)
tree9f2b217223475d6cf242146b628720cfe51f0ef9 /sql/hive
parentb5761d150b66ee0ae5f1be897d9d7a1abb039884 (diff)
downloadspark-8f744fe3d931c2380613b8e5bafa1bb1fd292839.tar.gz
spark-8f744fe3d931c2380613b8e5bafa1bb1fd292839.tar.bz2
spark-8f744fe3d931c2380613b8e5bafa1bb1fd292839.zip
[SPARK-13234] [SQL] remove duplicated SQL metrics
For lots of SQL operators, we have metrics for both of input and output, the number of input rows should be exactly the number of output rows of child, we could only have metrics for output rows. After we improved the performance using whole stage codegen, the overhead of SQL metrics are not trivial anymore, we should avoid that if it's not necessary. This PR remove all the SQL metrics for number of input rows, add SQL metric of number of output rows for all LeafNode. All remove the SQL metrics from those operators that have the same number of rows from input and output (for example, Projection, we may don't need that). The new SQL UI will looks like: ![metrics](https://cloud.githubusercontent.com/assets/40902/12965227/63614e5e-d009-11e5-88b3-84fea04f9c20.png) Author: Davies Liu <davies@databricks.com> Closes #11163 from davies/remove_metrics.
Diffstat (limited to 'sql/hive')
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala10
1 files changed, 9 insertions, 1 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index eff8833e92..235b80b7c6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.hive._
import org.apache.spark.sql.types.{BooleanType, DataType}
import org.apache.spark.util.Utils
@@ -52,6 +53,9 @@ case class HiveTableScan(
require(partitionPruningPred.isEmpty || relation.hiveQlTable.isPartitioned,
"Partition pruning predicates only supported for partitioned tables.")
+ private[sql] override lazy val metrics = Map(
+ "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))
+
override def producedAttributes: AttributeSet = outputSet ++
AttributeSet(partitionPruningPred.flatMap(_.references))
@@ -146,9 +150,13 @@ case class HiveTableScan(
prunePartitions(relation.getHiveQlPartitions(partitionPruningPred)))
}
}
+ val numOutputRows = longMetric("numOutputRows")
rdd.mapPartitionsInternal { iter =>
val proj = UnsafeProjection.create(schema)
- iter.map(proj)
+ iter.map { r =>
+ numOutputRows += 1
+ proj(r)
+ }
}
}