[SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating statistics

## What changes were proposed in this pull request? This patch reduces the default number element estimation for arrays and maps from 100 to 1. The issue with the 100 number is that when nested (e.g. an array of map), 100 * 100 would be used as the default size. This sounds like just an overestimation which doesn't seem that bad (since it is usually better to overestimate than underestimate). However, due to the way we assume the size output for Project (new estimated column size / old estimated column size), this overestimation can become underestimation. It is actually in general in this case safer to assume 1 default element. ## How was this patch tested? This should be covered by existing tests. Author: Reynold Xin <rxin@databricks.com> Closes #16274 from rxin/SPARK-18853.
author: Reynold Xin <rxin@databricks.com> 2016-12-14 21:22:49 +0100
committer: Herman van Hovell <hvanhovell@databricks.com> 2016-12-14 21:22:49 +0100
commit: 5d799473696a15fddd54ec71a93b6f8cb169810c (patch)
tree: 128a9254f375abfb55340f255b6b18d531921ecf /sql
parent: 89ae26dcdb73266fbc3a8b6da9f5dff30dc4ec95 (diff)
download: spark-5d799473696a15fddd54ec71a93b6f8cb169810c.tar.gz
spark-5d799473696a15fddd54ec71a93b6f8cb169810c.tar.bz2
spark-5d799473696a15fddd54ec71a93b6f8cb169810c.zip
3 files changed, 13 insertions, 13 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index d409271fbc..98efba199a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -78,10 +78,10 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT
       ("containsNull" -> containsNull)
 
   /**
-   * The default size of a value of the ArrayType is 100 * the default size of the element type.
-   * (We assume that there are 100 elements).
+   * The default size of a value of the ArrayType is the default size of the element type.
+   * We assume that there is only 1 element on average in an array. See SPARK-18853.
    */
-  override def defaultSize: Int = 100 * elementType.defaultSize
+  override def defaultSize: Int = 1 * elementType.defaultSize
 
   override def simpleString: String = s"array<${elementType.simpleString}>"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
index fbf3a61786..6691b81dce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
@@ -56,10 +56,10 @@ case class MapType(
 
   /**
    * The default size of a value of the MapType is
-   * 100 * (the default size of the key type + the default size of the value type).
-   * (We assume that there are 100 elements).
+   * (the default size of the key type + the default size of the value type).
+   * We assume that there is only 1 element on average in a map. See SPARK-18853.
    */
-  override def defaultSize: Int = 100 * (keyType.defaultSize + valueType.defaultSize)
+  override def defaultSize: Int = 1 * (keyType.defaultSize + valueType.defaultSize)
 
   override def simpleString: String = s"map<${keyType.simpleString},${valueType.simpleString}>"
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index b8ab9a9963..12d2c00dc9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -253,7 +253,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkDataTypeJsonRepr(structType)
 
   def checkDefaultSize(dataType: DataType, expectedDefaultSize: Int): Unit = {
-    test(s"Check the default size of ${dataType}") {
+    test(s"Check the default size of $dataType") {
       assert(dataType.defaultSize === expectedDefaultSize)
     }
   }
@@ -272,18 +272,18 @@ class DataTypeSuite extends SparkFunSuite {
   checkDefaultSize(TimestampType, 8)
   checkDefaultSize(StringType, 20)
   checkDefaultSize(BinaryType, 100)
-  checkDefaultSize(ArrayType(DoubleType, true), 800)
-  checkDefaultSize(ArrayType(StringType, false), 2000)
-  checkDefaultSize(MapType(IntegerType, StringType, true), 2400)
-  checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 80400)
-  checkDefaultSize(structType, 812)
+  checkDefaultSize(ArrayType(DoubleType, true), 8)
+  checkDefaultSize(ArrayType(StringType, false), 20)
+  checkDefaultSize(MapType(IntegerType, StringType, true), 24)
+  checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 12)
+  checkDefaultSize(structType, 20)
 
   def checkEqualsIgnoreCompatibleNullability(
       from: DataType,
       to: DataType,
       expected: Boolean): Unit = {
     val testName =
-      s"equalsIgnoreCompatibleNullability: (from: ${from}, to: ${to})"
+      s"equalsIgnoreCompatibleNullability: (from: $from, to: $to)"
     test(testName) {
       assert(DataType.equalsIgnoreCompatibleNullability(from, to) === expected)
     }
author	Reynold Xin <rxin@databricks.com>	2016-12-14 21:22:49 +0100
committer	Herman van Hovell <hvanhovell@databricks.com>	2016-12-14 21:22:49 +0100
commit	5d799473696a15fddd54ec71a93b6f8cb169810c (patch)
tree	128a9254f375abfb55340f255b6b18d531921ecf /sql
parent	89ae26dcdb73266fbc3a8b6da9f5dff30dc4ec95 (diff)
download	spark-5d799473696a15fddd54ec71a93b6f8cb169810c.tar.gz spark-5d799473696a15fddd54ec71a93b6f8cb169810c.tar.bz2 spark-5d799473696a15fddd54ec71a93b6f8cb169810c.zip