[SPARK-14838] [SQL] Set default size for ObjecType to avoid failure when estimating sizeInBytes in ObjectProducer

## What changes were proposed in this pull request? We have logical plans that produce domain objects which are `ObjectType`. As we can't estimate the size of `ObjectType`, we throw an `UnsupportedOperationException` if trying to do that. We should set a default size for `ObjectType` to avoid this failure. ## How was this patch tested? `DatasetSuite`. Author: Liang-Chi Hsieh <simonh@tw.ibm.com> Closes #12599 from viirya/skip-broadcast-objectproducer.
author: Liang-Chi Hsieh <simonh@tw.ibm.com> 2016-04-23 21:15:31 -0700
committer: Davies Liu <davies.liu@gmail.com> 2016-04-23 21:15:31 -0700
commit: ba5e0b87a043e46e9599695c82d90e7572185aa5 (patch)
tree: 7c13d9c4015caac6b187f858cf07ccac70db28da
parent: 1b7eab74e64f554bbf892c8ef7b7ec00b359d2c0 (diff)
download: spark-ba5e0b87a043e46e9599695c82d90e7572185aa5.tar.gz
spark-ba5e0b87a043e46e9599695c82d90e7572185aa5.tar.bz2
spark-ba5e0b87a043e46e9599695c82d90e7572185aa5.zip
2 files changed, 24 insertions, 2 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
index b7b1acc582..c741a2dd3e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
@@ -37,8 +37,7 @@ private[sql] object ObjectType extends AbstractDataType {
  * outside of the execution engine.
  */
 private[sql] case class ObjectType(cls: Class[_]) extends DataType {
-  override def defaultSize: Int =
-    throw new UnsupportedOperationException("No size estimation available for objects.")
+  override def defaultSize: Int = 4096
 
   def asNullable: DataType = this
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index a6e3bd3a91..eee21acf75 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -630,6 +630,29 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     // Make sure the generated code for this plan can compile and execute.
     checkDataset(wideDF.map(_.getLong(0)), 0L until 10 : _*)
   }
+
+  test("SPARK-14838: estimating sizeInBytes in operators with ObjectProducer shouldn't fail") {
+    val dataset = Seq(
+      (0, 3, 54f),
+      (0, 4, 44f),
+      (0, 5, 42f),
+      (1, 3, 39f),
+      (1, 5, 33f),
+      (1, 4, 26f),
+      (2, 3, 51f),
+      (2, 5, 45f),
+      (2, 4, 30f)
+    ).toDF("user", "item", "rating")
+
+    val actual = dataset
+      .select("user", "item")
+      .as[(Int, Int)]
+      .groupByKey(_._1)
+      .mapGroups { case (src, ids) => (src, ids.map(_._2).toArray) }
+      .toDF("id", "actual")
+
+    dataset.join(actual, dataset("user") === actual("id")).collect()
+  }
 }
 
 case class OtherTuple(_1: String, _2: Int)
author	Liang-Chi Hsieh <simonh@tw.ibm.com>	2016-04-23 21:15:31 -0700
committer	Davies Liu <davies.liu@gmail.com>	2016-04-23 21:15:31 -0700
commit	ba5e0b87a043e46e9599695c82d90e7572185aa5 (patch)
tree	7c13d9c4015caac6b187f858cf07ccac70db28da
parent	1b7eab74e64f554bbf892c8ef7b7ec00b359d2c0 (diff)
download	spark-ba5e0b87a043e46e9599695c82d90e7572185aa5.tar.gz spark-ba5e0b87a043e46e9599695c82d90e7572185aa5.tar.bz2 spark-ba5e0b87a043e46e9599695c82d90e7572185aa5.zip