From b4c32c4952f7af2733258aa4e27f21e8832c8a3a Mon Sep 17 00:00:00 2001
From: Yadong Qi <qiyadong2010@gmail.com>
Date: Sat, 28 May 2016 10:19:29 -0700
Subject: [SPARK-15549][SQL] Disable bucketing when the output doesn't contain
 all bucketing columns

## What changes were proposed in this pull request?
I create a bucketed table bucketed_table with bucket column i,
```scala
case class Data(i: Int, j: Int, k: Int)
sc.makeRDD(Array((1, 2, 3))).map(x => Data(x._1, x._2, x._3)).toDF.write.bucketBy(2, "i").saveAsTable("bucketed_table")
```

and I run the following SQLs:
```sql
SELECT j FROM bucketed_table;
Error in query: bucket column i not found in existing columns (j);

SELECT j, MAX(k) FROM bucketed_table GROUP BY j;
Error in query: bucket column i not found in existing columns (j, k);
```

I think we should add a check that, we only enable bucketing when it satisfies all conditions below:
1. the conf is enabled
2. the relation is bucketed
3. the output contains all bucketing columns

## How was this patch tested?
Updated test cases to reflect the changes.

Author: Yadong Qi <qiyadong2010@gmail.com>

Closes #13321 from watermen/SPARK-15549.
---
 .../org/apache/spark/sql/sources/BucketedReadSuite.scala      | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'sql/hive')

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
index f9891ac571..bab0092c37 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -362,4 +362,15 @@ class BucketedReadSuite extends QueryTest with SQLTestUtils with TestHiveSinglet
       assert(error.toString contains "Invalid bucket file")
     }
   }
+
+  test("disable bucketing when the output doesn't contain all bucketing columns") {
+    withTable("bucketed_table") {
+      df1.write.format("parquet").bucketBy(8, "i").saveAsTable("bucketed_table")
+
+      checkAnswer(hiveContext.table("bucketed_table").select("j"), df1.select("j"))
+
+      checkAnswer(hiveContext.table("bucketed_table").groupBy("j").agg(max("k")),
+        df1.groupBy("j").agg(max("k")))
+    }
+  }
 }
-- 
cgit v1.2.3