[SPARK-19587][SQL] bucket sorting columns should not be picked from partition columns

## What changes were proposed in this pull request? We will throw an exception if bucket columns are part of partition columns, this should also apply to sort columns. This PR also move the checking logic from `DataFrameWriter` to `PreprocessTableCreation`, which is the central place for checking and normailization. ## How was this patch tested? updated test. Author: Wenchen Fan <wenchen@databricks.com> Closes #16931 from cloud-fan/bucket.
author: Wenchen Fan <wenchen@databricks.com> 2017-02-15 08:15:03 -0800
committer: Wenchen Fan <wenchen@databricks.com> 2017-02-15 08:15:03 -0800
commit: 8b75f8c1c9acae9c5c0dee92ad4f50195bf185d4 (patch)
tree: 1a4b683efb6bd3c28ffd3021a0a8c58494c3dd06 /sql/hive
parent: 733c59ec1ee5746c322e68459cd06241f5fa0903 (diff)
download: spark-8b75f8c1c9acae9c5c0dee92ad4f50195bf185d4.tar.gz
spark-8b75f8c1c9acae9c5c0dee92ad4f50195bf185d4.tar.bz2
spark-8b75f8c1c9acae9c5c0dee92ad4f50195bf185d4.zip
1 files changed, 8 insertions, 7 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
index 2eafe18b85..8528dfc4ce 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
@@ -169,19 +169,20 @@ class BucketedWriteSuite extends QueryTest with SQLTestUtils with TestHiveSingle
     }
   }
 
-  test("write bucketed data with the overlapping bucketBy and partitionBy columns") {
-    intercept[AnalysisException](df.write
+  test("write bucketed data with the overlapping bucketBy/sortBy and partitionBy columns") {
+    val e1 = intercept[AnalysisException](df.write
       .partitionBy("i", "j")
       .bucketBy(8, "j", "k")
       .sortBy("k")
       .saveAsTable("bucketed_table"))
-  }
+    assert(e1.message.contains("bucketing column 'j' should not be part of partition columns"))
 
-  test("write bucketed data with the identical bucketBy and partitionBy columns") {
-    intercept[AnalysisException](df.write
-      .partitionBy("i")
-      .bucketBy(8, "i")
+    val e2 = intercept[AnalysisException](df.write
+      .partitionBy("i", "j")
+      .bucketBy(8, "k")
+      .sortBy("i")
       .saveAsTable("bucketed_table"))
+    assert(e2.message.contains("bucket sorting column 'i' should not be part of partition columns"))
   }
 
   test("write bucketed data without partitionBy") {
author	Wenchen Fan <wenchen@databricks.com>	2017-02-15 08:15:03 -0800
committer	Wenchen Fan <wenchen@databricks.com>	2017-02-15 08:15:03 -0800
commit	8b75f8c1c9acae9c5c0dee92ad4f50195bf185d4 (patch)
tree	1a4b683efb6bd3c28ffd3021a0a8c58494c3dd06 /sql/hive
parent	733c59ec1ee5746c322e68459cd06241f5fa0903 (diff)
download	spark-8b75f8c1c9acae9c5c0dee92ad4f50195bf185d4.tar.gz spark-8b75f8c1c9acae9c5c0dee92ad4f50195bf185d4.tar.bz2 spark-8b75f8c1c9acae9c5c0dee92ad4f50195bf185d4.zip