aboutsummaryrefslogtreecommitdiff
path: root/sql/hive/src/test
diff options
context:
space:
mode:
authorWenchen Fan <wenchen@databricks.com>2016-11-17 17:31:12 -0800
committerYin Huai <yhuai@databricks.com>2016-11-17 17:31:12 -0800
commitce13c2672318242748f7520ed4ce6bcfad4fb428 (patch)
tree76d5ca2b6d5bd8b69dcf4bf97bb77d9c22d67b3c /sql/hive/src/test
parentb0aa1aa1af6c513a6a881eaea96abdd2b480ef98 (diff)
downloadspark-ce13c2672318242748f7520ed4ce6bcfad4fb428.tar.gz
spark-ce13c2672318242748f7520ed4ce6bcfad4fb428.tar.bz2
spark-ce13c2672318242748f7520ed4ce6bcfad4fb428.zip
[SPARK-18360][SQL] default table path of tables in default database should depend on the location of default database
## What changes were proposed in this pull request? The current semantic of the warehouse config: 1. it's a static config, which means you can't change it once your spark application is launched. 2. Once a database is created, its location won't change even the warehouse path config is changed. 3. default database is a special case, although its location is fixed, but the locations of tables created in it are not. If a Spark app starts with warehouse path B(while the location of default database is A), then users create a table `tbl` in default database, its location will be `B/tbl` instead of `A/tbl`. If uses change the warehouse path config to C, and create another table `tbl2`, its location will still be `B/tbl2` instead of `C/tbl2`. rule 3 doesn't make sense and I think we made it by mistake, not intentionally. Data source tables don't follow rule 3 and treat default database like normal ones. This PR fixes hive serde tables to make it consistent with data source tables. ## How was this patch tested? HiveSparkSubmitSuite Author: Wenchen Fan <wenchen@databricks.com> Closes #15812 from cloud-fan/default-db.
Diffstat (limited to 'sql/hive/src/test')
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala76
1 files changed, 64 insertions, 12 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index fbd705172c..a670560c59 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -24,6 +24,7 @@ import java.util.Date
import scala.collection.mutable.ArrayBuffer
import scala.tools.nsc.Properties
+import org.apache.hadoop.fs.Path
import org.scalatest.{BeforeAndAfterEach, Matchers}
import org.scalatest.concurrent.Timeouts
import org.scalatest.exceptions.TestFailedDueToTimeoutException
@@ -33,11 +34,12 @@ import org.apache.spark._
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{QueryTest, Row, SparkSession}
import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
-import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, FunctionResource, JarResource}
+import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.execution.command.DDLUtils
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext}
import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer
-import org.apache.spark.sql.types.DecimalType
+import org.apache.spark.sql.types.{DecimalType, StructType}
import org.apache.spark.util.{ResetSystemProperties, Utils}
/**
@@ -295,6 +297,20 @@ class HiveSparkSubmitSuite
runSparkSubmit(args)
}
+ test("SPARK-18360: default table path of tables in default database should depend on the " +
+ "location of default database") {
+ val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
+ val args = Seq(
+ "--class", SPARK_18360.getClass.getName.stripSuffix("$"),
+ "--name", "SPARK-18360",
+ "--master", "local-cluster[2,1,1024]",
+ "--conf", "spark.ui.enabled=false",
+ "--conf", "spark.master.rest.enabled=false",
+ "--driver-java-options", "-Dderby.system.durability=test",
+ unusedJar.toString)
+ runSparkSubmit(args)
+ }
+
// NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
// This is copied from org.apache.spark.deploy.SparkSubmitSuite
private def runSparkSubmit(args: Seq[String]): Unit = {
@@ -397,11 +413,7 @@ object SetWarehouseLocationTest extends Logging {
def main(args: Array[String]): Unit = {
Utils.configTestLog4j("INFO")
- val sparkConf = new SparkConf(loadDefaults = true)
- val builder = SparkSession.builder()
- .config(sparkConf)
- .config("spark.ui.enabled", "false")
- .enableHiveSupport()
+ val sparkConf = new SparkConf(loadDefaults = true).set("spark.ui.enabled", "false")
val providedExpectedWarehouseLocation =
sparkConf.getOption("spark.sql.test.expectedWarehouseDir")
@@ -410,7 +422,7 @@ object SetWarehouseLocationTest extends Logging {
// If spark.sql.test.expectedWarehouseDir is set, the warehouse dir is set
// through spark-summit. So, neither spark.sql.warehouse.dir nor
// hive.metastore.warehouse.dir is set at here.
- (builder.getOrCreate(), warehouseDir)
+ (new TestHiveContext(new SparkContext(sparkConf)).sparkSession, warehouseDir)
case None =>
val warehouseLocation = Utils.createTempDir()
warehouseLocation.delete()
@@ -420,10 +432,10 @@ object SetWarehouseLocationTest extends Logging {
// spark.sql.warehouse.dir and hive.metastore.warehouse.dir.
// We are expecting that the value of spark.sql.warehouse.dir will override the
// value of hive.metastore.warehouse.dir.
- val session = builder
- .config("spark.sql.warehouse.dir", warehouseLocation.toString)
- .config("hive.metastore.warehouse.dir", hiveWarehouseLocation.toString)
- .getOrCreate()
+ val session = new TestHiveContext(new SparkContext(sparkConf
+ .set("spark.sql.warehouse.dir", warehouseLocation.toString)
+ .set("hive.metastore.warehouse.dir", hiveWarehouseLocation.toString)))
+ .sparkSession
(session, warehouseLocation.toString)
}
@@ -801,3 +813,43 @@ object SPARK_14244 extends QueryTest {
}
}
}
+
+object SPARK_18360 {
+ def main(args: Array[String]): Unit = {
+ val spark = SparkSession.builder()
+ .config("spark.ui.enabled", "false")
+ .enableHiveSupport().getOrCreate()
+
+ val defaultDbLocation = spark.catalog.getDatabase("default").locationUri
+ assert(new Path(defaultDbLocation) == new Path(spark.sharedState.warehousePath))
+
+ val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+
+ try {
+ val tableMeta = CatalogTable(
+ identifier = TableIdentifier("test_tbl", Some("default")),
+ tableType = CatalogTableType.MANAGED,
+ storage = CatalogStorageFormat.empty,
+ schema = new StructType().add("i", "int"),
+ provider = Some(DDLUtils.HIVE_PROVIDER))
+
+ val newWarehousePath = Utils.createTempDir().getAbsolutePath
+ hiveClient.runSqlHive(s"SET hive.metastore.warehouse.dir=$newWarehousePath")
+ hiveClient.createTable(tableMeta, ignoreIfExists = false)
+ val rawTable = hiveClient.getTable("default", "test_tbl")
+ // Hive will use the value of `hive.metastore.warehouse.dir` to generate default table
+ // location for tables in default database.
+ assert(rawTable.storage.locationUri.get.contains(newWarehousePath))
+ hiveClient.dropTable("default", "test_tbl", ignoreIfNotExists = false, purge = false)
+
+ spark.sharedState.externalCatalog.createTable(tableMeta, ignoreIfExists = false)
+ val readBack = spark.sharedState.externalCatalog.getTable("default", "test_tbl")
+ // Spark SQL will use the location of default database to generate default table
+ // location for tables in default database.
+ assert(readBack.storage.locationUri.get.contains(defaultDbLocation))
+ } finally {
+ hiveClient.dropTable("default", "test_tbl", ignoreIfNotExists = true, purge = false)
+ hiveClient.runSqlHive(s"SET hive.metastore.warehouse.dir=$defaultDbLocation")
+ }
+ }
+}