aboutsummaryrefslogtreecommitdiff
path: root/sql/hive
diff options
context:
space:
mode:
authoreyal farago <eyal farago>2016-11-02 11:12:20 +0100
committerHerman van Hovell <hvanhovell@databricks.com>2016-11-02 11:12:20 +0100
commitf151bd1af8a05d4b6c901ebe6ac0b51a4a1a20df (patch)
treeca9c328efdb1cf9961223196b0396800714eb72b /sql/hive
parent9c8deef64efee20a0ddc9b612f90e77c80aede60 (diff)
downloadspark-f151bd1af8a05d4b6c901ebe6ac0b51a4a1a20df.tar.gz
spark-f151bd1af8a05d4b6c901ebe6ac0b51a4a1a20df.tar.bz2
spark-f151bd1af8a05d4b6c901ebe6ac0b51a4a1a20df.zip
[SPARK-16839][SQL] Simplify Struct creation code path
## What changes were proposed in this pull request? Simplify struct creation, especially the aspect of `CleanupAliases` which missed some aliases when handling trees created by `CreateStruct`. This PR includes: 1. A failing test (create struct with nested aliases, some of the aliases survive `CleanupAliases`). 2. A fix that transforms `CreateStruct` into a `CreateNamedStruct` constructor, effectively eliminating `CreateStruct` from all expression trees. 3. A `NamePlaceHolder` used by `CreateStruct` when column names cannot be extracted from unresolved `NamedExpression`. 4. A new Analyzer rule that resolves `NamePlaceHolder` into a string literal once the `NamedExpression` is resolved. 5. `CleanupAliases` code was simplified as it no longer has to deal with `CreateStruct`'s top level columns. ## How was this patch tested? Running all tests-suits in package org.apache.spark.sql, especially including the analysis suite, making sure added test initially fails, after applying suggested fix rerun the entire analysis package successfully. Modified few tests that expected `CreateStruct` which is now transformed into `CreateNamedStruct`. Author: eyal farago <eyal farago> Author: Herman van Hovell <hvanhovell@databricks.com> Author: eyal farago <eyal.farago@gmail.com> Author: Eyal Farago <eyal.farago@actimize.com> Author: Hyukjin Kwon <gurwls223@gmail.com> Author: eyalfa <eyal.farago@gmail.com> Closes #15718 from hvanhovell/SPARK-16839-2.
Diffstat (limited to 'sql/hive')
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala20
-rw-r--r--sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql2
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala12
3 files changed, 22 insertions, 12 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 6eb571b91f..90000445df 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -190,6 +190,12 @@ private[hive] class TestHiveSparkSession(
new File(Thread.currentThread().getContextClassLoader.getResource(path).getFile)
}
+ private def quoteHiveFile(path : String) = if (Utils.isWindows) {
+ getHiveFile(path).getPath.replace('\\', '/')
+ } else {
+ getHiveFile(path).getPath
+ }
+
def getWarehousePath(): String = {
val tempConf = new SQLConf
sc.conf.getAll.foreach { case (k, v) => tempConf.setConfString(k, v) }
@@ -225,16 +231,16 @@ private[hive] class TestHiveSparkSession(
val hiveQTestUtilTables: Seq[TestTable] = Seq(
TestTable("src",
"CREATE TABLE src (key INT, value STRING)".cmd,
- s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv1.txt")}' INTO TABLE src".cmd),
+ s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}' INTO TABLE src".cmd),
TestTable("src1",
"CREATE TABLE src1 (key INT, value STRING)".cmd,
- s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv3.txt")}' INTO TABLE src1".cmd),
+ s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv3.txt")}' INTO TABLE src1".cmd),
TestTable("srcpart", () => {
sql(
"CREATE TABLE srcpart (key INT, value STRING) PARTITIONED BY (ds STRING, hr STRING)")
for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq("11", "12")) {
sql(
- s"""LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv1.txt")}'
+ s"""LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}'
|OVERWRITE INTO TABLE srcpart PARTITION (ds='$ds',hr='$hr')
""".stripMargin)
}
@@ -244,7 +250,7 @@ private[hive] class TestHiveSparkSession(
"CREATE TABLE srcpart1 (key INT, value STRING) PARTITIONED BY (ds STRING, hr INT)")
for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- 11 to 12) {
sql(
- s"""LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv1.txt")}'
+ s"""LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}'
|OVERWRITE INTO TABLE srcpart1 PARTITION (ds='$ds',hr='$hr')
""".stripMargin)
}
@@ -269,7 +275,7 @@ private[hive] class TestHiveSparkSession(
sql(
s"""
- |LOAD DATA LOCAL INPATH '${getHiveFile("data/files/complex.seq")}'
+ |LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/complex.seq")}'
|INTO TABLE src_thrift
""".stripMargin)
}),
@@ -308,7 +314,7 @@ private[hive] class TestHiveSparkSession(
|)
""".stripMargin.cmd,
s"""
- |LOAD DATA LOCAL INPATH '${getHiveFile("data/files/episodes.avro")}'
+ |LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/episodes.avro")}'
|INTO TABLE episodes
""".stripMargin.cmd
),
@@ -379,7 +385,7 @@ private[hive] class TestHiveSparkSession(
TestTable("src_json",
s"""CREATE TABLE src_json (json STRING) STORED AS TEXTFILE
""".stripMargin.cmd,
- s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/json.txt")}' INTO TABLE src_json".cmd)
+ s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/json.txt")}' INTO TABLE src_json".cmd)
)
hiveQTestUtilTables.foreach(registerTestTable)
diff --git a/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql b/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql
index de0116a4dc..cdda29af50 100644
--- a/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql
+++ b/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql
@@ -7,4 +7,4 @@ having b.key in (select a.key
where a.value > 'val_9' and a.value = min(b.value))
order by b.key
--------------------------------------------------------------------------------
-SELECT `gen_attr_0` AS `key`, `gen_attr_1` AS `min(value)` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `gen_attr_0`, min(`gen_attr_5`) AS `gen_attr_1`, min(`gen_attr_5`) AS `gen_attr_4` FROM (SELECT `key` AS `gen_attr_0`, `value` AS `gen_attr_5` FROM `default`.`src`) AS gen_subquery_0 GROUP BY `gen_attr_0` HAVING (struct(`gen_attr_0`, `gen_attr_4`) IN (SELECT `gen_attr_6` AS `_c0`, `gen_attr_7` AS `_c1` FROM (SELECT `gen_attr_2` AS `gen_attr_6`, `gen_attr_3` AS `gen_attr_7` FROM (SELECT `gen_attr_2`, `gen_attr_3` FROM (SELECT `key` AS `gen_attr_2`, `value` AS `gen_attr_3` FROM `default`.`src`) AS gen_subquery_3 WHERE (`gen_attr_3` > 'val_9')) AS gen_subquery_2) AS gen_subquery_4))) AS gen_subquery_1 ORDER BY `gen_attr_0` ASC NULLS FIRST) AS b
+SELECT `gen_attr_0` AS `key`, `gen_attr_1` AS `min(value)` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `gen_attr_0`, min(`gen_attr_5`) AS `gen_attr_1`, min(`gen_attr_5`) AS `gen_attr_4` FROM (SELECT `key` AS `gen_attr_0`, `value` AS `gen_attr_5` FROM `default`.`src`) AS gen_subquery_0 GROUP BY `gen_attr_0` HAVING (named_struct('gen_attr_0', `gen_attr_0`, 'gen_attr_4', `gen_attr_4`) IN (SELECT `gen_attr_6` AS `_c0`, `gen_attr_7` AS `_c1` FROM (SELECT `gen_attr_2` AS `gen_attr_6`, `gen_attr_3` AS `gen_attr_7` FROM (SELECT `gen_attr_2`, `gen_attr_3` FROM (SELECT `key` AS `gen_attr_2`, `value` AS `gen_attr_3` FROM `default`.`src`) AS gen_subquery_3 WHERE (`gen_attr_3` > 'val_9')) AS gen_subquery_2) AS gen_subquery_4))) AS gen_subquery_1 ORDER BY `gen_attr_0` ASC NULLS FIRST) AS b
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
index c7f10e569f..12d18dc87c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, NoSuchFileException, Paths}
+import scala.io.Source
import scala.util.control.NonFatal
import org.apache.spark.sql.Column
@@ -109,12 +110,15 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
Files.write(path, answerText.getBytes(StandardCharsets.UTF_8))
} else {
val goldenFileName = s"sqlgen/$answerFile.sql"
- val resourceFile = getClass.getClassLoader.getResource(goldenFileName)
- if (resourceFile == null) {
+ val resourceStream = getClass.getClassLoader.getResourceAsStream(goldenFileName)
+ if (resourceStream == null) {
throw new NoSuchFileException(goldenFileName)
}
- val path = resourceFile.getPath
- val answerText = new String(Files.readAllBytes(Paths.get(path)), StandardCharsets.UTF_8)
+ val answerText = try {
+ Source.fromInputStream(resourceStream).mkString
+ } finally {
+ resourceStream.close
+ }
val sqls = answerText.split(separator)
assert(sqls.length == 2, "Golden sql files should have a separator.")
val expectedSQL = sqls(1).trim()