aboutsummaryrefslogtreecommitdiff
path: root/sql/hive
diff options
context:
space:
mode:
authorjiangxingbo <jiangxb1987@gmail.com>2017-02-09 19:35:39 +0100
committerHerman van Hovell <hvanhovell@databricks.com>2017-02-09 19:35:39 +0100
commitaf63c52fd36c59525d9504003b15142dc850fccb (patch)
tree1241c433e4fc7250ff0b3f4ae6e6025f28cd4711 /sql/hive
parent1af0dee4189fd00398864a2ea2d504079b67a7f5 (diff)
downloadspark-af63c52fd36c59525d9504003b15142dc850fccb.tar.gz
spark-af63c52fd36c59525d9504003b15142dc850fccb.tar.bz2
spark-af63c52fd36c59525d9504003b15142dc850fccb.zip
[SPARK-19025][SQL] Remove SQL builder for operators
## What changes were proposed in this pull request? With the new approach of view resolution, we can get rid of SQL generation on view creation, so let's remove SQL builder for operators. Note that, since all sql generation for operators is defined in one file (org.apache.spark.sql.catalyst.SQLBuilder), it’d be trivial to recover it in the future. ## How was this patch tested? N/A Author: jiangxingbo <jiangxb1987@gmail.com> Closes #16869 from jiangxb1987/SQLBuilder.
Diffstat (limited to 'sql/hive')
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala290
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala1201
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/catalyst/SQLBuilderTest.scala29
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala54
4 files changed, 2 insertions, 1572 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala
deleted file mode 100644
index 1daa6f7822..0000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst
-
-import scala.util.control.NonFatal
-
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.test.SQLTestUtils
-
-class ExpressionToSQLSuite extends SQLBuilderTest with SQLTestUtils {
- import testImplicits._
-
- protected override def beforeAll(): Unit = {
- super.beforeAll()
- sql("DROP TABLE IF EXISTS t0")
- sql("DROP TABLE IF EXISTS t1")
- sql("DROP TABLE IF EXISTS t2")
-
- val bytes = Array[Byte](1, 2, 3, 4)
- Seq((bytes, "AQIDBA==")).toDF("a", "b").write.saveAsTable("t0")
-
- spark
- .range(10)
- .select('id as 'key, concat(lit("val_"), 'id) as 'value)
- .write
- .saveAsTable("t1")
-
- spark.range(10).select('id as 'a, 'id as 'b, 'id as 'c, 'id as 'd).write.saveAsTable("t2")
- }
-
- override protected def afterAll(): Unit = {
- try {
- sql("DROP TABLE IF EXISTS t0")
- sql("DROP TABLE IF EXISTS t1")
- sql("DROP TABLE IF EXISTS t2")
- } finally {
- super.afterAll()
- }
- }
-
- private def checkSqlGeneration(hiveQl: String): Unit = {
- val df = sql(hiveQl)
-
- val convertedSQL = try new SQLBuilder(df).toSQL catch {
- case NonFatal(e) =>
- fail(
- s"""Cannot convert the following HiveQL query plan back to SQL query string:
- |
- |# Original HiveQL query string:
- |$hiveQl
- |
- |# Resolved query plan:
- |${df.queryExecution.analyzed.treeString}
- """.stripMargin)
- }
-
- try {
- checkAnswer(sql(convertedSQL), df)
- } catch { case cause: Throwable =>
- fail(
- s"""Failed to execute converted SQL string or got wrong answer:
- |
- |# Converted SQL query string:
- |$convertedSQL
- |
- |# Original HiveQL query string:
- |$hiveQl
- |
- |# Resolved query plan:
- |${df.queryExecution.analyzed.treeString}
- """.stripMargin,
- cause)
- }
- }
-
- test("misc non-aggregate functions") {
- checkSqlGeneration("SELECT abs(15), abs(-15)")
- checkSqlGeneration("SELECT array(1,2,3)")
- checkSqlGeneration("SELECT coalesce(null, 1, 2)")
- checkSqlGeneration("SELECT explode(array(1,2,3))")
- checkSqlGeneration("SELECT explode_outer(array())")
- checkSqlGeneration("SELECT greatest(1,null,3)")
- checkSqlGeneration("SELECT if(1==2, 'yes', 'no')")
- checkSqlGeneration("SELECT isnan(15), isnan('invalid')")
- checkSqlGeneration("SELECT isnull(null), isnull('a')")
- checkSqlGeneration("SELECT isnotnull(null), isnotnull('a')")
- checkSqlGeneration("SELECT least(1,null,3)")
- checkSqlGeneration("SELECT map(1, 'a', 2, 'b')")
- checkSqlGeneration("SELECT named_struct('c1',1,'c2',2,'c3',3)")
- checkSqlGeneration("SELECT nanvl(a, 5), nanvl(b, 10), nanvl(d, c) from t2")
- checkSqlGeneration("SELECT posexplode_outer(array())")
- checkSqlGeneration("SELECT inline_outer(array(struct('a', 1)))")
- checkSqlGeneration("SELECT rand(1)")
- checkSqlGeneration("SELECT randn(3)")
- checkSqlGeneration("SELECT struct(1,2,3)")
- }
-
- test("math functions") {
- checkSqlGeneration("SELECT acos(-1)")
- checkSqlGeneration("SELECT asin(-1)")
- checkSqlGeneration("SELECT atan(1)")
- checkSqlGeneration("SELECT atan2(1, 1)")
- checkSqlGeneration("SELECT bin(10)")
- checkSqlGeneration("SELECT cbrt(1000.0)")
- checkSqlGeneration("SELECT ceil(2.333)")
- checkSqlGeneration("SELECT ceiling(2.333)")
- checkSqlGeneration("SELECT cos(1.0)")
- checkSqlGeneration("SELECT cosh(1.0)")
- checkSqlGeneration("SELECT conv(15, 10, 16)")
- checkSqlGeneration("SELECT degrees(pi())")
- checkSqlGeneration("SELECT e()")
- checkSqlGeneration("SELECT exp(1.0)")
- checkSqlGeneration("SELECT expm1(1.0)")
- checkSqlGeneration("SELECT floor(-2.333)")
- checkSqlGeneration("SELECT factorial(5)")
- checkSqlGeneration("SELECT hex(10)")
- checkSqlGeneration("SELECT hypot(3, 4)")
- checkSqlGeneration("SELECT log(10.0)")
- checkSqlGeneration("SELECT log10(1000.0)")
- checkSqlGeneration("SELECT log1p(0.0)")
- checkSqlGeneration("SELECT log2(8.0)")
- checkSqlGeneration("SELECT ln(10.0)")
- checkSqlGeneration("SELECT negative(-1)")
- checkSqlGeneration("SELECT pi()")
- checkSqlGeneration("SELECT pmod(3, 2)")
- checkSqlGeneration("SELECT positive(3)")
- checkSqlGeneration("SELECT pow(2, 3)")
- checkSqlGeneration("SELECT power(2, 3)")
- checkSqlGeneration("SELECT radians(180.0)")
- checkSqlGeneration("SELECT rint(1.63)")
- checkSqlGeneration("SELECT round(31.415, -1)")
- checkSqlGeneration("SELECT shiftleft(2, 3)")
- checkSqlGeneration("SELECT shiftright(16, 3)")
- checkSqlGeneration("SELECT shiftrightunsigned(16, 3)")
- checkSqlGeneration("SELECT sign(-2.63)")
- checkSqlGeneration("SELECT signum(-2.63)")
- checkSqlGeneration("SELECT sin(1.0)")
- checkSqlGeneration("SELECT sinh(1.0)")
- checkSqlGeneration("SELECT sqrt(100.0)")
- checkSqlGeneration("SELECT tan(1.0)")
- checkSqlGeneration("SELECT tanh(1.0)")
- }
-
- test("aggregate functions") {
- checkSqlGeneration("SELECT approx_count_distinct(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT percentile_approx(value, 0.25) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT percentile_approx(value, array(0.25, 0.75)) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT percentile_approx(value, 0.25, 100) FROM t1 GROUP BY key")
- checkSqlGeneration(
- "SELECT percentile_approx(value, array(0.25, 0.75), 100) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT avg(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT corr(value, key) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT count(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT covar_pop(value, key) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT covar_samp(value, key) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT first(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT first_value(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT kurtosis(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT last(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT last_value(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT max(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT mean(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT min(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT percentile(value, 0.25) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT percentile(value, array(0.25, 0.75)) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT skewness(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT stddev(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT stddev_pop(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT stddev_samp(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT sum(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT variance(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT var_pop(value) FROM t1 GROUP BY key")
- checkSqlGeneration("SELECT var_samp(value) FROM t1 GROUP BY key")
- }
-
- test("string functions") {
- checkSqlGeneration("SELECT ascii('SparkSql')")
- checkSqlGeneration("SELECT base64(a) FROM t0")
- checkSqlGeneration("SELECT concat('This ', 'is ', 'a ', 'test')")
- checkSqlGeneration("SELECT concat_ws(' ', 'This', 'is', 'a', 'test')")
- checkSqlGeneration("SELECT decode(a, 'UTF-8') FROM t0")
- checkSqlGeneration("SELECT encode('SparkSql', 'UTF-8')")
- checkSqlGeneration("SELECT find_in_set('ab', 'abc,b,ab,c,def')")
- checkSqlGeneration("SELECT format_number(1234567.890, 2)")
- checkSqlGeneration("SELECT format_string('aa%d%s',123, 'cc')")
- checkSqlGeneration("SELECT get_json_object('{\"a\":\"bc\"}','$.a')")
- checkSqlGeneration("SELECT initcap('This is a test')")
- checkSqlGeneration("SELECT instr('This is a test', 'is')")
- checkSqlGeneration("SELECT lcase('SparkSql')")
- checkSqlGeneration("SELECT length('This is a test')")
- checkSqlGeneration("SELECT levenshtein('This is a test', 'Another test')")
- checkSqlGeneration("SELECT lower('SparkSql')")
- checkSqlGeneration("SELECT locate('is', 'This is a test', 3)")
- checkSqlGeneration("SELECT lpad('SparkSql', 16, 'Learning')")
- checkSqlGeneration("SELECT ltrim(' SparkSql ')")
- checkSqlGeneration("SELECT json_tuple('{\"f1\": \"value1\", \"f2\": \"value2\"}','f1')")
- checkSqlGeneration("SELECT printf('aa%d%s', 123, 'cc')")
- checkSqlGeneration("SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1)")
- checkSqlGeneration("SELECT regexp_replace('100-200', '(\\d+)', 'num')")
- checkSqlGeneration("SELECT repeat('SparkSql', 3)")
- checkSqlGeneration("SELECT reverse('SparkSql')")
- checkSqlGeneration("SELECT rpad('SparkSql', 16, ' is Cool')")
- checkSqlGeneration("SELECT rtrim(' SparkSql ')")
- checkSqlGeneration("SELECT soundex('SparkSql')")
- checkSqlGeneration("SELECT space(2)")
- checkSqlGeneration("SELECT split('aa2bb3cc', '[1-9]+')")
- checkSqlGeneration("SELECT space(2)")
- checkSqlGeneration("SELECT substr('This is a test', 1)")
- checkSqlGeneration("SELECT substring('This is a test', 1)")
- checkSqlGeneration("SELECT substring_index('www.apache.org','.',1)")
- checkSqlGeneration("SELECT translate('translate', 'rnlt', '123')")
- checkSqlGeneration("SELECT trim(' SparkSql ')")
- checkSqlGeneration("SELECT ucase('SparkSql')")
- checkSqlGeneration("SELECT unbase64('SparkSql')")
- checkSqlGeneration("SELECT unhex(41)")
- checkSqlGeneration("SELECT upper('SparkSql')")
- }
-
- test("datetime functions") {
- checkSqlGeneration("SELECT add_months('2001-03-31', 1)")
- checkSqlGeneration("SELECT count(current_date())")
- checkSqlGeneration("SELECT count(current_timestamp())")
- checkSqlGeneration("SELECT datediff('2001-01-02', '2001-01-01')")
- checkSqlGeneration("SELECT date_add('2001-01-02', 1)")
- checkSqlGeneration("SELECT date_format('2001-05-02', 'yyyy-dd')")
- checkSqlGeneration("SELECT date_sub('2001-01-02', 1)")
- checkSqlGeneration("SELECT day('2001-05-02')")
- checkSqlGeneration("SELECT dayofyear('2001-05-02')")
- checkSqlGeneration("SELECT dayofmonth('2001-05-02')")
- checkSqlGeneration("SELECT from_unixtime(1000, 'yyyy-MM-dd HH:mm:ss')")
- checkSqlGeneration("SELECT from_utc_timestamp('2015-07-24 00:00:00', 'PST')")
- checkSqlGeneration("SELECT hour('11:35:55')")
- checkSqlGeneration("SELECT last_day('2001-01-01')")
- checkSqlGeneration("SELECT minute('11:35:55')")
- checkSqlGeneration("SELECT month('2001-05-02')")
- checkSqlGeneration("SELECT months_between('2001-10-30 10:30:00', '1996-10-30')")
- checkSqlGeneration("SELECT next_day('2001-05-02', 'TU')")
- checkSqlGeneration("SELECT count(now())")
- checkSqlGeneration("SELECT quarter('2001-05-02')")
- checkSqlGeneration("SELECT second('11:35:55')")
- checkSqlGeneration("SELECT to_timestamp('2001-10-30 10:30:00', 'yyyy-MM-dd HH:mm:ss')")
- checkSqlGeneration("SELECT to_date('2001-10-30 10:30:00')")
- checkSqlGeneration("SELECT to_unix_timestamp('2015-07-24 00:00:00', 'yyyy-MM-dd HH:mm:ss')")
- checkSqlGeneration("SELECT to_utc_timestamp('2015-07-24 00:00:00', 'PST')")
- checkSqlGeneration("SELECT trunc('2001-10-30 10:30:00', 'YEAR')")
- checkSqlGeneration("SELECT unix_timestamp('2001-10-30 10:30:00')")
- checkSqlGeneration("SELECT weekofyear('2001-05-02')")
- checkSqlGeneration("SELECT year('2001-05-02')")
-
- checkSqlGeneration("SELECT interval 3 years - 3 month 7 week 123 microseconds as i")
- }
-
- test("collection functions") {
- checkSqlGeneration("SELECT array_contains(array(2, 9, 8), 9)")
- checkSqlGeneration("SELECT size(array('b', 'd', 'c', 'a'))")
- checkSqlGeneration("SELECT sort_array(array('b', 'd', 'c', 'a'))")
- }
-
- test("misc functions") {
- checkSqlGeneration("SELECT crc32('Spark')")
- checkSqlGeneration("SELECT md5('Spark')")
- checkSqlGeneration("SELECT hash('Spark')")
- checkSqlGeneration("SELECT sha('Spark')")
- checkSqlGeneration("SELECT sha1('Spark')")
- checkSqlGeneration("SELECT sha2('Spark', 0)")
- checkSqlGeneration("SELECT spark_partition_id()")
- checkSqlGeneration("SELECT input_file_name()")
- checkSqlGeneration("SELECT monotonically_increasing_id()")
- }
-
- test("subquery") {
- checkSqlGeneration("SELECT 1 + (SELECT 2)")
- checkSqlGeneration("SELECT 1 + (SELECT 2 + (SELECT 3 as a))")
- }
-}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
deleted file mode 100644
index fe171a6ee8..0000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
+++ /dev/null
@@ -1,1201 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst
-
-import java.nio.charset.StandardCharsets
-import java.nio.file.{Files, NoSuchFileException, Paths}
-
-import scala.io.Source
-import scala.util.control.NonFatal
-
-import org.apache.spark.TestUtils
-import org.apache.spark.sql.Column
-import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.parser.ParseException
-import org.apache.spark.sql.catalyst.plans.logical.LeafNode
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.test.SQLTestUtils
-
-/**
- * A test suite for LogicalPlan-to-SQL conversion.
- *
- * Each query has a golden generated SQL file in test/resources/sqlgen. The test suite also has
- * built-in functionality to automatically generate these golden files.
- *
- * To re-generate golden files, run:
- * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "hive/test-only *LogicalPlanToSQLSuite"
- */
-class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
- import testImplicits._
-
- // Used for generating new query answer files by saving
- private val regenerateGoldenFiles: Boolean = System.getenv("SPARK_GENERATE_GOLDEN_FILES") == "1"
- private val goldenSQLPath = {
- // If regenerateGoldenFiles is true, we must be running this in SBT and we use hard-coded
- // relative path. Otherwise, we use classloader's getResource to find the location.
- if (regenerateGoldenFiles) {
- java.nio.file.Paths.get("src", "test", "resources", "sqlgen").toFile.getCanonicalPath
- } else {
- getTestResourcePath("sqlgen")
- }
- }
-
- protected override def beforeAll(): Unit = {
- super.beforeAll()
- (0 to 3).foreach { i =>
- sql(s"DROP TABLE IF EXISTS parquet_t$i")
- }
- sql("DROP TABLE IF EXISTS t0")
-
- spark.range(10).write.saveAsTable("parquet_t0")
- sql("CREATE TABLE t0 AS SELECT * FROM parquet_t0")
-
- spark
- .range(10)
- .select('id as 'key, concat(lit("val_"), 'id) as 'value)
- .write
- .saveAsTable("parquet_t1")
-
- spark
- .range(10)
- .select('id as 'a, 'id as 'b, 'id as 'c, 'id as 'd)
- .write
- .saveAsTable("parquet_t2")
-
- def createArray(id: Column): Column = {
- when(id % 3 === 0, lit(null)).otherwise(array('id, 'id + 1))
- }
-
- spark
- .range(10)
- .select(
- createArray('id).as("arr"),
- array(array('id), createArray('id)).as("arr2"),
- lit("""{"f1": "1", "f2": "2", "f3": 3}""").as("json"),
- 'id
- )
- .write
- .saveAsTable("parquet_t3")
- }
-
- override protected def afterAll(): Unit = {
- try {
- (0 to 3).foreach { i =>
- sql(s"DROP TABLE IF EXISTS parquet_t$i")
- }
- sql("DROP TABLE IF EXISTS t0")
- } finally {
- super.afterAll()
- }
- }
-
- /**
- * Compare the generated SQL with the expected answer string.
- */
- private def checkSQLStructure(originalSQL: String, convertedSQL: String, answerFile: String) = {
- if (answerFile != null) {
- val separator = "-" * 80
- if (regenerateGoldenFiles) {
- val path = Paths.get(s"$goldenSQLPath/$answerFile.sql")
- val header = "-- This file is automatically generated by LogicalPlanToSQLSuite."
- val answerText = s"$header\n${originalSQL.trim()}\n${separator}\n$convertedSQL\n"
- Files.write(path, answerText.getBytes(StandardCharsets.UTF_8))
- } else {
- val goldenFileName = s"sqlgen/$answerFile.sql"
- val resourceStream = getClass.getClassLoader.getResourceAsStream(goldenFileName)
- if (resourceStream == null) {
- throw new NoSuchFileException(goldenFileName)
- }
- val answerText = try {
- Source.fromInputStream(resourceStream).mkString
- } finally {
- resourceStream.close
- }
- val sqls = answerText.split(separator)
- assert(sqls.length == 2, "Golden sql files should have a separator.")
- val expectedSQL = sqls(1).trim()
- assert(convertedSQL == expectedSQL)
- }
- }
- }
-
- /**
- * 1. Checks if SQL parsing succeeds.
- * 2. Checks if SQL generation succeeds.
- * 3. Checks the generated SQL against golden files.
- * 4. Verifies the execution result stays the same.
- */
- private def checkSQL(sqlString: String, answerFile: String = null): Unit = {
- val df = sql(sqlString)
-
- val convertedSQL = try new SQLBuilder(df).toSQL catch {
- case NonFatal(e) =>
- fail(
- s"""Cannot convert the following SQL query plan back to SQL query string:
- |
- |# Original SQL query string:
- |$sqlString
- |
- |# Resolved query plan:
- |${df.queryExecution.analyzed.treeString}
- """.stripMargin, e)
- }
-
- checkSQLStructure(sqlString, convertedSQL, answerFile)
-
- try {
- checkAnswer(sql(convertedSQL), df)
- } catch { case cause: Throwable =>
- fail(
- s"""Failed to execute converted SQL string or got wrong answer:
- |
- |# Converted SQL query string:
- |$convertedSQL
- |
- |# Original SQL query string:
- |$sqlString
- |
- |# Resolved query plan:
- |${df.queryExecution.analyzed.treeString}
- """.stripMargin, cause)
- }
- }
-
- // When saving golden files, these tests should be ignored to prevent making files.
- if (!regenerateGoldenFiles) {
- test("Test should fail if the SQL query cannot be parsed") {
- val m = intercept[ParseException] {
- checkSQL("SELE", "NOT_A_FILE")
- }.getMessage
- assert(m.contains("mismatched input"))
- }
-
- test("Test should fail if the golden file cannot be found") {
- val m2 = intercept[NoSuchFileException] {
- checkSQL("SELECT 1", "NOT_A_FILE")
- }.getMessage
- assert(m2.contains("NOT_A_FILE"))
- }
-
- test("Test should fail if the SQL query cannot be regenerated") {
- case class Unsupported() extends LeafNode with MultiInstanceRelation {
- override def newInstance(): Unsupported = copy()
- override def output: Seq[Attribute] = Nil
- }
- Unsupported().createOrReplaceTempView("not_sql_gen_supported_table_so_far")
- sql("select * from not_sql_gen_supported_table_so_far")
- val m3 = intercept[org.scalatest.exceptions.TestFailedException] {
- checkSQL("select * from not_sql_gen_supported_table_so_far", "in")
- }.getMessage
- assert(m3.contains("Cannot convert the following SQL query plan back to SQL query string"))
- }
-
- test("Test should fail if the SQL query did not equal to the golden SQL") {
- val m4 = intercept[org.scalatest.exceptions.TestFailedException] {
- checkSQL("SELECT 1", "in")
- }.getMessage
- assert(m4.contains("did not equal"))
- }
- }
-
- test("range") {
- checkSQL("select * from range(100)", "range")
- checkSQL("select * from range(1, 100, 20, 10)", "range_with_splits")
- }
-
- test("in") {
- checkSQL("SELECT id FROM parquet_t0 WHERE id IN (1, 2, 3)", "in")
- }
-
- test("not in") {
- checkSQL("SELECT id FROM t0 WHERE id NOT IN (1, 2, 3)", "not_in")
- }
-
- test("not like") {
- checkSQL("SELECT id FROM t0 WHERE id + 5 NOT LIKE '1%'", "not_like")
- }
-
- test("aggregate function in having clause") {
- checkSQL("SELECT COUNT(value) FROM parquet_t1 GROUP BY key HAVING MAX(key) > 0", "agg1")
- }
-
- test("aggregate function in order by clause") {
- checkSQL("SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY MAX(key)", "agg2")
- }
-
- // When there are multiple aggregate functions in ORDER BY clause, all of them are extracted into
- // Aggregate operator and aliased to the same name "aggOrder". This is OK for normal query
- // execution since these aliases have different expression ID. But this introduces name collision
- // when converting resolved plans back to SQL query strings as expression IDs are stripped.
- test("aggregate function in order by clause with multiple order keys") {
- checkSQL("SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY key, MAX(key)", "agg3")
- }
-
- test("order by asc nulls last") {
- checkSQL("SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY key nulls last, MAX(key)",
- "sort_asc_nulls_last")
- }
-
- test("order by desc nulls first") {
- checkSQL("SELECT COUNT(value) FROM parquet_t1 GROUP BY key ORDER BY key desc nulls first," +
- "MAX(key)", "sort_desc_nulls_first")
- }
-
- test("type widening in union") {
- checkSQL("SELECT id FROM parquet_t0 UNION ALL SELECT CAST(id AS INT) AS id FROM parquet_t0",
- "type_widening")
- }
-
- test("union distinct") {
- checkSQL("SELECT * FROM t0 UNION SELECT * FROM t0", "union_distinct")
- }
-
- test("three-child union") {
- checkSQL(
- """
- |SELECT id FROM parquet_t0
- |UNION ALL SELECT id FROM parquet_t0
- |UNION ALL SELECT id FROM parquet_t0
- """.stripMargin,
- "three_child_union")
- }
-
- test("intersect") {
- checkSQL("SELECT * FROM t0 INTERSECT SELECT * FROM t0", "intersect")
- }
-
- test("except") {
- checkSQL("SELECT * FROM t0 EXCEPT SELECT * FROM t0", "except")
- }
-
- test("self join") {
- checkSQL("SELECT x.key FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key", "self_join")
- }
-
- test("self join with group by") {
- checkSQL(
- "SELECT x.key, COUNT(*) FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key group by x.key",
- "self_join_with_group_by")
- }
-
- test("case") {
- checkSQL("SELECT CASE WHEN id % 2 > 0 THEN 0 WHEN id % 2 = 0 THEN 1 END FROM parquet_t0",
- "case")
- }
-
- test("case with else") {
- checkSQL("SELECT CASE WHEN id % 2 > 0 THEN 0 ELSE 1 END FROM parquet_t0", "case_with_else")
- }
-
- test("case with key") {
- checkSQL("SELECT CASE id WHEN 0 THEN 'foo' WHEN 1 THEN 'bar' END FROM parquet_t0",
- "case_with_key")
- }
-
- test("case with key and else") {
- checkSQL("SELECT CASE id WHEN 0 THEN 'foo' WHEN 1 THEN 'bar' ELSE 'baz' END FROM parquet_t0",
- "case_with_key_and_else")
- }
-
- test("select distinct without aggregate functions") {
- checkSQL("SELECT DISTINCT id FROM parquet_t0", "select_distinct")
- }
-
- test("rollup/cube #1") {
- // Original logical plan:
- // Aggregate [(key#17L % cast(5 as bigint))#47L,grouping__id#46],
- // [(count(1),mode=Complete,isDistinct=false) AS cnt#43L,
- // (key#17L % cast(5 as bigint))#47L AS _c1#45L,
- // grouping__id#46 AS _c2#44]
- // +- Expand [List(key#17L, value#18, (key#17L % cast(5 as bigint))#47L, 0),
- // List(key#17L, value#18, null, 1)],
- // [key#17L,value#18,(key#17L % cast(5 as bigint))#47L,grouping__id#46]
- // +- Project [key#17L,
- // value#18,
- // (key#17L % cast(5 as bigint)) AS (key#17L % cast(5 as bigint))#47L]
- // +- Subquery t1
- // +- Relation[key#17L,value#18] ParquetRelation
- // Converted SQL:
- // SELECT count( 1) AS `cnt`,
- // (`t1`.`key` % CAST(5 AS BIGINT)),
- // grouping_id() AS `_c2`
- // FROM `default`.`t1`
- // GROUP BY (`t1`.`key` % CAST(5 AS BIGINT))
- // GROUPING SETS (((`t1`.`key` % CAST(5 AS BIGINT))), ())
- checkSQL(
- "SELECT count(*) as cnt, key%5, grouping_id() FROM parquet_t1 GROUP BY key % 5 WITH ROLLUP",
- "rollup_cube_1_1")
-
- checkSQL(
- "SELECT count(*) as cnt, key%5, grouping_id() FROM parquet_t1 GROUP BY key % 5 WITH CUBE",
- "rollup_cube_1_2")
- }
-
- test("rollup/cube #2") {
- checkSQL("SELECT key, value, count(value) FROM parquet_t1 GROUP BY key, value WITH ROLLUP",
- "rollup_cube_2_1")
-
- checkSQL("SELECT key, value, count(value) FROM parquet_t1 GROUP BY key, value WITH CUBE",
- "rollup_cube_2_2")
- }
-
- test("rollup/cube #3") {
- checkSQL(
- "SELECT key, count(value), grouping_id() FROM parquet_t1 GROUP BY key, value WITH ROLLUP",
- "rollup_cube_3_1")
-
- checkSQL(
- "SELECT key, count(value), grouping_id() FROM parquet_t1 GROUP BY key, value WITH CUBE",
- "rollup_cube_3_2")
- }
-
- test("rollup/cube #4") {
- checkSQL(
- s"""
- |SELECT count(*) as cnt, key % 5 as k1, key - 5 as k2, grouping_id() FROM parquet_t1
- |GROUP BY key % 5, key - 5 WITH ROLLUP
- """.stripMargin,
- "rollup_cube_4_1")
-
- checkSQL(
- s"""
- |SELECT count(*) as cnt, key % 5 as k1, key - 5 as k2, grouping_id() FROM parquet_t1
- |GROUP BY key % 5, key - 5 WITH CUBE
- """.stripMargin,
- "rollup_cube_4_2")
- }
-
- test("rollup/cube #5") {
- checkSQL(
- s"""
- |SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id(key % 5, key - 5) AS k3
- |FROM (SELECT key, key%2, key - 5 FROM parquet_t1) t GROUP BY key%5, key-5
- |WITH ROLLUP
- """.stripMargin,
- "rollup_cube_5_1")
-
- checkSQL(
- s"""
- |SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id(key % 5, key - 5) AS k3
- |FROM (SELECT key, key % 2, key - 5 FROM parquet_t1) t GROUP BY key % 5, key - 5
- |WITH CUBE
- """.stripMargin,
- "rollup_cube_5_2")
- }
-
- test("rollup/cube #6") {
- checkSQL("SELECT a, b, sum(c) FROM parquet_t2 GROUP BY ROLLUP(a, b) ORDER BY a, b",
- "rollup_cube_6_1")
-
- checkSQL("SELECT a, b, sum(c) FROM parquet_t2 GROUP BY CUBE(a, b) ORDER BY a, b",
- "rollup_cube_6_2")
-
- checkSQL("SELECT a, b, sum(a) FROM parquet_t2 GROUP BY ROLLUP(a, b) ORDER BY a, b",
- "rollup_cube_6_3")
-
- checkSQL("SELECT a, b, sum(a) FROM parquet_t2 GROUP BY CUBE(a, b) ORDER BY a, b",
- "rollup_cube_6_4")
-
- checkSQL("SELECT a + b, b, sum(a - b) FROM parquet_t2 GROUP BY a + b, b WITH ROLLUP",
- "rollup_cube_6_5")
-
- checkSQL("SELECT a + b, b, sum(a - b) FROM parquet_t2 GROUP BY a + b, b WITH CUBE",
- "rollup_cube_6_6")
- }
-
- test("rollup/cube #7") {
- checkSQL("SELECT a, b, grouping_id(a, b) FROM parquet_t2 GROUP BY cube(a, b)",
- "rollup_cube_7_1")
-
- checkSQL("SELECT a, b, grouping(b) FROM parquet_t2 GROUP BY cube(a, b)",
- "rollup_cube_7_2")
-
- checkSQL("SELECT a, b, grouping(a) FROM parquet_t2 GROUP BY cube(a, b)",
- "rollup_cube_7_3")
- }
-
- test("rollup/cube #8") {
- // grouping_id() is part of another expression
- checkSQL(
- s"""
- |SELECT hkey AS k1, value - 5 AS k2, hash(grouping_id()) AS hgid
- |FROM (SELECT hash(key) as hkey, key as value FROM parquet_t1) t GROUP BY hkey, value-5
- |WITH ROLLUP
- """.stripMargin,
- "rollup_cube_8_1")
-
- checkSQL(
- s"""
- |SELECT hkey AS k1, value - 5 AS k2, hash(grouping_id()) AS hgid
- |FROM (SELECT hash(key) as hkey, key as value FROM parquet_t1) t GROUP BY hkey, value-5
- |WITH CUBE
- """.stripMargin,
- "rollup_cube_8_2")
- }
-
- test("rollup/cube #9") {
- // self join is used as the child node of ROLLUP/CUBE with replaced quantifiers
- checkSQL(
- s"""
- |SELECT t.key - 5, cnt, SUM(cnt)
- |FROM (SELECT x.key, COUNT(*) as cnt
- |FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key GROUP BY x.key) t
- |GROUP BY cnt, t.key - 5
- |WITH ROLLUP
- """.stripMargin,
- "rollup_cube_9_1")
-
- checkSQL(
- s"""
- |SELECT t.key - 5, cnt, SUM(cnt)
- |FROM (SELECT x.key, COUNT(*) as cnt
- |FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key GROUP BY x.key) t
- |GROUP BY cnt, t.key - 5
- |WITH CUBE
- """.stripMargin,
- "rollup_cube_9_2")
- }
-
- test("grouping sets #1") {
- checkSQL(
- s"""
- |SELECT count(*) AS cnt, key % 5 AS k1, key - 5 AS k2, grouping_id() AS k3
- |FROM (SELECT key, key % 2, key - 5 FROM parquet_t1) t GROUP BY key % 5, key - 5
- |GROUPING SETS (key % 5, key - 5)
- """.stripMargin,
- "grouping_sets_1")
- }
-
- test("grouping sets #2") {
- checkSQL(
- "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (a, b) ORDER BY a, b",
- "grouping_sets_2_1")
-
- checkSQL(
- "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (a) ORDER BY a, b",
- "grouping_sets_2_2")
-
- checkSQL(
- "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (b) ORDER BY a, b",
- "grouping_sets_2_3")
-
- checkSQL(
- "SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b GROUPING SETS (()) ORDER BY a, b",
- "grouping_sets_2_4")
-
- checkSQL(
- s"""
- |SELECT a, b, sum(c) FROM parquet_t2 GROUP BY a, b
- |GROUPING SETS ((), (a), (a, b)) ORDER BY a, b
- """.stripMargin,
- "grouping_sets_2_5")
- }
-
- test("cluster by") {
- checkSQL("SELECT id FROM parquet_t0 CLUSTER BY id", "cluster_by")
- }
-
- test("distribute by") {
- checkSQL("SELECT id FROM parquet_t0 DISTRIBUTE BY id", "distribute_by")
- }
-
- test("distribute by with sort by") {
- checkSQL("SELECT id FROM parquet_t0 DISTRIBUTE BY id SORT BY id",
- "distribute_by_with_sort_by")
- }
-
- test("SPARK-13720: sort by after having") {
- checkSQL("SELECT COUNT(value) FROM parquet_t1 GROUP BY key HAVING MAX(key) > 0 SORT BY key",
- "sort_by_after_having")
- }
-
- test("distinct aggregation") {
- checkSQL("SELECT COUNT(DISTINCT id) FROM parquet_t0", "distinct_aggregation")
- }
-
- test("TABLESAMPLE") {
- // Project [id#2L]
- // +- Sample 0.0, 1.0, false, ...
- // +- Subquery s
- // +- Subquery parquet_t0
- // +- Relation[id#2L] ParquetRelation
- checkSQL("SELECT s.id FROM parquet_t0 TABLESAMPLE(100 PERCENT) s", "tablesample_1")
-
- // Project [id#2L]
- // +- Sample 0.0, 1.0, false, ...
- // +- Subquery parquet_t0
- // +- Relation[id#2L] ParquetRelation
- checkSQL("SELECT * FROM parquet_t0 TABLESAMPLE(100 PERCENT)", "tablesample_2")
-
- // Project [id#21L]
- // +- Sample 0.0, 1.0, false, ...
- // +- MetastoreRelation default, t0, Some(s)
- checkSQL("SELECT s.id FROM t0 TABLESAMPLE(100 PERCENT) s", "tablesample_3")
-
- // Project [id#24L]
- // +- Sample 0.0, 1.0, false, ...
- // +- MetastoreRelation default, t0, None
- checkSQL("SELECT * FROM t0 TABLESAMPLE(100 PERCENT)", "tablesample_4")
-
- // When a sampling fraction is not 100%, the returned results are random.
- // Thus, added an always-false filter here to check if the generated plan can be successfully
- // executed.
- checkSQL("SELECT s.id FROM parquet_t0 TABLESAMPLE(0.1 PERCENT) s WHERE 1=0", "tablesample_5")
- checkSQL("SELECT * FROM parquet_t0 TABLESAMPLE(0.1 PERCENT) WHERE 1=0", "tablesample_6")
- }
-
- test("multi-distinct columns") {
- checkSQL("SELECT a, COUNT(DISTINCT b), COUNT(DISTINCT c), SUM(d) FROM parquet_t2 GROUP BY a",
- "multi_distinct")
- }
-
- test("persisted data source relations") {
- Seq("orc", "json", "parquet").foreach { format =>
- val tableName = s"${format}_parquet_t0"
- withTable(tableName) {
- spark.range(10).write.format(format).saveAsTable(tableName)
- checkSQL(s"SELECT id FROM $tableName", s"data_source_$tableName")
- }
- }
- }
-
- test("script transformation - schemaless") {
- assume(TestUtils.testCommandAvailable("/bin/bash"))
-
- checkSQL("SELECT TRANSFORM (a, b, c, d) USING 'cat' FROM parquet_t2",
- "script_transformation_1")
- checkSQL("SELECT TRANSFORM (*) USING 'cat' FROM parquet_t2",
- "script_transformation_2")
- }
-
- test("script transformation - alias list") {
- assume(TestUtils.testCommandAvailable("/bin/bash"))
-
- checkSQL("SELECT TRANSFORM (a, b, c, d) USING 'cat' AS (d1, d2, d3, d4) FROM parquet_t2",
- "script_transformation_alias_list")
- }
-
- test("script transformation - alias list with type") {
- assume(TestUtils.testCommandAvailable("/bin/bash"))
-
- checkSQL(
- """FROM
- |(FROM parquet_t1 SELECT TRANSFORM(key, value) USING 'cat' AS (thing1 int, thing2 string)) t
- |SELECT thing1 + 1
- """.stripMargin,
- "script_transformation_alias_list_with_type")
- }
-
- test("script transformation - row format delimited clause with only one format property") {
- assume(TestUtils.testCommandAvailable("/bin/bash"))
-
- checkSQL(
- """SELECT TRANSFORM (key) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
- |USING 'cat' AS (tKey) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
- |FROM parquet_t1
- """.stripMargin,
- "script_transformation_row_format_one")
- }
-
- test("script transformation - row format delimited clause with multiple format properties") {
- assume(TestUtils.testCommandAvailable("/bin/bash"))
-
- checkSQL(
- """SELECT TRANSFORM (key)
- |ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\t'
- |USING 'cat' AS (tKey)
- |ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\t'
- |FROM parquet_t1
- """.stripMargin,
- "script_transformation_row_format_multiple")
- }
-
- test("script transformation - row format serde clauses with SERDEPROPERTIES") {
- assume(TestUtils.testCommandAvailable("/bin/bash"))
-
- checkSQL(
- """SELECT TRANSFORM (key, value)
- |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
- |WITH SERDEPROPERTIES('field.delim' = '|')
- |USING 'cat' AS (tKey, tValue)
- |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
- |WITH SERDEPROPERTIES('field.delim' = '|')
- |FROM parquet_t1
- """.stripMargin,
- "script_transformation_row_format_serde")
- }
-
- test("script transformation - row format serde clauses without SERDEPROPERTIES") {
- assume(TestUtils.testCommandAvailable("/bin/bash"))
-
- checkSQL(
- """SELECT TRANSFORM (key, value)
- |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
- |USING 'cat' AS (tKey, tValue)
- |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
- |FROM parquet_t1
- """.stripMargin,
- "script_transformation_row_format_without_serde")
- }
-
- test("plans with non-SQL expressions") {
- spark.udf.register("foo", (_: Int) * 2)
- intercept[UnsupportedOperationException](new SQLBuilder(sql("SELECT foo(id) FROM t0")).toSQL)
- }
-
- test("named expression in column names shouldn't be quoted") {
- def checkColumnNames(query: String, expectedColNames: String*): Unit = {
- checkSQL(query)
- assert(sql(query).columns === expectedColNames)
- }
-
- // Attributes
- checkColumnNames(
- """SELECT * FROM (
- | SELECT 1 AS a, 2 AS b, 3 AS `we``ird`
- |) s
- """.stripMargin,
- "a", "b", "we`ird"
- )
-
- checkColumnNames(
- """SELECT x.a, y.a, x.b, y.b
- |FROM (SELECT 1 AS a, 2 AS b) x
- |CROSS JOIN (SELECT 1 AS a, 2 AS b) y
- |ON x.a = y.a
- """.stripMargin,
- "a", "a", "b", "b"
- )
-
- // String literal
- checkColumnNames(
- "SELECT 'foo', '\"bar\\''",
- "foo", "\"bar\'"
- )
-
- // Numeric literals (should have CAST or suffixes in column names)
- checkColumnNames(
- "SELECT 1Y, 2S, 3, 4L, 5.1, 6.1D",
- "1", "2", "3", "4", "5.1", "6.1"
- )
-
- // Aliases
- checkColumnNames(
- "SELECT 1 AS a",
- "a"
- )
-
- // Complex type extractors
- checkColumnNames(
- """SELECT
- | a.f1, b[0].f1, b.f1, c["foo"], d[0]
- |FROM (
- | SELECT
- | NAMED_STRUCT("f1", 1, "f2", "foo") AS a,
- | ARRAY(NAMED_STRUCT("f1", 1, "f2", "foo")) AS b,
- | MAP("foo", 1) AS c,
- | ARRAY(1) AS d
- |) s
- """.stripMargin,
- "f1", "b[0].f1", "f1", "c[foo]", "d[0]"
- )
- }
-
- test("window basic") {
- checkSQL("SELECT MAX(value) OVER (PARTITION BY key % 3) FROM parquet_t1", "window_basic_1")
-
- checkSQL(
- """
- |SELECT key, value, ROUND(AVG(key) OVER (), 2)
- |FROM parquet_t1 ORDER BY key
- """.stripMargin,
- "window_basic_2")
-
- checkSQL(
- """
- |SELECT value, MAX(key + 1) OVER (PARTITION BY key % 5 ORDER BY key % 7) AS max
- |FROM parquet_t1
- """.stripMargin,
- "window_basic_3")
-
- checkSQL(
- """
- |SELECT key, value, ROUND(AVG(key) OVER (), 2)
- |FROM parquet_t1 ORDER BY key nulls last
- """.stripMargin,
- "window_basic_asc_nulls_last")
-
- checkSQL(
- """
- |SELECT key, value, ROUND(AVG(key) OVER (), 2)
- |FROM parquet_t1 ORDER BY key desc nulls first
- """.stripMargin,
- "window_basic_desc_nulls_first")
- }
-
- test("multiple window functions in one expression") {
- checkSQL(
- """
- |SELECT
- | MAX(key) OVER (ORDER BY key DESC, value) / MIN(key) OVER (PARTITION BY key % 3)
- |FROM parquet_t1
- """.stripMargin)
- }
-
- test("regular expressions and window functions in one expression") {
- checkSQL("SELECT MAX(key) OVER (PARTITION BY key % 3) + key FROM parquet_t1",
- "regular_expressions_and_window")
- }
-
- test("aggregate functions and window functions in one expression") {
- checkSQL("SELECT MAX(c) + COUNT(a) OVER () FROM parquet_t2 GROUP BY a, b",
- "aggregate_functions_and_window")
- }
-
- test("window with different window specification") {
- checkSQL(
- """
- |SELECT key, value,
- |DENSE_RANK() OVER (ORDER BY key, value) AS dr,
- |MAX(value) OVER (PARTITION BY key ORDER BY key ASC) AS max
- |FROM parquet_t1
- """.stripMargin)
- }
-
- test("window with the same window specification with aggregate + having") {
- checkSQL(
- """
- |SELECT key, value,
- |MAX(value) OVER (PARTITION BY key % 5 ORDER BY key DESC) AS max
- |FROM parquet_t1 GROUP BY key, value HAVING key > 5
- """.stripMargin,
- "window_with_the_same_window_with_agg_having")
- }
-
- test("window with the same window specification with aggregate functions") {
- checkSQL(
- """
- |SELECT key, value,
- |MAX(value) OVER (PARTITION BY key % 5 ORDER BY key) AS max
- |FROM parquet_t1 GROUP BY key, value
- """.stripMargin,
- "window_with_the_same_window_with_agg_functions")
- }
-
- test("window with the same window specification with aggregate") {
- checkSQL(
- """
- |SELECT key, value,
- |DENSE_RANK() OVER (DISTRIBUTE BY key SORT BY key, value) AS dr,
- |COUNT(key)
- |FROM parquet_t1 GROUP BY key, value
- """.stripMargin,
- "window_with_the_same_window_with_agg")
- }
-
- test("window with the same window specification without aggregate and filter") {
- checkSQL(
- """
- |SELECT key, value,
- |DENSE_RANK() OVER (DISTRIBUTE BY key SORT BY key, value) AS dr,
- |COUNT(key) OVER(DISTRIBUTE BY key SORT BY key, value) AS ca
- |FROM parquet_t1
- """.stripMargin,
- "window_with_the_same_window_with_agg_filter")
- }
-
- test("window clause") {
- checkSQL(
- """
- |SELECT key, MAX(value) OVER w1 AS MAX, MIN(value) OVER w2 AS min
- |FROM parquet_t1
- |WINDOW w1 AS (PARTITION BY key % 5 ORDER BY key), w2 AS (PARTITION BY key % 6)
- """.stripMargin)
- }
-
- test("special window functions") {
- checkSQL(
- """
- |SELECT
- | RANK() OVER w,
- | PERCENT_RANK() OVER w,
- | DENSE_RANK() OVER w,
- | ROW_NUMBER() OVER w,
- | NTILE(10) OVER w,
- | CUME_DIST() OVER w,
- | LAG(key, 2) OVER w,
- | LEAD(key, 2) OVER w
- |FROM parquet_t1
- |WINDOW w AS (PARTITION BY key % 5 ORDER BY key)
- """.stripMargin)
- }
-
- test("window with join") {
- checkSQL(
- """
- |SELECT x.key, MAX(y.key) OVER (PARTITION BY x.key % 5 ORDER BY x.key)
- |FROM parquet_t1 x JOIN parquet_t1 y ON x.key = y.key
- """.stripMargin,
- "window_with_join")
- }
-
- test("join 2 tables and aggregate function in having clause") {
- checkSQL(
- """
- |SELECT COUNT(a.value), b.KEY, a.KEY
- |FROM parquet_t1 a CROSS JOIN parquet_t1 b
- |GROUP BY a.KEY, b.KEY
- |HAVING MAX(a.KEY) > 0
- """.stripMargin,
- "join_2_tables")
- }
-
- test("generator in project list without FROM clause") {
- checkSQL("SELECT EXPLODE(ARRAY(1,2,3))", "generator_without_from_1")
- checkSQL("SELECT EXPLODE(ARRAY(1,2,3)) AS val", "generator_without_from_2")
- }
-
- test("generator in project list with non-referenced table") {
- checkSQL("SELECT EXPLODE(ARRAY(1,2,3)) FROM t0", "generator_non_referenced_table_1")
- checkSQL("SELECT EXPLODE(ARRAY(1,2,3)) AS val FROM t0", "generator_non_referenced_table_2")
- }
-
- test("generator in project list with referenced table") {
- checkSQL("SELECT EXPLODE(arr) FROM parquet_t3", "generator_referenced_table_1")
- checkSQL("SELECT EXPLODE(arr) AS val FROM parquet_t3", "generator_referenced_table_2")
- }
-
- test("generator in project list with non-UDTF expressions") {
- checkSQL("SELECT EXPLODE(arr), id FROM parquet_t3", "generator_non_udtf_1")
- checkSQL("SELECT EXPLODE(arr) AS val, id as a FROM parquet_t3", "generator_non_udtf_2")
- }
-
- test("generator in lateral view") {
- checkSQL("SELECT val, id FROM parquet_t3 LATERAL VIEW EXPLODE(arr) exp AS val",
- "generator_in_lateral_view_1")
- checkSQL("SELECT val, id FROM parquet_t3 LATERAL VIEW OUTER EXPLODE(arr) exp AS val",
- "generator_in_lateral_view_2")
- }
-
- test("generator in lateral view with ambiguous names") {
- checkSQL(
- """
- |SELECT exp.id, parquet_t3.id
- |FROM parquet_t3
- |LATERAL VIEW EXPLODE(arr) exp AS id
- """.stripMargin,
- "generator_with_ambiguous_names_1")
-
- checkSQL(
- """
- |SELECT exp.id, parquet_t3.id
- |FROM parquet_t3
- |LATERAL VIEW OUTER EXPLODE(arr) exp AS id
- """.stripMargin,
- "generator_with_ambiguous_names_2")
- }
-
- test("use JSON_TUPLE as generator") {
- checkSQL(
- """
- |SELECT c0, c1, c2
- |FROM parquet_t3
- |LATERAL VIEW JSON_TUPLE(json, 'f1', 'f2', 'f3') jt
- """.stripMargin,
- "json_tuple_generator_1")
-
- checkSQL(
- """
- |SELECT a, b, c
- |FROM parquet_t3
- |LATERAL VIEW JSON_TUPLE(json, 'f1', 'f2', 'f3') jt AS a, b, c
- """.stripMargin,
- "json_tuple_generator_2")
- }
-
- test("nested generator in lateral view") {
- checkSQL(
- """
- |SELECT val, id
- |FROM parquet_t3
- |LATERAL VIEW EXPLODE(arr2) exp1 AS nested_array
- |LATERAL VIEW EXPLODE(nested_array) exp1 AS val
- """.stripMargin,
- "nested_generator_in_lateral_view_1")
-
- checkSQL(
- """
- |SELECT val, id
- |FROM parquet_t3
- |LATERAL VIEW EXPLODE(arr2) exp1 AS nested_array
- |LATERAL VIEW OUTER EXPLODE(nested_array) exp1 AS val
- """.stripMargin,
- "nested_generator_in_lateral_view_2")
- }
-
- test("generate with other operators") {
- checkSQL(
- """
- |SELECT EXPLODE(arr) AS val, id
- |FROM parquet_t3
- |WHERE id > 2
- |ORDER BY val, id
- |LIMIT 5
- """.stripMargin,
- "generate_with_other_1")
-
- checkSQL(
- """
- |SELECT val, id
- |FROM parquet_t3
- |LATERAL VIEW EXPLODE(arr2) exp1 AS nested_array
- |LATERAL VIEW EXPLODE(nested_array) exp1 AS val
- |WHERE val > 2
- |ORDER BY val, id
- |LIMIT 5
- """.stripMargin,
- "generate_with_other_2")
- }
-
- test("filter after subquery") {
- checkSQL("SELECT a FROM (SELECT key + 1 AS a FROM parquet_t1) t WHERE a > 5",
- "filter_after_subquery")
- }
-
- test("SPARK-14933 - select parquet table") {
- withTable("parquet_t") {
- sql("create table parquet_t stored as parquet as select 1 as c1, 'abc' as c2")
- checkSQL("select * from parquet_t", "select_parquet_table")
- }
- }
-
- test("predicate subquery") {
- withTable("t1") {
- withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
- sql("CREATE TABLE t1(a int)")
- checkSQL("select * from t1 b where exists (select * from t1 a)", "predicate_subquery")
- }
- }
- }
-
- test("broadcast join") {
- checkSQL(
- """
- |SELECT /*+ MAPJOIN(srcpart) */ subq.key1, z.value
- |FROM (SELECT x.key as key1, x.value as value1, y.key as key2, y.value as value2
- | FROM src1 x JOIN src y ON (x.key = y.key)) subq
- |JOIN srcpart z ON (subq.key1 = z.key and z.ds='2008-04-08' and z.hr=11)
- |ORDER BY subq.key1, z.value
- """.stripMargin,
- "broadcast_join_subquery")
- }
-
- test("subquery using single table") {
- checkSQL(
- """
- |SELECT a.k, a.c
- |FROM (SELECT b.key as k, count(1) as c
- | FROM src b
- | GROUP BY b.key) a
- |WHERE a.k >= 90
- """.stripMargin,
- "subq2")
- }
-
- test("correlated subqueries using EXISTS on where clause") {
- checkSQL(
- """
- |select *
- |from src b
- |where exists (select a.key
- | from src a
- | where b.value = a.value and a.key = b.key and a.value > 'val_9')
- """.stripMargin,
- "subquery_exists_1")
-
- checkSQL(
- """
- |select *
- |from (select *
- | from src b
- | where exists (select a.key
- | from src a
- | where b.value = a.value and a.key = b.key and a.value > 'val_9')) a
- """.stripMargin,
- "subquery_exists_2")
- }
-
- test("correlated subqueries using EXISTS on having clause") {
- checkSQL(
- """
- |select b.key, count(*)
- |from src b
- |group by b.key
- |having exists (select a.key
- | from src a
- | where a.key = b.key and a.value > 'val_9')
- """.stripMargin,
- "subquery_exists_having_1")
-
- checkSQL(
- """
- |select *
- |from (select b.key, count(*)
- | from src b
- | group by b.key
- | having exists (select a.key
- | from src a
- | where a.key = b.key and a.value > 'val_9')) a
- """.stripMargin,
- "subquery_exists_having_2")
-
- checkSQL(
- """
- |select b.key, min(b.value)
- |from src b
- |group by b.key
- |having exists (select a.key
- | from src a
- | where a.value > 'val_9' and a.value = min(b.value))
- """.stripMargin,
- "subquery_exists_having_3")
- }
-
- test("correlated subqueries using NOT EXISTS on where clause") {
- checkSQL(
- """
- |select *
- |from src b
- |where not exists (select a.key
- | from src a
- | where b.value = a.value and a.key = b.key and a.value > 'val_2')
- """.stripMargin,
- "subquery_not_exists_1")
-
- checkSQL(
- """
- |select *
- |from src b
- |where not exists (select a.key
- | from src a
- | where b.value = a.value and a.value > 'val_2')
- """.stripMargin,
- "subquery_not_exists_2")
- }
-
- test("correlated subqueries using NOT EXISTS on having clause") {
- checkSQL(
- """
- |select *
- |from src b
- |group by key, value
- |having not exists (select a.key
- | from src a
- | where b.value = a.value and a.key = b.key and a.value > 'val_12')
- """.stripMargin,
- "subquery_not_exists_having_1")
-
- checkSQL(
- """
- |select *
- |from src b
- |group by key, value
- |having not exists (select distinct a.key
- | from src a
- | where b.value = a.value and a.value > 'val_12')
- """.stripMargin,
- "subquery_not_exists_having_2")
- }
-
- test("subquery using IN on where clause") {
- checkSQL(
- """
- |SELECT key
- |FROM src
- |WHERE key in (SELECT max(key) FROM src)
- """.stripMargin,
- "subquery_in")
- }
-
- test("subquery using IN on having clause") {
- checkSQL(
- """
- |select key, count(*)
- |from src
- |group by key
- |having count(*) in (select count(*) from src s1 where s1.key = '90' group by s1.key)
- |order by key
- """.stripMargin,
- "subquery_in_having_1")
-
- checkSQL(
- """
- |select b.key, min(b.value)
- |from src b
- |group by b.key
- |having b.key in (select a.key
- | from src a
- | where a.value > 'val_9' and a.value = min(b.value))
- |order by b.key
- """.stripMargin,
- "subquery_in_having_2")
- }
-
- test("SPARK-14933 - select orc table") {
- withTable("orc_t") {
- sql("create table orc_t stored as orc as select 1 as c1, 'abc' as c2")
- checkSQL("select * from orc_t", "select_orc_table")
- }
- }
-
- test("inline tables") {
- checkSQL(
- """
- |select * from values ("one", 1), ("two", 2), ("three", null) as data(a, b) where b > 1
- """.stripMargin,
- "inline_tables")
- }
-
- test("SPARK-17750 - interval arithmetic") {
- withTable("dates") {
- sql("create table dates (ts timestamp)")
- checkSQL(
- """
- |select ts + interval 1 day, ts + interval 2 days,
- | ts - interval 1 day, ts - interval 2 days,
- | ts + interval '1' day, ts + interval '2' days,
- | ts - interval '1' day, ts - interval '2' days
- |from dates
- """.stripMargin,
- "interval_arithmetic"
- )
- }
- }
-
- test("SPARK-17982 - limit") {
- withTable("tbl") {
- sql("CREATE TABLE tbl(id INT, name STRING)")
- checkSQL(
- "SELECT * FROM (SELECT id FROM tbl LIMIT 2)",
- "limit"
- )
- }
- }
-}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/SQLBuilderTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/SQLBuilderTest.scala
index 31755f56ec..157783abc8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/SQLBuilderTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/SQLBuilderTest.scala
@@ -41,33 +41,4 @@ abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton {
""".stripMargin)
}
}
-
- protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = {
- val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) =>
- fail(
- s"""Cannot convert the following logical query plan to SQL:
- |
- |${plan.treeString}
- """.stripMargin)
- }
-
- try {
- assert(generatedSQL === expectedSQL)
- } catch {
- case cause: Throwable =>
- fail(
- s"""Wrong SQL generated for the following logical query plan:
- |
- |${plan.treeString}
- |
- |$cause
- """.stripMargin)
- }
-
- checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan))
- }
-
- protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = {
- checkSQL(df.queryExecution.analyzed, expectedSQL)
- }
}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 4772a264d6..f3151d52f2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -27,7 +27,6 @@ import org.scalatest.{BeforeAndAfterAll, GivenWhenThen}
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Dataset
-import org.apache.spark.sql.catalyst.SQLBuilder
import org.apache.spark.sql.catalyst.planning.PhysicalOperation
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.util._
@@ -343,57 +342,8 @@ abstract class HiveComparisonTest
// Run w/ catalyst
val catalystResults = queryList.zip(hiveResults).map { case (queryString, hive) =>
- var query: TestHiveQueryExecution = null
- try {
- query = {
- val originalQuery = new TestHiveQueryExecution(
- queryString.replace("../../data", testDataPath))
- val containsCommands = originalQuery.analyzed.collectFirst {
- case _: Command => ()
- case _: InsertIntoTable => ()
- }.nonEmpty
-
- if (containsCommands) {
- originalQuery
- } else {
- val convertedSQL = try {
- new SQLBuilder(originalQuery.analyzed).toSQL
- } catch {
- case NonFatal(e) => fail(
- s"""Cannot convert the following HiveQL query plan back to SQL query string:
- |
- |# Original HiveQL query string:
- |$queryString
- |
- |# Resolved query plan:
- |${originalQuery.analyzed.treeString}
- """.stripMargin, e)
- }
-
- try {
- val queryExecution = new TestHiveQueryExecution(convertedSQL)
- // Trigger the analysis of this converted SQL query.
- queryExecution.analyzed
- queryExecution
- } catch {
- case NonFatal(e) => fail(
- s"""Failed to analyze the converted SQL string:
- |
- |# Original HiveQL query string:
- |$queryString
- |
- |# Resolved query plan:
- |${originalQuery.analyzed.treeString}
- |
- |# Converted SQL query string:
- |$convertedSQL
- """.stripMargin, e)
- }
- }
- }
-
- (query, prepareAnswer(query, query.hiveResultString()))
- } catch {
+ val query = new TestHiveQueryExecution(queryString.replace("../../data", testDataPath))
+ try { (query, prepareAnswer(query, query.hiveResultString())) } catch {
case e: Throwable =>
val errorMessage =
s"""