From 77409967008415b7ea57e9349d296350e6519687 Mon Sep 17 00:00:00 2001 From: Yin Huai Date: Wed, 6 May 2015 14:48:25 -0700 Subject: [HOT-FIX] Move HiveWindowFunctionQuerySuite.scala to hive compatibility dir. Author: Yin Huai Closes #5951 from yhuai/fixBuildMaven and squashes the following commits: fdde183 [Yin Huai] Move HiveWindowFunctionQuerySuite.scala to hive compatibility dir. --- .../execution/HiveWindowFunctionQuerySuite.scala | 845 +++++++++++++++++++++ .../execution/HiveWindowFunctionQuerySuite.scala | 845 --------------------- 2 files changed, 845 insertions(+), 845 deletions(-) create mode 100644 sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala (limited to 'sql') diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala new file mode 100644 index 0000000000..934452fe57 --- /dev/null +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala @@ -0,0 +1,845 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.execution + +import java.io.File +import java.util.{Locale, TimeZone} + +import org.scalatest.BeforeAndAfter + +import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.hive.test.TestHive._ +import org.apache.spark.util.Utils + +/** + * The test suite for window functions. To actually compare results with Hive, + * every test should be created by `createQueryTest`. Because we are reusing tables + * for different tests and there are a few properties needed to let Hive generate golden + * files, every `createQueryTest` calls should explicitly set `reset` to `false`. + */ +abstract class HiveWindowFunctionQueryBaseSuite extends HiveComparisonTest with BeforeAndAfter { + private val originalTimeZone = TimeZone.getDefault + private val originalLocale = Locale.getDefault + private val testTempDir = Utils.createTempDir() + + override def beforeAll() { + TestHive.cacheTables = true + // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) + TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) + // Add Locale setting + Locale.setDefault(Locale.US) + + // Create the table used in windowing.q + sql("DROP TABLE IF EXISTS part") + sql( + """ + |CREATE TABLE part( + | p_partkey INT, + | p_name STRING, + | p_mfgr STRING, + | p_brand STRING, + | p_type STRING, + | p_size INT, + | p_container STRING, + | p_retailprice DOUBLE, + | p_comment STRING) + """.stripMargin) + val testData1 = TestHive.getHiveFile("data/files/part_tiny.txt").getCanonicalPath + sql( + s""" + |LOAD DATA LOCAL INPATH '$testData1' overwrite into table part + """.stripMargin) + + sql("DROP TABLE IF EXISTS over1k") + sql( + """ + |create table over1k( + | t tinyint, + | si smallint, + | i int, + | b bigint, + | f float, + | d double, + | bo boolean, + | s string, + | ts timestamp, + | dec decimal(4,2), + | bin binary) + |row format delimited + |fields terminated by '|' + """.stripMargin) + val testData2 = TestHive.getHiveFile("data/files/over1k").getCanonicalPath + sql( + s""" + |LOAD DATA LOCAL INPATH '$testData2' overwrite into table over1k + """.stripMargin) + + // The following settings are used for generating golden files with Hive. + // We have to use kryo to correctly let Hive serialize plans with window functions. + // This is used to generate golden files. + sql("set hive.plan.serialization.format=kryo") + // Explicitly set fs to local fs. + sql(s"set fs.default.name=file://$testTempDir/") + // Ask Hive to run jobs in-process as a single map and reduce task. + sql("set mapred.job.tracker=local") + } + + override def afterAll() { + TestHive.cacheTables = false + TimeZone.setDefault(originalTimeZone) + Locale.setDefault(originalLocale) + TestHive.reset() + } + + ///////////////////////////////////////////////////////////////////////////// + // Tests based on windowing_multipartitioning.q + // Results of the original query file are not deterministic. + ///////////////////////////////////////////////////////////////////////////// + createQueryTest("windowing_multipartitioning.q (deterministic) 1", + s""" + |select s, + |rank() over (partition by s order by si) r, + |sum(b) over (partition by s order by si) sum + |from over1k + |order by s, r, sum; + """.stripMargin, reset = false) + + /* timestamp comparison issue with Hive? + createQueryTest("windowing_multipartitioning.q (deterministic) 2", + s""" + |select s, + |rank() over (partition by s order by dec desc) r, + |sum(b) over (partition by s order by ts desc) as sum + |from over1k + |where s = 'tom allen' or s = 'bob steinbeck' + |order by s, r, sum; + """.stripMargin, reset = false) + */ + + createQueryTest("windowing_multipartitioning.q (deterministic) 3", + s""" + |select s, sum(i) over (partition by s), sum(f) over (partition by si) + |from over1k where s = 'tom allen' or s = 'bob steinbeck'; + """.stripMargin, reset = false) + + createQueryTest("windowing_multipartitioning.q (deterministic) 4", + s""" + |select s, rank() over (partition by s order by bo), + |rank() over (partition by si order by bin desc) from over1k + |where s = 'tom allen' or s = 'bob steinbeck'; + """.stripMargin, reset = false) + + createQueryTest("windowing_multipartitioning.q (deterministic) 5", + s""" + |select s, sum(f) over (partition by i), row_number() over (order by f) + |from over1k where s = 'tom allen' or s = 'bob steinbeck'; + """.stripMargin, reset = false) + + createQueryTest("windowing_multipartitioning.q (deterministic) 6", + s""" + |select s, rank() over w1, + |rank() over w2 + |from over1k + |where s = 'tom allen' or s = 'bob steinbeck' + |window + |w1 as (partition by s order by dec), + |w2 as (partition by si order by f) ; + """.stripMargin, reset = false) + + ///////////////////////////////////////////////////////////////////////////// + // Tests based on windowing_navfn.q + // Results of the original query file are not deterministic. + // Also, the original query of + // select i, lead(s) over (partition by bin order by d,i desc) from over1k ; + ///////////////////////////////////////////////////////////////////////////// + createQueryTest("windowing_navfn.q (deterministic)", + s""" + |select s, row_number() over (partition by d order by dec) rn from over1k + |order by s, rn desc; + |select i, lead(s) over (partition by cast(bin as string) order by d,i desc) as l + |from over1k + |order by i desc, l; + |select i, lag(dec) over (partition by i order by s,i,dec) l from over1k + |order by i, l; + |select s, last_value(t) over (partition by d order by f) l from over1k + |order by s, l; + |select s, first_value(s) over (partition by bo order by s) f from over1k + |order by s, f; + |select t, s, i, last_value(i) over (partition by t order by s) + |from over1k where (s = 'oscar allen' or s = 'oscar carson') and t = 10; + """.stripMargin, reset = false) + + ///////////////////////////////////////////////////////////////////////////// + // Tests based on windowing_ntile.q + // Results of the original query file are not deterministic. + ///////////////////////////////////////////////////////////////////////////// + createQueryTest("windowing_ntile.q (deterministic)", + s""" + |select i, ntile(10) over (partition by s order by i) n from over1k + |order by i, n; + |select s, ntile(100) over (partition by i order by s) n from over1k + |order by s, n; + |select f, ntile(4) over (partition by d order by f) n from over1k + |order by f, n; + |select d, ntile(1000) over (partition by dec order by d) n from over1k + |order by d, n; + """.stripMargin, reset = false) + + ///////////////////////////////////////////////////////////////////////////// + // Tests based on windowing_udaf.q + // Results of the original query file are not deterministic. + ///////////////////////////////////////////////////////////////////////////// + createQueryTest("windowing_udaf.q (deterministic)", + s""" + |select s, min(i) over (partition by s) m from over1k + |order by s, m; + |select s, avg(f) over (partition by si order by s) a from over1k + |order by s, a; + |select s, avg(i) over (partition by t, b order by s) a from over1k + |order by s, a; + |select max(i) over w m from over1k + |order by m window w as (partition by f) ; + |select s, avg(d) over (partition by t order by f) a from over1k + |order by s, a; + """.stripMargin, reset = false) + + ///////////////////////////////////////////////////////////////////////////// + // Tests based on windowing_windowspec.q + // Results of the original query file are not deterministic. + ///////////////////////////////////////////////////////////////////////////// + createQueryTest("windowing_windowspec.q (deterministic)", + s""" + |select s, sum(b) over (partition by i order by s,b rows unbounded preceding) as sum + |from over1k order by s, sum; + |select s, sum(f) over (partition by d order by s,f rows unbounded preceding) as sum + |from over1k order by s, sum; + |select s, sum(f) over + |(partition by ts order by f range between current row and unbounded following) as sum + |from over1k order by s, sum; + |select s, avg(f) + |over (partition by ts order by s,f rows between current row and 5 following) avg + |from over1k order by s, avg; + |select s, avg(d) over + |(partition by t order by s,d desc rows between 5 preceding and 5 following) avg + |from over1k order by s, avg; + |select s, sum(i) over(partition by ts order by s) sum from over1k + |order by s, sum; + |select f, sum(f) over + |(partition by ts order by f range between unbounded preceding and current row) sum + |from over1k order by f, sum; + |select s, i, round(avg(d) over (partition by s order by i) / 10.0 , 2) avg + |from over1k order by s, i, avg; + |select s, i, round((avg(d) over w1 + 10.0) - (avg(d) over w1 - 10.0),2) avg + |from over1k + |order by s, i, avg window w1 as (partition by s order by i); + """.stripMargin, reset = false) + + ///////////////////////////////////////////////////////////////////////////// + // Tests based on windowing_rank.q + // Results of the original query file are not deterministic. + ///////////////////////////////////////////////////////////////////////////// + createQueryTest("windowing_rank.q (deterministic) 1", + s""" + |select s, rank() over (partition by f order by t) r from over1k order by s, r; + |select s, dense_rank() over (partition by ts order by i,s desc) as r from over1k + |order by s desc, r desc; + |select s, cume_dist() over (partition by bo order by b,s) cd from over1k + |order by s, cd; + |select s, percent_rank() over (partition by dec order by f) r from over1k + |order by s desc, r desc; + """.stripMargin, reset = false) + + createQueryTest("windowing_rank.q (deterministic) 2", + s""" + |select ts, dec, rnk + |from + | (select ts, dec, + | rank() over (partition by ts order by dec) as rnk + | from + | (select other.ts, other.dec + | from over1k other + | join over1k on (other.b = over1k.b) + | ) joined + | ) ranked + |where rnk = 1 + |order by ts, dec, rnk; + """.stripMargin, reset = false) + + createQueryTest("windowing_rank.q (deterministic) 3", + s""" + |select ts, dec, rnk + |from + | (select ts, dec, + | rank() over (partition by ts order by dec) as rnk + | from + | (select other.ts, other.dec + | from over1k other + | join over1k on (other.b = over1k.b) + | ) joined + | ) ranked + |where dec = 89.5 + |order by ts, dec, rnk; + """.stripMargin, reset = false) + + createQueryTest("windowing_rank.q (deterministic) 4", + s""" + |select ts, dec, rnk + |from + | (select ts, dec, + | rank() over (partition by ts order by dec) as rnk + | from + | (select other.ts, other.dec + | from over1k other + | join over1k on (other.b = over1k.b) + | where other.t < 10 + | ) joined + | ) ranked + |where rnk = 1 + |order by ts, dec, rnk; + """.stripMargin, reset = false) + + ///////////////////////////////////////////////////////////////////////////// + // Tests from windowing.q + // We port tests in windowing.q to here because this query file contains too + // many tests and the syntax of test "-- 7. testJoinWithWindowingAndPTF" + // is not supported right now. + ///////////////////////////////////////////////////////////////////////////// + createQueryTest("windowing.q -- 1. testWindowing", + s""" + |select p_mfgr, p_name, p_size, + |rank() over(distribute by p_mfgr sort by p_name) as r, + |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, + |sum(p_retailprice) over + |(distribute by p_mfgr sort by p_name rows between unbounded preceding and current row) as s1 + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 2. testGroupByWithPartitioning", + s""" + |select p_mfgr, p_name, p_size, + |min(p_retailprice), + |rank() over(distribute by p_mfgr sort by p_name)as r, + |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, + |p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz + |from part + |group by p_mfgr, p_name, p_size + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 3. testGroupByHavingWithSWQ", + s""" + |select p_mfgr, p_name, p_size, min(p_retailprice), + |rank() over(distribute by p_mfgr sort by p_name) as r, + |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, + |p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz + |from part + |group by p_mfgr, p_name, p_size + |having p_size > 0 + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 4. testCount", + s""" + |select p_mfgr, p_name, + |count(p_size) over(distribute by p_mfgr sort by p_name) as cd + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 5. testCountWithWindowingUDAF", + s""" + |select p_mfgr, p_name, + |rank() over(distribute by p_mfgr sort by p_name) as r, + |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, + |count(p_size) over(distribute by p_mfgr sort by p_name) as cd, + |p_retailprice, sum(p_retailprice) over (distribute by p_mfgr sort by p_name + | rows between unbounded preceding and current row) as s1, + |p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 6. testCountInSubQ", + s""" + |select sub1.r, sub1.dr, sub1.cd, sub1.s1, sub1.deltaSz + |from (select p_mfgr, p_name, + |rank() over(distribute by p_mfgr sort by p_name) as r, + |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, + |count(p_size) over(distribute by p_mfgr sort by p_name) as cd, + |p_retailprice, sum(p_retailprice) over (distribute by p_mfgr sort by p_name + | rows between unbounded preceding and current row) as s1, + |p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz + |from part + |) sub1 + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 8. testMixedCaseAlias", + s""" + |select p_mfgr, p_name, p_size, + |rank() over(distribute by p_mfgr sort by p_name, p_size desc) as R + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 9. testHavingWithWindowingNoGBY", + s""" + |select p_mfgr, p_name, p_size, + |rank() over(distribute by p_mfgr sort by p_name) as r, + |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, + |sum(p_retailprice) over (distribute by p_mfgr sort by p_name + | rows between unbounded preceding and current row) as s1 + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 10. testHavingWithWindowingCondRankNoGBY", + s""" + |select p_mfgr, p_name, p_size, + |rank() over(distribute by p_mfgr sort by p_name) as r, + |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, + |sum(p_retailprice) over (distribute by p_mfgr sort by p_name + | rows between unbounded preceding and current row) as s1 + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 11. testFirstLast", + s""" + |select p_mfgr,p_name, p_size, + |sum(p_size) over (distribute by p_mfgr sort by p_name + |rows between current row and current row) as s2, + |first_value(p_size) over w1 as f, + |last_value(p_size, false) over w1 as l + |from part + |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 12. testFirstLastWithWhere", + s""" + |select p_mfgr,p_name, p_size, + |rank() over(distribute by p_mfgr sort by p_name) as r, + |sum(p_size) over (distribute by p_mfgr sort by p_name + |rows between current row and current row) as s2, + |first_value(p_size) over w1 as f, + |last_value(p_size, false) over w1 as l + |from part + |where p_mfgr = 'Manufacturer#3' + |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 13. testSumWindow", + s""" + |select p_mfgr,p_name, p_size, + |sum(p_size) over w1 as s1, + |sum(p_size) over (distribute by p_mfgr sort by p_name + |rows between current row and current row) as s2 + |from part + |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 14. testNoSortClause", + s""" + |select p_mfgr,p_name, p_size, + |rank() over(distribute by p_mfgr sort by p_name) as r, + |dense_rank() over(distribute by p_mfgr sort by p_name) as dr + |from part + |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 15. testExpressions", + s""" + |select p_mfgr,p_name, p_size, + |rank() over(distribute by p_mfgr sort by p_name) as r, + |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, + |cume_dist() over(distribute by p_mfgr sort by p_name) as cud, + |percent_rank() over(distribute by p_mfgr sort by p_name) as pr, + |ntile(3) over(distribute by p_mfgr sort by p_name) as nt, + |count(p_size) over(distribute by p_mfgr sort by p_name) as ca, + |avg(p_size) over(distribute by p_mfgr sort by p_name) as avg, + |stddev(p_size) over(distribute by p_mfgr sort by p_name) as st, + |first_value(p_size % 5) over(distribute by p_mfgr sort by p_name) as fv, + |last_value(p_size) over(distribute by p_mfgr sort by p_name) as lv, + |first_value(p_size) over w1 as fvW1 + |from part + |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name + | rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 16. testMultipleWindows", + s""" + |select p_mfgr,p_name, p_size, + |rank() over(distribute by p_mfgr sort by p_name) as r, + |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, + |cume_dist() over(distribute by p_mfgr sort by p_name) as cud, + |sum(p_size) over (distribute by p_mfgr sort by p_name + |range between unbounded preceding and current row) as s1, + |sum(p_size) over (distribute by p_mfgr sort by p_size + |range between 5 preceding and current row) as s2, + |first_value(p_size) over w1 as fv1 + |from part + |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name + | rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + + createQueryTest("windowing.q -- 17. testCountStar", + s""" + |select p_mfgr,p_name, p_size, + |count(*) over(distribute by p_mfgr sort by p_name ) as c, + |count(p_size) over(distribute by p_mfgr sort by p_name) as ca, + |first_value(p_size) over w1 as fvW1 + |from part + |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name + | rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 18. testUDAFs", + s""" + |select p_mfgr,p_name, p_size, + |sum(p_retailprice) over w1 as s, + |min(p_retailprice) over w1 as mi, + |max(p_retailprice) over w1 as ma, + |avg(p_retailprice) over w1 as ag + |from part + |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name + | rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 19. testUDAFsWithGBY", + """ + |select p_mfgr,p_name, p_size, p_retailprice, + |sum(p_retailprice) over w1 as s, + |min(p_retailprice) as mi , + |max(p_retailprice) as ma , + |avg(p_retailprice) over w1 as ag + |from part + |group by p_mfgr,p_name, p_size, p_retailprice + |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name + | rows between 2 preceding and 2 following); + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 20. testSTATs", + """ + |select p_mfgr,p_name, p_size, + |stddev(p_retailprice) over w1 as sdev, + |stddev_pop(p_retailprice) over w1 as sdev_pop, + |collect_set(p_size) over w1 as uniq_size, + |variance(p_retailprice) over w1 as var, + |corr(p_size, p_retailprice) over w1 as cor, + |covar_pop(p_size, p_retailprice) over w1 as covarp + |from part + |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name + | rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 21. testDISTs", + """ + |select p_mfgr,p_name, p_size, + |histogram_numeric(p_retailprice, 5) over w1 as hist, + |percentile(p_partkey, 0.5) over w1 as per, + |row_number() over(distribute by p_mfgr sort by p_name) as rn + |from part + |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name + | rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 24. testLateralViews", + """ + |select p_mfgr, p_name, + |lv_col, p_size, sum(p_size) over w1 as s + |from (select p_mfgr, p_name, p_size, array(1,2,3) arr from part) p + |lateral view explode(arr) part_lv as lv_col + |window w1 as (distribute by p_mfgr sort by p_size, lv_col + | rows between 2 preceding and current row) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 26. testGroupByHavingWithSWQAndAlias", + """ + |select p_mfgr, p_name, p_size, min(p_retailprice) as mi, + |rank() over(distribute by p_mfgr sort by p_name) as r, + |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, + |p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz + |from part + |group by p_mfgr, p_name, p_size + |having p_size > 0 + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 27. testMultipleRangeWindows", + """ + |select p_mfgr,p_name, p_size, + |sum(p_size) over (distribute by p_mfgr sort by p_size + |range between 10 preceding and current row) as s2, + |sum(p_size) over (distribute by p_mfgr sort by p_size + |range between current row and 10 following ) as s1 + |from part + |window w1 as (rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 28. testPartOrderInUDAFInvoke", + """ + |select p_mfgr, p_name, p_size, + |sum(p_size) over (partition by p_mfgr order by p_name + |rows between 2 preceding and 2 following) as s + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 29. testPartOrderInWdwDef", + """ + |select p_mfgr, p_name, p_size, + |sum(p_size) over w1 as s + |from part + |window w1 as (partition by p_mfgr order by p_name + | rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 30. testDefaultPartitioningSpecRules", + """ + |select p_mfgr, p_name, p_size, + |sum(p_size) over w1 as s, + |sum(p_size) over w2 as s2 + |from part + |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following), + | w2 as (partition by p_mfgr order by p_name) + """.stripMargin, reset = false) + + /* p_name is not a numeric column. What is Hive's semantic? + createQueryTest("windowing.q -- 31. testWindowCrossReference", + """ + |select p_mfgr, p_name, p_size, + |sum(p_size) over w1 as s1, + |sum(p_size) over w2 as s2 + |from part + |window w1 as (partition by p_mfgr order by p_name + | range between 2 preceding and 2 following), + | w2 as w1 + """.stripMargin, reset = false) + */ + /* + createQueryTest("windowing.q -- 32. testWindowInheritance", + """ + |select p_mfgr, p_name, p_size, + |sum(p_size) over w1 as s1, + |sum(p_size) over w2 as s2 + |from part + |window w1 as (partition by p_mfgr order by p_name + | range between 2 preceding and 2 following), + | w2 as (w1 rows between unbounded preceding and current row) + """.stripMargin, reset = false) + */ + + /* p_name is not a numeric column. What is Hive's semantic? + createQueryTest("windowing.q -- 33. testWindowForwardReference", + """ + |select p_mfgr, p_name, p_size, + |sum(p_size) over w1 as s1, + |sum(p_size) over w2 as s2, + |sum(p_size) over w3 as s3 + |from part + |window w1 as (distribute by p_mfgr sort by p_name + | range between 2 preceding and 2 following), + | w2 as w3, + | w3 as (distribute by p_mfgr sort by p_name + | range between unbounded preceding and current row) + """.stripMargin, reset = false) + */ + /* + createQueryTest("windowing.q -- 34. testWindowDefinitionPropagation", + """ + |select p_mfgr, p_name, p_size, + |sum(p_size) over w1 as s1, + |sum(p_size) over w2 as s2, + |sum(p_size) over (w3 rows between 2 preceding and 2 following) as s3 + |from part + |window w1 as (distribute by p_mfgr sort by p_name + | range between 2 preceding and 2 following), + | w2 as w3, + | w3 as (distribute by p_mfgr sort by p_name + | range between unbounded preceding and current row) + """.stripMargin, reset = false) + */ + + /* Seems Hive evaluate SELECT DISTINCT before window functions? + createQueryTest("windowing.q -- 35. testDistinctWithWindowing", + """ + |select DISTINCT p_mfgr, p_name, p_size, + |sum(p_size) over w1 as s + |from part + |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following) + """.stripMargin, reset = false) + */ + + createQueryTest("windowing.q -- 36. testRankWithPartitioning", + """ + |select p_mfgr, p_name, p_size, + |rank() over (partition by p_mfgr order by p_name ) as r + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 37. testPartitioningVariousForms", + """ + |select p_mfgr, + |round(sum(p_retailprice) over (partition by p_mfgr order by p_mfgr),2) as s1, + |min(p_retailprice) over (partition by p_mfgr) as s2, + |max(p_retailprice) over (distribute by p_mfgr sort by p_mfgr) as s3, + |round(avg(p_retailprice) over (distribute by p_mfgr),2) as s4, + |count(p_retailprice) over (cluster by p_mfgr ) as s5 + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 38. testPartitioningVariousForms2", + """ + |select p_mfgr, p_name, p_size, + |sum(p_retailprice) over (partition by p_mfgr, p_name order by p_mfgr, p_name + |rows between unbounded preceding and current row) as s1, + |min(p_retailprice) over (distribute by p_mfgr, p_name sort by p_mfgr, p_name + |rows between unbounded preceding and current row) as s2, + |max(p_retailprice) over (partition by p_mfgr, p_name order by p_name) as s3 + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 39. testUDFOnOrderCols", + """ + |select p_mfgr, p_type, substr(p_type, 2) as short_ptype, + |rank() over (partition by p_mfgr order by substr(p_type, 2)) as r + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 40. testNoBetweenForRows", + """ + |select p_mfgr, p_name, p_size, + |sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows unbounded preceding) as s1 + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 41. testNoBetweenForRange", + """ + |select p_mfgr, p_name, p_size, + |sum(p_retailprice) over (distribute by p_mfgr sort by p_size range unbounded preceding) as s1 + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 42. testUnboundedFollowingForRows", + """ + |select p_mfgr, p_name, p_size, + |sum(p_retailprice) over (distribute by p_mfgr sort by p_name + |rows between current row and unbounded following) as s1 + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 43. testUnboundedFollowingForRange", + """ + |select p_mfgr, p_name, p_size, + |sum(p_retailprice) over (distribute by p_mfgr sort by p_size + |range between current row and unbounded following) as s1 + |from part + """.stripMargin, reset = false) + + createQueryTest("windowing.q -- 44. testOverNoPartitionSingleAggregate", + """ + |select p_name, p_retailprice, + |round(avg(p_retailprice) over(),2) + |from part + |order by p_name + """.stripMargin, reset = false) +} + +class HiveWindowFunctionQueryWithoutCodeGenSuite extends HiveWindowFunctionQueryBaseSuite { + var originalCodegenEnabled: Boolean = _ + override def beforeAll(): Unit = { + super.beforeAll() + originalCodegenEnabled = conf.codegenEnabled + sql("set spark.sql.codegen=false") + } + + override def afterAll(): Unit = { + sql(s"set spark.sql.codegen=$originalCodegenEnabled") + super.afterAll() + } +} + +abstract class HiveWindowFunctionQueryFileBaseSuite + extends HiveCompatibilitySuite with BeforeAndAfter { + private val originalTimeZone = TimeZone.getDefault + private val originalLocale = Locale.getDefault + private val testTempDir = Utils.createTempDir() + + override def beforeAll() { + TestHive.cacheTables = true + // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) + TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) + // Add Locale setting + Locale.setDefault(Locale.US) + + // The following settings are used for generating golden files with Hive. + // We have to use kryo to correctly let Hive serialize plans with window functions. + // This is used to generate golden files. + sql("set hive.plan.serialization.format=kryo") + // Explicitly set fs to local fs. + sql(s"set fs.default.name=file://$testTempDir/") + // Ask Hive to run jobs in-process as a single map and reduce task. + sql("set mapred.job.tracker=local") + } + + override def afterAll() { + TestHive.cacheTables = false + TimeZone.setDefault(originalTimeZone) + Locale.setDefault(originalLocale) + TestHive.reset() + } + + override def blackList: Seq[String] = Seq( + // Partitioned table functions are not supported. + "ptf*", + // tests of windowing.q are in HiveWindowFunctionQueryBaseSuite + "windowing.q", + + // This one failed on the expression of + // sum(lag(p_retailprice,1,0.0)) over w1 + // lag(p_retailprice,1,0.0) is a GenericUDF and the argument inspector of + // p_retailprice created by HiveInspectors is + // PrimitiveObjectInspectorFactory.javaDoubleObjectInspector. + // However, seems Hive assumes it is + // PrimitiveObjectInspectorFactory.writableDoubleObjectInspector, which introduces an error. + "windowing_expressions", + + // Hive's results are not deterministic + "windowing_multipartitioning", + "windowing_navfn", + "windowing_ntile", + "windowing_udaf", + "windowing_windowspec", + "windowing_rank" + ) + + override def whiteList: Seq[String] = Seq( + "windowing_udaf2", + "windowing_columnPruning", + "windowing_adjust_rowcontainer_sz" + ) + + override def testCases: Seq[(String, File)] = super.testCases.filter { + case (name, _) => realWhiteList.contains(name) + } +} + +class HiveWindowFunctionQueryFileWithoutCodeGenSuite extends HiveWindowFunctionQueryFileBaseSuite { + var originalCodegenEnabled: Boolean = _ + override def beforeAll(): Unit = { + super.beforeAll() + originalCodegenEnabled = conf.codegenEnabled + sql("set spark.sql.codegen=false") + } + + override def afterAll(): Unit = { + sql(s"set spark.sql.codegen=$originalCodegenEnabled") + super.afterAll() + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala deleted file mode 100644 index 934452fe57..0000000000 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala +++ /dev/null @@ -1,845 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.hive.execution - -import java.io.File -import java.util.{Locale, TimeZone} - -import org.scalatest.BeforeAndAfter - -import org.apache.spark.sql.hive.test.TestHive -import org.apache.spark.sql.hive.test.TestHive._ -import org.apache.spark.util.Utils - -/** - * The test suite for window functions. To actually compare results with Hive, - * every test should be created by `createQueryTest`. Because we are reusing tables - * for different tests and there are a few properties needed to let Hive generate golden - * files, every `createQueryTest` calls should explicitly set `reset` to `false`. - */ -abstract class HiveWindowFunctionQueryBaseSuite extends HiveComparisonTest with BeforeAndAfter { - private val originalTimeZone = TimeZone.getDefault - private val originalLocale = Locale.getDefault - private val testTempDir = Utils.createTempDir() - - override def beforeAll() { - TestHive.cacheTables = true - // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) - TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) - // Add Locale setting - Locale.setDefault(Locale.US) - - // Create the table used in windowing.q - sql("DROP TABLE IF EXISTS part") - sql( - """ - |CREATE TABLE part( - | p_partkey INT, - | p_name STRING, - | p_mfgr STRING, - | p_brand STRING, - | p_type STRING, - | p_size INT, - | p_container STRING, - | p_retailprice DOUBLE, - | p_comment STRING) - """.stripMargin) - val testData1 = TestHive.getHiveFile("data/files/part_tiny.txt").getCanonicalPath - sql( - s""" - |LOAD DATA LOCAL INPATH '$testData1' overwrite into table part - """.stripMargin) - - sql("DROP TABLE IF EXISTS over1k") - sql( - """ - |create table over1k( - | t tinyint, - | si smallint, - | i int, - | b bigint, - | f float, - | d double, - | bo boolean, - | s string, - | ts timestamp, - | dec decimal(4,2), - | bin binary) - |row format delimited - |fields terminated by '|' - """.stripMargin) - val testData2 = TestHive.getHiveFile("data/files/over1k").getCanonicalPath - sql( - s""" - |LOAD DATA LOCAL INPATH '$testData2' overwrite into table over1k - """.stripMargin) - - // The following settings are used for generating golden files with Hive. - // We have to use kryo to correctly let Hive serialize plans with window functions. - // This is used to generate golden files. - sql("set hive.plan.serialization.format=kryo") - // Explicitly set fs to local fs. - sql(s"set fs.default.name=file://$testTempDir/") - // Ask Hive to run jobs in-process as a single map and reduce task. - sql("set mapred.job.tracker=local") - } - - override def afterAll() { - TestHive.cacheTables = false - TimeZone.setDefault(originalTimeZone) - Locale.setDefault(originalLocale) - TestHive.reset() - } - - ///////////////////////////////////////////////////////////////////////////// - // Tests based on windowing_multipartitioning.q - // Results of the original query file are not deterministic. - ///////////////////////////////////////////////////////////////////////////// - createQueryTest("windowing_multipartitioning.q (deterministic) 1", - s""" - |select s, - |rank() over (partition by s order by si) r, - |sum(b) over (partition by s order by si) sum - |from over1k - |order by s, r, sum; - """.stripMargin, reset = false) - - /* timestamp comparison issue with Hive? - createQueryTest("windowing_multipartitioning.q (deterministic) 2", - s""" - |select s, - |rank() over (partition by s order by dec desc) r, - |sum(b) over (partition by s order by ts desc) as sum - |from over1k - |where s = 'tom allen' or s = 'bob steinbeck' - |order by s, r, sum; - """.stripMargin, reset = false) - */ - - createQueryTest("windowing_multipartitioning.q (deterministic) 3", - s""" - |select s, sum(i) over (partition by s), sum(f) over (partition by si) - |from over1k where s = 'tom allen' or s = 'bob steinbeck'; - """.stripMargin, reset = false) - - createQueryTest("windowing_multipartitioning.q (deterministic) 4", - s""" - |select s, rank() over (partition by s order by bo), - |rank() over (partition by si order by bin desc) from over1k - |where s = 'tom allen' or s = 'bob steinbeck'; - """.stripMargin, reset = false) - - createQueryTest("windowing_multipartitioning.q (deterministic) 5", - s""" - |select s, sum(f) over (partition by i), row_number() over (order by f) - |from over1k where s = 'tom allen' or s = 'bob steinbeck'; - """.stripMargin, reset = false) - - createQueryTest("windowing_multipartitioning.q (deterministic) 6", - s""" - |select s, rank() over w1, - |rank() over w2 - |from over1k - |where s = 'tom allen' or s = 'bob steinbeck' - |window - |w1 as (partition by s order by dec), - |w2 as (partition by si order by f) ; - """.stripMargin, reset = false) - - ///////////////////////////////////////////////////////////////////////////// - // Tests based on windowing_navfn.q - // Results of the original query file are not deterministic. - // Also, the original query of - // select i, lead(s) over (partition by bin order by d,i desc) from over1k ; - ///////////////////////////////////////////////////////////////////////////// - createQueryTest("windowing_navfn.q (deterministic)", - s""" - |select s, row_number() over (partition by d order by dec) rn from over1k - |order by s, rn desc; - |select i, lead(s) over (partition by cast(bin as string) order by d,i desc) as l - |from over1k - |order by i desc, l; - |select i, lag(dec) over (partition by i order by s,i,dec) l from over1k - |order by i, l; - |select s, last_value(t) over (partition by d order by f) l from over1k - |order by s, l; - |select s, first_value(s) over (partition by bo order by s) f from over1k - |order by s, f; - |select t, s, i, last_value(i) over (partition by t order by s) - |from over1k where (s = 'oscar allen' or s = 'oscar carson') and t = 10; - """.stripMargin, reset = false) - - ///////////////////////////////////////////////////////////////////////////// - // Tests based on windowing_ntile.q - // Results of the original query file are not deterministic. - ///////////////////////////////////////////////////////////////////////////// - createQueryTest("windowing_ntile.q (deterministic)", - s""" - |select i, ntile(10) over (partition by s order by i) n from over1k - |order by i, n; - |select s, ntile(100) over (partition by i order by s) n from over1k - |order by s, n; - |select f, ntile(4) over (partition by d order by f) n from over1k - |order by f, n; - |select d, ntile(1000) over (partition by dec order by d) n from over1k - |order by d, n; - """.stripMargin, reset = false) - - ///////////////////////////////////////////////////////////////////////////// - // Tests based on windowing_udaf.q - // Results of the original query file are not deterministic. - ///////////////////////////////////////////////////////////////////////////// - createQueryTest("windowing_udaf.q (deterministic)", - s""" - |select s, min(i) over (partition by s) m from over1k - |order by s, m; - |select s, avg(f) over (partition by si order by s) a from over1k - |order by s, a; - |select s, avg(i) over (partition by t, b order by s) a from over1k - |order by s, a; - |select max(i) over w m from over1k - |order by m window w as (partition by f) ; - |select s, avg(d) over (partition by t order by f) a from over1k - |order by s, a; - """.stripMargin, reset = false) - - ///////////////////////////////////////////////////////////////////////////// - // Tests based on windowing_windowspec.q - // Results of the original query file are not deterministic. - ///////////////////////////////////////////////////////////////////////////// - createQueryTest("windowing_windowspec.q (deterministic)", - s""" - |select s, sum(b) over (partition by i order by s,b rows unbounded preceding) as sum - |from over1k order by s, sum; - |select s, sum(f) over (partition by d order by s,f rows unbounded preceding) as sum - |from over1k order by s, sum; - |select s, sum(f) over - |(partition by ts order by f range between current row and unbounded following) as sum - |from over1k order by s, sum; - |select s, avg(f) - |over (partition by ts order by s,f rows between current row and 5 following) avg - |from over1k order by s, avg; - |select s, avg(d) over - |(partition by t order by s,d desc rows between 5 preceding and 5 following) avg - |from over1k order by s, avg; - |select s, sum(i) over(partition by ts order by s) sum from over1k - |order by s, sum; - |select f, sum(f) over - |(partition by ts order by f range between unbounded preceding and current row) sum - |from over1k order by f, sum; - |select s, i, round(avg(d) over (partition by s order by i) / 10.0 , 2) avg - |from over1k order by s, i, avg; - |select s, i, round((avg(d) over w1 + 10.0) - (avg(d) over w1 - 10.0),2) avg - |from over1k - |order by s, i, avg window w1 as (partition by s order by i); - """.stripMargin, reset = false) - - ///////////////////////////////////////////////////////////////////////////// - // Tests based on windowing_rank.q - // Results of the original query file are not deterministic. - ///////////////////////////////////////////////////////////////////////////// - createQueryTest("windowing_rank.q (deterministic) 1", - s""" - |select s, rank() over (partition by f order by t) r from over1k order by s, r; - |select s, dense_rank() over (partition by ts order by i,s desc) as r from over1k - |order by s desc, r desc; - |select s, cume_dist() over (partition by bo order by b,s) cd from over1k - |order by s, cd; - |select s, percent_rank() over (partition by dec order by f) r from over1k - |order by s desc, r desc; - """.stripMargin, reset = false) - - createQueryTest("windowing_rank.q (deterministic) 2", - s""" - |select ts, dec, rnk - |from - | (select ts, dec, - | rank() over (partition by ts order by dec) as rnk - | from - | (select other.ts, other.dec - | from over1k other - | join over1k on (other.b = over1k.b) - | ) joined - | ) ranked - |where rnk = 1 - |order by ts, dec, rnk; - """.stripMargin, reset = false) - - createQueryTest("windowing_rank.q (deterministic) 3", - s""" - |select ts, dec, rnk - |from - | (select ts, dec, - | rank() over (partition by ts order by dec) as rnk - | from - | (select other.ts, other.dec - | from over1k other - | join over1k on (other.b = over1k.b) - | ) joined - | ) ranked - |where dec = 89.5 - |order by ts, dec, rnk; - """.stripMargin, reset = false) - - createQueryTest("windowing_rank.q (deterministic) 4", - s""" - |select ts, dec, rnk - |from - | (select ts, dec, - | rank() over (partition by ts order by dec) as rnk - | from - | (select other.ts, other.dec - | from over1k other - | join over1k on (other.b = over1k.b) - | where other.t < 10 - | ) joined - | ) ranked - |where rnk = 1 - |order by ts, dec, rnk; - """.stripMargin, reset = false) - - ///////////////////////////////////////////////////////////////////////////// - // Tests from windowing.q - // We port tests in windowing.q to here because this query file contains too - // many tests and the syntax of test "-- 7. testJoinWithWindowingAndPTF" - // is not supported right now. - ///////////////////////////////////////////////////////////////////////////// - createQueryTest("windowing.q -- 1. testWindowing", - s""" - |select p_mfgr, p_name, p_size, - |rank() over(distribute by p_mfgr sort by p_name) as r, - |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, - |sum(p_retailprice) over - |(distribute by p_mfgr sort by p_name rows between unbounded preceding and current row) as s1 - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 2. testGroupByWithPartitioning", - s""" - |select p_mfgr, p_name, p_size, - |min(p_retailprice), - |rank() over(distribute by p_mfgr sort by p_name)as r, - |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, - |p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz - |from part - |group by p_mfgr, p_name, p_size - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 3. testGroupByHavingWithSWQ", - s""" - |select p_mfgr, p_name, p_size, min(p_retailprice), - |rank() over(distribute by p_mfgr sort by p_name) as r, - |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, - |p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz - |from part - |group by p_mfgr, p_name, p_size - |having p_size > 0 - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 4. testCount", - s""" - |select p_mfgr, p_name, - |count(p_size) over(distribute by p_mfgr sort by p_name) as cd - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 5. testCountWithWindowingUDAF", - s""" - |select p_mfgr, p_name, - |rank() over(distribute by p_mfgr sort by p_name) as r, - |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, - |count(p_size) over(distribute by p_mfgr sort by p_name) as cd, - |p_retailprice, sum(p_retailprice) over (distribute by p_mfgr sort by p_name - | rows between unbounded preceding and current row) as s1, - |p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 6. testCountInSubQ", - s""" - |select sub1.r, sub1.dr, sub1.cd, sub1.s1, sub1.deltaSz - |from (select p_mfgr, p_name, - |rank() over(distribute by p_mfgr sort by p_name) as r, - |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, - |count(p_size) over(distribute by p_mfgr sort by p_name) as cd, - |p_retailprice, sum(p_retailprice) over (distribute by p_mfgr sort by p_name - | rows between unbounded preceding and current row) as s1, - |p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz - |from part - |) sub1 - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 8. testMixedCaseAlias", - s""" - |select p_mfgr, p_name, p_size, - |rank() over(distribute by p_mfgr sort by p_name, p_size desc) as R - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 9. testHavingWithWindowingNoGBY", - s""" - |select p_mfgr, p_name, p_size, - |rank() over(distribute by p_mfgr sort by p_name) as r, - |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, - |sum(p_retailprice) over (distribute by p_mfgr sort by p_name - | rows between unbounded preceding and current row) as s1 - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 10. testHavingWithWindowingCondRankNoGBY", - s""" - |select p_mfgr, p_name, p_size, - |rank() over(distribute by p_mfgr sort by p_name) as r, - |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, - |sum(p_retailprice) over (distribute by p_mfgr sort by p_name - | rows between unbounded preceding and current row) as s1 - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 11. testFirstLast", - s""" - |select p_mfgr,p_name, p_size, - |sum(p_size) over (distribute by p_mfgr sort by p_name - |rows between current row and current row) as s2, - |first_value(p_size) over w1 as f, - |last_value(p_size, false) over w1 as l - |from part - |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 12. testFirstLastWithWhere", - s""" - |select p_mfgr,p_name, p_size, - |rank() over(distribute by p_mfgr sort by p_name) as r, - |sum(p_size) over (distribute by p_mfgr sort by p_name - |rows between current row and current row) as s2, - |first_value(p_size) over w1 as f, - |last_value(p_size, false) over w1 as l - |from part - |where p_mfgr = 'Manufacturer#3' - |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 13. testSumWindow", - s""" - |select p_mfgr,p_name, p_size, - |sum(p_size) over w1 as s1, - |sum(p_size) over (distribute by p_mfgr sort by p_name - |rows between current row and current row) as s2 - |from part - |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 14. testNoSortClause", - s""" - |select p_mfgr,p_name, p_size, - |rank() over(distribute by p_mfgr sort by p_name) as r, - |dense_rank() over(distribute by p_mfgr sort by p_name) as dr - |from part - |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 15. testExpressions", - s""" - |select p_mfgr,p_name, p_size, - |rank() over(distribute by p_mfgr sort by p_name) as r, - |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, - |cume_dist() over(distribute by p_mfgr sort by p_name) as cud, - |percent_rank() over(distribute by p_mfgr sort by p_name) as pr, - |ntile(3) over(distribute by p_mfgr sort by p_name) as nt, - |count(p_size) over(distribute by p_mfgr sort by p_name) as ca, - |avg(p_size) over(distribute by p_mfgr sort by p_name) as avg, - |stddev(p_size) over(distribute by p_mfgr sort by p_name) as st, - |first_value(p_size % 5) over(distribute by p_mfgr sort by p_name) as fv, - |last_value(p_size) over(distribute by p_mfgr sort by p_name) as lv, - |first_value(p_size) over w1 as fvW1 - |from part - |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name - | rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 16. testMultipleWindows", - s""" - |select p_mfgr,p_name, p_size, - |rank() over(distribute by p_mfgr sort by p_name) as r, - |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, - |cume_dist() over(distribute by p_mfgr sort by p_name) as cud, - |sum(p_size) over (distribute by p_mfgr sort by p_name - |range between unbounded preceding and current row) as s1, - |sum(p_size) over (distribute by p_mfgr sort by p_size - |range between 5 preceding and current row) as s2, - |first_value(p_size) over w1 as fv1 - |from part - |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name - | rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - - createQueryTest("windowing.q -- 17. testCountStar", - s""" - |select p_mfgr,p_name, p_size, - |count(*) over(distribute by p_mfgr sort by p_name ) as c, - |count(p_size) over(distribute by p_mfgr sort by p_name) as ca, - |first_value(p_size) over w1 as fvW1 - |from part - |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name - | rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 18. testUDAFs", - s""" - |select p_mfgr,p_name, p_size, - |sum(p_retailprice) over w1 as s, - |min(p_retailprice) over w1 as mi, - |max(p_retailprice) over w1 as ma, - |avg(p_retailprice) over w1 as ag - |from part - |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name - | rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 19. testUDAFsWithGBY", - """ - |select p_mfgr,p_name, p_size, p_retailprice, - |sum(p_retailprice) over w1 as s, - |min(p_retailprice) as mi , - |max(p_retailprice) as ma , - |avg(p_retailprice) over w1 as ag - |from part - |group by p_mfgr,p_name, p_size, p_retailprice - |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name - | rows between 2 preceding and 2 following); - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 20. testSTATs", - """ - |select p_mfgr,p_name, p_size, - |stddev(p_retailprice) over w1 as sdev, - |stddev_pop(p_retailprice) over w1 as sdev_pop, - |collect_set(p_size) over w1 as uniq_size, - |variance(p_retailprice) over w1 as var, - |corr(p_size, p_retailprice) over w1 as cor, - |covar_pop(p_size, p_retailprice) over w1 as covarp - |from part - |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name - | rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 21. testDISTs", - """ - |select p_mfgr,p_name, p_size, - |histogram_numeric(p_retailprice, 5) over w1 as hist, - |percentile(p_partkey, 0.5) over w1 as per, - |row_number() over(distribute by p_mfgr sort by p_name) as rn - |from part - |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name - | rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 24. testLateralViews", - """ - |select p_mfgr, p_name, - |lv_col, p_size, sum(p_size) over w1 as s - |from (select p_mfgr, p_name, p_size, array(1,2,3) arr from part) p - |lateral view explode(arr) part_lv as lv_col - |window w1 as (distribute by p_mfgr sort by p_size, lv_col - | rows between 2 preceding and current row) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 26. testGroupByHavingWithSWQAndAlias", - """ - |select p_mfgr, p_name, p_size, min(p_retailprice) as mi, - |rank() over(distribute by p_mfgr sort by p_name) as r, - |dense_rank() over(distribute by p_mfgr sort by p_name) as dr, - |p_size, p_size - lag(p_size,1,p_size) over(distribute by p_mfgr sort by p_name) as deltaSz - |from part - |group by p_mfgr, p_name, p_size - |having p_size > 0 - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 27. testMultipleRangeWindows", - """ - |select p_mfgr,p_name, p_size, - |sum(p_size) over (distribute by p_mfgr sort by p_size - |range between 10 preceding and current row) as s2, - |sum(p_size) over (distribute by p_mfgr sort by p_size - |range between current row and 10 following ) as s1 - |from part - |window w1 as (rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 28. testPartOrderInUDAFInvoke", - """ - |select p_mfgr, p_name, p_size, - |sum(p_size) over (partition by p_mfgr order by p_name - |rows between 2 preceding and 2 following) as s - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 29. testPartOrderInWdwDef", - """ - |select p_mfgr, p_name, p_size, - |sum(p_size) over w1 as s - |from part - |window w1 as (partition by p_mfgr order by p_name - | rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 30. testDefaultPartitioningSpecRules", - """ - |select p_mfgr, p_name, p_size, - |sum(p_size) over w1 as s, - |sum(p_size) over w2 as s2 - |from part - |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following), - | w2 as (partition by p_mfgr order by p_name) - """.stripMargin, reset = false) - - /* p_name is not a numeric column. What is Hive's semantic? - createQueryTest("windowing.q -- 31. testWindowCrossReference", - """ - |select p_mfgr, p_name, p_size, - |sum(p_size) over w1 as s1, - |sum(p_size) over w2 as s2 - |from part - |window w1 as (partition by p_mfgr order by p_name - | range between 2 preceding and 2 following), - | w2 as w1 - """.stripMargin, reset = false) - */ - /* - createQueryTest("windowing.q -- 32. testWindowInheritance", - """ - |select p_mfgr, p_name, p_size, - |sum(p_size) over w1 as s1, - |sum(p_size) over w2 as s2 - |from part - |window w1 as (partition by p_mfgr order by p_name - | range between 2 preceding and 2 following), - | w2 as (w1 rows between unbounded preceding and current row) - """.stripMargin, reset = false) - */ - - /* p_name is not a numeric column. What is Hive's semantic? - createQueryTest("windowing.q -- 33. testWindowForwardReference", - """ - |select p_mfgr, p_name, p_size, - |sum(p_size) over w1 as s1, - |sum(p_size) over w2 as s2, - |sum(p_size) over w3 as s3 - |from part - |window w1 as (distribute by p_mfgr sort by p_name - | range between 2 preceding and 2 following), - | w2 as w3, - | w3 as (distribute by p_mfgr sort by p_name - | range between unbounded preceding and current row) - """.stripMargin, reset = false) - */ - /* - createQueryTest("windowing.q -- 34. testWindowDefinitionPropagation", - """ - |select p_mfgr, p_name, p_size, - |sum(p_size) over w1 as s1, - |sum(p_size) over w2 as s2, - |sum(p_size) over (w3 rows between 2 preceding and 2 following) as s3 - |from part - |window w1 as (distribute by p_mfgr sort by p_name - | range between 2 preceding and 2 following), - | w2 as w3, - | w3 as (distribute by p_mfgr sort by p_name - | range between unbounded preceding and current row) - """.stripMargin, reset = false) - */ - - /* Seems Hive evaluate SELECT DISTINCT before window functions? - createQueryTest("windowing.q -- 35. testDistinctWithWindowing", - """ - |select DISTINCT p_mfgr, p_name, p_size, - |sum(p_size) over w1 as s - |from part - |window w1 as (distribute by p_mfgr sort by p_name rows between 2 preceding and 2 following) - """.stripMargin, reset = false) - */ - - createQueryTest("windowing.q -- 36. testRankWithPartitioning", - """ - |select p_mfgr, p_name, p_size, - |rank() over (partition by p_mfgr order by p_name ) as r - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 37. testPartitioningVariousForms", - """ - |select p_mfgr, - |round(sum(p_retailprice) over (partition by p_mfgr order by p_mfgr),2) as s1, - |min(p_retailprice) over (partition by p_mfgr) as s2, - |max(p_retailprice) over (distribute by p_mfgr sort by p_mfgr) as s3, - |round(avg(p_retailprice) over (distribute by p_mfgr),2) as s4, - |count(p_retailprice) over (cluster by p_mfgr ) as s5 - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 38. testPartitioningVariousForms2", - """ - |select p_mfgr, p_name, p_size, - |sum(p_retailprice) over (partition by p_mfgr, p_name order by p_mfgr, p_name - |rows between unbounded preceding and current row) as s1, - |min(p_retailprice) over (distribute by p_mfgr, p_name sort by p_mfgr, p_name - |rows between unbounded preceding and current row) as s2, - |max(p_retailprice) over (partition by p_mfgr, p_name order by p_name) as s3 - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 39. testUDFOnOrderCols", - """ - |select p_mfgr, p_type, substr(p_type, 2) as short_ptype, - |rank() over (partition by p_mfgr order by substr(p_type, 2)) as r - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 40. testNoBetweenForRows", - """ - |select p_mfgr, p_name, p_size, - |sum(p_retailprice) over (distribute by p_mfgr sort by p_name rows unbounded preceding) as s1 - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 41. testNoBetweenForRange", - """ - |select p_mfgr, p_name, p_size, - |sum(p_retailprice) over (distribute by p_mfgr sort by p_size range unbounded preceding) as s1 - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 42. testUnboundedFollowingForRows", - """ - |select p_mfgr, p_name, p_size, - |sum(p_retailprice) over (distribute by p_mfgr sort by p_name - |rows between current row and unbounded following) as s1 - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 43. testUnboundedFollowingForRange", - """ - |select p_mfgr, p_name, p_size, - |sum(p_retailprice) over (distribute by p_mfgr sort by p_size - |range between current row and unbounded following) as s1 - |from part - """.stripMargin, reset = false) - - createQueryTest("windowing.q -- 44. testOverNoPartitionSingleAggregate", - """ - |select p_name, p_retailprice, - |round(avg(p_retailprice) over(),2) - |from part - |order by p_name - """.stripMargin, reset = false) -} - -class HiveWindowFunctionQueryWithoutCodeGenSuite extends HiveWindowFunctionQueryBaseSuite { - var originalCodegenEnabled: Boolean = _ - override def beforeAll(): Unit = { - super.beforeAll() - originalCodegenEnabled = conf.codegenEnabled - sql("set spark.sql.codegen=false") - } - - override def afterAll(): Unit = { - sql(s"set spark.sql.codegen=$originalCodegenEnabled") - super.afterAll() - } -} - -abstract class HiveWindowFunctionQueryFileBaseSuite - extends HiveCompatibilitySuite with BeforeAndAfter { - private val originalTimeZone = TimeZone.getDefault - private val originalLocale = Locale.getDefault - private val testTempDir = Utils.createTempDir() - - override def beforeAll() { - TestHive.cacheTables = true - // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) - TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) - // Add Locale setting - Locale.setDefault(Locale.US) - - // The following settings are used for generating golden files with Hive. - // We have to use kryo to correctly let Hive serialize plans with window functions. - // This is used to generate golden files. - sql("set hive.plan.serialization.format=kryo") - // Explicitly set fs to local fs. - sql(s"set fs.default.name=file://$testTempDir/") - // Ask Hive to run jobs in-process as a single map and reduce task. - sql("set mapred.job.tracker=local") - } - - override def afterAll() { - TestHive.cacheTables = false - TimeZone.setDefault(originalTimeZone) - Locale.setDefault(originalLocale) - TestHive.reset() - } - - override def blackList: Seq[String] = Seq( - // Partitioned table functions are not supported. - "ptf*", - // tests of windowing.q are in HiveWindowFunctionQueryBaseSuite - "windowing.q", - - // This one failed on the expression of - // sum(lag(p_retailprice,1,0.0)) over w1 - // lag(p_retailprice,1,0.0) is a GenericUDF and the argument inspector of - // p_retailprice created by HiveInspectors is - // PrimitiveObjectInspectorFactory.javaDoubleObjectInspector. - // However, seems Hive assumes it is - // PrimitiveObjectInspectorFactory.writableDoubleObjectInspector, which introduces an error. - "windowing_expressions", - - // Hive's results are not deterministic - "windowing_multipartitioning", - "windowing_navfn", - "windowing_ntile", - "windowing_udaf", - "windowing_windowspec", - "windowing_rank" - ) - - override def whiteList: Seq[String] = Seq( - "windowing_udaf2", - "windowing_columnPruning", - "windowing_adjust_rowcontainer_sz" - ) - - override def testCases: Seq[(String, File)] = super.testCases.filter { - case (name, _) => realWhiteList.contains(name) - } -} - -class HiveWindowFunctionQueryFileWithoutCodeGenSuite extends HiveWindowFunctionQueryFileBaseSuite { - var originalCodegenEnabled: Boolean = _ - override def beforeAll(): Unit = { - super.beforeAll() - originalCodegenEnabled = conf.codegenEnabled - sql("set spark.sql.codegen=false") - } - - override def afterAll(): Unit = { - sql(s"set spark.sql.codegen=$originalCodegenEnabled") - super.afterAll() - } -} -- cgit v1.2.3