From 04ad822534e8ded96a9ba4b7d43320e53c6d2808 Mon Sep 17 00:00:00 2001 From: windpiger Date: Mon, 13 Feb 2017 12:25:13 +0100 Subject: [SPARK-19496][SQL] to_date udf to return null when input date is invalid ## What changes were proposed in this pull request? Currently the udf `to_date` has different return value with an invalid date input. ``` SELECT to_date('2015-07-22', 'yyyy-dd-MM') -> return `2016-10-07` SELECT to_date('2014-31-12') -> return null ``` As discussed in JIRA [SPARK-19496](https://issues.apache.org/jira/browse/SPARK-19496), we should return null in both situations when the input date is invalid ## How was this patch tested? unit test added Author: windpiger Closes #16870 from windpiger/to_date. --- .../spark/sql/catalyst/util/DateTimeUtils.scala | 4 ++ .../org/apache/spark/sql/DateFunctionsSuite.scala | 75 ++++++++++++++++++++-- 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index af70efbb0a..9e1de0fd2f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -98,6 +98,10 @@ object DateTimeUtils { def newDateFormat(formatString: String, timeZone: TimeZone): DateFormat = { val sdf = new SimpleDateFormat(formatString, Locale.US) sdf.setTimeZone(timeZone) + // Enable strict parsing, if the input date/format is invalid, it will throw an exception. + // e.g. to parse invalid date '2016-13-12', or '2016-01-12' with invalid format 'yyyy-aa-dd', + // an exception will be throwed. + sdf.setLenient(false) sdf } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index 618db43446..2acda3f007 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -398,14 +398,27 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext { Row(Date.valueOf("2014-12-31")))) checkAnswer( df.select(to_date(col("s"), "yyyy-MM-dd")), - Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), - Row(Date.valueOf("2016-07-12")))) + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) // now switch format checkAnswer( df.select(to_date(col("s"), "yyyy-dd-MM")), - Seq(Row(Date.valueOf("2016-10-07")), Row(Date.valueOf("2016-07-12")), - Row(Date.valueOf("2014-12-31")))) + Seq(Row(null), Row(null), Row(Date.valueOf("2014-12-31")))) + + // invalid format + checkAnswer( + df.select(to_date(col("s"), "yyyy-hh-MM")), + Seq(Row(null), Row(null), Row(null))) + checkAnswer( + df.select(to_date(col("s"), "yyyy-dd-aa")), + Seq(Row(null), Row(null), Row(null))) + + // february + val x1 = "2016-02-29" + val x2 = "2017-02-29" + val df1 = Seq(x1, x2).toDF("x") + checkAnswer( + df1.select(to_date(col("x"))), Row(Date.valueOf("2016-02-29")) :: Row(null) :: Nil) } test("function trunc") { @@ -477,6 +490,35 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext { checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq( Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L))) + val x1 = "2015-07-24 10:00:00" + val x2 = "2015-25-07 02:02:02" + val x3 = "2015-07-24 25:02:02" + val x4 = "2015-24-07 26:02:02" + val ts3 = Timestamp.valueOf("2015-07-24 02:25:02") + val ts4 = Timestamp.valueOf("2015-07-24 00:10:00") + + val df1 = Seq(x1, x2, x3, x4).toDF("x") + checkAnswer(df1.select(unix_timestamp(col("x"))), Seq( + Row(ts1.getTime / 1000L), Row(null), Row(null), Row(null))) + checkAnswer(df1.selectExpr("unix_timestamp(x)"), Seq( + Row(ts1.getTime / 1000L), Row(null), Row(null), Row(null))) + checkAnswer(df1.select(unix_timestamp(col("x"), "yyyy-dd-MM HH:mm:ss")), Seq( + Row(null), Row(ts2.getTime / 1000L), Row(null), Row(null))) + checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq( + Row(ts4.getTime / 1000L), Row(null), Row(ts3.getTime / 1000L), Row(null))) + + // invalid format + checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd aa:HH:ss')"), Seq( + Row(null), Row(null), Row(null), Row(null))) + + // february + val y1 = "2016-02-29" + val y2 = "2017-02-29" + val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") + val df2 = Seq(y1, y2).toDF("y") + checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq( + Row(ts5.getTime / 1000L), Row(null))) + val now = sql("select unix_timestamp()").collect().head.getLong(0) checkAnswer(sql(s"select cast ($now as timestamp)"), Row(new java.util.Date(now * 1000))) } @@ -500,6 +542,31 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext { Row(date1.getTime / 1000L), Row(date2.getTime / 1000L))) checkAnswer(df.selectExpr(s"to_unix_timestamp(s, '$fmt')"), Seq( Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L))) + + val x1 = "2015-07-24 10:00:00" + val x2 = "2015-25-07 02:02:02" + val x3 = "2015-07-24 25:02:02" + val x4 = "2015-24-07 26:02:02" + val ts3 = Timestamp.valueOf("2015-07-24 02:25:02") + val ts4 = Timestamp.valueOf("2015-07-24 00:10:00") + + val df1 = Seq(x1, x2, x3, x4).toDF("x") + checkAnswer(df1.selectExpr("to_unix_timestamp(x)"), Seq( + Row(ts1.getTime / 1000L), Row(null), Row(null), Row(null))) + checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq( + Row(ts4.getTime / 1000L), Row(null), Row(ts3.getTime / 1000L), Row(null))) + + // february + val y1 = "2016-02-29" + val y2 = "2017-02-29" + val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") + val df2 = Seq(y1, y2).toDF("y") + checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq( + Row(ts5.getTime / 1000L), Row(null))) + + // invalid format + checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd bb:HH:ss')"), Seq( + Row(null), Row(null), Row(null), Row(null))) } -- cgit v1.2.3