aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorwindpiger <songjun@outlook.com>2017-02-13 12:25:13 +0100
committerHerman van Hovell <hvanhovell@databricks.com>2017-02-13 12:25:13 +0100
commit04ad822534e8ded96a9ba4b7d43320e53c6d2808 (patch)
tree82d0d202e0782587345738a1746b1cfae3327ce1
parent8f03ad547895b63825cff751265231b4fb75d660 (diff)
downloadspark-04ad822534e8ded96a9ba4b7d43320e53c6d2808.tar.gz
spark-04ad822534e8ded96a9ba4b7d43320e53c6d2808.tar.bz2
spark-04ad822534e8ded96a9ba4b7d43320e53c6d2808.zip
[SPARK-19496][SQL] to_date udf to return null when input date is invalid
## What changes were proposed in this pull request? Currently the udf `to_date` has different return value with an invalid date input. ``` SELECT to_date('2015-07-22', 'yyyy-dd-MM') -> return `2016-10-07` SELECT to_date('2014-31-12') -> return null ``` As discussed in JIRA [SPARK-19496](https://issues.apache.org/jira/browse/SPARK-19496), we should return null in both situations when the input date is invalid ## How was this patch tested? unit test added Author: windpiger <songjun@outlook.com> Closes #16870 from windpiger/to_date.
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala4
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala75
2 files changed, 75 insertions, 4 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index af70efbb0a..9e1de0fd2f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -98,6 +98,10 @@ object DateTimeUtils {
def newDateFormat(formatString: String, timeZone: TimeZone): DateFormat = {
val sdf = new SimpleDateFormat(formatString, Locale.US)
sdf.setTimeZone(timeZone)
+ // Enable strict parsing, if the input date/format is invalid, it will throw an exception.
+ // e.g. to parse invalid date '2016-13-12', or '2016-01-12' with invalid format 'yyyy-aa-dd',
+ // an exception will be throwed.
+ sdf.setLenient(false)
sdf
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 618db43446..2acda3f007 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -398,14 +398,27 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
Row(Date.valueOf("2014-12-31"))))
checkAnswer(
df.select(to_date(col("s"), "yyyy-MM-dd")),
- Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")),
- Row(Date.valueOf("2016-07-12"))))
+ Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null)))
// now switch format
checkAnswer(
df.select(to_date(col("s"), "yyyy-dd-MM")),
- Seq(Row(Date.valueOf("2016-10-07")), Row(Date.valueOf("2016-07-12")),
- Row(Date.valueOf("2014-12-31"))))
+ Seq(Row(null), Row(null), Row(Date.valueOf("2014-12-31"))))
+
+ // invalid format
+ checkAnswer(
+ df.select(to_date(col("s"), "yyyy-hh-MM")),
+ Seq(Row(null), Row(null), Row(null)))
+ checkAnswer(
+ df.select(to_date(col("s"), "yyyy-dd-aa")),
+ Seq(Row(null), Row(null), Row(null)))
+
+ // february
+ val x1 = "2016-02-29"
+ val x2 = "2017-02-29"
+ val df1 = Seq(x1, x2).toDF("x")
+ checkAnswer(
+ df1.select(to_date(col("x"))), Row(Date.valueOf("2016-02-29")) :: Row(null) :: Nil)
}
test("function trunc") {
@@ -477,6 +490,35 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq(
Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L)))
+ val x1 = "2015-07-24 10:00:00"
+ val x2 = "2015-25-07 02:02:02"
+ val x3 = "2015-07-24 25:02:02"
+ val x4 = "2015-24-07 26:02:02"
+ val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
+ val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
+
+ val df1 = Seq(x1, x2, x3, x4).toDF("x")
+ checkAnswer(df1.select(unix_timestamp(col("x"))), Seq(
+ Row(ts1.getTime / 1000L), Row(null), Row(null), Row(null)))
+ checkAnswer(df1.selectExpr("unix_timestamp(x)"), Seq(
+ Row(ts1.getTime / 1000L), Row(null), Row(null), Row(null)))
+ checkAnswer(df1.select(unix_timestamp(col("x"), "yyyy-dd-MM HH:mm:ss")), Seq(
+ Row(null), Row(ts2.getTime / 1000L), Row(null), Row(null)))
+ checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq(
+ Row(ts4.getTime / 1000L), Row(null), Row(ts3.getTime / 1000L), Row(null)))
+
+ // invalid format
+ checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd aa:HH:ss')"), Seq(
+ Row(null), Row(null), Row(null), Row(null)))
+
+ // february
+ val y1 = "2016-02-29"
+ val y2 = "2017-02-29"
+ val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
+ val df2 = Seq(y1, y2).toDF("y")
+ checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
+ Row(ts5.getTime / 1000L), Row(null)))
+
val now = sql("select unix_timestamp()").collect().head.getLong(0)
checkAnswer(sql(s"select cast ($now as timestamp)"), Row(new java.util.Date(now * 1000)))
}
@@ -500,6 +542,31 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
Row(date1.getTime / 1000L), Row(date2.getTime / 1000L)))
checkAnswer(df.selectExpr(s"to_unix_timestamp(s, '$fmt')"), Seq(
Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L)))
+
+ val x1 = "2015-07-24 10:00:00"
+ val x2 = "2015-25-07 02:02:02"
+ val x3 = "2015-07-24 25:02:02"
+ val x4 = "2015-24-07 26:02:02"
+ val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
+ val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
+
+ val df1 = Seq(x1, x2, x3, x4).toDF("x")
+ checkAnswer(df1.selectExpr("to_unix_timestamp(x)"), Seq(
+ Row(ts1.getTime / 1000L), Row(null), Row(null), Row(null)))
+ checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq(
+ Row(ts4.getTime / 1000L), Row(null), Row(ts3.getTime / 1000L), Row(null)))
+
+ // february
+ val y1 = "2016-02-29"
+ val y2 = "2017-02-29"
+ val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
+ val df2 = Seq(y1, y2).toDF("y")
+ checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
+ Row(ts5.getTime / 1000L), Row(null)))
+
+ // invalid format
+ checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd bb:HH:ss')"), Seq(
+ Row(null), Row(null), Row(null), Row(null)))
}