diff options
author | anabranch <wac.chambers@gmail.com> | 2017-02-07 15:50:30 +0100 |
---|---|---|
committer | Herman van Hovell <hvanhovell@databricks.com> | 2017-02-07 15:50:30 +0100 |
commit | 7a7ce272fe9a703f58b0180a9d2001ecb5c4b8db (patch) | |
tree | 4a2533e6d7dcc2e38d62db64f87634a15f3e2439 /sql | |
parent | 6ed285c68fee451c45db7b01ca8ec1dea2efd479 (diff) | |
download | spark-7a7ce272fe9a703f58b0180a9d2001ecb5c4b8db.tar.gz spark-7a7ce272fe9a703f58b0180a9d2001ecb5c4b8db.tar.bz2 spark-7a7ce272fe9a703f58b0180a9d2001ecb5c4b8db.zip |
[SPARK-16609] Add to_date/to_timestamp with format functions
## What changes were proposed in this pull request?
This pull request adds two new user facing functions:
- `to_date` which accepts an expression and a format and returns a date.
- `to_timestamp` which accepts an expression and a format and returns a timestamp.
For example, Given a date in format: `2016-21-05`. (YYYY-dd-MM)
### Date Function
*Previously*
```
to_date(unix_timestamp(lit("2016-21-05"), "yyyy-dd-MM").cast("timestamp"))
```
*Current*
```
to_date(lit("2016-21-05"), "yyyy-dd-MM")
```
### Timestamp Function
*Previously*
```
unix_timestamp(lit("2016-21-05"), "yyyy-dd-MM").cast("timestamp")
```
*Current*
```
to_timestamp(lit("2016-21-05"), "yyyy-dd-MM")
```
### Tasks
- [X] Add `to_date` to Scala Functions
- [x] Add `to_date` to Python Functions
- [x] Add `to_date` to SQL Functions
- [X] Add `to_timestamp` to Scala Functions
- [x] Add `to_timestamp` to Python Functions
- [x] Add `to_timestamp` to SQL Functions
- [x] Add function to R
## How was this patch tested?
- [x] Add Functions to `DateFunctionsSuite`
- Test new `ParseToTimestamp` Expression (*not necessary*)
- Test new `ParseToDate` Expression (*not necessary*)
- [x] Add test for R
- [x] Add test for Python in test.py
Please review http://spark.apache.org/contributing.html before opening a pull request.
Author: anabranch <wac.chambers@gmail.com>
Author: Bill Chambers <bill@databricks.com>
Author: anabranch <bill@databricks.com>
Closes #16138 from anabranch/SPARK-16609.
Diffstat (limited to 'sql')
5 files changed, 161 insertions, 8 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index eea3740be8..9c9465f6b8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -345,7 +345,8 @@ object FunctionRegistry { expression[CurrentTimestamp]("now"), expression[Quarter]("quarter"), expression[Second]("second"), - expression[ToDate]("to_date"), + expression[ParseToTimestamp]("to_timestamp"), + expression[ParseToDate]("to_date"), expression[ToUnixTimestamp]("to_unix_timestamp"), expression[ToUTCTimestamp]("to_utc_timestamp"), expression[TruncDate]("trunc"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index bad8a71230..f8fe774823 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -24,6 +24,7 @@ import java.util.{Calendar, TimeZone} import scala.util.control.NonFatal import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ @@ -1170,6 +1171,69 @@ case class ToDate(child: Expression) extends UnaryExpression with ImplicitCastIn } /** + * Parses a column to a date based on the given format. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(date_str, fmt) - Parses the `left` expression with the `fmt` expression. Returns null with invalid input.", + extended = """ + Examples: + > SELECT _FUNC_('2016-12-31', 'yyyy-MM-dd'); + 2016-12-31 + """) +// scalastyle:on line.size.limit +case class ParseToDate(left: Expression, format: Option[Expression], child: Expression) + extends RuntimeReplaceable { + + def this(left: Expression, format: Expression) { + this(left, Option(format), + Cast(Cast(UnixTimestamp(left, format), TimestampType), DateType)) + } + + def this(left: Expression) = { + // backwards compatability + this(left, Option(null), ToDate(left)) + } + + override def flatArguments: Iterator[Any] = Iterator(left, format) + override def sql: String = { + if (format.isDefined) { + s"$prettyName(${left.sql}, ${format.get.sql}" + } else { + s"$prettyName(${left.sql})" + } + } + + override def prettyName: String = "to_date" +} + +/** + * Parses a column to a timestamp based on the supplied format. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(timestamp, fmt) - Parses the `left` expression with the `format` expression to a timestamp. Returns null with invalid input.", + extended = """ + Examples: + > SELECT _FUNC_('2016-12-31', 'yyyy-MM-dd'); + 2016-12-31 00:00:00.0 + """) +// scalastyle:on line.size.limit +case class ParseToTimestamp(left: Expression, format: Expression, child: Expression) + extends RuntimeReplaceable { + + def this(left: Expression, format: Expression) = { + this(left, format, Cast(UnixTimestamp(left, format), TimestampType)) +} + + override def flatArguments: Iterator[Any] = Iterator(left, format) + override def sql: String = s"$prettyName(${left.sql}, ${format.sql})" + + override def prettyName: String = "to_timestamp" + override def dataType: DataType = TimestampType +} + +/** * Returns date truncated to the unit specified by the format. */ // scalastyle:off line.size.limit diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 5e27484c11..24ed906d33 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2665,6 +2665,27 @@ object functions { def unix_timestamp(s: Column, p: String): Column = withExpr {UnixTimestamp(s.expr, Literal(p)) } /** + * Convert time string to a Unix timestamp (in seconds). + * Uses the pattern "yyyy-MM-dd HH:mm:ss" and will return null on failure. + * @group datetime_funcs + * @since 2.2.0 + */ + def to_timestamp(s: Column): Column = withExpr { + new ParseToTimestamp(s.expr, Literal("yyyy-MM-dd HH:mm:ss")) + } + + /** + * Convert time string to a Unix timestamp (in seconds) with a specified format + * (see [http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html]) + * to Unix timestamp (in seconds), return null if fail. + * @group datetime_funcs + * @since 2.2.0 + */ + def to_timestamp(s: Column, fmt: String): Column = withExpr { + new ParseToTimestamp(s.expr, Literal(fmt)) + } + + /** * Converts the column into DateType. * * @group datetime_funcs @@ -2673,6 +2694,18 @@ object functions { def to_date(e: Column): Column = withExpr { ToDate(e.expr) } /** + * Converts the column into a DateType with a specified format + * (see [http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html]) + * return null if fail. + * + * @group datetime_funcs + * @since 2.2.0 + */ + def to_date(e: Column, fmt: String): Column = withExpr { + new ParseToDate(e.expr, Literal(fmt)) + } + + /** * Returns date truncated to the unit specified by the format. * * @param format: 'year', 'yyyy', 'yy' for truncate by year, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index e05b2252ee..618db43446 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -354,31 +354,58 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext { test("function to_date") { val d1 = Date.valueOf("2015-07-22") val d2 = Date.valueOf("2015-07-01") + val d3 = Date.valueOf("2014-12-31") val t1 = Timestamp.valueOf("2015-07-22 10:00:00") val t2 = Timestamp.valueOf("2014-12-31 23:59:59") + val t3 = Timestamp.valueOf("2014-12-31 23:59:59") val s1 = "2015-07-22 10:00:00" val s2 = "2014-12-31" - val df = Seq((d1, t1, s1), (d2, t2, s2)).toDF("d", "t", "s") + val s3 = "2014-31-12" + val df = Seq((d1, t1, s1), (d2, t2, s2), (d3, t3, s3)).toDF("d", "t", "s") checkAnswer( df.select(to_date(col("t"))), - Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")))) + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) checkAnswer( df.select(to_date(col("d"))), - Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01")))) + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) checkAnswer( df.select(to_date(col("s"))), - Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")))) + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) checkAnswer( df.selectExpr("to_date(t)"), - Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")))) + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) checkAnswer( df.selectExpr("to_date(d)"), - Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01")))) + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) checkAnswer( df.selectExpr("to_date(s)"), - Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")))) + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) + + // Now with format + checkAnswer( + df.select(to_date(col("t"), "yyyy-MM-dd")), + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("d"), "yyyy-MM-dd")), + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("s"), "yyyy-MM-dd")), + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2016-07-12")))) + + // now switch format + checkAnswer( + df.select(to_date(col("s"), "yyyy-dd-MM")), + Seq(Row(Date.valueOf("2016-10-07")), Row(Date.valueOf("2016-07-12")), + Row(Date.valueOf("2014-12-31")))) } test("function trunc") { @@ -475,6 +502,33 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext { Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L))) } + + test("to_timestamp") { + val date1 = Date.valueOf("2015-07-24") + val date2 = Date.valueOf("2015-07-25") + val ts_date1 = Timestamp.valueOf("2015-07-24 00:00:00") + val ts_date2 = Timestamp.valueOf("2015-07-25 00:00:00") + val ts1 = Timestamp.valueOf("2015-07-24 10:00:00") + val ts2 = Timestamp.valueOf("2015-07-25 02:02:02") + val s1 = "2015/07/24 10:00:00.5" + val s2 = "2015/07/25 02:02:02.6" + val ss1 = "2015-07-24 10:00:00" + val ss2 = "2015-07-25 02:02:02" + val fmt = "yyyy/MM/dd HH:mm:ss.S" + val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") + + checkAnswer(df.select(to_timestamp(col("ss"))), + df.select(unix_timestamp(col("ss")).cast("timestamp"))) + checkAnswer(df.select(to_timestamp(col("ss"))), Seq( + Row(ts1), Row(ts2))) + checkAnswer(df.select(to_timestamp(col("s"), fmt)), Seq( + Row(ts1), Row(ts2))) + checkAnswer(df.select(to_timestamp(col("ts"), fmt)), Seq( + Row(ts1), Row(ts2))) + checkAnswer(df.select(to_timestamp(col("d"), "yyyy-MM-dd")), Seq( + Row(ts_date1), Row(ts_date2))) + } + test("datediff") { val df = Seq( (Date.valueOf("2015-07-24"), Timestamp.valueOf("2015-07-24 01:00:00"), diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala index df9390aec7..1daa6f7822 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala @@ -253,6 +253,7 @@ class ExpressionToSQLSuite extends SQLBuilderTest with SQLTestUtils { checkSqlGeneration("SELECT count(now())") checkSqlGeneration("SELECT quarter('2001-05-02')") checkSqlGeneration("SELECT second('11:35:55')") + checkSqlGeneration("SELECT to_timestamp('2001-10-30 10:30:00', 'yyyy-MM-dd HH:mm:ss')") checkSqlGeneration("SELECT to_date('2001-10-30 10:30:00')") checkSqlGeneration("SELECT to_unix_timestamp('2015-07-24 00:00:00', 'yyyy-MM-dd HH:mm:ss')") checkSqlGeneration("SELECT to_utc_timestamp('2015-07-24 00:00:00', 'PST')") |