aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authoranabranch <wac.chambers@gmail.com>2017-02-07 15:50:30 +0100
committerHerman van Hovell <hvanhovell@databricks.com>2017-02-07 15:50:30 +0100
commit7a7ce272fe9a703f58b0180a9d2001ecb5c4b8db (patch)
tree4a2533e6d7dcc2e38d62db64f87634a15f3e2439 /sql
parent6ed285c68fee451c45db7b01ca8ec1dea2efd479 (diff)
downloadspark-7a7ce272fe9a703f58b0180a9d2001ecb5c4b8db.tar.gz
spark-7a7ce272fe9a703f58b0180a9d2001ecb5c4b8db.tar.bz2
spark-7a7ce272fe9a703f58b0180a9d2001ecb5c4b8db.zip
[SPARK-16609] Add to_date/to_timestamp with format functions
## What changes were proposed in this pull request? This pull request adds two new user facing functions: - `to_date` which accepts an expression and a format and returns a date. - `to_timestamp` which accepts an expression and a format and returns a timestamp. For example, Given a date in format: `2016-21-05`. (YYYY-dd-MM) ### Date Function *Previously* ``` to_date(unix_timestamp(lit("2016-21-05"), "yyyy-dd-MM").cast("timestamp")) ``` *Current* ``` to_date(lit("2016-21-05"), "yyyy-dd-MM") ``` ### Timestamp Function *Previously* ``` unix_timestamp(lit("2016-21-05"), "yyyy-dd-MM").cast("timestamp") ``` *Current* ``` to_timestamp(lit("2016-21-05"), "yyyy-dd-MM") ``` ### Tasks - [X] Add `to_date` to Scala Functions - [x] Add `to_date` to Python Functions - [x] Add `to_date` to SQL Functions - [X] Add `to_timestamp` to Scala Functions - [x] Add `to_timestamp` to Python Functions - [x] Add `to_timestamp` to SQL Functions - [x] Add function to R ## How was this patch tested? - [x] Add Functions to `DateFunctionsSuite` - Test new `ParseToTimestamp` Expression (*not necessary*) - Test new `ParseToDate` Expression (*not necessary*) - [x] Add test for R - [x] Add test for Python in test.py Please review http://spark.apache.org/contributing.html before opening a pull request. Author: anabranch <wac.chambers@gmail.com> Author: Bill Chambers <bill@databricks.com> Author: anabranch <bill@databricks.com> Closes #16138 from anabranch/SPARK-16609.
Diffstat (limited to 'sql')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala3
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala64
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/functions.scala33
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala68
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala1
5 files changed, 161 insertions, 8 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index eea3740be8..9c9465f6b8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -345,7 +345,8 @@ object FunctionRegistry {
expression[CurrentTimestamp]("now"),
expression[Quarter]("quarter"),
expression[Second]("second"),
- expression[ToDate]("to_date"),
+ expression[ParseToTimestamp]("to_timestamp"),
+ expression[ParseToDate]("to_date"),
expression[ToUnixTimestamp]("to_unix_timestamp"),
expression[ToUTCTimestamp]("to_utc_timestamp"),
expression[TruncDate]("trunc"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index bad8a71230..f8fe774823 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -24,6 +24,7 @@ import java.util.{Calendar, TimeZone}
import scala.util.control.NonFatal
import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode}
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types._
@@ -1170,6 +1171,69 @@ case class ToDate(child: Expression) extends UnaryExpression with ImplicitCastIn
}
/**
+ * Parses a column to a date based on the given format.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = "_FUNC_(date_str, fmt) - Parses the `left` expression with the `fmt` expression. Returns null with invalid input.",
+ extended = """
+ Examples:
+ > SELECT _FUNC_('2016-12-31', 'yyyy-MM-dd');
+ 2016-12-31
+ """)
+// scalastyle:on line.size.limit
+case class ParseToDate(left: Expression, format: Option[Expression], child: Expression)
+ extends RuntimeReplaceable {
+
+ def this(left: Expression, format: Expression) {
+ this(left, Option(format),
+ Cast(Cast(UnixTimestamp(left, format), TimestampType), DateType))
+ }
+
+ def this(left: Expression) = {
+ // backwards compatability
+ this(left, Option(null), ToDate(left))
+ }
+
+ override def flatArguments: Iterator[Any] = Iterator(left, format)
+ override def sql: String = {
+ if (format.isDefined) {
+ s"$prettyName(${left.sql}, ${format.get.sql}"
+ } else {
+ s"$prettyName(${left.sql})"
+ }
+ }
+
+ override def prettyName: String = "to_date"
+}
+
+/**
+ * Parses a column to a timestamp based on the supplied format.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = "_FUNC_(timestamp, fmt) - Parses the `left` expression with the `format` expression to a timestamp. Returns null with invalid input.",
+ extended = """
+ Examples:
+ > SELECT _FUNC_('2016-12-31', 'yyyy-MM-dd');
+ 2016-12-31 00:00:00.0
+ """)
+// scalastyle:on line.size.limit
+case class ParseToTimestamp(left: Expression, format: Expression, child: Expression)
+ extends RuntimeReplaceable {
+
+ def this(left: Expression, format: Expression) = {
+ this(left, format, Cast(UnixTimestamp(left, format), TimestampType))
+}
+
+ override def flatArguments: Iterator[Any] = Iterator(left, format)
+ override def sql: String = s"$prettyName(${left.sql}, ${format.sql})"
+
+ override def prettyName: String = "to_timestamp"
+ override def dataType: DataType = TimestampType
+}
+
+/**
* Returns date truncated to the unit specified by the format.
*/
// scalastyle:off line.size.limit
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 5e27484c11..24ed906d33 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2665,6 +2665,27 @@ object functions {
def unix_timestamp(s: Column, p: String): Column = withExpr {UnixTimestamp(s.expr, Literal(p)) }
/**
+ * Convert time string to a Unix timestamp (in seconds).
+ * Uses the pattern "yyyy-MM-dd HH:mm:ss" and will return null on failure.
+ * @group datetime_funcs
+ * @since 2.2.0
+ */
+ def to_timestamp(s: Column): Column = withExpr {
+ new ParseToTimestamp(s.expr, Literal("yyyy-MM-dd HH:mm:ss"))
+ }
+
+ /**
+ * Convert time string to a Unix timestamp (in seconds) with a specified format
+ * (see [http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html])
+ * to Unix timestamp (in seconds), return null if fail.
+ * @group datetime_funcs
+ * @since 2.2.0
+ */
+ def to_timestamp(s: Column, fmt: String): Column = withExpr {
+ new ParseToTimestamp(s.expr, Literal(fmt))
+ }
+
+ /**
* Converts the column into DateType.
*
* @group datetime_funcs
@@ -2673,6 +2694,18 @@ object functions {
def to_date(e: Column): Column = withExpr { ToDate(e.expr) }
/**
+ * Converts the column into a DateType with a specified format
+ * (see [http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html])
+ * return null if fail.
+ *
+ * @group datetime_funcs
+ * @since 2.2.0
+ */
+ def to_date(e: Column, fmt: String): Column = withExpr {
+ new ParseToDate(e.expr, Literal(fmt))
+ }
+
+ /**
* Returns date truncated to the unit specified by the format.
*
* @param format: 'year', 'yyyy', 'yy' for truncate by year,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index e05b2252ee..618db43446 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -354,31 +354,58 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
test("function to_date") {
val d1 = Date.valueOf("2015-07-22")
val d2 = Date.valueOf("2015-07-01")
+ val d3 = Date.valueOf("2014-12-31")
val t1 = Timestamp.valueOf("2015-07-22 10:00:00")
val t2 = Timestamp.valueOf("2014-12-31 23:59:59")
+ val t3 = Timestamp.valueOf("2014-12-31 23:59:59")
val s1 = "2015-07-22 10:00:00"
val s2 = "2014-12-31"
- val df = Seq((d1, t1, s1), (d2, t2, s2)).toDF("d", "t", "s")
+ val s3 = "2014-31-12"
+ val df = Seq((d1, t1, s1), (d2, t2, s2), (d3, t3, s3)).toDF("d", "t", "s")
checkAnswer(
df.select(to_date(col("t"))),
- Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31"))))
+ Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")),
+ Row(Date.valueOf("2014-12-31"))))
checkAnswer(
df.select(to_date(col("d"))),
- Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01"))))
+ Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01")),
+ Row(Date.valueOf("2014-12-31"))))
checkAnswer(
df.select(to_date(col("s"))),
- Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31"))))
+ Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null)))
checkAnswer(
df.selectExpr("to_date(t)"),
- Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31"))))
+ Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")),
+ Row(Date.valueOf("2014-12-31"))))
checkAnswer(
df.selectExpr("to_date(d)"),
- Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01"))))
+ Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01")),
+ Row(Date.valueOf("2014-12-31"))))
checkAnswer(
df.selectExpr("to_date(s)"),
- Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31"))))
+ Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null)))
+
+ // Now with format
+ checkAnswer(
+ df.select(to_date(col("t"), "yyyy-MM-dd")),
+ Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")),
+ Row(Date.valueOf("2014-12-31"))))
+ checkAnswer(
+ df.select(to_date(col("d"), "yyyy-MM-dd")),
+ Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01")),
+ Row(Date.valueOf("2014-12-31"))))
+ checkAnswer(
+ df.select(to_date(col("s"), "yyyy-MM-dd")),
+ Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")),
+ Row(Date.valueOf("2016-07-12"))))
+
+ // now switch format
+ checkAnswer(
+ df.select(to_date(col("s"), "yyyy-dd-MM")),
+ Seq(Row(Date.valueOf("2016-10-07")), Row(Date.valueOf("2016-07-12")),
+ Row(Date.valueOf("2014-12-31"))))
}
test("function trunc") {
@@ -475,6 +502,33 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L)))
}
+
+ test("to_timestamp") {
+ val date1 = Date.valueOf("2015-07-24")
+ val date2 = Date.valueOf("2015-07-25")
+ val ts_date1 = Timestamp.valueOf("2015-07-24 00:00:00")
+ val ts_date2 = Timestamp.valueOf("2015-07-25 00:00:00")
+ val ts1 = Timestamp.valueOf("2015-07-24 10:00:00")
+ val ts2 = Timestamp.valueOf("2015-07-25 02:02:02")
+ val s1 = "2015/07/24 10:00:00.5"
+ val s2 = "2015/07/25 02:02:02.6"
+ val ss1 = "2015-07-24 10:00:00"
+ val ss2 = "2015-07-25 02:02:02"
+ val fmt = "yyyy/MM/dd HH:mm:ss.S"
+ val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss")
+
+ checkAnswer(df.select(to_timestamp(col("ss"))),
+ df.select(unix_timestamp(col("ss")).cast("timestamp")))
+ checkAnswer(df.select(to_timestamp(col("ss"))), Seq(
+ Row(ts1), Row(ts2)))
+ checkAnswer(df.select(to_timestamp(col("s"), fmt)), Seq(
+ Row(ts1), Row(ts2)))
+ checkAnswer(df.select(to_timestamp(col("ts"), fmt)), Seq(
+ Row(ts1), Row(ts2)))
+ checkAnswer(df.select(to_timestamp(col("d"), "yyyy-MM-dd")), Seq(
+ Row(ts_date1), Row(ts_date2)))
+ }
+
test("datediff") {
val df = Seq(
(Date.valueOf("2015-07-24"), Timestamp.valueOf("2015-07-24 01:00:00"),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala
index df9390aec7..1daa6f7822 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala
@@ -253,6 +253,7 @@ class ExpressionToSQLSuite extends SQLBuilderTest with SQLTestUtils {
checkSqlGeneration("SELECT count(now())")
checkSqlGeneration("SELECT quarter('2001-05-02')")
checkSqlGeneration("SELECT second('11:35:55')")
+ checkSqlGeneration("SELECT to_timestamp('2001-10-30 10:30:00', 'yyyy-MM-dd HH:mm:ss')")
checkSqlGeneration("SELECT to_date('2001-10-30 10:30:00')")
checkSqlGeneration("SELECT to_unix_timestamp('2015-07-24 00:00:00', 'yyyy-MM-dd HH:mm:ss')")
checkSqlGeneration("SELECT to_utc_timestamp('2015-07-24 00:00:00', 'PST')")