aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShilei <shilei.qian@intel.com>2015-06-19 10:49:27 -0700
committerReynold Xin <rxin@databricks.com>2015-06-19 10:49:27 -0700
commit0c32fc125c45e59f06cb55f3ba7da612d840ca86 (patch)
treeb8c545da9dc49400a5008a045b11dfe8e58bbc35
parentfe08561e2ee13fc8f641db8b6e6c1499bdfd4d29 (diff)
downloadspark-0c32fc125c45e59f06cb55f3ba7da612d840ca86.tar.gz
spark-0c32fc125c45e59f06cb55f3ba7da612d840ca86.tar.bz2
spark-0c32fc125c45e59f06cb55f3ba7da612d840ca86.zip
[SPARK-8234][SQL] misc function: md5
Author: Shilei <shilei.qian@intel.com> Closes #6779 from qiansl127/MD5 and squashes the following commits: 11fcdb2 [Shilei] Fix the indent 04bd27b [Shilei] Add codegen da60eb3 [Shilei] Remove checkInputDataTypes function 9509ad0 [Shilei] Format code 12c61f4 [Shilei] Accept only BinaryType for Md5 1df0b5b [Shilei] format to scala type 60ccde1 [Shilei] Add more test case b8c73b4 [Shilei] Rewrite the type check for Md5 c166167 [Shilei] Add md5 function
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala3
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala50
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala32
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/functions.scala21
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala11
5 files changed, 117 insertions, 0 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 79273a7840..5fb3369f85 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -133,6 +133,9 @@ object FunctionRegistry {
expression[ToDegrees]("degrees"),
expression[ToRadians]("radians"),
+ // misc functions
+ expression[Md5]("md5"),
+
// aggregate functions
expression[Average]("avg"),
expression[Count]("count"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
new file mode 100644
index 0000000000..4bee8cb728
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.commons.codec.digest.DigestUtils
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.types.{BinaryType, StringType, DataType}
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * A function that calculates an MD5 128-bit checksum and returns it as a hex string
+ * For input of type [[BinaryType]]
+ */
+case class Md5(child: Expression)
+ extends UnaryExpression with ExpectsInputTypes {
+
+ override def dataType: DataType = StringType
+
+ override def expectedChildTypes: Seq[DataType] = Seq(BinaryType)
+
+ override def eval(input: InternalRow): Any = {
+ val value = child.eval(input)
+ if (value == null) {
+ null
+ } else {
+ UTF8String.fromString(DigestUtils.md5Hex(value.asInstanceOf[Array[Byte]]))
+ }
+ }
+
+ override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+ defineCodeGen(ctx, ev, c =>
+ "org.apache.spark.unsafe.types.UTF8String.fromString" +
+ s"(org.apache.commons.codec.digest.DigestUtils.md5Hex($c))")
+ }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
new file mode 100644
index 0000000000..48b84130b4
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types.{StringType, BinaryType}
+
+class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+ test("md5") {
+ checkEvaluation(Md5(Literal("ABC".getBytes)), "902fbdd2b1df0c4f70b4a5d23525e932")
+ checkEvaluation(Md5(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
+ "6ac1e56bc78f031059be7be854522c4c")
+ checkEvaluation(Md5(Literal.create(null, BinaryType)), null)
+ }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 40ae9f5df8..7e7a099a83 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -36,6 +36,7 @@ import org.apache.spark.util.Utils
* @groupname sort_funcs Sorting functions
* @groupname normal_funcs Non-aggregate functions
* @groupname math_funcs Math functions
+ * @groupname misc_funcs Misc functions
* @groupname window_funcs Window functions
* @groupname string_funcs String functions
* @groupname Ungrouped Support functions for DataFrames.
@@ -1377,6 +1378,26 @@ object functions {
def toRadians(columnName: String): Column = toRadians(Column(columnName))
//////////////////////////////////////////////////////////////////////////////////////////////
+ // Misc functions
+ //////////////////////////////////////////////////////////////////////////////////////////////
+
+ /**
+ * Calculates the MD5 digest and returns the value as a 32 character hex string.
+ *
+ * @group misc_funcs
+ * @since 1.5.0
+ */
+ def md5(e: Column): Column = Md5(e.expr)
+
+ /**
+ * Calculates the MD5 digest and returns the value as a 32 character hex string.
+ *
+ * @group misc_funcs
+ * @since 1.5.0
+ */
+ def md5(columnName: String): Column = md5(Column(columnName))
+
+ //////////////////////////////////////////////////////////////////////////////////////////////
// String functions
//////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 70819fe287..8b53b384a2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -133,6 +133,17 @@ class DataFrameFunctionsSuite extends QueryTest {
Row("x", "y", null))
}
+ test("misc md5 function") {
+ val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b")
+ checkAnswer(
+ df.select(md5($"a"), md5("b")),
+ Row("902fbdd2b1df0c4f70b4a5d23525e932", "6ac1e56bc78f031059be7be854522c4c"))
+
+ checkAnswer(
+ df.selectExpr("md5(a)", "md5(b)"),
+ Row("902fbdd2b1df0c4f70b4a5d23525e932", "6ac1e56bc78f031059be7be854522c4c"))
+ }
+
test("string length function") {
checkAnswer(
nullStrings.select(strlen($"s"), strlen("s")),