[SPARK-8227] [SQL] Add function unhex

cc chenghao-intel adrian-wang Author: zhichao.li <zhichao.li@intel.com> Closes #7113 from zhichao-li/unhex and squashes the following commits: 379356e [zhichao.li] remove exception checking a4ae6dc [zhichao.li] add udf_unhex to whitelist fe5c14a [zhichao.li] add todigit 607d7a3 [zhichao.li] use checkInputTypes bffd37f [zhichao.li] change to use Hex in apache common package cde73f5 [zhichao.li] update to use AutoCastInputTypes 11945c7 [zhichao.li] style c852d46 [zhichao.li] Add function unhex
author: zhichao.li <zhichao.li@intel.com> 2015-07-01 22:19:51 -0700
committer: Davies Liu <davies@databricks.com> 2015-07-01 22:19:51 -0700
commit: b285ac5ba85fe0b32b00726ad7d3a2efb602e885 (patch)
tree: dd4903378a6ff8c7543c2740782ddccb9c7801af /sql
parent: 4e4f74b5e1267d1ada4a8f57b86aee0d9c17d90a (diff)
download: spark-b285ac5ba85fe0b32b00726ad7d3a2efb602e885.tar.gz
spark-b285ac5ba85fe0b32b00726ad7d3a2efb602e885.tar.bz2
spark-b285ac5ba85fe0b32b00726ad7d3a2efb602e885.zip
6 files changed, 88 insertions, 0 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index d53eaedda5..6f04298d47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -157,6 +157,7 @@ object FunctionRegistry {
     expression[Substring]("substr"),
     expression[Substring]("substring"),
     expression[Upper]("ucase"),
+    expression[UnHex]("unhex"),
     expression[Upper]("upper")
   )
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index b51318dd50..8633eb06ff 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -351,6 +351,58 @@ case class Pow(left: Expression, right: Expression)
   }
 }
 
+/**
+ * Performs the inverse operation of HEX.
+ * Resulting characters are returned as a byte array.
+ */
+case class UnHex(child: Expression) extends UnaryExpression with Serializable {
+
+  override def dataType: DataType = BinaryType
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (child.dataType.isInstanceOf[StringType] || child.dataType == NullType) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      TypeCheckResult.TypeCheckFailure(s"unHex accepts String type, not ${child.dataType}")
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    val num = child.eval(input)
+    if (num == null) {
+      null
+    } else {
+      unhex(num.asInstanceOf[UTF8String].getBytes)
+    }
+  }
+
+  private val unhexDigits = {
+    val array = Array.fill[Byte](128)(-1)
+    (0 to 9).foreach(i => array('0' + i) = i.toByte)
+    (0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
+    (0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
+    array
+  }
+
+  private def unhex(inputBytes: Array[Byte]): Array[Byte] = {
+    var bytes = inputBytes
+    if ((bytes.length & 0x01) != 0) {
+      bytes = '0'.toByte +: bytes
+    }
+    val out = new Array[Byte](bytes.length >> 1)
+    // two characters form the hex value.
+    var i = 0
+    while (i < bytes.length) {
+        val first = unhexDigits(bytes(i))
+        val second = unhexDigits(bytes(i + 1))
+        if (first == -1 || second == -1) { return null}
+        out(i / 2) = (((first << 4) | second) & 0xFF).toByte
+        i += 2
+    }
+    out
+  }
+}
+
 case class Hypot(left: Expression, right: Expression)
   extends BinaryMathExpression(math.hypot, "HYPOT")
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index b932d4ab85..b3345d7069 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -238,6 +238,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // scalastyle:on
   }
 
+  test("unhex") {
+    checkEvaluation(UnHex(Literal("737472696E67")), "string".getBytes)
+    checkEvaluation(UnHex(Literal("")), new Array[Byte](0))
+    checkEvaluation(UnHex(Literal("0")), Array[Byte](0))
+  }
+
   test("hypot") {
     testBinary(Hypot, math.hypot)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 4e8f3f96bf..e6f623bdf3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1054,6 +1054,24 @@ object functions {
   def hex(colName: String): Column = hex(Column(colName))
 
   /**
+   * Inverse of hex. Interprets each pair of characters as a hexadecimal number
+   * and converts to the byte representation of number.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def unhex(column: Column): Column = UnHex(column.expr)
+
+  /**
+   * Inverse of hex. Interprets each pair of characters as a hexadecimal number
+   * and converts to the byte representation of number.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def unhex(colName: String): Column = unhex(Column(colName))
+
+  /**
    * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow.
    *
    * @group math_funcs
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index d6331aa4ff..c03cde38d7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -225,6 +225,16 @@ class MathExpressionsSuite extends QueryTest {
     checkAnswer(data.selectExpr("hex(cast(d as binary))"), Seq(Row("68656C6C6F")))
   }
 
+  test("unhex") {
+    val data = Seq(("1C", "737472696E67")).toDF("a", "b")
+    checkAnswer(data.select(unhex('a)), Row(Array[Byte](28.toByte)))
+    checkAnswer(data.select(unhex('b)), Row("string".getBytes))
+    checkAnswer(data.selectExpr("unhex(a)"), Row(Array[Byte](28.toByte)))
+    checkAnswer(data.selectExpr("unhex(b)"), Row("string".getBytes))
+    checkAnswer(data.selectExpr("""unhex("##")"""), Row(null))
+    checkAnswer(data.selectExpr("""unhex("G123")"""), Row(null))
+  }
+
   test("hypot") {
     testTwoToOneMathFunction(hypot, hypot, math.hypot)
   }
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index f88e62763c..415a81644c 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -949,6 +949,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_trim",
     "udf_ucase",
     "udf_unix_timestamp",
+    "udf_unhex",
     "udf_upper",
     "udf_var_pop",
     "udf_var_samp",
author	zhichao.li <zhichao.li@intel.com>	2015-07-01 22:19:51 -0700
committer	Davies Liu <davies@databricks.com>	2015-07-01 22:19:51 -0700
commit	b285ac5ba85fe0b32b00726ad7d3a2efb602e885 (patch)
tree	dd4903378a6ff8c7543c2740782ddccb9c7801af /sql
parent	4e4f74b5e1267d1ada4a8f57b86aee0d9c17d90a (diff)
download	spark-b285ac5ba85fe0b32b00726ad7d3a2efb602e885.tar.gz spark-b285ac5ba85fe0b32b00726ad7d3a2efb602e885.tar.bz2 spark-b285ac5ba85fe0b32b00726ad7d3a2efb602e885.zip