diff options
author | Cheng Hao <hao.cheng@intel.com> | 2015-07-03 23:45:21 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-07-03 23:45:21 -0700 |
commit | f35b0c3436898f22860d2c6c1d12f3a661005201 (patch) | |
tree | 2e2437cf66f651c62b521557c8fea4957f999c70 /sql/core/src | |
parent | f32487b7ca86f768336a7c9b173f7c610fcde86f (diff) | |
download | spark-f35b0c3436898f22860d2c6c1d12f3a661005201.tar.gz spark-f35b0c3436898f22860d2c6c1d12f3a661005201.tar.bz2 spark-f35b0c3436898f22860d2c6c1d12f3a661005201.zip |
[SPARK-8238][SPARK-8239][SPARK-8242][SPARK-8243][SPARK-8268][SQL]Add ascii/base64/unbase64/encode/decode functions
Add `ascii`,`base64`,`unbase64`,`encode` and `decode` expressions.
Author: Cheng Hao <hao.cheng@intel.com>
Closes #6843 from chenghao-intel/str_funcs2 and squashes the following commits:
78dee7d [Cheng Hao] base 64 -> base64
9d6f9f4 [Cheng Hao] remove the toString method for expressions
ed5c19c [Cheng Hao] update code as comments
96170fc [Cheng Hao] scalastyle issues
e2df768 [Cheng Hao] remove the unused import
491ce7b [Cheng Hao] add ascii/base64/unbase64/encode/decode functions
Diffstat (limited to 'sql/core/src')
-rw-r--r-- | sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 93 | ||||
-rw-r--r-- | sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala | 38 |
2 files changed, 131 insertions, 0 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 25e37ff67a..b63c6ee8ab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1581,6 +1581,7 @@ object functions { /** * Computes the length of a given string value + * * @group string_funcs * @since 1.5.0 */ @@ -1588,11 +1589,103 @@ object functions { /** * Computes the length of a given string column + * * @group string_funcs * @since 1.5.0 */ def strlen(columnName: String): Column = strlen(Column(columnName)) + /** + * Computes the numeric value of the first character of the specified string value. + * + * @group string_funcs + * @since 1.5.0 + */ + def ascii(e: Column): Column = Ascii(e.expr) + + /** + * Computes the numeric value of the first character of the specified string column. + * + * @group string_funcs + * @since 1.5.0 + */ + def ascii(columnName: String): Column = ascii(Column(columnName)) + + /** + * Computes the specified value from binary to a base64 string. + * + * @group string_funcs + * @since 1.5.0 + */ + def base64(e: Column): Column = Base64(e.expr) + + /** + * Computes the specified column from binary to a base64 string. + * + * @group string_funcs + * @since 1.5.0 + */ + def base64(columnName: String): Column = base64(Column(columnName)) + + /** + * Computes the specified value from a base64 string to binary. + * + * @group string_funcs + * @since 1.5.0 + */ + def unbase64(e: Column): Column = UnBase64(e.expr) + + /** + * Computes the specified column from a base64 string to binary. + * + * @group string_funcs + * @since 1.5.0 + */ + def unbase64(columnName: String): Column = unbase64(Column(columnName)) + + /** + * Computes the first argument into a binary from a string using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. + * + * @group string_funcs + * @since 1.5.0 + */ + def encode(value: Column, charset: Column): Column = Encode(value.expr, charset.expr) + + /** + * Computes the first argument into a binary from a string using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. + * + * @group string_funcs + * @since 1.5.0 + */ + def encode(columnName: String, charsetColumnName: String): Column = + encode(Column(columnName), Column(charsetColumnName)) + + /** + * Computes the first argument into a string from a binary using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. + * + * @group string_funcs + * @since 1.5.0 + */ + def decode(value: Column, charset: Column): Column = Decode(value.expr, charset.expr) + + /** + * Computes the first argument into a string from a binary using the provided character set + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * If either argument is null, the result will also be null. + * + * @group string_funcs + * @since 1.5.0 + */ + def decode(columnName: String, charsetColumnName: String): Column = + decode(Column(columnName), Column(charsetColumnName)) + + ////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 0d43aca877..bd9fa400e5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -225,4 +225,42 @@ class DataFrameFunctionsSuite extends QueryTest { Row(l) }) } + + test("string ascii function") { + val df = Seq(("abc", "")).toDF("a", "b") + checkAnswer( + df.select(ascii($"a"), ascii("b")), + Row(97, 0)) + + checkAnswer( + df.selectExpr("ascii(a)", "ascii(b)"), + Row(97, 0)) + } + + test("string base64/unbase64 function") { + val bytes = Array[Byte](1, 2, 3, 4) + val df = Seq((bytes, "AQIDBA==")).toDF("a", "b") + checkAnswer( + df.select(base64("a"), base64($"a"), unbase64("b"), unbase64($"b")), + Row("AQIDBA==", "AQIDBA==", bytes, bytes)) + + checkAnswer( + df.selectExpr("base64(a)", "unbase64(b)"), + Row("AQIDBA==", bytes)) + } + + test("string encode/decode function") { + val bytes = Array[Byte](-27, -92, -89, -27, -115, -125, -28, -72, -106, -25, -107, -116) + // scalastyle:off + // non ascii characters are not allowed in the code, so we disable the scalastyle here. + val df = Seq(("大千世界", "utf-8", bytes)).toDF("a", "b", "c") + checkAnswer( + df.select(encode($"a", $"b"), encode("a", "b"), decode($"c", $"b"), decode("c", "b")), + Row(bytes, bytes, "大千世界", "大千世界")) + + checkAnswer( + df.selectExpr("encode(a, b)", "decode(c, b)"), + Row(bytes, "大千世界")) + // scalastyle:on + } } |