aboutsummaryrefslogtreecommitdiff
path: root/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
diff options
context:
space:
mode:
Diffstat (limited to 'sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala117
1 files changed, 112 insertions, 5 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 3ee19cc4ad..a17482697d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -35,6 +35,9 @@ import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
* An expression that concatenates multiple input strings into a single string.
* If any input is null, concat returns null.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str1, str2, ..., strN) - Returns the concatenation of str1, str2, ..., strN",
+ extended = "> SELECT _FUNC_('Spark','SQL');\n 'SparkSQL'")
case class Concat(children: Seq[Expression]) extends Expression with ImplicitCastInputTypes {
override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringType)
@@ -70,6 +73,10 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
*
* Returns null if the separator is null. Otherwise, concat_ws skips all null values.
*/
+@ExpressionDescription(
+ usage =
+ "_FUNC_(sep, [str | array(str)]+) - Returns the concatenation of the strings separated by sep.",
+ extended = "> SELECT _FUNC_(' ', Spark', 'SQL');\n 'Spark SQL'")
case class ConcatWs(children: Seq[Expression])
extends Expression with ImplicitCastInputTypes {
@@ -188,7 +195,7 @@ case class Upper(child: Expression)
*/
@ExpressionDescription(
usage = "_FUNC_(str) - Returns str with all characters changed to lowercase",
- extended = "> SELECT _FUNC_('SparkSql');\n'sparksql'")
+ extended = "> SELECT _FUNC_('SparkSql');\n 'sparksql'")
case class Lower(child: Expression) extends UnaryExpression with String2StringExpression {
override def convert(v: UTF8String): UTF8String = v.toLowerCase
@@ -270,6 +277,11 @@ object StringTranslate {
* The translate will happen when any character in the string matching with the character
* in the `matchingExpr`.
*/
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = """_FUNC_(input, from, to) - Translates the input string by replacing the characters present in the from string with the corresponding characters in the to string""",
+ extended = "> SELECT _FUNC_('AaBbCc', 'abc', '123');\n 'A1B2C3'")
+// scalastyle:on line.size.limit
case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replaceExpr: Expression)
extends TernaryExpression with ImplicitCastInputTypes {
@@ -325,6 +337,12 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac
* delimited list (right). Returns 0, if the string wasn't found or if the given
* string (left) contains a comma.
*/
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = """_FUNC_(str, str_array) - Returns the index (1-based) of the given string (left) in the comma-delimited list (right).
+ Returns 0, if the string wasn't found or if the given string (left) contains a comma.""",
+ extended = "> SELECT _FUNC_('ab','abc,b,ab,c,def');\n 3")
+// scalastyle:on
case class FindInSet(left: Expression, right: Expression) extends BinaryExpression
with ImplicitCastInputTypes {
@@ -347,6 +365,9 @@ case class FindInSet(left: Expression, right: Expression) extends BinaryExpressi
/**
* A function that trim the spaces from both ends for the specified string.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str) - Removes the leading and trailing space characters from str.",
+ extended = "> SELECT _FUNC_(' SparkSQL ');\n 'SparkSQL'")
case class StringTrim(child: Expression)
extends UnaryExpression with String2StringExpression {
@@ -362,6 +383,9 @@ case class StringTrim(child: Expression)
/**
* A function that trim the spaces from left end for given string.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str) - Removes the leading space characters from str.",
+ extended = "> SELECT _FUNC_(' SparkSQL ');\n 'SparkSQL '")
case class StringTrimLeft(child: Expression)
extends UnaryExpression with String2StringExpression {
@@ -377,6 +401,9 @@ case class StringTrimLeft(child: Expression)
/**
* A function that trim the spaces from right end for given string.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str) - Removes the trailing space characters from str.",
+ extended = "> SELECT _FUNC_(' SparkSQL ');\n ' SparkSQL'")
case class StringTrimRight(child: Expression)
extends UnaryExpression with String2StringExpression {
@@ -396,6 +423,9 @@ case class StringTrimRight(child: Expression)
*
* NOTE: that this is not zero based, but 1-based index. The first character in str has index 1.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str, substr) - Returns the (1-based) index of the first occurrence of substr in str.",
+ extended = "> SELECT _FUNC_('SparkSQL', 'SQL');\n 6")
case class StringInstr(str: Expression, substr: Expression)
extends BinaryExpression with ImplicitCastInputTypes {
@@ -422,6 +452,15 @@ case class StringInstr(str: Expression, substr: Expression)
* returned. If count is negative, every to the right of the final delimiter (counting from the
* right) is returned. substring_index performs a case-sensitive match when searching for delim.
*/
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = """_FUNC_(str, delim, count) - Returns the substring from str before count occurrences of the delimiter delim.
+ If count is positive, everything to the left of the final delimiter (counting from the
+ left) is returned. If count is negative, everything to the right of the final delimiter
+ (counting from the right) is returned. Substring_index performs a case-sensitive match
+ when searching for delim.""",
+ extended = "> SELECT _FUNC_('www.apache.org', '.', 2);\n 'www.apache'")
+// scalastyle:on line.size.limit
case class SubstringIndex(strExpr: Expression, delimExpr: Expression, countExpr: Expression)
extends TernaryExpression with ImplicitCastInputTypes {
@@ -445,6 +484,12 @@ case class SubstringIndex(strExpr: Expression, delimExpr: Expression, countExpr:
* A function that returns the position of the first occurrence of substr
* in given string after position pos.
*/
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = """_FUNC_(substr, str[, pos]) - Returns the position of the first occurrence of substr in str after position pos.
+ The given pos and return value are 1-based.""",
+ extended = "> SELECT _FUNC_('bar', 'foobarbar', 5);\n 7")
+// scalastyle:on line.size.limit
case class StringLocate(substr: Expression, str: Expression, start: Expression)
extends TernaryExpression with ImplicitCastInputTypes {
@@ -510,6 +555,11 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
/**
* Returns str, left-padded with pad to a length of len.
*/
+@ExpressionDescription(
+ usage = """_FUNC_(str, len, pad) - Returns str, left-padded with pad to a length of len.
+ If str is longer than len, the return value is shortened to len characters.""",
+ extended = "> SELECT _FUNC_('hi', 5, '??');\n '???hi'\n" +
+ "> SELECT _FUNC_('hi', 1, '??');\n 'h'")
case class StringLPad(str: Expression, len: Expression, pad: Expression)
extends TernaryExpression with ImplicitCastInputTypes {
@@ -531,6 +581,11 @@ case class StringLPad(str: Expression, len: Expression, pad: Expression)
/**
* Returns str, right-padded with pad to a length of len.
*/
+@ExpressionDescription(
+ usage = """_FUNC_(str, len, pad) - Returns str, right-padded with pad to a length of len.
+ If str is longer than len, the return value is shortened to len characters.""",
+ extended = "> SELECT _FUNC_('hi', 5, '??');\n 'hi???'\n" +
+ "> SELECT _FUNC_('hi', 1, '??');\n 'h'")
case class StringRPad(str: Expression, len: Expression, pad: Expression)
extends TernaryExpression with ImplicitCastInputTypes {
@@ -552,6 +607,11 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression)
/**
* Returns the input formatted according do printf-style format strings
*/
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = "_FUNC_(String format, Obj... args) - Returns a formatted string from printf-style format strings.",
+ extended = "> SELECT _FUNC_(\"Hello World %d %s\", 100, \"days\");\n 'Hello World 100 days'")
+// scalastyle:on line.size.limit
case class FormatString(children: Expression*) extends Expression with ImplicitCastInputTypes {
require(children.nonEmpty, "format_string() should take at least 1 argument")
@@ -618,25 +678,33 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC
}
/**
- * Returns string, with the first letter of each word in uppercase.
+ * Returns string, with the first letter of each word in uppercase, all other letters in lowercase.
* Words are delimited by whitespace.
*/
+@ExpressionDescription(
+ usage =
+ """_FUNC_(str) - Returns str with the first letter of each word in uppercase.
+ All other letters are in lowercase. Words are delimited by white space.""",
+ extended = "> SELECT initcap('sPark sql');\n 'Spark Sql'")
case class InitCap(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
override def inputTypes: Seq[DataType] = Seq(StringType)
override def dataType: DataType = StringType
override def nullSafeEval(string: Any): Any = {
- string.asInstanceOf[UTF8String].toTitleCase
+ string.asInstanceOf[UTF8String].toLowerCase.toTitleCase
}
override def genCode(ctx: CodegenContext, ev: ExprCode): String = {
- defineCodeGen(ctx, ev, str => s"$str.toTitleCase()")
+ defineCodeGen(ctx, ev, str => s"$str.toLowerCase().toTitleCase()")
}
}
/**
* Returns the string which repeat the given string value n times.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str, n) - Returns the string which repeat the given string value n times.",
+ extended = "> SELECT _FUNC_('123', 2);\n '123123'")
case class StringRepeat(str: Expression, times: Expression)
extends BinaryExpression with ImplicitCastInputTypes {
@@ -659,6 +727,9 @@ case class StringRepeat(str: Expression, times: Expression)
/**
* Returns the reversed given string.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str) - Returns the reversed given string.",
+ extended = "> SELECT _FUNC_('Spark SQL');\n 'LQS krapS'")
case class StringReverse(child: Expression) extends UnaryExpression with String2StringExpression {
override def convert(v: UTF8String): UTF8String = v.reverse()
@@ -672,6 +743,9 @@ case class StringReverse(child: Expression) extends UnaryExpression with String2
/**
* Returns a n spaces string.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(n) - Returns a n spaces string.",
+ extended = "> SELECT _FUNC_(2);\n ' '")
case class StringSpace(child: Expression)
extends UnaryExpression with ImplicitCastInputTypes {
@@ -694,7 +768,14 @@ case class StringSpace(child: Expression)
/**
* A function that takes a substring of its first argument starting at a given position.
* Defined for String and Binary types.
+ *
+ * NOTE: that this is not zero based, but 1-based index. The first character in str has index 1.
*/
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = "_FUNC_(str, pos[, len]) - Returns the substring of str that starts at pos and is of length len or the slice of byte array that starts at pos and is of length len.",
+ extended = "> SELECT _FUNC_('Spark SQL', 5);\n 'k SQL'\n> SELECT _FUNC_('Spark SQL', -3);\n 'SQL'\n> SELECT _FUNC_('Spark SQL', 5, 1);\n 'k'")
+// scalastyle:on line.size.limit
case class Substring(str: Expression, pos: Expression, len: Expression)
extends TernaryExpression with ImplicitCastInputTypes {
@@ -732,6 +813,9 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
/**
* A function that return the length of the given string or binary expression.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str | binary) - Returns the length of str or number of bytes in binary data.",
+ extended = "> SELECT _FUNC_('Spark SQL');\n 9")
case class Length(child: Expression) extends UnaryExpression with ExpectsInputTypes {
override def dataType: DataType = IntegerType
override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
@@ -752,6 +836,9 @@ case class Length(child: Expression) extends UnaryExpression with ExpectsInputTy
/**
* A function that return the Levenshtein distance between the two given strings.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str1, str2) - Returns the Levenshtein distance between the two given strings.",
+ extended = "> SELECT _FUNC_('kitten', 'sitting');\n 3")
case class Levenshtein(left: Expression, right: Expression) extends BinaryExpression
with ImplicitCastInputTypes {
@@ -770,6 +857,9 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
/**
* A function that return soundex code of the given string expression.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str) - Returns soundex code of the string.",
+ extended = "> SELECT _FUNC_('Miller');\n 'M460'")
case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes {
override def dataType: DataType = StringType
@@ -786,6 +876,10 @@ case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputT
/**
* Returns the numeric value of the first character of str.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str) - Returns the numeric value of the first character of str.",
+ extended = "> SELECT _FUNC_('222');\n 50\n" +
+ "> SELECT _FUNC_(2);\n 50")
case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
override def dataType: DataType = IntegerType
@@ -817,6 +911,8 @@ case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInp
/**
* Converts the argument from binary to a base 64 string.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(bin) - Convert the argument from binary to a base 64 string.")
case class Base64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
override def dataType: DataType = StringType
@@ -839,6 +935,8 @@ case class Base64(child: Expression) extends UnaryExpression with ImplicitCastIn
/**
* Converts the argument from a base 64 string to BINARY.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(str) - Convert the argument from a base 64 string to binary.")
case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
override def dataType: DataType = BinaryType
@@ -860,6 +958,8 @@ case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCast
* (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
* If either argument is null, the result will also be null.
*/
+@ExpressionDescription(
+ usage = "_FUNC_(bin, str) - Decode the first argument using the second argument character set.")
case class Decode(bin: Expression, charset: Expression)
extends BinaryExpression with ImplicitCastInputTypes {
@@ -889,7 +989,9 @@ case class Decode(bin: Expression, charset: Expression)
* Encodes the first argument into a BINARY using the provided character set
* (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
* If either argument is null, the result will also be null.
-*/
+ */
+@ExpressionDescription(
+ usage = "_FUNC_(str, str) - Encode the first argument using the second argument character set.")
case class Encode(value: Expression, charset: Expression)
extends BinaryExpression with ImplicitCastInputTypes {
@@ -919,6 +1021,11 @@ case class Encode(value: Expression, charset: Expression)
* and returns the result as a string. If D is 0, the result has no decimal point or
* fractional part.
*/
+@ExpressionDescription(
+ usage = """_FUNC_(X, D) - Formats the number X like '#,###,###.##', rounded to D decimal places.
+ If D is 0, the result has no decimal point or fractional part.
+ This is supposed to function like MySQL's FORMAT.""",
+ extended = "> SELECT _FUNC_(12332.123456, 4);\n '12,332.1235'")
case class FormatNumber(x: Expression, d: Expression)
extends BinaryExpression with ExpectsInputTypes {