aboutsummaryrefslogtreecommitdiff
path: root/sql/catalyst
diff options
context:
space:
mode:
authorHuJiayin <jiayin.hu@intel.com>2015-07-31 16:05:26 -0700
committerReynold Xin <rxin@databricks.com>2015-07-31 16:05:26 -0700
commit4d5a6e7b60b315968973e2298eeee5eb174ec721 (patch)
tree8967ec9a096760ab45668136bb070f5d9d72179e /sql/catalyst
parent3fc0cb92001798167a14c1377362a3335397dd4c (diff)
downloadspark-4d5a6e7b60b315968973e2298eeee5eb174ec721.tar.gz
spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.tar.bz2
spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.zip
[SPARK-8271][SQL]string function: soundex
This PR brings SQL function soundex(), see https://issues.apache.org/jira/browse/HIVE-9738 It's based on #7115 , thanks to HuJiayin Author: HuJiayin <jiayin.hu@intel.com> Author: Davies Liu <davies@databricks.com> Closes #7812 from davies/soundex and squashes the following commits: fa75941 [Davies Liu] Merge branch 'master' of github.com:apache/spark into soundex a4bd6d8 [Davies Liu] fix soundex 2538908 [HuJiayin] add codegen soundex d15d329 [HuJiayin] add back ut ded1a14 [HuJiayin] Merge branch 'master' of https://github.com/apache/spark e2dec2c [HuJiayin] support soundex rebase code
Diffstat (limited to 'sql/catalyst')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala1
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala16
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala28
3 files changed, 45 insertions, 0 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 1bf7204a25..3f61a9af1f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -194,6 +194,7 @@ object FunctionRegistry {
expression[StringRepeat]("repeat"),
expression[StringReverse]("reverse"),
expression[StringTrimRight]("rtrim"),
+ expression[SoundEx]("soundex"),
expression[StringSpace]("space"),
expression[StringSplit]("split"),
expression[Substring]("substr"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 684eac12bd..160e72f384 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -719,6 +719,22 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
}
/**
+ * A function that return soundex code of the given string expression.
+ */
+case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+
+ override def dataType: DataType = StringType
+
+ override def inputTypes: Seq[DataType] = Seq(StringType)
+
+ override def nullSafeEval(input: Any): Any = input.asInstanceOf[UTF8String].soundex()
+
+ override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+ defineCodeGen(ctx, ev, c => s"$c.soundex()")
+ }
+}
+
+/**
* Returns the numeric value of the first character of str.
*/
case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 3ecd0d374c..fb72fe1714 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -347,6 +347,34 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
// scalastyle:on
}
+ test("soundex unit test") {
+ checkEvaluation(SoundEx(Literal("ZIN")), "Z500")
+ checkEvaluation(SoundEx(Literal("SU")), "S000")
+ checkEvaluation(SoundEx(Literal("")), "")
+ checkEvaluation(SoundEx(Literal.create(null, StringType)), null)
+
+ // scalastyle:off
+ // non ascii characters are not allowed in the code, so we disable the scalastyle here.
+ checkEvaluation(SoundEx(Literal("测试")), "测试")
+ checkEvaluation(SoundEx(Literal("Tschüss")), "T220")
+ // scalastyle:on
+ checkEvaluation(SoundEx(Literal("zZ")), "Z000", create_row("s8"))
+ checkEvaluation(SoundEx(Literal("RAGSSEEESSSVEEWE")), "R221")
+ checkEvaluation(SoundEx(Literal("Ashcraft")), "A261")
+ checkEvaluation(SoundEx(Literal("Aswcraft")), "A261")
+ checkEvaluation(SoundEx(Literal("Tymczak")), "T522")
+ checkEvaluation(SoundEx(Literal("Pfister")), "P236")
+ checkEvaluation(SoundEx(Literal("Miller")), "M460")
+ checkEvaluation(SoundEx(Literal("Peterson")), "P362")
+ checkEvaluation(SoundEx(Literal("Peters")), "P362")
+ checkEvaluation(SoundEx(Literal("Auerbach")), "A612")
+ checkEvaluation(SoundEx(Literal("Uhrbach")), "U612")
+ checkEvaluation(SoundEx(Literal("Moskowitz")), "M232")
+ checkEvaluation(SoundEx(Literal("Moskovitz")), "M213")
+ checkEvaluation(SoundEx(Literal("relyheewsgeessg")), "R422")
+ checkEvaluation(SoundEx(Literal("!!")), "!!")
+ }
+
test("TRIM/LTRIM/RTRIM") {
val s = 'a.string.at(0)
checkEvaluation(StringTrim(Literal(" aa ")), "aa", create_row(" abdef "))