diff options
author | HuJiayin <jiayin.hu@intel.com> | 2015-07-31 16:05:26 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-07-31 16:05:26 -0700 |
commit | 4d5a6e7b60b315968973e2298eeee5eb174ec721 (patch) | |
tree | 8967ec9a096760ab45668136bb070f5d9d72179e /sql | |
parent | 3fc0cb92001798167a14c1377362a3335397dd4c (diff) | |
download | spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.tar.gz spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.tar.bz2 spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.zip |
[SPARK-8271][SQL]string function: soundex
This PR brings SQL function soundex(), see https://issues.apache.org/jira/browse/HIVE-9738
It's based on #7115 , thanks to HuJiayin
Author: HuJiayin <jiayin.hu@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7812 from davies/soundex and squashes the following commits:
fa75941 [Davies Liu] Merge branch 'master' of github.com:apache/spark into soundex
a4bd6d8 [Davies Liu] fix soundex
2538908 [HuJiayin] add codegen soundex
d15d329 [HuJiayin] add back ut
ded1a14 [HuJiayin] Merge branch 'master' of https://github.com/apache/spark
e2dec2c [HuJiayin] support soundex rebase code
Diffstat (limited to 'sql')
5 files changed, 62 insertions, 0 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 1bf7204a25..3f61a9af1f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -194,6 +194,7 @@ object FunctionRegistry { expression[StringRepeat]("repeat"), expression[StringReverse]("reverse"), expression[StringTrimRight]("rtrim"), + expression[SoundEx]("soundex"), expression[StringSpace]("space"), expression[StringSplit]("split"), expression[Substring]("substr"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 684eac12bd..160e72f384 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -719,6 +719,22 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres } /** + * A function that return soundex code of the given string expression. + */ +case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes { + + override def dataType: DataType = StringType + + override def inputTypes: Seq[DataType] = Seq(StringType) + + override def nullSafeEval(input: Any): Any = input.asInstanceOf[UTF8String].soundex() + + override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + defineCodeGen(ctx, ev, c => s"$c.soundex()") + } +} + +/** * Returns the numeric value of the first character of str. */ case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 3ecd0d374c..fb72fe1714 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -347,6 +347,34 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // scalastyle:on } + test("soundex unit test") { + checkEvaluation(SoundEx(Literal("ZIN")), "Z500") + checkEvaluation(SoundEx(Literal("SU")), "S000") + checkEvaluation(SoundEx(Literal("")), "") + checkEvaluation(SoundEx(Literal.create(null, StringType)), null) + + // scalastyle:off + // non ascii characters are not allowed in the code, so we disable the scalastyle here. + checkEvaluation(SoundEx(Literal("测试")), "测试") + checkEvaluation(SoundEx(Literal("Tschüss")), "T220") + // scalastyle:on + checkEvaluation(SoundEx(Literal("zZ")), "Z000", create_row("s8")) + checkEvaluation(SoundEx(Literal("RAGSSEEESSSVEEWE")), "R221") + checkEvaluation(SoundEx(Literal("Ashcraft")), "A261") + checkEvaluation(SoundEx(Literal("Aswcraft")), "A261") + checkEvaluation(SoundEx(Literal("Tymczak")), "T522") + checkEvaluation(SoundEx(Literal("Pfister")), "P236") + checkEvaluation(SoundEx(Literal("Miller")), "M460") + checkEvaluation(SoundEx(Literal("Peterson")), "P362") + checkEvaluation(SoundEx(Literal("Peters")), "P362") + checkEvaluation(SoundEx(Literal("Auerbach")), "A612") + checkEvaluation(SoundEx(Literal("Uhrbach")), "U612") + checkEvaluation(SoundEx(Literal("Moskowitz")), "M232") + checkEvaluation(SoundEx(Literal("Moskovitz")), "M213") + checkEvaluation(SoundEx(Literal("relyheewsgeessg")), "R422") + checkEvaluation(SoundEx(Literal("!!")), "!!") + } + test("TRIM/LTRIM/RTRIM") { val s = 'a.string.at(0) checkEvaluation(StringTrim(Literal(" aa ")), "aa", create_row(" abdef ")) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 5d82a5eadd..89ffa9c50d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1903,6 +1903,14 @@ object functions { } /** + * * Return the soundex code for the specified expression. + * + * @group string_funcs + * @since 1.5.0 + */ + def soundex(e: Column): Column = SoundEx(e.expr) + + /** * Splits str around pattern (pattern is a regular expression). * NOTE: pattern is a string represent the regular expression. * diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 8e0ea76d15..b7f073cccb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -142,6 +142,15 @@ class StringFunctionsSuite extends QueryTest { Row("aa123cc")) } + test("soundex function") { + val df = Seq(("MARY", "SU")).toDF("l", "r") + checkAnswer( + df.select(soundex($"l"), soundex($"r")), Row("M600", "S000")) + + checkAnswer( + df.selectExpr("SoundEx(l)", "SoundEx(r)"), Row("M600", "S000")) + } + test("string instr function") { val df = Seq(("aaads", "aa", "zz")).toDF("a", "b", "c") |