aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorTarek Auel <tarek.auel@googlemail.com>2015-07-09 09:22:24 -0700
committerDavies Liu <davies.liu@gmail.com>2015-07-09 09:23:35 -0700
commita1964e9d902bb31f001893da8bc81f6dce08c908 (patch)
treec1c4d090804316358c959637558dd5dc5141ffec /sql
parent23448a9e988a1b92bd05ee8c6c1a096c83375a12 (diff)
downloadspark-a1964e9d902bb31f001893da8bc81f6dce08c908.tar.gz
spark-a1964e9d902bb31f001893da8bc81f6dce08c908.tar.bz2
spark-a1964e9d902bb31f001893da8bc81f6dce08c908.zip
[SPARK-8830] [SQL] native levenshtein distance
Jira: https://issues.apache.org/jira/browse/SPARK-8830 rxin and HuJiayin can you have a look on it. Author: Tarek Auel <tarek.auel@googlemail.com> Closes #7236 from tarekauel/native-levenshtein-distance and squashes the following commits: ee4c4de [Tarek Auel] [SPARK-8830] implemented improvement proposals c252e71 [Tarek Auel] [SPARK-8830] removed chartAt; use unsafe method for byte array comparison ddf2222 [Tarek Auel] Merge branch 'master' into native-levenshtein-distance 179920a [Tarek Auel] [SPARK-8830] added description 5e9ed54 [Tarek Auel] [SPARK-8830] removed StringUtils import dce4308 [Tarek Auel] [SPARK-8830] native levenshtein distance
Diffstat (limited to 'sql')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala9
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala5
2 files changed, 9 insertions, 5 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 47fc7cdaa8..57f436485b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -284,13 +284,12 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
override def dataType: DataType = IntegerType
- protected override def nullSafeEval(input1: Any, input2: Any): Any =
- StringUtils.getLevenshteinDistance(input1.toString, input2.toString)
+ protected override def nullSafeEval(leftValue: Any, rightValue: Any): Any =
+ leftValue.asInstanceOf[UTF8String].levenshteinDistance(rightValue.asInstanceOf[UTF8String])
override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
- val stringUtils = classOf[StringUtils].getName
- defineCodeGen(ctx, ev, (left, right) =>
- s"$stringUtils.getLevenshteinDistance($left.toString(), $right.toString())")
+ nullSafeCodeGen(ctx, ev, (left, right) =>
+ s"${ev.primitive} = $left.levenshteinDistance($right);")
}
}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
index 1efbe1a245..69bef1c63e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
@@ -282,5 +282,10 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(Levenshtein(Literal("abc"), Literal("abc")), 0)
checkEvaluation(Levenshtein(Literal("kitten"), Literal("sitting")), 3)
checkEvaluation(Levenshtein(Literal("frog"), Literal("fog")), 1)
+ // scalastyle:off
+ // non ascii characters are not allowed in the code, so we disable the scalastyle here.
+ checkEvaluation(Levenshtein(Literal("千世"), Literal("fog")), 3)
+ checkEvaluation(Levenshtein(Literal("世界千世"), Literal("大a界b")), 4)
+ // scalastyle:on
}
}