diff options
author | Tarek Auel <tarek.auel@googlemail.com> | 2015-07-04 01:10:52 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-07-04 01:10:52 -0700 |
commit | 6b3574e68704d58ba41efe0ea4fe928cc166afcd (patch) | |
tree | c8dc9f32d4081d94063df0d7cf6665d99e797641 /sql/core/src | |
parent | f35b0c3436898f22860d2c6c1d12f3a661005201 (diff) | |
download | spark-6b3574e68704d58ba41efe0ea4fe928cc166afcd.tar.gz spark-6b3574e68704d58ba41efe0ea4fe928cc166afcd.tar.bz2 spark-6b3574e68704d58ba41efe0ea4fe928cc166afcd.zip |
[SPARK-8270][SQL] levenshtein distance
Jira: https://issues.apache.org/jira/browse/SPARK-8270
Info: I can not build the latest master, it stucks during the build process: `[INFO] Dependency-reduced POM written at: /Users/tarek/test/spark/bagel/dependency-reduced-pom.xml`
Author: Tarek Auel <tarek.auel@googlemail.com>
Closes #7214 from tarekauel/SPARK-8270 and squashes the following commits:
ab348b9 [Tarek Auel] Merge branch 'master' into SPARK-8270
a2ad318 [Tarek Auel] [SPARK-8270] changed order of fields
d91b12c [Tarek Auel] [SPARK-8270] python fix
adbd075 [Tarek Auel] [SPARK-8270] fixed typo
23185c9 [Tarek Auel] [SPARK-8270] levenshtein distance
Diffstat (limited to 'sql/core/src')
-rw-r--r-- | sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 23 | ||||
-rw-r--r-- | sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala | 6 |
2 files changed, 25 insertions, 4 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index b63c6ee8ab..e4109da08e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1580,22 +1580,37 @@ object functions { ////////////////////////////////////////////////////////////////////////////////////////////// /** - * Computes the length of a given string value - * + * Computes the length of a given string value. + * * @group string_funcs * @since 1.5.0 */ def strlen(e: Column): Column = StringLength(e.expr) /** - * Computes the length of a given string column - * + * Computes the length of a given string column. + * * @group string_funcs * @since 1.5.0 */ def strlen(columnName: String): Column = strlen(Column(columnName)) /** + * Computes the Levenshtein distance of the two given strings. + * @group string_funcs + * @since 1.5.0 + */ + def levenshtein(l: Column, r: Column): Column = Levenshtein(l.expr, r.expr) + + /** + * Computes the Levenshtein distance of the two given strings. + * @group string_funcs + * @since 1.5.0 + */ + def levenshtein(leftColumnName: String, rightColumnName: String): Column = + levenshtein(Column(leftColumnName), Column(rightColumnName)) + + /** * Computes the numeric value of the first character of the specified string value. * * @group string_funcs diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index bd9fa400e5..bc455a922d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -226,6 +226,12 @@ class DataFrameFunctionsSuite extends QueryTest { }) } + test("Levenshtein distance") { + val df = Seq(("kitten", "sitting"), ("frog", "fog")).toDF("l", "r") + checkAnswer(df.select(levenshtein("l", "r")), Seq(Row(3), Row(1))) + checkAnswer(df.selectExpr("levenshtein(l, r)"), Seq(Row(3), Row(1))) + } + test("string ascii function") { val df = Seq(("abc", "")).toDF("a", "b") checkAnswer( |