diff options
author | Tarek Auel <tarek.auel@googlemail.com> | 2015-07-09 09:22:24 -0700 |
---|---|---|
committer | Davies Liu <davies.liu@gmail.com> | 2015-07-09 09:23:35 -0700 |
commit | a1964e9d902bb31f001893da8bc81f6dce08c908 (patch) | |
tree | c1c4d090804316358c959637558dd5dc5141ffec /unsafe/src/main | |
parent | 23448a9e988a1b92bd05ee8c6c1a096c83375a12 (diff) | |
download | spark-a1964e9d902bb31f001893da8bc81f6dce08c908.tar.gz spark-a1964e9d902bb31f001893da8bc81f6dce08c908.tar.bz2 spark-a1964e9d902bb31f001893da8bc81f6dce08c908.zip |
[SPARK-8830] [SQL] native levenshtein distance
Jira: https://issues.apache.org/jira/browse/SPARK-8830
rxin and HuJiayin can you have a look on it.
Author: Tarek Auel <tarek.auel@googlemail.com>
Closes #7236 from tarekauel/native-levenshtein-distance and squashes the following commits:
ee4c4de [Tarek Auel] [SPARK-8830] implemented improvement proposals
c252e71 [Tarek Auel] [SPARK-8830] removed chartAt; use unsafe method for byte array comparison
ddf2222 [Tarek Auel] Merge branch 'master' into native-levenshtein-distance
179920a [Tarek Auel] [SPARK-8830] added description
5e9ed54 [Tarek Auel] [SPARK-8830] removed StringUtils import
dce4308 [Tarek Auel] [SPARK-8830] native levenshtein distance
Diffstat (limited to 'unsafe/src/main')
-rw-r--r-- | unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java | 66 |
1 files changed, 64 insertions, 2 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index d2a25096a5..847d80ad58 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -99,8 +99,6 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { /** * Returns the number of code points in it. - * - * This is only used by Substring() when `start` is negative. */ public int numChars() { int len = 0; @@ -254,6 +252,70 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { } } + /** + * Levenshtein distance is a metric for measuring the distance of two strings. The distance is + * defined by the minimum number of single-character edits (i.e. insertions, deletions or + * substitutions) that are required to change one of the strings into the other. + */ + public int levenshteinDistance(UTF8String other) { + // Implementation adopted from org.apache.common.lang3.StringUtils.getLevenshteinDistance + + int n = numChars(); + int m = other.numChars(); + + if (n == 0) { + return m; + } else if (m == 0) { + return n; + } + + UTF8String s, t; + + if (n <= m) { + s = this; + t = other; + } else { + s = other; + t = this; + int swap; + swap = n; + n = m; + m = swap; + } + + int p[] = new int[n + 1]; + int d[] = new int[n + 1]; + int swap[]; + + int i, i_bytes, j, j_bytes, num_bytes_j, cost; + + for (i = 0; i <= n; i++) { + p[i] = i; + } + + for (j = 0, j_bytes = 0; j < m; j_bytes += num_bytes_j, j++) { + num_bytes_j = numBytesForFirstByte(t.getByte(j_bytes)); + d[0] = j + 1; + + for (i = 0, i_bytes = 0; i < n; i_bytes += numBytesForFirstByte(s.getByte(i_bytes)), i++) { + if (s.getByte(i_bytes) != t.getByte(j_bytes) || + num_bytes_j != numBytesForFirstByte(s.getByte(i_bytes))) { + cost = 1; + } else { + cost = (ByteArrayMethods.arrayEquals(t.base, t.offset + j_bytes, s.base, + s.offset + i_bytes, num_bytes_j)) ? 0 : 1; + } + d[i + 1] = Math.min(Math.min(d[i] + 1, p[i + 1] + 1), p[i] + cost); + } + + swap = p; + p = d; + d = swap; + } + + return p[n]; + } + @Override public int hashCode() { int result = 1; |