diff options
author | HuJiayin <jiayin.hu@intel.com> | 2015-07-31 16:05:26 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-07-31 16:05:26 -0700 |
commit | 4d5a6e7b60b315968973e2298eeee5eb174ec721 (patch) | |
tree | 8967ec9a096760ab45668136bb070f5d9d72179e /unsafe | |
parent | 3fc0cb92001798167a14c1377362a3335397dd4c (diff) | |
download | spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.tar.gz spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.tar.bz2 spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.zip |
[SPARK-8271][SQL]string function: soundex
This PR brings SQL function soundex(), see https://issues.apache.org/jira/browse/HIVE-9738
It's based on #7115 , thanks to HuJiayin
Author: HuJiayin <jiayin.hu@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7812 from davies/soundex and squashes the following commits:
fa75941 [Davies Liu] Merge branch 'master' of github.com:apache/spark into soundex
a4bd6d8 [Davies Liu] fix soundex
2538908 [HuJiayin] add codegen soundex
d15d329 [HuJiayin] add back ut
ded1a14 [HuJiayin] Merge branch 'master' of https://github.com/apache/spark
e2dec2c [HuJiayin] support soundex rebase code
Diffstat (limited to 'unsafe')
-rw-r--r-- | unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java | 53 | ||||
-rw-r--r-- | unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java | 48 |
2 files changed, 101 insertions, 0 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index c38953f65d..9d4998fd48 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -680,4 +680,57 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { } return result; } + + /** + * Soundex mapping table + */ + private static final byte[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '7', + '0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '7', '2', '0', '2'}; + + /** + * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, + * but can also be used as a general purpose scheme to find word with similar phonemes. + * https://en.wikipedia.org/wiki/Soundex + */ + public UTF8String soundex() { + if (numBytes == 0) { + return EMPTY_UTF8; + } + + byte b = getByte(0); + if ('a' <= b && b <= 'z') { + b -= 32; + } else if (b < 'A' || 'Z' < b) { + // first character must be a letter + return this; + } + byte sx[] = {'0', '0', '0', '0'}; + sx[0] = b; + int sxi = 1; + int idx = b - 'A'; + byte lastCode = US_ENGLISH_MAPPING[idx]; + + for (int i = 1; i < numBytes; i++) { + b = getByte(i); + if ('a' <= b && b <= 'z') { + b -= 32; + } else if (b < 'A' || 'Z' < b) { + // not a letter, skip it + lastCode = '0'; + continue; + } + idx = b - 'A'; + byte code = US_ENGLISH_MAPPING[idx]; + if (code == '7') { + // ignore it + } else { + if (code != '0' && code != lastCode) { + sx[sxi++] = code; + if (sxi > 3) break; + } + lastCode = code; + } + } + return UTF8String.fromBytes(sx); + } } diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index f2cc19ca6b..c565210872 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -341,4 +341,52 @@ public class UTF8StringSuite { assertEquals(fromString(" "), blankString(3)); assertEquals(fromString(""), blankString(0)); } + + @Test + public void soundex() { + assertEquals(fromString("Robert").soundex(), fromString("R163")); + assertEquals(fromString("Rupert").soundex(), fromString("R163")); + assertEquals(fromString("Rubin").soundex(), fromString("R150")); + assertEquals(fromString("Ashcraft").soundex(), fromString("A261")); + assertEquals(fromString("Ashcroft").soundex(), fromString("A261")); + assertEquals(fromString("Burroughs").soundex(), fromString("B620")); + assertEquals(fromString("Burrows").soundex(), fromString("B620")); + assertEquals(fromString("Ekzampul").soundex(), fromString("E251")); + assertEquals(fromString("Example").soundex(), fromString("E251")); + assertEquals(fromString("Ellery").soundex(), fromString("E460")); + assertEquals(fromString("Euler").soundex(), fromString("E460")); + assertEquals(fromString("Ghosh").soundex(), fromString("G200")); + assertEquals(fromString("Gauss").soundex(), fromString("G200")); + assertEquals(fromString("Gutierrez").soundex(), fromString("G362")); + assertEquals(fromString("Heilbronn").soundex(), fromString("H416")); + assertEquals(fromString("Hilbert").soundex(), fromString("H416")); + assertEquals(fromString("Jackson").soundex(), fromString("J250")); + assertEquals(fromString("Kant").soundex(), fromString("K530")); + assertEquals(fromString("Knuth").soundex(), fromString("K530")); + assertEquals(fromString("Lee").soundex(), fromString("L000")); + assertEquals(fromString("Lukasiewicz").soundex(), fromString("L222")); + assertEquals(fromString("Lissajous").soundex(), fromString("L222")); + assertEquals(fromString("Ladd").soundex(), fromString("L300")); + assertEquals(fromString("Lloyd").soundex(), fromString("L300")); + assertEquals(fromString("Moses").soundex(), fromString("M220")); + assertEquals(fromString("O'Hara").soundex(), fromString("O600")); + assertEquals(fromString("Pfister").soundex(), fromString("P236")); + assertEquals(fromString("Rubin").soundex(), fromString("R150")); + assertEquals(fromString("Robert").soundex(), fromString("R163")); + assertEquals(fromString("Rupert").soundex(), fromString("R163")); + assertEquals(fromString("Soundex").soundex(), fromString("S532")); + assertEquals(fromString("Sownteks").soundex(), fromString("S532")); + assertEquals(fromString("Tymczak").soundex(), fromString("T522")); + assertEquals(fromString("VanDeusen").soundex(), fromString("V532")); + assertEquals(fromString("Washington").soundex(), fromString("W252")); + assertEquals(fromString("Wheaton").soundex(), fromString("W350")); + + assertEquals(fromString("a").soundex(), fromString("A000")); + assertEquals(fromString("ab").soundex(), fromString("A100")); + assertEquals(fromString("abc").soundex(), fromString("A120")); + assertEquals(fromString("abcd").soundex(), fromString("A123")); + assertEquals(fromString("").soundex(), fromString("")); + assertEquals(fromString("123").soundex(), fromString("123")); + assertEquals(fromString("世界千世").soundex(), fromString("世界千世")); + } } |