diff options
author | HuJiayin <jiayin.hu@intel.com> | 2015-07-31 16:05:26 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-07-31 16:05:26 -0700 |
commit | 4d5a6e7b60b315968973e2298eeee5eb174ec721 (patch) | |
tree | 8967ec9a096760ab45668136bb070f5d9d72179e /unsafe/src/main | |
parent | 3fc0cb92001798167a14c1377362a3335397dd4c (diff) | |
download | spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.tar.gz spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.tar.bz2 spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.zip |
[SPARK-8271][SQL]string function: soundex
This PR brings SQL function soundex(), see https://issues.apache.org/jira/browse/HIVE-9738
It's based on #7115 , thanks to HuJiayin
Author: HuJiayin <jiayin.hu@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7812 from davies/soundex and squashes the following commits:
fa75941 [Davies Liu] Merge branch 'master' of github.com:apache/spark into soundex
a4bd6d8 [Davies Liu] fix soundex
2538908 [HuJiayin] add codegen soundex
d15d329 [HuJiayin] add back ut
ded1a14 [HuJiayin] Merge branch 'master' of https://github.com/apache/spark
e2dec2c [HuJiayin] support soundex rebase code
Diffstat (limited to 'unsafe/src/main')
-rw-r--r-- | unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java | 53 |
1 files changed, 53 insertions, 0 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index c38953f65d..9d4998fd48 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -680,4 +680,57 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { } return result; } + + /** + * Soundex mapping table + */ + private static final byte[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '7', + '0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '7', '2', '0', '2'}; + + /** + * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, + * but can also be used as a general purpose scheme to find word with similar phonemes. + * https://en.wikipedia.org/wiki/Soundex + */ + public UTF8String soundex() { + if (numBytes == 0) { + return EMPTY_UTF8; + } + + byte b = getByte(0); + if ('a' <= b && b <= 'z') { + b -= 32; + } else if (b < 'A' || 'Z' < b) { + // first character must be a letter + return this; + } + byte sx[] = {'0', '0', '0', '0'}; + sx[0] = b; + int sxi = 1; + int idx = b - 'A'; + byte lastCode = US_ENGLISH_MAPPING[idx]; + + for (int i = 1; i < numBytes; i++) { + b = getByte(i); + if ('a' <= b && b <= 'z') { + b -= 32; + } else if (b < 'A' || 'Z' < b) { + // not a letter, skip it + lastCode = '0'; + continue; + } + idx = b - 'A'; + byte code = US_ENGLISH_MAPPING[idx]; + if (code == '7') { + // ignore it + } else { + if (code != '0' && code != lastCode) { + sx[sxi++] = code; + if (sxi > 3) break; + } + lastCode = code; + } + } + return UTF8String.fromBytes(sx); + } } |