aboutsummaryrefslogtreecommitdiff
path: root/unsafe/src/main
diff options
context:
space:
mode:
authorHuJiayin <jiayin.hu@intel.com>2015-07-31 16:05:26 -0700
committerReynold Xin <rxin@databricks.com>2015-07-31 16:05:26 -0700
commit4d5a6e7b60b315968973e2298eeee5eb174ec721 (patch)
tree8967ec9a096760ab45668136bb070f5d9d72179e /unsafe/src/main
parent3fc0cb92001798167a14c1377362a3335397dd4c (diff)
downloadspark-4d5a6e7b60b315968973e2298eeee5eb174ec721.tar.gz
spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.tar.bz2
spark-4d5a6e7b60b315968973e2298eeee5eb174ec721.zip
[SPARK-8271][SQL]string function: soundex
This PR brings SQL function soundex(), see https://issues.apache.org/jira/browse/HIVE-9738 It's based on #7115 , thanks to HuJiayin Author: HuJiayin <jiayin.hu@intel.com> Author: Davies Liu <davies@databricks.com> Closes #7812 from davies/soundex and squashes the following commits: fa75941 [Davies Liu] Merge branch 'master' of github.com:apache/spark into soundex a4bd6d8 [Davies Liu] fix soundex 2538908 [HuJiayin] add codegen soundex d15d329 [HuJiayin] add back ut ded1a14 [HuJiayin] Merge branch 'master' of https://github.com/apache/spark e2dec2c [HuJiayin] support soundex rebase code
Diffstat (limited to 'unsafe/src/main')
-rw-r--r--unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java53
1 files changed, 53 insertions, 0 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index c38953f65d..9d4998fd48 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -680,4 +680,57 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
}
return result;
}
+
+ /**
+ * Soundex mapping table
+ */
+ private static final byte[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '7',
+ '0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '7', '2', '0', '2'};
+
+ /**
+ * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names,
+ * but can also be used as a general purpose scheme to find word with similar phonemes.
+ * https://en.wikipedia.org/wiki/Soundex
+ */
+ public UTF8String soundex() {
+ if (numBytes == 0) {
+ return EMPTY_UTF8;
+ }
+
+ byte b = getByte(0);
+ if ('a' <= b && b <= 'z') {
+ b -= 32;
+ } else if (b < 'A' || 'Z' < b) {
+ // first character must be a letter
+ return this;
+ }
+ byte sx[] = {'0', '0', '0', '0'};
+ sx[0] = b;
+ int sxi = 1;
+ int idx = b - 'A';
+ byte lastCode = US_ENGLISH_MAPPING[idx];
+
+ for (int i = 1; i < numBytes; i++) {
+ b = getByte(i);
+ if ('a' <= b && b <= 'z') {
+ b -= 32;
+ } else if (b < 'A' || 'Z' < b) {
+ // not a letter, skip it
+ lastCode = '0';
+ continue;
+ }
+ idx = b - 'A';
+ byte code = US_ENGLISH_MAPPING[idx];
+ if (code == '7') {
+ // ignore it
+ } else {
+ if (code != '0' && code != lastCode) {
+ sx[sxi++] = code;
+ if (sxi > 3) break;
+ }
+ lastCode = code;
+ }
+ }
+ return UTF8String.fromBytes(sx);
+ }
}