diff options
author | HuJiayin <jiayin.hu@intel.com> | 2015-08-01 21:44:57 -0700 |
---|---|---|
committer | Davies Liu <davies.liu@gmail.com> | 2015-08-01 21:44:57 -0700 |
commit | 00cd92f32f17ca57d47aa2dcc716eb707aaee799 (patch) | |
tree | 87fae8a2daea19abc7dee69b551c5c0e6f54bf4b /unsafe/src | |
parent | 5d9e33d9a2633e45082ac395a64646364f22f4c4 (diff) | |
download | spark-00cd92f32f17ca57d47aa2dcc716eb707aaee799.tar.gz spark-00cd92f32f17ca57d47aa2dcc716eb707aaee799.tar.bz2 spark-00cd92f32f17ca57d47aa2dcc716eb707aaee799.zip |
[SPARK-8269] [SQL] string function: initcap
This PR is based on #7208 , thanks to HuJiayin
Closes #7208
Author: HuJiayin <jiayin.hu@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7850 from davies/initcap and squashes the following commits:
54472e9 [Davies Liu] fix python test
17ffe51 [Davies Liu] Merge branch 'master' of github.com:apache/spark into initcap
ca46390 [Davies Liu] Merge branch 'master' of github.com:apache/spark into initcap
3a906e4 [Davies Liu] implement title case in UTF8String
8b2506a [HuJiayin] Update functions.py
2cd43e5 [HuJiayin] fix python style check
b616c0e [HuJiayin] add python api
1f5a0ef [HuJiayin] add codegen
7e0c604 [HuJiayin] Merge branch 'master' of https://github.com/apache/spark into initcap
6a0b958 [HuJiayin] add column
c79482d [HuJiayin] support soundex
7ce416b [HuJiayin] support initcap rebase code
Diffstat (limited to 'unsafe/src')
-rw-r--r-- | unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java | 88 | ||||
-rw-r--r-- | unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java | 8 |
2 files changed, 96 insertions, 0 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 208503d2fd..213dc761bb 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -279,6 +279,29 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { * Returns the upper case of this string */ public UTF8String toUpperCase() { + if (numBytes == 0) { + return EMPTY_UTF8; + } + + byte[] bytes = new byte[numBytes]; + bytes[0] = (byte) Character.toTitleCase(getByte(0)); + for (int i = 0; i < numBytes; i++) { + byte b = getByte(i); + if (numBytesForFirstByte(b) != 1) { + // fallback + return toUpperCaseSlow(); + } + int upper = Character.toUpperCase((int) b); + if (upper > 127) { + // fallback + return toUpperCaseSlow(); + } + bytes[i] = (byte) upper; + } + return fromBytes(bytes); + } + + private UTF8String toUpperCaseSlow() { return fromString(toString().toUpperCase()); } @@ -286,10 +309,75 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { * Returns the lower case of this string */ public UTF8String toLowerCase() { + if (numBytes == 0) { + return EMPTY_UTF8; + } + + byte[] bytes = new byte[numBytes]; + bytes[0] = (byte) Character.toTitleCase(getByte(0)); + for (int i = 0; i < numBytes; i++) { + byte b = getByte(i); + if (numBytesForFirstByte(b) != 1) { + // fallback + return toLowerCaseSlow(); + } + int lower = Character.toLowerCase((int) b); + if (lower > 127) { + // fallback + return toLowerCaseSlow(); + } + bytes[i] = (byte) lower; + } + return fromBytes(bytes); + } + + private UTF8String toLowerCaseSlow() { return fromString(toString().toLowerCase()); } /** + * Returns the title case of this string, that could be used as title. + */ + public UTF8String toTitleCase() { + if (numBytes == 0) { + return EMPTY_UTF8; + } + + byte[] bytes = new byte[numBytes]; + for (int i = 0; i < numBytes; i++) { + byte b = getByte(i); + if (i == 0 || getByte(i - 1) == ' ') { + if (numBytesForFirstByte(b) != 1) { + // fallback + return toTitleCaseSlow(); + } + int upper = Character.toTitleCase(b); + if (upper > 127) { + // fallback + return toTitleCaseSlow(); + } + bytes[i] = (byte) upper; + } else { + bytes[i] = b; + } + } + return fromBytes(bytes); + } + + private UTF8String toTitleCaseSlow() { + StringBuffer sb = new StringBuffer(); + String s = toString(); + sb.append(s); + sb.setCharAt(0, Character.toTitleCase(sb.charAt(0))); + for (int i = 1; i < s.length(); i++) { + if (sb.charAt(i - 1) == ' ') { + sb.setCharAt(i, Character.toTitleCase(sb.charAt(i))); + } + } + return fromString(sb.toString()); + } + + /** * Copy the bytes from the current UTF8String, and make a new UTF8String. * @param start the start position of the current UTF8String in bytes. * @param end the end position of the current UTF8String in bytes. diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index ed50cdcb29..9b3190f8f0 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -115,6 +115,14 @@ public class UTF8StringSuite { } @Test + public void titleCase() { + assertEquals(fromString(""), fromString("").toTitleCase()); + assertEquals(fromString("Ab Bc Cd"), fromString("ab bc cd").toTitleCase()); + assertEquals(fromString("Ѐ Ё Ђ Ѻ Ώ Ề"), fromString("ѐ ё ђ ѻ ώ ề").toTitleCase()); + assertEquals(fromString("大千世界 数据砖头"), fromString("大千世界 数据砖头").toTitleCase()); + } + + @Test public void concatTest() { assertEquals(EMPTY_UTF8, concat()); assertEquals(null, concat((UTF8String) null)); |