[SPARK-8269] [SQL] string function: initcap

This PR is based on #7208 , thanks to HuJiayin Closes #7208 Author: HuJiayin <jiayin.hu@intel.com> Author: Davies Liu <davies@databricks.com> Closes #7850 from davies/initcap and squashes the following commits: 54472e9 [Davies Liu] fix python test 17ffe51 [Davies Liu] Merge branch 'master' of github.com:apache/spark into initcap ca46390 [Davies Liu] Merge branch 'master' of github.com:apache/spark into initcap 3a906e4 [Davies Liu] implement title case in UTF8String 8b2506a [HuJiayin] Update functions.py 2cd43e5 [HuJiayin] fix python style check b616c0e [HuJiayin] add python api 1f5a0ef [HuJiayin] add codegen 7e0c604 [HuJiayin] Merge branch 'master' of https://github.com/apache/spark into initcap 6a0b958 [HuJiayin] add column c79482d [HuJiayin] support soundex 7ce416b [HuJiayin] support initcap rebase code
author: HuJiayin <jiayin.hu@intel.com> 2015-08-01 21:44:57 -0700
committer: Davies Liu <davies.liu@gmail.com> 2015-08-01 21:44:57 -0700
commit: 00cd92f32f17ca57d47aa2dcc716eb707aaee799 (patch)
tree: 87fae8a2daea19abc7dee69b551c5c0e6f54bf4b /unsafe/src
parent: 5d9e33d9a2633e45082ac395a64646364f22f4c4 (diff)
download: spark-00cd92f32f17ca57d47aa2dcc716eb707aaee799.tar.gz
spark-00cd92f32f17ca57d47aa2dcc716eb707aaee799.tar.bz2
spark-00cd92f32f17ca57d47aa2dcc716eb707aaee799.zip
2 files changed, 96 insertions, 0 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 208503d2fd..213dc761bb 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -279,6 +279,29 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
    * Returns the upper case of this string
    */
   public UTF8String toUpperCase() {
+    if (numBytes == 0) {
+      return EMPTY_UTF8;
+    }
+
+    byte[] bytes = new byte[numBytes];
+    bytes[0] = (byte) Character.toTitleCase(getByte(0));
+    for (int i = 0; i < numBytes; i++) {
+      byte b = getByte(i);
+      if (numBytesForFirstByte(b) != 1) {
+        // fallback
+        return toUpperCaseSlow();
+      }
+      int upper = Character.toUpperCase((int) b);
+      if (upper > 127) {
+        // fallback
+        return toUpperCaseSlow();
+      }
+      bytes[i] = (byte) upper;
+    }
+    return fromBytes(bytes);
+  }
+
+  private UTF8String toUpperCaseSlow() {
     return fromString(toString().toUpperCase());
   }
 
@@ -286,10 +309,75 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
    * Returns the lower case of this string
    */
   public UTF8String toLowerCase() {
+    if (numBytes == 0) {
+      return EMPTY_UTF8;
+    }
+
+    byte[] bytes = new byte[numBytes];
+    bytes[0] = (byte) Character.toTitleCase(getByte(0));
+    for (int i = 0; i < numBytes; i++) {
+      byte b = getByte(i);
+      if (numBytesForFirstByte(b) != 1) {
+        // fallback
+        return toLowerCaseSlow();
+      }
+      int lower = Character.toLowerCase((int) b);
+      if (lower > 127) {
+        // fallback
+        return toLowerCaseSlow();
+      }
+      bytes[i] = (byte) lower;
+    }
+    return fromBytes(bytes);
+  }
+
+  private UTF8String toLowerCaseSlow() {
     return fromString(toString().toLowerCase());
   }
 
   /**
+   * Returns the title case of this string, that could be used as title.
+   */
+  public UTF8String toTitleCase() {
+    if (numBytes == 0) {
+      return EMPTY_UTF8;
+    }
+
+    byte[] bytes = new byte[numBytes];
+    for (int i = 0; i < numBytes; i++) {
+      byte b = getByte(i);
+      if (i == 0 || getByte(i - 1) == ' ') {
+        if (numBytesForFirstByte(b) != 1) {
+          // fallback
+          return toTitleCaseSlow();
+        }
+        int upper = Character.toTitleCase(b);
+        if (upper > 127) {
+          // fallback
+          return toTitleCaseSlow();
+        }
+        bytes[i] = (byte) upper;
+      } else {
+        bytes[i] = b;
+      }
+    }
+    return fromBytes(bytes);
+  }
+
+  private UTF8String toTitleCaseSlow() {
+    StringBuffer sb = new StringBuffer();
+    String s = toString();
+    sb.append(s);
+    sb.setCharAt(0, Character.toTitleCase(sb.charAt(0)));
+    for (int i = 1; i < s.length(); i++) {
+      if (sb.charAt(i - 1) == ' ') {
+        sb.setCharAt(i, Character.toTitleCase(sb.charAt(i)));
+      }
+    }
+    return fromString(sb.toString());
+  }
+
+  /**
    * Copy the bytes from the current UTF8String, and make a new UTF8String.
    * @param start the start position of the current UTF8String in bytes.
    * @param end the end position of the current UTF8String in bytes.
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index ed50cdcb29..9b3190f8f0 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -115,6 +115,14 @@ public class UTF8StringSuite {
   }
 
   @Test
+  public void titleCase() {
+    assertEquals(fromString(""), fromString("").toTitleCase());
+    assertEquals(fromString("Ab Bc Cd"), fromString("ab bc cd").toTitleCase());
+    assertEquals(fromString("Ѐ Ё Ђ Ѻ Ώ Ề"), fromString("ѐ ё ђ ѻ ώ ề").toTitleCase());
+    assertEquals(fromString("大千世界 数据砖头"), fromString("大千世界 数据砖头").toTitleCase());
+  }
+
+  @Test
   public void concatTest() {
     assertEquals(EMPTY_UTF8, concat());
     assertEquals(null, concat((UTF8String) null));
author	HuJiayin <jiayin.hu@intel.com>	2015-08-01 21:44:57 -0700
committer	Davies Liu <davies.liu@gmail.com>	2015-08-01 21:44:57 -0700
commit	00cd92f32f17ca57d47aa2dcc716eb707aaee799 (patch)
tree	87fae8a2daea19abc7dee69b551c5c0e6f54bf4b /unsafe/src
parent	5d9e33d9a2633e45082ac395a64646364f22f4c4 (diff)
download	spark-00cd92f32f17ca57d47aa2dcc716eb707aaee799.tar.gz spark-00cd92f32f17ca57d47aa2dcc716eb707aaee799.tar.bz2 spark-00cd92f32f17ca57d47aa2dcc716eb707aaee799.zip