[SPARK-8286] Rewrite UTF8String in Java and move it into unsafe package.

Unit test is still in Scala. Author: Reynold Xin <rxin@databricks.com> Closes #6738 from rxin/utf8string-java and squashes the following commits: 562dc6e [Reynold Xin] Flag... 98e600b [Reynold Xin] Another try with encoding setting .. cfa6bdf [Reynold Xin] Merge branch 'master' into utf8string-java a3b124d [Reynold Xin] Try different UTF-8 encoded characters. 1ff7c82 [Reynold Xin] Enable UTF-8 encoding. 82d58cc [Reynold Xin] Reset run-tests. 2cb3c69 [Reynold Xin] Use utf-8 encoding in set bytes. 53f8ef4 [Reynold Xin] Hack Jenkins to run one test. 9a48e8d [Reynold Xin] Fixed runtime compilation error. 911c450 [Reynold Xin] Moved unit test also to Java. 4eff7bd [Reynold Xin] Improved unit test coverage. 8e89a3c [Reynold Xin] Fixed tests. 77c64bd [Reynold Xin] Fixed string type codegen. ffedb62 [Reynold Xin] Code review feedback. 0967ce6 [Reynold Xin] Fixed import ordering. 45a123d [Reynold Xin] [SPARK-8286] Rewrite UTF8String in Java and move it into unsafe package.
author: Reynold Xin <rxin@databricks.com> 2015-06-11 16:07:15 -0700
committer: Reynold Xin <rxin@databricks.com> 2015-06-11 16:07:15 -0700
commit: 7d669a56ffc7a4f5827830ef3c27d45cc0e8774f (patch)
tree: 26345664c6976534f8ba212e60c4b65855053f3e /unsafe/src
parent: 9cbdf31ec1399d4d43a1863c15688ce78b6dfd92 (diff)
download: spark-7d669a56ffc7a4f5827830ef3c27d45cc0e8774f.tar.gz
spark-7d669a56ffc7a4f5827830ef3c27d45cc0e8774f.tar.bz2
spark-7d669a56ffc7a4f5827830ef3c27d45cc0e8774f.zip
3 files changed, 305 insertions, 1 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
new file mode 100644
index 0000000000..a351680195
--- /dev/null
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.types;
+
+import java.io.Serializable;
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import javax.annotation.Nullable;
+
+import org.apache.spark.unsafe.PlatformDependent;
+
+/**
+ * A UTF-8 String for internal Spark use.
+ * <p>
+ * A String encoded in UTF-8 as an Array[Byte], which can be used for comparison,
+ * search, see http://en.wikipedia.org/wiki/UTF-8 for details.
+ * <p>
+ * Note: This is not designed for general use cases, should not be used outside SQL.
+ */
+public final class UTF8String implements Comparable<UTF8String>, Serializable {
+
+  @Nullable
+  private byte[] bytes;
+
+  private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5,
+    6, 6, 6, 6};
+
+  public static UTF8String fromBytes(byte[] bytes) {
+    return (bytes != null) ? new UTF8String().set(bytes) : null;
+  }
+
+  public static UTF8String fromString(String str) {
+    return (str != null) ? new UTF8String().set(str) : null;
+  }
+
+  /**
+   * Updates the UTF8String with String.
+   */
+  public UTF8String set(final String str) {
+    try {
+      bytes = str.getBytes("utf-8");
+    } catch (UnsupportedEncodingException e) {
+      // Turn the exception into unchecked so we can find out about it at runtime, but
+      // don't need to add lots of boilerplate code everywhere.
+      PlatformDependent.throwException(e);
+    }
+    return this;
+  }
+
+  /**
+   * Updates the UTF8String with byte[], which should be encoded in UTF-8.
+   */
+  public UTF8String set(final byte[] bytes) {
+    this.bytes = bytes;
+    return this;
+  }
+
+  /**
+   * Returns the number of bytes for a code point with the first byte as `b`
+   * @param b The first byte of a code point
+   */
+  public int numBytes(final byte b) {
+    final int offset = (b & 0xFF) - 192;
+    return (offset >= 0) ? bytesOfCodePointInUTF8[offset] : 1;
+  }
+
+  /**
+   * Returns the number of code points in it.
+   *
+   * This is only used by Substring() when `start` is negative.
+   */
+  public int length() {
+    int len = 0;
+    for (int i = 0; i < bytes.length; i+= numBytes(bytes[i])) {
+      len += 1;
+    }
+    return len;
+  }
+
+  public byte[] getBytes() {
+    return bytes;
+  }
+
+  /**
+   * Returns a substring of this.
+   * @param start the position of first code point
+   * @param until the position after last code point, exclusive.
+   */
+  public UTF8String substring(final int start, final int until) {
+    if (until <= start || start >= bytes.length) {
+      return UTF8String.fromBytes(new byte[0]);
+    }
+
+    int i = 0;
+    int c = 0;
+    for (; i < bytes.length && c < start; i += numBytes(bytes[i])) {
+      c += 1;
+    }
+
+    int j = i;
+    for (; j < bytes.length && c < until; j += numBytes(bytes[i])) {
+      c += 1;
+    }
+
+    return UTF8String.fromBytes(Arrays.copyOfRange(bytes, i, j));
+  }
+
+  public boolean contains(final UTF8String substring) {
+    final byte[] b = substring.getBytes();
+    if (b.length == 0) {
+      return true;
+    }
+
+    for (int i = 0; i <= bytes.length - b.length; i++) {
+      // TODO: Avoid copying.
+      if (bytes[i] == b[0] && Arrays.equals(Arrays.copyOfRange(bytes, i, i + b.length), b)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  public boolean startsWith(final UTF8String prefix) {
+    final byte[] b = prefix.getBytes();
+    // TODO: Avoid copying.
+    return b.length <= bytes.length && Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b);
+  }
+
+  public boolean endsWith(final UTF8String suffix) {
+    final byte[] b = suffix.getBytes();
+    return b.length <= bytes.length &&
+      Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b);
+  }
+
+  public UTF8String toUpperCase() {
+    return UTF8String.fromString(toString().toUpperCase());
+  }
+
+  public UTF8String toLowerCase() {
+    return UTF8String.fromString(toString().toLowerCase());
+  }
+
+  @Override
+  public String toString() {
+    try {
+      return new String(bytes, "utf-8");
+    } catch (UnsupportedEncodingException e) {
+      // Turn the exception into unchecked so we can find out about it at runtime, but
+      // don't need to add lots of boilerplate code everywhere.
+      PlatformDependent.throwException(e);
+      return "unknown";  // we will never reach here.
+    }
+  }
+
+  @Override
+  public UTF8String clone() {
+    return new UTF8String().set(bytes);
+  }
+
+  @Override
+  public int compareTo(final UTF8String other) {
+    final byte[] b = other.getBytes();
+    for (int i = 0; i < bytes.length && i < b.length; i++) {
+      int res = bytes[i] - b[i];
+      if (res != 0) {
+        return res;
+      }
+    }
+    return bytes.length - b.length;
+  }
+
+  public int compare(final UTF8String other) {
+    return compareTo(other);
+  }
+
+  @Override
+  public boolean equals(final Object other) {
+    if (other instanceof UTF8String) {
+      return Arrays.equals(bytes, ((UTF8String) other).getBytes());
+    } else if (other instanceof String) {
+      // Used only in unit tests.
+      String s = (String) other;
+      return bytes.length >= s.length() && length() == s.length() && toString().equals(s);
+    } else {
+      return false;
+    }
+  }
+
+  @Override
+  public int hashCode() {
+    return Arrays.hashCode(bytes);
+  }
+}
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
index 18393db9f3..a93fc0ee29 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
@@ -18,7 +18,6 @@
 package org.apache.spark.unsafe.bitset;
 
 import junit.framework.Assert;
-import org.apache.spark.unsafe.bitset.BitSet;
 import org.junit.Test;
 
 import org.apache.spark.unsafe.memory.MemoryBlock;
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
new file mode 100644
index 0000000000..80c179a1b5
--- /dev/null
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -0,0 +1,93 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.unsafe.types;
+
+import java.io.UnsupportedEncodingException;
+
+import junit.framework.Assert;
+import org.junit.Test;
+
+public class UTF8StringSuite {
+
+  private void checkBasic(String str, int len) throws UnsupportedEncodingException {
+    Assert.assertEquals(UTF8String.fromString(str).length(), len);
+    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).length(), len);
+
+    Assert.assertEquals(UTF8String.fromString(str), str);
+    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")), str);
+    Assert.assertEquals(UTF8String.fromString(str).toString(), str);
+    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).toString(), str);
+    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")), UTF8String.fromString(str));
+
+    Assert.assertEquals(UTF8String.fromString(str).hashCode(),
+      UTF8String.fromBytes(str.getBytes("utf8")).hashCode());
+  }
+
+  @Test
+  public void basicTest() throws UnsupportedEncodingException {
+    checkBasic("hello", 5);
+    checkBasic("世 界", 3);
+  }
+
+  @Test
+  public void contains() {
+    Assert.assertTrue(UTF8String.fromString("hello").contains(UTF8String.fromString("ello")));
+    Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("vello")));
+    Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("hellooo")));
+    Assert.assertTrue(UTF8String.fromString("大千世界").contains(UTF8String.fromString("千世")));
+    Assert.assertFalse(UTF8String.fromString("大千世界").contains(UTF8String.fromString("世千")));
+    Assert.assertFalse(
+      UTF8String.fromString("大千世界").contains(UTF8String.fromString("大千世界好")));
+  }
+
+  @Test
+  public void startsWith() {
+    Assert.assertTrue(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hell")));
+    Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("ell")));
+    Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hellooo")));
+    Assert.assertTrue(UTF8String.fromString("数据砖头").startsWith(UTF8String.fromString("数据")));
+    Assert.assertFalse(UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("千")));
+    Assert.assertFalse(
+      UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千世界好")));
+  }
+
+  @Test
+  public void endsWith() {
+    Assert.assertTrue(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ello")));
+    Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ellov")));
+    Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("hhhello")));
+    Assert.assertTrue(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世界")));
+    Assert.assertFalse(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世")));
+    Assert.assertFalse(
+      UTF8String.fromString("数据砖头").endsWith(UTF8String.fromString("我的数据砖头")));
+  }
+
+  @Test
+  public void substring() {
+    Assert.assertEquals(
+      UTF8String.fromString("hello").substring(0, 0), UTF8String.fromString(""));
+    Assert.assertEquals(
+      UTF8String.fromString("hello").substring(1, 3), UTF8String.fromString("el"));
+    Assert.assertEquals(
+      UTF8String.fromString("数据砖头").substring(0, 1), UTF8String.fromString("数"));
+    Assert.assertEquals(
+      UTF8String.fromString("数据砖头").substring(1, 3), UTF8String.fromString("据砖"));
+    Assert.assertEquals(
+      UTF8String.fromString("数据砖头").substring(3, 5), UTF8String.fromString("头"));
+  }
+}
author	Reynold Xin <rxin@databricks.com>	2015-06-11 16:07:15 -0700
committer	Reynold Xin <rxin@databricks.com>	2015-06-11 16:07:15 -0700
commit	7d669a56ffc7a4f5827830ef3c27d45cc0e8774f (patch)
tree	26345664c6976534f8ba212e60c4b65855053f3e /unsafe/src
parent	9cbdf31ec1399d4d43a1863c15688ce78b6dfd92 (diff)
download	spark-7d669a56ffc7a4f5827830ef3c27d45cc0e8774f.tar.gz spark-7d669a56ffc7a4f5827830ef3c27d45cc0e8774f.tar.bz2 spark-7d669a56ffc7a4f5827830ef3c27d45cc0e8774f.zip