aboutsummaryrefslogtreecommitdiff
path: root/unsafe/src
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-06-11 16:07:15 -0700
committerReynold Xin <rxin@databricks.com>2015-06-11 16:07:15 -0700
commit7d669a56ffc7a4f5827830ef3c27d45cc0e8774f (patch)
tree26345664c6976534f8ba212e60c4b65855053f3e /unsafe/src
parent9cbdf31ec1399d4d43a1863c15688ce78b6dfd92 (diff)
downloadspark-7d669a56ffc7a4f5827830ef3c27d45cc0e8774f.tar.gz
spark-7d669a56ffc7a4f5827830ef3c27d45cc0e8774f.tar.bz2
spark-7d669a56ffc7a4f5827830ef3c27d45cc0e8774f.zip
[SPARK-8286] Rewrite UTF8String in Java and move it into unsafe package.
Unit test is still in Scala. Author: Reynold Xin <rxin@databricks.com> Closes #6738 from rxin/utf8string-java and squashes the following commits: 562dc6e [Reynold Xin] Flag... 98e600b [Reynold Xin] Another try with encoding setting .. cfa6bdf [Reynold Xin] Merge branch 'master' into utf8string-java a3b124d [Reynold Xin] Try different UTF-8 encoded characters. 1ff7c82 [Reynold Xin] Enable UTF-8 encoding. 82d58cc [Reynold Xin] Reset run-tests. 2cb3c69 [Reynold Xin] Use utf-8 encoding in set bytes. 53f8ef4 [Reynold Xin] Hack Jenkins to run one test. 9a48e8d [Reynold Xin] Fixed runtime compilation error. 911c450 [Reynold Xin] Moved unit test also to Java. 4eff7bd [Reynold Xin] Improved unit test coverage. 8e89a3c [Reynold Xin] Fixed tests. 77c64bd [Reynold Xin] Fixed string type codegen. ffedb62 [Reynold Xin] Code review feedback. 0967ce6 [Reynold Xin] Fixed import ordering. 45a123d [Reynold Xin] [SPARK-8286] Rewrite UTF8String in Java and move it into unsafe package.
Diffstat (limited to 'unsafe/src')
-rw-r--r--unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java212
-rw-r--r--unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java1
-rw-r--r--unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java93
3 files changed, 305 insertions, 1 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
new file mode 100644
index 0000000000..a351680195
--- /dev/null
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.types;
+
+import java.io.Serializable;
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import javax.annotation.Nullable;
+
+import org.apache.spark.unsafe.PlatformDependent;
+
+/**
+ * A UTF-8 String for internal Spark use.
+ * <p>
+ * A String encoded in UTF-8 as an Array[Byte], which can be used for comparison,
+ * search, see http://en.wikipedia.org/wiki/UTF-8 for details.
+ * <p>
+ * Note: This is not designed for general use cases, should not be used outside SQL.
+ */
+public final class UTF8String implements Comparable<UTF8String>, Serializable {
+
+ @Nullable
+ private byte[] bytes;
+
+ private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 5, 5, 5, 5,
+ 6, 6, 6, 6};
+
+ public static UTF8String fromBytes(byte[] bytes) {
+ return (bytes != null) ? new UTF8String().set(bytes) : null;
+ }
+
+ public static UTF8String fromString(String str) {
+ return (str != null) ? new UTF8String().set(str) : null;
+ }
+
+ /**
+ * Updates the UTF8String with String.
+ */
+ public UTF8String set(final String str) {
+ try {
+ bytes = str.getBytes("utf-8");
+ } catch (UnsupportedEncodingException e) {
+ // Turn the exception into unchecked so we can find out about it at runtime, but
+ // don't need to add lots of boilerplate code everywhere.
+ PlatformDependent.throwException(e);
+ }
+ return this;
+ }
+
+ /**
+ * Updates the UTF8String with byte[], which should be encoded in UTF-8.
+ */
+ public UTF8String set(final byte[] bytes) {
+ this.bytes = bytes;
+ return this;
+ }
+
+ /**
+ * Returns the number of bytes for a code point with the first byte as `b`
+ * @param b The first byte of a code point
+ */
+ public int numBytes(final byte b) {
+ final int offset = (b & 0xFF) - 192;
+ return (offset >= 0) ? bytesOfCodePointInUTF8[offset] : 1;
+ }
+
+ /**
+ * Returns the number of code points in it.
+ *
+ * This is only used by Substring() when `start` is negative.
+ */
+ public int length() {
+ int len = 0;
+ for (int i = 0; i < bytes.length; i+= numBytes(bytes[i])) {
+ len += 1;
+ }
+ return len;
+ }
+
+ public byte[] getBytes() {
+ return bytes;
+ }
+
+ /**
+ * Returns a substring of this.
+ * @param start the position of first code point
+ * @param until the position after last code point, exclusive.
+ */
+ public UTF8String substring(final int start, final int until) {
+ if (until <= start || start >= bytes.length) {
+ return UTF8String.fromBytes(new byte[0]);
+ }
+
+ int i = 0;
+ int c = 0;
+ for (; i < bytes.length && c < start; i += numBytes(bytes[i])) {
+ c += 1;
+ }
+
+ int j = i;
+ for (; j < bytes.length && c < until; j += numBytes(bytes[i])) {
+ c += 1;
+ }
+
+ return UTF8String.fromBytes(Arrays.copyOfRange(bytes, i, j));
+ }
+
+ public boolean contains(final UTF8String substring) {
+ final byte[] b = substring.getBytes();
+ if (b.length == 0) {
+ return true;
+ }
+
+ for (int i = 0; i <= bytes.length - b.length; i++) {
+ // TODO: Avoid copying.
+ if (bytes[i] == b[0] && Arrays.equals(Arrays.copyOfRange(bytes, i, i + b.length), b)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public boolean startsWith(final UTF8String prefix) {
+ final byte[] b = prefix.getBytes();
+ // TODO: Avoid copying.
+ return b.length <= bytes.length && Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b);
+ }
+
+ public boolean endsWith(final UTF8String suffix) {
+ final byte[] b = suffix.getBytes();
+ return b.length <= bytes.length &&
+ Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b);
+ }
+
+ public UTF8String toUpperCase() {
+ return UTF8String.fromString(toString().toUpperCase());
+ }
+
+ public UTF8String toLowerCase() {
+ return UTF8String.fromString(toString().toLowerCase());
+ }
+
+ @Override
+ public String toString() {
+ try {
+ return new String(bytes, "utf-8");
+ } catch (UnsupportedEncodingException e) {
+ // Turn the exception into unchecked so we can find out about it at runtime, but
+ // don't need to add lots of boilerplate code everywhere.
+ PlatformDependent.throwException(e);
+ return "unknown"; // we will never reach here.
+ }
+ }
+
+ @Override
+ public UTF8String clone() {
+ return new UTF8String().set(bytes);
+ }
+
+ @Override
+ public int compareTo(final UTF8String other) {
+ final byte[] b = other.getBytes();
+ for (int i = 0; i < bytes.length && i < b.length; i++) {
+ int res = bytes[i] - b[i];
+ if (res != 0) {
+ return res;
+ }
+ }
+ return bytes.length - b.length;
+ }
+
+ public int compare(final UTF8String other) {
+ return compareTo(other);
+ }
+
+ @Override
+ public boolean equals(final Object other) {
+ if (other instanceof UTF8String) {
+ return Arrays.equals(bytes, ((UTF8String) other).getBytes());
+ } else if (other instanceof String) {
+ // Used only in unit tests.
+ String s = (String) other;
+ return bytes.length >= s.length() && length() == s.length() && toString().equals(s);
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return Arrays.hashCode(bytes);
+ }
+}
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
index 18393db9f3..a93fc0ee29 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
@@ -18,7 +18,6 @@
package org.apache.spark.unsafe.bitset;
import junit.framework.Assert;
-import org.apache.spark.unsafe.bitset.BitSet;
import org.junit.Test;
import org.apache.spark.unsafe.memory.MemoryBlock;
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
new file mode 100644
index 0000000000..80c179a1b5
--- /dev/null
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -0,0 +1,93 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.unsafe.types;
+
+import java.io.UnsupportedEncodingException;
+
+import junit.framework.Assert;
+import org.junit.Test;
+
+public class UTF8StringSuite {
+
+ private void checkBasic(String str, int len) throws UnsupportedEncodingException {
+ Assert.assertEquals(UTF8String.fromString(str).length(), len);
+ Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).length(), len);
+
+ Assert.assertEquals(UTF8String.fromString(str), str);
+ Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")), str);
+ Assert.assertEquals(UTF8String.fromString(str).toString(), str);
+ Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).toString(), str);
+ Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")), UTF8String.fromString(str));
+
+ Assert.assertEquals(UTF8String.fromString(str).hashCode(),
+ UTF8String.fromBytes(str.getBytes("utf8")).hashCode());
+ }
+
+ @Test
+ public void basicTest() throws UnsupportedEncodingException {
+ checkBasic("hello", 5);
+ checkBasic("世 界", 3);
+ }
+
+ @Test
+ public void contains() {
+ Assert.assertTrue(UTF8String.fromString("hello").contains(UTF8String.fromString("ello")));
+ Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("vello")));
+ Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("hellooo")));
+ Assert.assertTrue(UTF8String.fromString("大千世界").contains(UTF8String.fromString("千世")));
+ Assert.assertFalse(UTF8String.fromString("大千世界").contains(UTF8String.fromString("世千")));
+ Assert.assertFalse(
+ UTF8String.fromString("大千世界").contains(UTF8String.fromString("大千世界好")));
+ }
+
+ @Test
+ public void startsWith() {
+ Assert.assertTrue(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hell")));
+ Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("ell")));
+ Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hellooo")));
+ Assert.assertTrue(UTF8String.fromString("数据砖头").startsWith(UTF8String.fromString("数据")));
+ Assert.assertFalse(UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("千")));
+ Assert.assertFalse(
+ UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千世界好")));
+ }
+
+ @Test
+ public void endsWith() {
+ Assert.assertTrue(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ello")));
+ Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ellov")));
+ Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("hhhello")));
+ Assert.assertTrue(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世界")));
+ Assert.assertFalse(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世")));
+ Assert.assertFalse(
+ UTF8String.fromString("数据砖头").endsWith(UTF8String.fromString("我的数据砖头")));
+ }
+
+ @Test
+ public void substring() {
+ Assert.assertEquals(
+ UTF8String.fromString("hello").substring(0, 0), UTF8String.fromString(""));
+ Assert.assertEquals(
+ UTF8String.fromString("hello").substring(1, 3), UTF8String.fromString("el"));
+ Assert.assertEquals(
+ UTF8String.fromString("数据砖头").substring(0, 1), UTF8String.fromString("数"));
+ Assert.assertEquals(
+ UTF8String.fromString("数据砖头").substring(1, 3), UTF8String.fromString("据砖"));
+ Assert.assertEquals(
+ UTF8String.fromString("数据砖头").substring(3, 5), UTF8String.fromString("头"));
+ }
+}