aboutsummaryrefslogtreecommitdiff
path: root/unsafe/src
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-07-30 13:09:43 -0700
committerReynold Xin <rxin@databricks.com>2015-07-30 13:09:43 -0700
commita20e743fb863de809863652931bc982aac2d1f86 (patch)
treef948f807fb34b8861809e91a8b04fa22fb752af9 /unsafe/src
parent6d94bf6ac10ac851636c62439f8f2737f3526a2a (diff)
downloadspark-a20e743fb863de809863652931bc982aac2d1f86.tar.gz
spark-a20e743fb863de809863652931bc982aac2d1f86.tar.bz2
spark-a20e743fb863de809863652931bc982aac2d1f86.zip
[SPARK-9460] Fix prefix generation for UTF8String.
Previously we could be getting garbage data if the number of bytes is 0, or on JVMs that are 4 byte aligned, or when compressedoops is on. Author: Reynold Xin <rxin@databricks.com> Closes #7789 from rxin/utf8string and squashes the following commits: 86ffa3e [Reynold Xin] Mask out data outside of valid range. 4d647ed [Reynold Xin] Mask out data. c6e8794 [Reynold Xin] [SPARK-9460] Fix prefix generation for UTF8String.
Diffstat (limited to 'unsafe/src')
-rw-r--r--unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java36
-rw-r--r--unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java8
2 files changed, 41 insertions, 3 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 57522003ba..c38953f65d 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -66,6 +66,19 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
}
/**
+ * Creates an UTF8String from byte array, which should be encoded in UTF-8.
+ *
+ * Note: `bytes` will be hold by returned UTF8String.
+ */
+ public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) {
+ if (bytes != null) {
+ return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes);
+ } else {
+ return null;
+ }
+ }
+
+ /**
* Creates an UTF8String from String.
*/
public static UTF8String fromString(String str) {
@@ -89,10 +102,10 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
return fromBytes(spaces);
}
- protected UTF8String(Object base, long offset, int size) {
+ protected UTF8String(Object base, long offset, int numBytes) {
this.base = base;
this.offset = offset;
- this.numBytes = size;
+ this.numBytes = numBytes;
}
/**
@@ -141,7 +154,24 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
* Returns a 64-bit integer that can be used as the prefix used in sorting.
*/
public long getPrefix() {
- long p = PlatformDependent.UNSAFE.getLong(base, offset);
+ // Since JVMs are either 4-byte aligned or 8-byte aligned, we check the size of the string.
+ // If size is 0, just return 0.
+ // If size is between 0 and 4 (inclusive), assume data is 4-byte aligned under the hood and
+ // use a getInt to fetch the prefix.
+ // If size is greater than 4, assume we have at least 8 bytes of data to fetch.
+ // After getting the data, we use a mask to mask out data that is not part of the string.
+ long p;
+ if (numBytes >= 8) {
+ p = PlatformDependent.UNSAFE.getLong(base, offset);
+ } else if (numBytes > 4) {
+ p = PlatformDependent.UNSAFE.getLong(base, offset);
+ p = p & ((1L << numBytes * 8) - 1);
+ } else if (numBytes > 0) {
+ p = (long) PlatformDependent.UNSAFE.getInt(base, offset);
+ p = p & ((1L << numBytes * 8) - 1);
+ } else {
+ p = 0;
+ }
p = java.lang.Long.reverseBytes(p);
return p;
}
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 42e09e435a..f2cc19ca6b 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -71,6 +71,14 @@ public class UTF8StringSuite {
fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
+
+ byte[] buf1 = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+ byte[] buf2 = {1, 2, 3};
+ UTF8String str1 = UTF8String.fromBytes(buf1, 0, 3);
+ UTF8String str2 = UTF8String.fromBytes(buf1, 0, 8);
+ UTF8String str3 = UTF8String.fromBytes(buf2);
+ assertTrue(str1.getPrefix() - str2.getPrefix() < 0);
+ assertEquals(str1.getPrefix(), str3.getPrefix());
}
@Test