diff options
author | Reynold Xin <rxin@databricks.com> | 2015-07-29 21:18:43 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-07-29 21:18:43 -0700 |
commit | 07fd7d36471dfb823c1ce3e3a18464043affde18 (patch) | |
tree | d52b8914c9afbd1b382f8112c80490df9b3cb51e /unsafe | |
parent | 9514d874f0cf61f1eb4ec4f5f66e053119f769c9 (diff) | |
download | spark-07fd7d36471dfb823c1ce3e3a18464043affde18.tar.gz spark-07fd7d36471dfb823c1ce3e3a18464043affde18.tar.bz2 spark-07fd7d36471dfb823c1ce3e3a18464043affde18.zip |
[SPARK-9460] Avoid byte array allocation in StringPrefixComparator.
As of today, StringPrefixComparator converts the long values back to byte arrays in order to compare them. This patch optimizes this to compare the longs directly, rather than turning the longs into byte arrays and comparing them byte by byte (unsigned).
This only works on little-endian architecture right now.
Author: Reynold Xin <rxin@databricks.com>
Closes #7765 from rxin/SPARK-9460 and squashes the following commits:
e4908cc [Reynold Xin] Stricter randomized tests.
4c8d094 [Reynold Xin] [SPARK-9460] Avoid byte array allocation in StringPrefixComparator.
Diffstat (limited to 'unsafe')
-rw-r--r-- | unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java | 9 | ||||
-rw-r--r-- | unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java | 11 |
2 files changed, 20 insertions, 0 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 3e1cc67dbf..57522003ba 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -138,6 +138,15 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { } /** + * Returns a 64-bit integer that can be used as the prefix used in sorting. + */ + public long getPrefix() { + long p = PlatformDependent.UNSAFE.getLong(base, offset); + p = java.lang.Long.reverseBytes(p); + return p; + } + + /** * Returns the underline bytes, will be a copy of it if it's part of another array. */ public byte[] getBytes() { diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index e2a5628ff4..42e09e435a 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -64,7 +64,18 @@ public class UTF8StringSuite { } @Test + public void prefix() { + assertTrue(fromString("a").getPrefix() - fromString("b").getPrefix() < 0); + assertTrue(fromString("ab").getPrefix() - fromString("b").getPrefix() < 0); + assertTrue( + fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0); + assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0); + assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0); + } + + @Test public void compareTo() { + assertTrue(fromString("").compareTo(fromString("a")) < 0); assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0); assertTrue(fromString("abc0").compareTo(fromString("abc")) > 0); assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabc")) == 0); |