aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java29
-rw-r--r--core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala19
-rw-r--r--unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java9
-rw-r--r--unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java11
4 files changed, 36 insertions, 32 deletions
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
index 5624e067da..a9ee6042fe 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
@@ -17,9 +17,7 @@
package org.apache.spark.util.collection.unsafe.sort;
-import com.google.common.base.Charsets;
-import com.google.common.primitives.Longs;
-import com.google.common.primitives.UnsignedBytes;
+import com.google.common.primitives.UnsignedLongs;
import org.apache.spark.annotation.Private;
import org.apache.spark.unsafe.types.UTF8String;
@@ -36,32 +34,11 @@ public class PrefixComparators {
public static final class StringPrefixComparator extends PrefixComparator {
@Override
public int compare(long aPrefix, long bPrefix) {
- // TODO: can done more efficiently
- byte[] a = Longs.toByteArray(aPrefix);
- byte[] b = Longs.toByteArray(bPrefix);
- for (int i = 0; i < 8; i++) {
- int c = UnsignedBytes.compare(a[i], b[i]);
- if (c != 0) return c;
- }
- return 0;
- }
-
- public long computePrefix(byte[] bytes) {
- if (bytes == null) {
- return 0L;
- } else {
- byte[] padded = new byte[8];
- System.arraycopy(bytes, 0, padded, 0, Math.min(bytes.length, 8));
- return Longs.fromByteArray(padded);
- }
- }
-
- public long computePrefix(String value) {
- return value == null ? 0L : computePrefix(value.getBytes(Charsets.UTF_8));
+ return UnsignedLongs.compare(aPrefix, bPrefix);
}
public long computePrefix(UTF8String value) {
- return value == null ? 0L : computePrefix(value.getBytes());
+ return value == null ? 0L : value.getPrefix();
}
}
diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
index 28fe925945..26b7a9e816 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
@@ -17,22 +17,29 @@
package org.apache.spark.util.collection.unsafe.sort
+import com.google.common.primitives.UnsignedBytes
import org.scalatest.prop.PropertyChecks
-
import org.apache.spark.SparkFunSuite
+import org.apache.spark.unsafe.types.UTF8String
class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
test("String prefix comparator") {
def testPrefixComparison(s1: String, s2: String): Unit = {
- val s1Prefix = PrefixComparators.STRING.computePrefix(s1)
- val s2Prefix = PrefixComparators.STRING.computePrefix(s2)
+ val utf8string1 = UTF8String.fromString(s1)
+ val utf8string2 = UTF8String.fromString(s2)
+ val s1Prefix = PrefixComparators.STRING.computePrefix(utf8string1)
+ val s2Prefix = PrefixComparators.STRING.computePrefix(utf8string2)
val prefixComparisonResult = PrefixComparators.STRING.compare(s1Prefix, s2Prefix)
+
+ val cmp = UnsignedBytes.lexicographicalComparator().compare(
+ utf8string1.getBytes.take(8), utf8string2.getBytes.take(8))
+
assert(
- (prefixComparisonResult == 0) ||
- (prefixComparisonResult < 0 && s1 < s2) ||
- (prefixComparisonResult > 0 && s1 > s2))
+ (prefixComparisonResult == 0 && cmp == 0) ||
+ (prefixComparisonResult < 0 && s1.compareTo(s2) < 0) ||
+ (prefixComparisonResult > 0 && s1.compareTo(s2) > 0))
}
// scalastyle:off
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 3e1cc67dbf..57522003ba 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -138,6 +138,15 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
}
/**
+ * Returns a 64-bit integer that can be used as the prefix used in sorting.
+ */
+ public long getPrefix() {
+ long p = PlatformDependent.UNSAFE.getLong(base, offset);
+ p = java.lang.Long.reverseBytes(p);
+ return p;
+ }
+
+ /**
* Returns the underline bytes, will be a copy of it if it's part of another array.
*/
public byte[] getBytes() {
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index e2a5628ff4..42e09e435a 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -64,7 +64,18 @@ public class UTF8StringSuite {
}
@Test
+ public void prefix() {
+ assertTrue(fromString("a").getPrefix() - fromString("b").getPrefix() < 0);
+ assertTrue(fromString("ab").getPrefix() - fromString("b").getPrefix() < 0);
+ assertTrue(
+ fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
+ assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
+ assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
+ }
+
+ @Test
public void compareTo() {
+ assertTrue(fromString("").compareTo(fromString("a")) < 0);
assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0);
assertTrue(fromString("abc0").compareTo(fromString("abc")) > 0);
assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabc")) == 0);