From c2ea79f96acd076351b48162644ed1cff4c8e090 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Wed, 13 Jan 2016 12:29:02 -0800 Subject: [SPARK-12642][SQL] improve the hash expression to be decoupled from unsafe row https://issues.apache.org/jira/browse/SPARK-12642 Author: Wenchen Fan Closes #10694 from cloud-fan/hash-expr. --- .../apache/spark/unsafe/hash/Murmur3_x86_32.java | 28 +++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'unsafe/src') diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java index 4276f25c21..5e7ee480ca 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java @@ -38,6 +38,10 @@ public final class Murmur3_x86_32 { } public int hashInt(int input) { + return hashInt(input, seed); + } + + public static int hashInt(int input, int seed) { int k1 = mixK1(input); int h1 = mixH1(seed, k1); @@ -51,16 +55,38 @@ public final class Murmur3_x86_32 { public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed) { // This is based on Guava's `Murmur32_Hasher.processRemaining(ByteBuffer)` method. assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of 8 (word-aligned)"; + int h1 = hashBytesByInt(base, offset, lengthInBytes, seed); + return fmix(h1, lengthInBytes); + } + + public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) { + assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; + int lengthAligned = lengthInBytes - lengthInBytes % 4; + int h1 = hashBytesByInt(base, offset, lengthAligned, seed); + for (int i = lengthAligned; i < lengthInBytes; i++) { + int halfWord = Platform.getByte(base, offset + i); + int k1 = mixK1(halfWord); + h1 = mixH1(h1, k1); + } + return fmix(h1, lengthInBytes); + } + + private static int hashBytesByInt(Object base, long offset, int lengthInBytes, int seed) { + assert (lengthInBytes % 4 == 0); int h1 = seed; for (int i = 0; i < lengthInBytes; i += 4) { int halfWord = Platform.getInt(base, offset + i); int k1 = mixK1(halfWord); h1 = mixH1(h1, k1); } - return fmix(h1, lengthInBytes); + return h1; } public int hashLong(long input) { + return hashLong(input, seed); + } + + public static int hashLong(long input, int seed) { int low = (int) input; int high = (int) (input >>> 32); -- cgit v1.2.3