[SPARK-17495][SQL] Add Hash capability semantically equivalent to Hive's

## What changes were proposed in this pull request? Jira : https://issues.apache.org/jira/browse/SPARK-17495 Spark internally uses Murmur3Hash for partitioning. This is different from the one used by Hive. For queries which use bucketing this leads to different results if one tries the same query on both engines. For us, we want users to have backward compatibility to that one can switch parts of applications across the engines without observing regressions. This PR includes `HiveHash`, `HiveHashFunction`, `HiveHasher` which mimics Hive's hashing at https://github.com/apache/hive/blob/master/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java#L638 I am intentionally not introducing any usages of this hash function in rest of the code to keep this PR small. My eventual goal is to have Hive bucketing support in Spark. Once this PR gets in, I will make hash function pluggable in relevant areas (eg. `HashPartitioning`'s `partitionIdExpression` has Murmur3 hardcoded : https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala#L265) ## How was this patch tested? Added `HiveHashSuite` Author: Tejas Patil <tejasp@fb.com> Closes #15047 from tejasapatil/SPARK-17495_hive_hash.
author: Tejas Patil <tejasp@fb.com> 2016-10-04 18:59:31 -0700
committer: Herman van Hovell <hvanhovell@databricks.com> 2016-10-04 18:59:31 -0700
commit: a99743d053e84f695dc3034550939555297b0a05 (patch)
tree: 566a00324e1d3fdabc416e31efd3c25a3e6cf2cb /sql/catalyst/src/test/java/org/apache
parent: 8d969a2125d915da1506c17833aa98da614a257f (diff)
download: spark-a99743d053e84f695dc3034550939555297b0a05.tar.gz
spark-a99743d053e84f695dc3034550939555297b0a05.tar.bz2
spark-a99743d053e84f695dc3034550939555297b0a05.zip
1 files changed, 128 insertions, 0 deletions
diff --git a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java
new file mode 100644
index 0000000000..67a5eb0c7f
--- /dev/null
+++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions;
+
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.types.UTF8String;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+public class HiveHasherSuite {
+  private final static HiveHasher hasher = new HiveHasher();
+
+  @Test
+  public void testKnownIntegerInputs() {
+    int[] inputs = {0, Integer.MIN_VALUE, Integer.MAX_VALUE, 593689054, -189366624};
+    for (int input : inputs) {
+      Assert.assertEquals(input, HiveHasher.hashInt(input));
+    }
+  }
+
+  @Test
+  public void testKnownLongInputs() {
+    Assert.assertEquals(0, HiveHasher.hashLong(0L));
+    Assert.assertEquals(41, HiveHasher.hashLong(-42L));
+    Assert.assertEquals(42, HiveHasher.hashLong(42L));
+    Assert.assertEquals(-2147483648, HiveHasher.hashLong(Long.MIN_VALUE));
+    Assert.assertEquals(-2147483648, HiveHasher.hashLong(Long.MAX_VALUE));
+  }
+
+  @Test
+  public void testKnownStringAndIntInputs() {
+    int[] inputs = {84, 19, 8};
+    int[] expected = {-823832826, -823835053, 111972242};
+
+    for (int i = 0; i < inputs.length; i++) {
+      UTF8String s = UTF8String.fromString("val_" + inputs[i]);
+      int hash = HiveHasher.hashUnsafeBytes(s.getBaseObject(), s.getBaseOffset(), s.numBytes());
+      Assert.assertEquals(expected[i], ((31 * inputs[i]) + hash));
+    }
+  }
+
+  @Test
+  public void randomizedStressTest() {
+    int size = 65536;
+    Random rand = new Random();
+
+    // A set used to track collision rate.
+    Set<Integer> hashcodes = new HashSet<>();
+    for (int i = 0; i < size; i++) {
+      int vint = rand.nextInt();
+      long lint = rand.nextLong();
+      Assert.assertEquals(HiveHasher.hashInt(vint), HiveHasher.hashInt(vint));
+      Assert.assertEquals(HiveHasher.hashLong(lint), HiveHasher.hashLong(lint));
+
+      hashcodes.add(HiveHasher.hashLong(lint));
+    }
+
+    // A very loose bound.
+    Assert.assertTrue(hashcodes.size() > size * 0.95);
+  }
+
+  @Test
+  public void randomizedStressTestBytes() {
+    int size = 65536;
+    Random rand = new Random();
+
+    // A set used to track collision rate.
+    Set<Integer> hashcodes = new HashSet<>();
+    for (int i = 0; i < size; i++) {
+      int byteArrSize = rand.nextInt(100) * 8;
+      byte[] bytes = new byte[byteArrSize];
+      rand.nextBytes(bytes);
+
+      Assert.assertEquals(
+          HiveHasher.hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize),
+          HiveHasher.hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+
+      hashcodes.add(HiveHasher.hashUnsafeBytes(
+          bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+    }
+
+    // A very loose bound.
+    Assert.assertTrue(hashcodes.size() > size * 0.95);
+  }
+
+  @Test
+  public void randomizedStressTestPaddedStrings() {
+    int size = 64000;
+    // A set used to track collision rate.
+    Set<Integer> hashcodes = new HashSet<>();
+    for (int i = 0; i < size; i++) {
+      int byteArrSize = 8;
+      byte[] strBytes = String.valueOf(i).getBytes(StandardCharsets.UTF_8);
+      byte[] paddedBytes = new byte[byteArrSize];
+      System.arraycopy(strBytes, 0, paddedBytes, 0, strBytes.length);
+
+      Assert.assertEquals(
+          HiveHasher.hashUnsafeBytes(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize),
+          HiveHasher.hashUnsafeBytes(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+
+      hashcodes.add(HiveHasher.hashUnsafeBytes(
+          paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+    }
+
+    // A very loose bound.
+    Assert.assertTrue(hashcodes.size() > size * 0.95);
+  }
+}
author	Tejas Patil <tejasp@fb.com>	2016-10-04 18:59:31 -0700
committer	Herman van Hovell <hvanhovell@databricks.com>	2016-10-04 18:59:31 -0700
commit	a99743d053e84f695dc3034550939555297b0a05 (patch)
tree	566a00324e1d3fdabc416e31efd3c25a3e6cf2cb /sql/catalyst/src/test/java/org/apache
parent	8d969a2125d915da1506c17833aa98da614a257f (diff)
download	spark-a99743d053e84f695dc3034550939555297b0a05.tar.gz spark-a99743d053e84f695dc3034550939555297b0a05.tar.bz2 spark-a99743d053e84f695dc3034550939555297b0a05.zip