aboutsummaryrefslogtreecommitdiff
path: root/sql/catalyst/src/test/java/org/apache
diff options
context:
space:
mode:
authorTejas Patil <tejasp@fb.com>2016-10-04 18:59:31 -0700
committerHerman van Hovell <hvanhovell@databricks.com>2016-10-04 18:59:31 -0700
commita99743d053e84f695dc3034550939555297b0a05 (patch)
tree566a00324e1d3fdabc416e31efd3c25a3e6cf2cb /sql/catalyst/src/test/java/org/apache
parent8d969a2125d915da1506c17833aa98da614a257f (diff)
downloadspark-a99743d053e84f695dc3034550939555297b0a05.tar.gz
spark-a99743d053e84f695dc3034550939555297b0a05.tar.bz2
spark-a99743d053e84f695dc3034550939555297b0a05.zip
[SPARK-17495][SQL] Add Hash capability semantically equivalent to Hive's
## What changes were proposed in this pull request? Jira : https://issues.apache.org/jira/browse/SPARK-17495 Spark internally uses Murmur3Hash for partitioning. This is different from the one used by Hive. For queries which use bucketing this leads to different results if one tries the same query on both engines. For us, we want users to have backward compatibility to that one can switch parts of applications across the engines without observing regressions. This PR includes `HiveHash`, `HiveHashFunction`, `HiveHasher` which mimics Hive's hashing at https://github.com/apache/hive/blob/master/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java#L638 I am intentionally not introducing any usages of this hash function in rest of the code to keep this PR small. My eventual goal is to have Hive bucketing support in Spark. Once this PR gets in, I will make hash function pluggable in relevant areas (eg. `HashPartitioning`'s `partitionIdExpression` has Murmur3 hardcoded : https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala#L265) ## How was this patch tested? Added `HiveHashSuite` Author: Tejas Patil <tejasp@fb.com> Closes #15047 from tejasapatil/SPARK-17495_hive_hash.
Diffstat (limited to 'sql/catalyst/src/test/java/org/apache')
-rw-r--r--sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java128
1 files changed, 128 insertions, 0 deletions
diff --git a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java
new file mode 100644
index 0000000000..67a5eb0c7f
--- /dev/null
+++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions;
+
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.types.UTF8String;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+public class HiveHasherSuite {
+ private final static HiveHasher hasher = new HiveHasher();
+
+ @Test
+ public void testKnownIntegerInputs() {
+ int[] inputs = {0, Integer.MIN_VALUE, Integer.MAX_VALUE, 593689054, -189366624};
+ for (int input : inputs) {
+ Assert.assertEquals(input, HiveHasher.hashInt(input));
+ }
+ }
+
+ @Test
+ public void testKnownLongInputs() {
+ Assert.assertEquals(0, HiveHasher.hashLong(0L));
+ Assert.assertEquals(41, HiveHasher.hashLong(-42L));
+ Assert.assertEquals(42, HiveHasher.hashLong(42L));
+ Assert.assertEquals(-2147483648, HiveHasher.hashLong(Long.MIN_VALUE));
+ Assert.assertEquals(-2147483648, HiveHasher.hashLong(Long.MAX_VALUE));
+ }
+
+ @Test
+ public void testKnownStringAndIntInputs() {
+ int[] inputs = {84, 19, 8};
+ int[] expected = {-823832826, -823835053, 111972242};
+
+ for (int i = 0; i < inputs.length; i++) {
+ UTF8String s = UTF8String.fromString("val_" + inputs[i]);
+ int hash = HiveHasher.hashUnsafeBytes(s.getBaseObject(), s.getBaseOffset(), s.numBytes());
+ Assert.assertEquals(expected[i], ((31 * inputs[i]) + hash));
+ }
+ }
+
+ @Test
+ public void randomizedStressTest() {
+ int size = 65536;
+ Random rand = new Random();
+
+ // A set used to track collision rate.
+ Set<Integer> hashcodes = new HashSet<>();
+ for (int i = 0; i < size; i++) {
+ int vint = rand.nextInt();
+ long lint = rand.nextLong();
+ Assert.assertEquals(HiveHasher.hashInt(vint), HiveHasher.hashInt(vint));
+ Assert.assertEquals(HiveHasher.hashLong(lint), HiveHasher.hashLong(lint));
+
+ hashcodes.add(HiveHasher.hashLong(lint));
+ }
+
+ // A very loose bound.
+ Assert.assertTrue(hashcodes.size() > size * 0.95);
+ }
+
+ @Test
+ public void randomizedStressTestBytes() {
+ int size = 65536;
+ Random rand = new Random();
+
+ // A set used to track collision rate.
+ Set<Integer> hashcodes = new HashSet<>();
+ for (int i = 0; i < size; i++) {
+ int byteArrSize = rand.nextInt(100) * 8;
+ byte[] bytes = new byte[byteArrSize];
+ rand.nextBytes(bytes);
+
+ Assert.assertEquals(
+ HiveHasher.hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize),
+ HiveHasher.hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+
+ hashcodes.add(HiveHasher.hashUnsafeBytes(
+ bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+ }
+
+ // A very loose bound.
+ Assert.assertTrue(hashcodes.size() > size * 0.95);
+ }
+
+ @Test
+ public void randomizedStressTestPaddedStrings() {
+ int size = 64000;
+ // A set used to track collision rate.
+ Set<Integer> hashcodes = new HashSet<>();
+ for (int i = 0; i < size; i++) {
+ int byteArrSize = 8;
+ byte[] strBytes = String.valueOf(i).getBytes(StandardCharsets.UTF_8);
+ byte[] paddedBytes = new byte[byteArrSize];
+ System.arraycopy(strBytes, 0, paddedBytes, 0, strBytes.length);
+
+ Assert.assertEquals(
+ HiveHasher.hashUnsafeBytes(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize),
+ HiveHasher.hashUnsafeBytes(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+
+ hashcodes.add(HiveHasher.hashUnsafeBytes(
+ paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+ }
+
+ // A very loose bound.
+ Assert.assertTrue(hashcodes.size() > size * 0.95);
+ }
+}