aboutsummaryrefslogtreecommitdiff
path: root/common
diff options
context:
space:
mode:
authorTejas Patil <tejasp@fb.com>2016-10-04 18:59:31 -0700
committerHerman van Hovell <hvanhovell@databricks.com>2016-10-04 18:59:31 -0700
commita99743d053e84f695dc3034550939555297b0a05 (patch)
tree566a00324e1d3fdabc416e31efd3c25a3e6cf2cb /common
parent8d969a2125d915da1506c17833aa98da614a257f (diff)
downloadspark-a99743d053e84f695dc3034550939555297b0a05.tar.gz
spark-a99743d053e84f695dc3034550939555297b0a05.tar.bz2
spark-a99743d053e84f695dc3034550939555297b0a05.zip
[SPARK-17495][SQL] Add Hash capability semantically equivalent to Hive's
## What changes were proposed in this pull request? Jira : https://issues.apache.org/jira/browse/SPARK-17495 Spark internally uses Murmur3Hash for partitioning. This is different from the one used by Hive. For queries which use bucketing this leads to different results if one tries the same query on both engines. For us, we want users to have backward compatibility to that one can switch parts of applications across the engines without observing regressions. This PR includes `HiveHash`, `HiveHashFunction`, `HiveHasher` which mimics Hive's hashing at https://github.com/apache/hive/blob/master/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java#L638 I am intentionally not introducing any usages of this hash function in rest of the code to keep this PR small. My eventual goal is to have Hive bucketing support in Spark. Once this PR gets in, I will make hash function pluggable in relevant areas (eg. `HashPartitioning`'s `partitionIdExpression` has Murmur3 hardcoded : https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala#L265) ## How was this patch tested? Added `HiveHashSuite` Author: Tejas Patil <tejasp@fb.com> Closes #15047 from tejasapatil/SPARK-17495_hive_hash.
Diffstat (limited to 'common')
-rw-r--r--common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java49
1 files changed, 49 insertions, 0 deletions
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java
new file mode 100644
index 0000000000..c7ea9085eb
--- /dev/null
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions;
+
+import org.apache.spark.unsafe.Platform;
+
+/**
+ * Simulates Hive's hashing function at
+ * org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils#hashcode()
+ */
+public class HiveHasher {
+
+ @Override
+ public String toString() {
+ return HiveHasher.class.getSimpleName();
+ }
+
+ public static int hashInt(int input) {
+ return input;
+ }
+
+ public static int hashLong(long input) {
+ return (int) ((input >>> 32) ^ input);
+ }
+
+ public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes) {
+ assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
+ int result = 0;
+ for (int i = 0; i < lengthInBytes; i++) {
+ result = (result * 31) + (int) Platform.getByte(base, offset + i);
+ }
+ return result;
+ }
+}