aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTakeshi YAMAMURO <linguin.m.s@gmail.com>2016-06-27 21:45:22 +0800
committerWenchen Fan <wenchen@databricks.com>2016-06-27 21:45:22 +0800
commit3e4e868c850e6b6da2c0d005167316e1abdc7460 (patch)
tree36193ba937b0c449ecfbf09aaf747fbdc84dbbe8
parent11f420b4bbcd607346204fb6fd7db7efe948cdac (diff)
downloadspark-3e4e868c850e6b6da2c0d005167316e1abdc7460.tar.gz
spark-3e4e868c850e6b6da2c0d005167316e1abdc7460.tar.bz2
spark-3e4e868c850e6b6da2c0d005167316e1abdc7460.zip
[SPARK-16135][SQL] Remove hashCode and euqals in ArrayBasedMapData
## What changes were proposed in this pull request? This pr is to remove `hashCode` and `equals` in `ArrayBasedMapData` because the type cannot be used as join keys, grouping keys, or in equality tests. ## How was this patch tested? Add a new test suite `MapDataSuite` for comparison tests. Author: Takeshi YAMAMURO <linguin.m.s@gmail.com> Closes #13847 from maropu/UnsafeMapTest.
-rw-r--r--sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java4
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala17
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala5
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala8
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala8
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala57
6 files changed, 76 insertions, 23 deletions
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index 02a863b2bb..6302660548 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -298,6 +298,10 @@ public final class UnsafeArrayData extends ArrayData {
return map;
}
+ // This `hashCode` computation could consume much processor time for large data.
+ // If the computation becomes a bottleneck, we can use a light-weight logic; the first fixed bytes
+ // are used to compute `hashCode` (See `Vector.hashCode`).
+ // The same issue exists in `UnsafeRow.hashCode`.
@Override
public int hashCode() {
return Murmur3_x86_32.hashUnsafeBytes(baseObject, baseOffset, sizeInBytes, 42);
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala
index d46f03ad8f..4449da13c0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala
@@ -24,23 +24,6 @@ class ArrayBasedMapData(val keyArray: ArrayData, val valueArray: ArrayData) exte
override def copy(): MapData = new ArrayBasedMapData(keyArray.copy(), valueArray.copy())
- override def equals(o: Any): Boolean = {
- if (!o.isInstanceOf[ArrayBasedMapData]) {
- return false
- }
-
- val other = o.asInstanceOf[ArrayBasedMapData]
- if (other eq null) {
- return false
- }
-
- this.keyArray == other.keyArray && this.valueArray == other.valueArray
- }
-
- override def hashCode: Int = {
- keyArray.hashCode() * 37 + valueArray.hashCode()
- }
-
override def toString: String = {
s"keys: $keyArray, values: $valueArray"
}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala
index 40db6067ad..94e8824cd1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala
@@ -19,6 +19,11 @@ package org.apache.spark.sql.catalyst.util
import org.apache.spark.sql.types.DataType
+/**
+ * This is an internal data representation for map type in Spark SQL. This should not implement
+ * `equals` and `hashCode` because the type cannot be used as join keys, grouping keys, or
+ * in equality tests. See SPARK-9415 and PR#13847 for the discussions.
+ */
abstract class MapData extends Serializable {
def numElements(): Int
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 62429a22f0..60dd03f5d0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -110,10 +110,10 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
case (expr, i) => Seq(Literal(i), expr)
}))
val plan = GenerateMutableProjection.generate(expressions)
- val actual = plan(new GenericMutableRow(length)).toSeq(expressions.map(_.dataType))
- val expected = Seq(new ArrayBasedMapData(
- new GenericArrayData(0 until length),
- new GenericArrayData(Seq.fill(length)(true))))
+ val actual = plan(new GenericMutableRow(length)).toSeq(expressions.map(_.dataType)).map {
+ case m: ArrayBasedMapData => ArrayBasedMapData.toScalaMap(m)
+ }
+ val expected = (0 until length).map((_, true)).toMap :: Nil
if (!checkResult(actual, expected)) {
fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 8a9617cfbf..e58a0df317 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer
import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
+import org.apache.spark.sql.catalyst.util.MapData
import org.apache.spark.sql.types.DataType
import org.apache.spark.util.Utils
@@ -52,7 +53,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
/**
* Check the equality between result of expression and expected value, it will handle
- * Array[Byte] and Spread[Double].
+ * Array[Byte], Spread[Double], and MapData.
*/
protected def checkResult(result: Any, expected: Any): Boolean = {
(result, expected) match {
@@ -60,7 +61,10 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
java.util.Arrays.equals(result, expected)
case (result: Double, expected: Spread[Double @unchecked]) =>
expected.asInstanceOf[Spread[Double]].isWithin(result)
- case _ => result == expected
+ case (result: MapData, expected: MapData) =>
+ result.keyArray() == expected.keyArray() && result.valueArray() == expected.valueArray()
+ case _ =>
+ result == expected
}
}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala
new file mode 100644
index 0000000000..0f1264c7c3
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import scala.collection._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.util.ArrayBasedMapData
+import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
+
+class MapDataSuite extends SparkFunSuite {
+
+ test("inequality tests") {
+ def u(str: String): UTF8String = UTF8String.fromString(str)
+
+ // test data
+ val testMap1 = Map(u("key1") -> 1)
+ val testMap2 = Map(u("key1") -> 1, u("key2") -> 2)
+ val testMap3 = Map(u("key1") -> 1)
+ val testMap4 = Map(u("key1") -> 1, u("key2") -> 2)
+
+ // ArrayBasedMapData
+ val testArrayMap1 = ArrayBasedMapData(testMap1.toMap)
+ val testArrayMap2 = ArrayBasedMapData(testMap2.toMap)
+ val testArrayMap3 = ArrayBasedMapData(testMap3.toMap)
+ val testArrayMap4 = ArrayBasedMapData(testMap4.toMap)
+ assert(testArrayMap1 !== testArrayMap3)
+ assert(testArrayMap2 !== testArrayMap4)
+
+ // UnsafeMapData
+ val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType)))
+ val row = new GenericMutableRow(1)
+ def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = {
+ row.update(0, map)
+ val unsafeRow = unsafeConverter.apply(row)
+ unsafeRow.getMap(0).copy
+ }
+ assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3))
+ assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4))
+ }
+}