aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWenchen Fan <wenchen@databricks.com>2016-01-20 15:08:27 -0800
committerReynold Xin <rxin@databricks.com>2016-01-20 15:08:27 -0800
commitf3934a8d656f1668bec065751b2a11411229b6f5 (patch)
treea353c7b5de468627aeb1c63a673a54cf122db960
parent8f90c151878571e20625e2a53561441ec0035dfc (diff)
downloadspark-f3934a8d656f1668bec065751b2a11411229b6f5.tar.gz
spark-f3934a8d656f1668bec065751b2a11411229b6f5.tar.bz2
spark-f3934a8d656f1668bec065751b2a11411229b6f5.zip
[SPARK-12888][SQL] benchmark the new hash expression
Benchmark it on 4 different schemas, the result: ``` Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz Hash For simple: Avg Time(ms) Avg Rate(M/s) Relative Rate ------------------------------------------------------------------------------- interpreted version 31.47 266.54 1.00 X codegen version 64.52 130.01 0.49 X ``` ``` Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz Hash For normal: Avg Time(ms) Avg Rate(M/s) Relative Rate ------------------------------------------------------------------------------- interpreted version 4068.11 0.26 1.00 X codegen version 1175.92 0.89 3.46 X ``` ``` Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz Hash For array: Avg Time(ms) Avg Rate(M/s) Relative Rate ------------------------------------------------------------------------------- interpreted version 9276.70 0.06 1.00 X codegen version 14762.23 0.04 0.63 X ``` ``` Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz Hash For map: Avg Time(ms) Avg Rate(M/s) Relative Rate ------------------------------------------------------------------------------- interpreted version 58869.79 0.01 1.00 X codegen version 9285.36 0.06 6.34 X ``` Author: Wenchen Fan <wenchen@databricks.com> Closes #10816 from cloud-fan/hash-benchmark.
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala104
1 files changed, 104 insertions, 0 deletions
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala
new file mode 100644
index 0000000000..184f845b4d
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.catalyst.expressions.{Murmur3Hash, UnsafeProjection}
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Benchmark
+
+/**
+ * Benchmark for the previous interpreted hash function(InternalRow.hashCode) vs the new codegen
+ * hash expression(Murmur3Hash).
+ */
+object HashBenchmark {
+
+ def test(name: String, schema: StructType, iters: Int): Unit = {
+ val numRows = 1024 * 8
+
+ val generator = RandomDataGenerator.forType(schema, nullable = false).get
+ val encoder = RowEncoder(schema)
+ val attrs = schema.toAttributes
+ val safeProjection = GenerateSafeProjection.generate(attrs, attrs)
+
+ val rows = (1 to numRows).map(_ =>
+ // The output of encoder is UnsafeRow, use safeProjection to turn in into safe format.
+ safeProjection(encoder.toRow(generator().asInstanceOf[Row])).copy()
+ ).toArray
+
+ val benchmark = new Benchmark("Hash For " + name, iters * numRows)
+ benchmark.addCase("interpreted version") { _: Int =>
+ for (_ <- 0L until iters) {
+ var sum = 0
+ var i = 0
+ while (i < numRows) {
+ sum += rows(i).hashCode()
+ i += 1
+ }
+ }
+ }
+
+ val getHashCode = UnsafeProjection.create(new Murmur3Hash(attrs) :: Nil, attrs)
+ benchmark.addCase("codegen version") { _: Int =>
+ for (_ <- 0L until iters) {
+ var sum = 0
+ var i = 0
+ while (i < numRows) {
+ sum += getHashCode(rows(i)).getInt(0)
+ i += 1
+ }
+ }
+ }
+ benchmark.run()
+ }
+
+ def main(args: Array[String]): Unit = {
+ val simple = new StructType().add("i", IntegerType)
+ test("simple", simple, 1024)
+
+ val normal = new StructType()
+ .add("null", NullType)
+ .add("boolean", BooleanType)
+ .add("byte", ByteType)
+ .add("short", ShortType)
+ .add("int", IntegerType)
+ .add("long", LongType)
+ .add("float", FloatType)
+ .add("double", DoubleType)
+ .add("bigDecimal", DecimalType.SYSTEM_DEFAULT)
+ .add("smallDecimal", DecimalType.USER_DEFAULT)
+ .add("string", StringType)
+ .add("binary", BinaryType)
+ .add("date", DateType)
+ .add("timestamp", TimestampType)
+ test("normal", normal, 128)
+
+ val arrayOfInt = ArrayType(IntegerType)
+ val array = new StructType()
+ .add("array", arrayOfInt)
+ .add("arrayOfArray", ArrayType(arrayOfInt))
+ test("array", array, 64)
+
+ val mapOfInt = MapType(IntegerType, IntegerType)
+ val map = new StructType()
+ .add("map", mapOfInt)
+ .add("mapOfMap", MapType(IntegerType, mapOfInt))
+ test("map", map, 64)
+ }
+}