From a5014447861a5678c8b595e235019bb8fec098a7 Mon Sep 17 00:00:00 2001 From: Pap Lőrinc Date: Tue, 22 Nov 2016 14:42:48 +0200 Subject: Optimized HashTable.index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (`ops/s`, smaller is better) `Before (9c5d3f8)`: ```scala [info] # Run complete. Total time: 00:08:15 [info] [info] Benchmark (size) Mode Cnt Score Error Units [info] s.c.immutable.VectorMapBenchmark.groupBy 10 avgt 20 645.594 ± 9.435 ns/op [info] s.c.immutable.VectorMapBenchmark.groupBy 100 avgt 20 2084.216 ± 37.814 ns/op [info] s.c.immutable.VectorMapBenchmark.groupBy 1000 avgt 20 19878.481 ± 262.404 ns/op [info] s.c.mutable.HashMapBenchmark.get 10 avgt 20 689.941 ± 5.850 ns/op [info] s.c.mutable.HashMapBenchmark.get 100 avgt 20 7357.330 ± 45.956 ns/op [info] s.c.mutable.HashMapBenchmark.get 1000 avgt 20 95767.200 ± 1550.771 ns/op [info] s.c.mutable.HashMapBenchmark.getOrElseUpdate 10 avgt 20 509.181 ± 2.683 ns/op [info] s.c.mutable.HashMapBenchmark.getOrElseUpdate 100 avgt 20 5563.301 ± 32.335 ns/op [info] s.c.mutable.HashMapBenchmark.getOrElseUpdate 1000 avgt 20 71965.365 ± 1809.738 ns/op [info] s.c.mutable.HashMapBenchmark.put 10 avgt 20 247.270 ± 3.972 ns/op [info] s.c.mutable.HashMapBenchmark.put 100 avgt 20 5646.185 ± 106.172 ns/op [info] s.c.mutable.HashMapBenchmark.put 1000 avgt 20 81303.663 ± 954.938 ns/op ``` `Changed modulo to bitwise and in hash calculation (4c729fe)`: ```scala [info] Benchmark (size) Mode Cnt Score Error Units [info] s.c.immutable.VectorMapBenchmark.groupBy 10 avgt 20 631.291 ± 9.269 ns/op [info] s.c.immutable.VectorMapBenchmark.groupBy 100 avgt 20 2077.885 ± 59.737 ns/op [info] s.c.immutable.VectorMapBenchmark.groupBy 1000 avgt 20 15458.278 ± 317.347 ns/op [info] s.c.mutable.HashMapBenchmark.get 10 avgt 20 678.013 ± 4.453 ns/op [info] s.c.mutable.HashMapBenchmark.get 100 avgt 20 7258.522 ± 76.088 ns/op [info] s.c.mutable.HashMapBenchmark.get 1000 avgt 20 94748.845 ± 1226.120 ns/op [info] s.c.mutable.HashMapBenchmark.getOrElseUpdate 10 avgt 20 498.042 ± 5.006 ns/op [info] s.c.mutable.HashMapBenchmark.getOrElseUpdate 100 avgt 20 5243.154 ± 110.372 ns/op [info] s.c.mutable.HashMapBenchmark.getOrElseUpdate 1000 avgt 20 68194.752 ± 655.436 ns/op [info] s.c.mutable.HashMapBenchmark.put 10 avgt 20 257.275 ± 1.411 ns/op [info] s.c.mutable.HashMapBenchmark.put 100 avgt 20 5318.532 ± 152.923 ns/op [info] s.c.mutable.HashMapBenchmark.put 1000 avgt 20 79607.160 ± 651.779 ns/op ``` `Optimized HashTable.index (6cc1504)`: ```scala [info] Benchmark (size) Mode Cnt Score Error Units [info] s.c.immutable.VectorMapBenchmark.groupBy 10 avgt 20 616.164 ± 4.712 ns/op [info] s.c.immutable.VectorMapBenchmark.groupBy 100 avgt 20 2034.447 ± 14.495 ns/op [info] s.c.immutable.VectorMapBenchmark.groupBy 1000 avgt 20 14712.164 ± 119.983 ns/op [info] s.c.mutable.HashMapBenchmark.get 10 avgt 20 679.046 ± 6.872 ns/op [info] s.c.mutable.HashMapBenchmark.get 100 avgt 20 7242.097 ± 41.244 ns/op [info] s.c.mutable.HashMapBenchmark.get 1000 avgt 20 95342.919 ± 1521.328 ns/op [info] s.c.mutable.HashMapBenchmark.getOrElseUpdate 10 avgt 20 488.034 ± 4.554 ns/op [info] s.c.mutable.HashMapBenchmark.getOrElseUpdate 100 avgt 20 4883.123 ± 59.268 ns/op [info] s.c.mutable.HashMapBenchmark.getOrElseUpdate 1000 avgt 20 65174.034 ± 496.759 ns/op [info] s.c.mutable.HashMapBenchmark.put 10 avgt 20 267.983 ± 1.797 ns/op [info] s.c.mutable.HashMapBenchmark.put 100 avgt 20 5097.351 ± 104.538 ns/op [info] s.c.mutable.HashMapBenchmark.put 1000 avgt 20 78772.540 ± 543.935 ns/op ``` Summary, i.e. the effect of this PR, according to the benchmarks: * `groupBy` has a `~35%` speedup * `get` didn't change * `getOrElseUpdate` has a `~10%` speedup * `put` has a `~3%` speedup Note: caching the `exponent` to a local private field (`Byte` or `Int`) didn't have any performance advantage (only a minor slowdown was measured, possibly because it's accessed via an interface now) --- .../scala/collection/mutable/HashTable.scala | 14 +- .../scala/scala/BitManipulationBenchmark.scala | 170 +++++++++++++++++++++ 2 files changed, 177 insertions(+), 7 deletions(-) create mode 100644 test/benchmarks/src/main/scala/scala/BitManipulationBenchmark.scala diff --git a/src/library/scala/collection/mutable/HashTable.scala b/src/library/scala/collection/mutable/HashTable.scala index 9cb40e3f50..776eafaccc 100644 --- a/src/library/scala/collection/mutable/HashTable.scala +++ b/src/library/scala/collection/mutable/HashTable.scala @@ -360,14 +360,14 @@ trait HashTable[A, Entry >: Null <: HashEntry[A, Entry]] extends HashTable.HashU protected def elemEquals(key1: A, key2: A): Boolean = (key1 == key2) - // Note: - // we take the most significant bits of the hashcode, not the lower ones - // this is of crucial importance when populating the table in parallel - protected final def index(hcode: Int) = { + /** + * Note: we take the most significant bits of the hashcode, not the lower ones + * this is of crucial importance when populating the table in parallel + */ + protected final def index(hcode: Int): Int = { val ones = table.length - 1 - val improved = improve(hcode, seedvalue) - val shifted = (improved >> (32 - java.lang.Integer.bitCount(ones))) & ones - shifted + val exponent = Integer.numberOfLeadingZeros(ones) + (improve(hcode, seedvalue) >>> exponent) & ones } protected def initWithContents(c: HashTable.Contents[A, Entry]) = { diff --git a/test/benchmarks/src/main/scala/scala/BitManipulationBenchmark.scala b/test/benchmarks/src/main/scala/scala/BitManipulationBenchmark.scala new file mode 100644 index 0000000000..23e303ede0 --- /dev/null +++ b/test/benchmarks/src/main/scala/scala/BitManipulationBenchmark.scala @@ -0,0 +1,170 @@ +package scala.collection + +import org.openjdk.jmh.annotations._ +import org.openjdk.jmh.infra._ +import org.openjdk.jmh.runner.IterationType +import benchmark._ +import java.util.concurrent.TimeUnit + +@BenchmarkMode(Array(Mode.AverageTime)) +@Fork(2) +@Threads(1) +@Warmup(iterations = 10) +@Measurement(iterations = 10) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Benchmark) +class BitManipulationBenchmark { + val powersOfTwo = Array(1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824) + + ////////////////////////////////////////////// + + @Benchmark def withIntegerBitCount(bh: Blackhole) { + for (v <- powersOfTwo) { + val leadingZeros = withIntegerBitCount(v) + // assert (leadingZeros == withLoop(v), s"$leadingZeros != ${withLoop(v)} ($v)") + bh.consume(leadingZeros) + } + } + + private def withIntegerBitCount(v: Int) = Integer.SIZE - Integer.bitCount(v - 1) + + ////////////////////////////////////////////// + + @Benchmark def withIntegerNumberOfLeadingZeros(bh: Blackhole) { + for (v <- powersOfTwo) { + val leadingZeros = withIntegerNumberOfLeadingZeros(v) + // assert (leadingZeros == withLoop(v), s"$leadingZeros != ${withLoop(v)} ($v)") + bh.consume(leadingZeros) + } + } + + private def withIntegerNumberOfLeadingZeros(v: Int) = Integer.numberOfLeadingZeros(v - 1) + + ////////////////////////////////////////////// + + @Benchmark def withLoop(bh: Blackhole) { + for (v <- powersOfTwo) { + val leadingZeros = withLoop(v) + bh.consume(leadingZeros) + } + } + + private def withLoop(v: Int): Int = { + var r = Integer.SIZE + var copy = v >> 1 + while (copy != 0) { + r -= 1 + copy = copy >> 1 + } + r + } + + ////////////////////////////////////////////// + + @Benchmark def withMatch(bh: Blackhole) { + for (v <- powersOfTwo) { + val leadingZeros = withMatch(v) + // assert (leadingZeros == withLoop(v), s"$leadingZeros != ${withLoop(v)} ($v)") + bh.consume(leadingZeros) + } + } + + private def withMatch(i: Int) = i match { + case 1 => 32 + case 2 => 31 + case 4 => 30 + case 8 => 29 + case 16 => 28 + case 32 => 27 + case 64 => 26 + case 128 => 25 + case 256 => 24 + case 512 => 23 + case 1024 => 22 + case 2048 => 21 + case 4096 => 20 + case 8192 => 19 + case 16384 => 18 + case 32768 => 17 + case 65536 => 16 + case 131072 => 15 + case 262144 => 14 + case 524288 => 13 + case 1048576 => 12 + case 2097152 => 11 + case 4194304 => 10 + case 8388608 => 9 + case 16777216 => 8 + case 33554432 => 7 + case 67108864 => 6 + case 134217728 => 5 + case 268435456 => 4 + case 536870912 => 3 + case 1073741824 => 2 + } + + + ////////////////////////////////////////////// + + @Benchmark def with2DeBruijn(bh: Blackhole) { + for (v <- powersOfTwo) { + val leadingZeros = with2DeBruijn(v) + // assert (leadingZeros == withLoop(v), s"$leadingZeros != ${withLoop(v)} ($v)") + bh.consume(leadingZeros) + } + } + + // https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn + private val multiplyDeBruijnBitPosition2 = Array(32, 31, 4, 30, 3, 18, 8, 29, 2, 10, 12, 17, 7, 15, 28, 24, 1, 5, 19, 9, 11, 13, 16, 25, 6, 20, 14, 26, 21, 27, 22, 23) + + private def with2DeBruijn(v: Int) = multiplyDeBruijnBitPosition2((v * 0x077CB531) >>> 27) + + + ////////////////////////////////////////////// + + @Benchmark def withBinSearch(bh: Blackhole) { + for (v <- powersOfTwo) { + val leadingZeros = withBinSearch(v) + // assert (leadingZeros == withLoop(v), s"$leadingZeros != ${withLoop(v)} ($v)") + bh.consume(leadingZeros) + } + } + + private def withBinSearch(v: Int) = + if (v < 65536) if (v < 256) if (v < 16) if (v < 4) if (v == 1) 32 else 31 + else if (v == 4) 30 else 29 + else if (v < 64) if (v == 16) 28 else 27 + else if (v == 64) 26 else 25 + else if (v < 4096) if (v < 1024) if (v == 256) 24 else 23 + else if (v == 1024) 22 else 21 + else if (v < 16384) if (v == 4096) 20 else 19 + else if (v == 16384) 18 else 17 + else if (v < 16777216) if (v < 1048576) if (v < 262144) if (v == 65536) 16 else 15 + else if (v == 262144) 14 else 13 + else if (v < 4194304) if (v == 1048576) 12 else 11 + else if (v == 4194304) 10 else 9 + else if (v < 268435456) if (v < 67108864) if (v == 16777216) 8 else 7 + else if (v == 67108864) 6 else 5 + else if (v < 1073741824) if (v == 268435456) 4 else 3 + else if (v == 1073741824) 2 else 1 + + ////////////////////////////////////////////// + + @Benchmark def withSumBinSearch(bh: Blackhole) { + for (v <- powersOfTwo) { + val leadingZeros = withSumBinSearch(v) + // assert(leadingZeros == withLoop(v), s"$leadingZeros != ${withLoop(v)} ($v)") + bh.consume(leadingZeros) + } + } + + private def withSumBinSearch(v: Int): Int = { + var exponent = Integer.SIZE + var remaining = v + if (remaining >= 65536) { remaining >>>= 16; exponent = 16 } + if (remaining >= 256) { remaining >>>= 8; exponent -= 8 } + if (remaining >= 16) { remaining >>>= 4; exponent -= 4 } + if (remaining >= 4) { remaining >>>= 2; exponent -= 2 } + if (remaining >= 2) exponent - 1 else exponent + } +} \ No newline at end of file -- cgit v1.2.3