SI-9789 use quadratic probing in OpenHashMap

The original probe sequence, taken from Python's hash table code, is exponential, jumping around in the hash table with poor memory locality. This replaces the probe algorithm with the more conventional quadratic probing. This also adds tests to the benchmarking code using AnyRef keys, which have pseudorandom hash codes (unlike Ints, whose hash code is simply the Int itself). The intensity of the benchmarking is reduced to make the tests complete within 9 hours, by removing unnecessary sampling.
author: Performant Data LLC <performantdata@users.noreply.github.com> 2016-05-22 09:40:54 -0700
committer: Performant Data LLC <performantdata@users.noreply.github.com> 2016-05-26 20:50:11 -0700
commit: 100d6374c282d94497ee9f0d4a206d427951c74c (patch)
tree: e2b231a12914042a5ae74cca8a909b3076bd3f69 /test/benchmarks
parent: 139f6bf9d709fc18a23530f2f84afa8a1f97b464 (diff)
download: scala-100d6374c282d94497ee9f0d4a206d427951c74c.tar.gz
scala-100d6374c282d94497ee9f0d4a206d427951c74c.tar.bz2
scala-100d6374c282d94497ee9f0d4a206d427951c74c.zip
4 files changed, 251 insertions, 82 deletions
diff --git a/test/benchmarks/src/main/scala/benchmark/KeySeq.scala b/test/benchmarks/src/main/scala/benchmark/KeySeq.scala
new file mode 100644
index 0000000000..126b92b3b6
--- /dev/null
+++ b/test/benchmarks/src/main/scala/benchmark/KeySeq.scala
@@ -0,0 +1,24 @@
+package benchmark
+
+/** A sequence of keys.
+  * 
+  * Tests of maps and sets require a sequence of keys that can be used
+  * to add entries and possibly to find them again.
+  * This type provides such a sequence.
+  * 
+  * Note that this needn't be a "sequence" in the full sense of [[collection.Seq]],
+  * particularly in that it needn't extend [[PartialFunction]].
+  * 
+  * @tparam K the type of the keys
+  */
+trait KeySeq[K] {
+  /** Selects a key by its index in the sequence.
+    * Repeated calls with the same index return the same key (by reference equality).
+    * 
+    * @param idx The index to select. Should be non-negative and less than `size`.
+    */
+  def apply(idx: Int): K
+
+  /** The size of this sequence. */
+  def size: Int
+}
diff --git a/test/benchmarks/src/main/scala/benchmark/KeySeqBuilder.scala b/test/benchmarks/src/main/scala/benchmark/KeySeqBuilder.scala
new file mode 100644
index 0000000000..95f6c7afd7
--- /dev/null
+++ b/test/benchmarks/src/main/scala/benchmark/KeySeqBuilder.scala
@@ -0,0 +1,33 @@
+package benchmark
+
+/** Builder of a [[KeySeq]]
+  * 
+  * @tparam K the type of the keys
+  */
+trait KeySeqBuilder[K] {
+  /** Return a [[KeySeq]] having at least the given size. */
+  def build(size: Int): KeySeq[K]
+}
+
+object KeySeqBuilder {
+  /** Builder of a sequence of `Int` keys.
+    * Simply maps the sequence index to itself.
+    */
+  implicit object IntKeySeqBuilder extends KeySeqBuilder[Int] {
+    def build(_size: Int) = new KeySeq[Int] {
+      def apply(idx: Int) = idx
+      def size = _size
+    }
+  }
+
+  /** Builder of a sequence of `AnyRef` keys. */
+  implicit object AnyRefKeySeqBuilder extends KeySeqBuilder[AnyRef] {
+    def build(_size: Int) = new KeySeq[AnyRef] {
+      private[this] val arr = new Array[AnyRef](size)
+      for (i <- 0 until size) arr(i) = new AnyRef()
+
+      def apply(idx: Int) = arr(idx)
+      def size = _size
+    }
+  }
+}
diff --git a/test/benchmarks/src/main/scala/scala/collection/mutable/OpenHashMapBenchmark.scala b/test/benchmarks/src/main/scala/scala/collection/mutable/OpenHashMapBenchmark.scala
index 26e26b3065..64e2244499 100644
--- a/test/benchmarks/src/main/scala/scala/collection/mutable/OpenHashMapBenchmark.scala
+++ b/test/benchmarks/src/main/scala/scala/collection/mutable/OpenHashMapBenchmark.scala
@@ -1,14 +1,12 @@
 package scala.collection.mutable;
 
-import java.util.concurrent.TimeUnit
 import org.openjdk.jmh.annotations._
-import org.openjdk.jmh.infra.Blackhole
-import org.openjdk.jmh.infra.BenchmarkParams
-import org.openjdk.jol.info.GraphLayout
-import org.openjdk.jol.info.GraphWalker
-import org.openjdk.jol.info.GraphVisitor
-import org.openjdk.jmh.infra.IterationParams
+import org.openjdk.jmh.infra._
 import org.openjdk.jmh.runner.IterationType
+import org.openjdk.jol.info.GraphLayout
+
+import benchmark._
+import java.util.concurrent.TimeUnit
 
 /** Utilities for the [[OpenHashMapBenchmark]].
   * 
@@ -16,7 +14,8 @@ import org.openjdk.jmh.runner.IterationType
   * instead of using the JMH harness, which iterates for a fixed length of time.
   */
 private object OpenHashMapBenchmark {
-  /** State container for the `put()` bulk calling tests.
+
+  /** Abstract state container for the `put()` bulk calling tests.
     * 
     * Provides an array of adequately-sized, empty maps to each invocation,
     * so that hash table allocation won't be done during measurement.
@@ -25,10 +24,11 @@ private object OpenHashMapBenchmark {
     * so that only the GCs caused by the invocation contribute to the measurement.
     * 
     * Records the memory used by all the maps in the last invocation of each iteration.
+    * 
+    * @tparam K type of the map keys to be used in the test
     */
   @State(Scope.Thread)
-  @AuxCounters
-  class BulkPutState {
+  private[this] abstract class BulkPutState[K](implicit keyBuilder: KeySeqBuilder[K]) {
     /** A lower-bound estimate of the number of nanoseconds per `put()` call */
     private[this] val nanosPerPut: Double = 5
 
@@ -39,35 +39,43 @@ private object OpenHashMapBenchmark {
     private[this] var size: Int = _
 
     /** Total number of entries in all of the `maps` combined. */
-    var mapEntries: Int = _
+    private[this] var _mapEntries: Int = _
+    protected def mapEntries = _mapEntries
 
     /** Number of operations performed in the current invocation. */
-    var operations: Int = _
+    private[this] var _operations: Int = _
+    protected def operations = _operations
 
     /** Bytes of memory used in the object graphs of all the maps. */
-    var memory: Long = _
+    private[this] var _memory: Long = _
+    protected def memory = _memory
+
+    /** The sequence of keys to store into a map. */
+    private[this] var _keys: KeySeq[K] = _
+    def keys() = _keys
 
-    var maps: Array[OpenHashMap[Int,Int]] = null
+    var maps: Array[OpenHashMap[K,Int]] = null
 
     @Setup
     def threadSetup(params: BenchmarkParams) {
       size = params.getParam("size").toInt
       val n = math.ceil(minNanosPerInvocation / (nanosPerPut * size)).toInt
-      mapEntries = size * n
+      _mapEntries = size * n
+      _keys = keyBuilder.build(size)
       maps = new Array(n)
     }
 
     @Setup(Level.Iteration)
     def iterationSetup {
-      operations = 0
+      _operations = 0
     }
 
     @Setup(Level.Invocation)
     def setup(params: IterationParams) {
-      for (i <- 0 until maps.length) maps(i) = new OpenHashMap[Int,Int](size)
+      for (i <- 0 until maps.length) maps(i) = new OpenHashMap[K,Int](size)
 
       if (params.getType == IterationType.MEASUREMENT) {
-        operations += mapEntries
+        _operations += _mapEntries
         System.gc()  // clean up after last invocation
       }
     }
@@ -76,72 +84,124 @@ private object OpenHashMapBenchmark {
     def iterationTeardown(params: IterationParams) {
       if (params.getType == IterationType.MEASUREMENT) {
         // limit to smaller cases to avoid OOM
-        memory = if (mapEntries <= 1000000) GraphLayout.parseInstance(maps(0), maps.tail).totalSize else 0
+        _memory =
+          if (_mapEntries <= 1000000) GraphLayout.parseInstance(maps(0), maps.tail).totalSize
+          else 0
       }
     }
   }
 
-  /** State container for the `get()` bulk calling tests.
+  /** Abstract state container for the `get()` bulk calling tests.
     * 
     * Provides a thread-scoped map of the expected size.
     * Performs a GC after loading the map.
+    * 
+    * @tparam K type of the map keys to be used in the test
     */
   @State(Scope.Thread)
-  class BulkGetState {
-    val map = new OpenHashMap[Int,Int].empty
+  private[this] abstract class BulkGetState[K](implicit keyBuilder: KeySeqBuilder[K]) {
+    /** The sequence of keys to store into a map. */
+    private[this] var _keys: KeySeq[K] = _
+    def keys() = _keys
+
+    val map = new OpenHashMap[K,Int].empty
 
     /** Load the map with keys from `1` to `size`. */
     @Setup
     def setup(params: BenchmarkParams) {
       val size = params.getParam("size").toInt
-      put_Int(map, 1, size)
+      _keys = keyBuilder.build(size)
+      put(map, keys, 0, size)
       System.gc()
     }
   }
 
-  /** State container for the `get()` bulk calling tests with deleted entries.
+  /** Abstract state container for the `get()` bulk calling tests with deleted entries.
     * 
     * Provides a thread-scoped map of the expected size, from which entries have been removed.
     * Performs a GC after loading the map.
+    * 
+    * @tparam K type of the map keys to be used in the test
     */
   @State(Scope.Thread)
-  class BulkRemovedGetState {
-    val map = new OpenHashMap[Int,Int].empty
+  private[this] abstract class BulkRemovedGetState[K](implicit keyBuilder: KeySeqBuilder[K]) {
+    /** The sequence of keys to store into a map. */
+    private[this] var _keys: KeySeq[K] = _
+    def keys() = _keys
+
+    val map = new OpenHashMap[K,Int].empty
 
     /** Load the map with keys from `1` to `size`, removing half of them. */
     @Setup
     def setup(params: BenchmarkParams) {
       val size = params.getParam("size").toInt
-      put_remove_Int(map, size)
+      _keys = keyBuilder.build(size)
+      put_remove(map, keys)
       System.gc()
     }
   }
 
-  /** Put elements into the given map. */
-  private def put_Int(map: OpenHashMap[Int,Int], from: Int, to: Int) {
+  /* In order to use `@AuxCounters` on a class hierarchy (as of JMH 1.11.3),
+   * it's necessary to place it on the injected (sub)class, and to make the
+   * counters visible as explicit public members of the that class.  JMH doesn't
+   * scan the ancestor classes for counters.
+   */
+
+  @AuxCounters
+  private class IntBulkPutState extends BulkPutState[Int] {
+    override def mapEntries = super.mapEntries
+    override def operations = super.operations
+    override def memory = super.memory
+  }
+  private class IntBulkGetState extends BulkGetState[Int]
+  private class IntBulkRemovedGetState extends BulkRemovedGetState[Int]
+
+  @AuxCounters
+  private class AnyRefBulkPutState extends BulkPutState[AnyRef] {
+    override def mapEntries = super.mapEntries
+    override def operations = super.operations
+    override def memory = super.memory
+  }
+  private class AnyRefBulkGetState extends BulkGetState[AnyRef]
+  private class AnyRefBulkRemovedGetState extends BulkRemovedGetState[AnyRef]
+
+
+  /** Put entries into the given map.
+    * Adds entries using a range of keys from the given list.
+    * 
+    * @param from lowest index in the range of keys to add
+    * @param to highest index in the range of keys to add, plus one
+    */
+  private[this] def put[K](map: OpenHashMap[K,Int], keys: KeySeq[K], from: Int, to: Int) {
     var i = from
-    while (i <= to) {  // using a `for` expression instead adds significant overhead
-      map.put(i, i)
+    while (i < to) {  // using a `for` expression instead adds significant overhead
+      map.put(keys(i), i)
       i += 1
     }
   }
 
-  /** Put elements into the given map, removing half of them as they're added.
+  /** Put entries into the given map.
+    * Adds entries using all of the keys from the given list.
+    */
+  private def put[K](map: OpenHashMap[K,Int], keys: KeySeq[K]): Unit =
+    put(map, keys, 0, keys.size)
+
+  /** Put entries into the given map, removing half of them as they're added.
     * 
-    * @param size number of entries to leave in the map on return
+    * @param keys list of keys to use
     */
-  def put_remove_Int(map: OpenHashMap[Int,Int], size: Int) {
-    val blocks = 50  // should be a factor of `size`
-    val totalPuts = 2 * size  // add twice as many, because we remove half of them
-    val blockSize: Int = totalPuts / blocks
+  private def put_remove[K](map: OpenHashMap[K,Int], keys: KeySeq[K]) {
+    val blocks = 25  // should be a non-trivial factor of `size`
+    val size = keys.size
+    val blockSize: Int = size / blocks
     var base = 0
-    while (base < totalPuts) {
-      put_Int(map, base + 1, base + blockSize)
+    while (base < size) {
+      put(map, keys, base, base + blockSize)
 
       // remove every other entry
-      var i = base + 1
-      while (i <= base + blockSize) {
-        map.remove(i)
+      var i = base
+      while (i < base + blockSize) {
+        map.remove(keys(i))
         i += 2
       }
 
@@ -150,55 +210,99 @@ private object OpenHashMapBenchmark {
   }
 
   /** Get elements from the given map. */
-  def get_Int(map: OpenHashMap[Int,Int], size: Int, bh: Blackhole) {
-    var i = 1
-    while (i <= size) {
-      bh.consume(map.get(i).getOrElse(0))
+  private def get[K](map: OpenHashMap[K,Int], keys: KeySeq[K]) = {
+    val size = keys.size
+    var i = 0
+    var sum = 0
+    while (i < size) {
+      sum += map.get(keys(i)).getOrElse(0)
       i += 1
     }
+    sum
   }
 }
 
 /** Benchmark for the library's [[OpenHashMap]]. */
 @BenchmarkMode(Array(Mode.AverageTime))
-@Fork(6)
+@Fork(5)
 @Threads(1)
 @Warmup(iterations = 20)
-@Measurement(iterations = 6)
+@Measurement(iterations = 5)
 @OutputTimeUnit(TimeUnit.NANOSECONDS)
 @State(Scope.Benchmark)
 class OpenHashMapBenchmark {
   import OpenHashMapBenchmark._
 
-  @Param(Array("25", "50", "100", "250", "1000", "2500", "10000", "25000", "100000", "250000", "1000000", "2500000",
+  @Param(Array("50", "100", "1000", "10000", "100000", "1000000", "2500000",
                "5000000", "7500000", "10000000", "25000000"))
   var size: Int = _
 
+  // Tests with Int keys
+
   /** Test putting elements to a map of `Int` to `Int`. */
   @Benchmark
-  def put_Int(state: BulkPutState) {
+  def put_Int(state: IntBulkPutState) {
     var i = 0
     while (i < state.maps.length) {
-      OpenHashMapBenchmark.put_Int(state.maps(i), 1, size)
+      put(state.maps(i), state.keys)
       i += 1
     }
   }
 
   /** Test putting and removing elements to a growing map of `Int` to `Int`. */
   @Benchmark
-  def put_remove_Int(state: BulkPutState) {
+  def put_remove_Int(state: IntBulkPutState) {
     var i = 0
     while (i < state.maps.length) {
-      OpenHashMapBenchmark.put_remove_Int(state.maps(i), size)
+      put_remove(state.maps(i), state.keys)
       i += 1
     }
   }
 
   /** Test getting elements from a map of `Int` to `Int`. */
   @Benchmark
-  def put_get_Int(state: BulkGetState, bh: Blackhole) = OpenHashMapBenchmark.get_Int(state.map, size, bh)
+  def get_Int_after_put(state: IntBulkGetState) =
+    get(state.map, state.keys)
+
+  /** Test getting elements from a map of `Int` to `Int` from which elements have been removed.
+    * Note that half of these queries will fail to find their keys, which have been removed.
+    */
+  @Benchmark
+  def get_Int_after_put_remove(state: IntBulkRemovedGetState) =
+    get(state.map, state.keys)
+
+
+  // Tests with AnyRef keys
+
+  /** Test putting elements to a map of `AnyRef` to `Int`. */
+  @Benchmark
+  def put_AnyRef(state: AnyRefBulkPutState) {
+    var i = 0
+    while (i < state.maps.length) {
+      put(state.maps(i), state.keys)
+      i += 1
+    }
+  }
+
+  /** Test putting and removing elements to a growing map of `AnyRef` to `Int`. */
+  @Benchmark
+  def put_remove_AnyRef(state: AnyRefBulkPutState) {
+    var i = 0
+    while (i < state.maps.length) {
+      put_remove(state.maps(i), state.keys)
+      i += 1
+    }
+  }
+
+  /** Test getting elements from a map of `AnyRef` to `Int`. */
+  @Benchmark
+  def get_AnyRef_after_put(state: AnyRefBulkGetState) =
+    get(state.map, state.keys)
 
-  /** Test getting elements from a map of `Int` to `Int` from which elements have been removed. */
+  /** Test getting elements from a map of `AnyRef` to `Int` from which elements have been removed.
+    * Note that half of these queries will fail to find their keys, which have been removed.
+    */
   @Benchmark
-  def put_remove_get_Int(state: BulkRemovedGetState, bh: Blackhole) = OpenHashMapBenchmark.get_Int(state.map, size, bh)
+  def get_AnyRef_after_put_remove(state: AnyRefBulkRemovedGetState) =
+    get(state.map, state.keys)
 }
diff --git a/test/benchmarks/src/main/scala/scala/collection/mutable/OpenHashMapRunner.scala b/test/benchmarks/src/main/scala/scala/collection/mutable/OpenHashMapRunner.scala
index 1a58b18ee9..b14b733a81 100644
--- a/test/benchmarks/src/main/scala/scala/collection/mutable/OpenHashMapRunner.scala
+++ b/test/benchmarks/src/main/scala/scala/collection/mutable/OpenHashMapRunner.scala
@@ -1,20 +1,18 @@
 package scala.collection.mutable
 
-import java.io.BufferedWriter
 import java.io.File
-import java.io.FileOutputStream
-import java.io.OutputStreamWriter
 import java.io.PrintWriter
-import scala.collection.JavaConversions
+
 import scala.language.existentials
+
+import org.openjdk.jmh.results.Result
 import org.openjdk.jmh.results.RunResult
 import org.openjdk.jmh.runner.Runner
 import org.openjdk.jmh.runner.options.CommandLineOptions
-import org.openjdk.jmh.runner.options.Options
-import benchmark.JmhRunner
 import org.openjdk.jmh.runner.options.OptionsBuilder
 import org.openjdk.jmh.runner.options.VerboseMode
-import org.openjdk.jmh.results.Result
+
+import benchmark.JmhRunner
 
 /** Replacement JMH application that runs the [[OpenHashMap]] benchmark.
   * 
@@ -27,6 +25,7 @@ object OpenHashMapRunner extends JmhRunner {
   /** Qualifier to add to the name of a memory usage data set. */
   private[this] val memoryDatasetQualifier = "-memory"
 
+  /** Adapter to the JMH result class that simplifies our method calls. */
   private[this] implicit class MyRunResult(r: RunResult) {
     /** Return the dataset label. */
     def label = r.getPrimaryResult.getLabel
@@ -34,13 +33,13 @@ object OpenHashMapRunner extends JmhRunner {
     /** Return the value of the JMH parameter for the number of map entries per invocation. */
     def size: String = r.getParams.getParam("size")
 
-    /** Return the operation counts. */
+    /** Return the operation counts. Not every test tracks this. */
     def operations = Option(r.getSecondaryResults.get("operations"))
 
     /** Return the number of map entries. */
     def entries = r.getSecondaryResults.get("mapEntries")
 
-    /** Return the memory usage. */
+    /** Return the memory usage. Only defined if memory usage was measured. */
     def memory = Option(r.getSecondaryResults.get("memory"))
   }
 
@@ -50,7 +49,6 @@ object OpenHashMapRunner extends JmhRunner {
 
   def main(args: Array[String]) {
     import scala.collection.JavaConversions._
-    import scala.language.existentials
   
     val opts = new CommandLineOptions(args: _*)
     var builder = new OptionsBuilder().parent(opts).jvmArgsPrepend("-Xmx6000m")
@@ -58,7 +56,12 @@ object OpenHashMapRunner extends JmhRunner {
 
     val results = new Runner(builder.build).run()
 
-    // Sort the results
+    /* Sort the JMH results into "data sets", each representing a complete test of one feature.
+     * Some results only measure CPU performance; while others also measure memory usage, and
+     * thus are split into two data sets.  A data set is distinguished by its label, which is
+     * the label of the JMH result, for CPU performance, or that with an added suffix, for memory
+     * usage.
+     */
 
     /** Map from data set name to data set. */
     val datasetByName = Map.empty[String, Set[RunResult]]
@@ -83,23 +86,28 @@ object OpenHashMapRunner extends JmhRunner {
 
     val f = new PrintWriter(outputFile, "UTF-8")
     try {
-      datasetByName.foreach(_ match { case (label: String, dataset: Iterable[RunResult]) => {
-        f.println(s"# [$label]")
-
-        val isMemoryUsageDataset = label.endsWith(memoryDatasetQualifier)
-        dataset.foreach { r =>
-          f.println(r.size + " " + (
-            if (isMemoryUsageDataset)
-              stats(r.entries) + " " + stats(r.memory.get)
-            else
-              stats(r.operations getOrElse r.getPrimaryResult)
-          ))
-        }
-
-        f.println(); f.println()  // data set separator
-      }})
+      datasetByName.foreach(_ match {
+        case (label: String, dataset: Iterable[RunResult]) =>
+          outputDataset(f, label, dataset)
+        })
     } finally {
       f.close()
     }
   }
+  
+  private[this] def outputDataset(f: PrintWriter, label: String, dataset: Iterable[RunResult]) {
+    f.println(s"# [$label]")
+
+    val isMemoryUsageDataset = label.endsWith(memoryDatasetQualifier)
+    dataset.foreach { r =>
+      f.println(r.size + " " + (
+        if (isMemoryUsageDataset && !r.memory.get.getScore.isInfinite)
+          stats(r.entries) + " " + stats(r.memory.get)
+        else
+          stats(r.operations getOrElse r.getPrimaryResult)
+      ))
+    }
+
+    f.println(); f.println()  // data set separator
+  }
 }
author	Performant Data LLC <performantdata@users.noreply.github.com>	2016-05-22 09:40:54 -0700
committer	Performant Data LLC <performantdata@users.noreply.github.com>	2016-05-26 20:50:11 -0700
commit	100d6374c282d94497ee9f0d4a206d427951c74c (patch)
tree	e2b231a12914042a5ae74cca8a909b3076bd3f69 /test/benchmarks
parent	139f6bf9d709fc18a23530f2f84afa8a1f97b464 (diff)
download	scala-100d6374c282d94497ee9f0d4a206d427951c74c.tar.gz scala-100d6374c282d94497ee9f0d4a206d427951c74c.tar.bz2 scala-100d6374c282d94497ee9f0d4a206d427951c74c.zip