diff options
author | Paul Phillips <paulp@improving.org> | 2011-01-25 19:41:34 +0000 |
---|---|---|
committer | Paul Phillips <paulp@improving.org> | 2011-01-25 19:41:34 +0000 |
commit | 5f905da8b66fe69b09d8b62c917970cde14e23ba (patch) | |
tree | c1670e8ca407b3e3f8a17fc6c42a87364cbd64e9 /src | |
parent | dea65103bf06c0a91527692ac053d0d2cb9a9ddd (diff) | |
download | scala-5f905da8b66fe69b09d8b62c917970cde14e23ba.tar.gz scala-5f905da8b66fe69b09d8b62c917970cde14e23ba.tar.bz2 scala-5f905da8b66fe69b09d8b62c917970cde14e23ba.zip |
A new murmur3 hashcode implementation contribut...
A new murmur3 hashcode implementation contributed by rex kerr. I'm
really happy with it. There is benchmarking code included but you can
use the pudding for proof: the compiler compiling itself is clearly
faster. I deleted the existing murmur2 implementation which has never
left trunk in a release.
There remain some possible points of improvement with inlining and/or
synthetic method generation, but this is a major improvement over the
status quo. Closes #2537, review by prokopec.
Diffstat (limited to 'src')
-rw-r--r-- | src/compiler/scala/tools/nsc/settings/ScalaSettings.scala | 1 | ||||
-rw-r--r-- | src/compiler/scala/tools/nsc/typechecker/SyntheticMethods.scala | 7 | ||||
-rw-r--r-- | src/compiler/scala/tools/nsc/util/BitSet.scala | 14 | ||||
-rw-r--r-- | src/library/scala/collection/Map.scala | 3 | ||||
-rw-r--r-- | src/library/scala/collection/MapLike.scala | 4 | ||||
-rw-r--r-- | src/library/scala/collection/SeqLike.scala | 6 | ||||
-rw-r--r-- | src/library/scala/collection/Set.scala | 2 | ||||
-rw-r--r-- | src/library/scala/collection/SetLike.scala | 3 | ||||
-rw-r--r-- | src/library/scala/runtime/ScalaRunTime.scala | 14 | ||||
-rw-r--r-- | src/library/scala/util/MurmurHash.scala | 263 | ||||
-rw-r--r-- | src/library/scala/xml/Utility.scala | 20 |
11 files changed, 208 insertions, 129 deletions
diff --git a/src/compiler/scala/tools/nsc/settings/ScalaSettings.scala b/src/compiler/scala/tools/nsc/settings/ScalaSettings.scala index e10abf6a0d..57de25b694 100644 --- a/src/compiler/scala/tools/nsc/settings/ScalaSettings.scala +++ b/src/compiler/scala/tools/nsc/settings/ScalaSettings.scala @@ -138,7 +138,6 @@ trait ScalaSettings extends AbsScalaSettings with StandardScalaSettings { withPostSetHook(set => interpreter._debug = true) val Ycompletion = BooleanSetting ("-Ycompletion-debug", "Trace all tab completion activity.") val Ypmatnaive = BooleanSetting ("-Ypmat-naive", "Desugar matches as naively as possible.") - val Ymurmur = BooleanSetting ("-Ymurmur", "Use Murmur hash algorithm for case class generated hashCodes.") val Ynotnull = BooleanSetting ("-Ynotnull", "Enable (experimental and incomplete) scala.NotNull.") val YdepMethTpes = BooleanSetting ("-Ydependent-method-types", "Allow dependent method types.") val YmethodInfer = BooleanSetting ("-Yinfer-argument-types", "Infer types for arguments of overriden methods.") diff --git a/src/compiler/scala/tools/nsc/typechecker/SyntheticMethods.scala b/src/compiler/scala/tools/nsc/typechecker/SyntheticMethods.scala index 8041b7a715..ac259ab7ff 100644 --- a/src/compiler/scala/tools/nsc/typechecker/SyntheticMethods.scala +++ b/src/compiler/scala/tools/nsc/typechecker/SyntheticMethods.scala @@ -135,11 +135,6 @@ trait SyntheticMethods extends ast.TreeDSL { } } - // XXX short term, expecting to make murmur the default as soon as it is put through some paces. - def hashCodeTarget: Name = - if (settings.Ymurmur.value) "hashCodeMurmur" - else nme.hashCode_ - /** The equality method for case modules: * def equals(that: Any) = this eq that */ @@ -257,7 +252,7 @@ trait SyntheticMethods extends ast.TreeDSL { // methods for case classes only def classMethods = List( - Object_hashCode -> (() => forwardingMethod(nme.hashCode_, "_" + hashCodeTarget)), + Object_hashCode -> (() => forwardingMethod(nme.hashCode_, "_" + nme.hashCode_)), Object_toString -> (() => forwardingMethod(nme.toString_, "_" + nme.toString_)), Object_equals -> (() => equalsClassMethod) ) diff --git a/src/compiler/scala/tools/nsc/util/BitSet.scala b/src/compiler/scala/tools/nsc/util/BitSet.scala index 59d6d51219..a8d54c2c84 100644 --- a/src/compiler/scala/tools/nsc/util/BitSet.scala +++ b/src/compiler/scala/tools/nsc/util/BitSet.scala @@ -72,12 +72,20 @@ abstract class BitSet { } override def hashCode: Int = { - var h = hashSeed + import scala.util.MurmurHash._ + var h = startHash(hashSeed) + var c = startMagicA + var k = startMagicB for (idx <- 0 until nwords) { val w = word(idx) - h = (h * 41 + (w >>> 32).toInt) * 41 + w.toInt + h = extendHash(h, (w>>>32).toInt, c, k) + c = nextMagicA(c) + k = nextMagicB(k) + h = extendHash(h, w.toInt, c, k) + c = nextMagicA(c) + k = nextMagicB(k) } - h + finalizeHash(h) } def addString(sb: StringBuilder, start: String, sep: String, end: String) { diff --git a/src/library/scala/collection/Map.scala b/src/library/scala/collection/Map.scala index b3dee2ff8c..c32b04f2ed 100644 --- a/src/library/scala/collection/Map.scala +++ b/src/library/scala/collection/Map.scala @@ -35,6 +35,9 @@ trait Map[A, +B] extends Iterable[(A, B)] with MapLike[A, B, Map[A, B]] { * @define coll map */ object Map extends MapFactory[Map] { + + private[collection] val hashSeed = "Map".hashCode + def empty[A, B]: immutable.Map[A, B] = immutable.Map.empty /** $mapCanBuildFromInfo */ diff --git a/src/library/scala/collection/MapLike.scala b/src/library/scala/collection/MapLike.scala index ce78fb705a..0d012d01b7 100644 --- a/src/library/scala/collection/MapLike.scala +++ b/src/library/scala/collection/MapLike.scala @@ -334,7 +334,9 @@ self => override /*PartialFunction*/ def toString = super[IterableLike].toString - override def hashCode() = this map (_.##) sum + // This hash code must be symmetric in the contents but ought not + // collide trivially. + override def hashCode() = util.MurmurHash.symmetricHash(this,Map.hashSeed) /** Compares two maps structurally; i.e. checks if all mappings * contained in this map are also contained in the other map, diff --git a/src/library/scala/collection/SeqLike.scala b/src/library/scala/collection/SeqLike.scala index 49098da47c..12e4c45c75 100644 --- a/src/library/scala/collection/SeqLike.scala +++ b/src/library/scala/collection/SeqLike.scala @@ -987,7 +987,11 @@ trait SeqLike[+A, +Repr] extends IterableLike[A, Repr] { self => /** Hashcodes for $Coll produce a value from the hashcodes of all the * elements of the $coll. */ - override def hashCode() = (Seq.hashSeed /: this)(_ * 41 + _.##) + override def hashCode() = { + val h = new util.MurmurHash[A](Seq.hashSeed) + this.foreach(h) + h.hash + } /** The equals method for arbitrary sequences. Compares this sequence to * some other object. diff --git a/src/library/scala/collection/Set.scala b/src/library/scala/collection/Set.scala index b012836a78..412ebf0e25 100644 --- a/src/library/scala/collection/Set.scala +++ b/src/library/scala/collection/Set.scala @@ -36,6 +36,8 @@ trait Set[A] extends (A => Boolean) * @define Coll Set */ object Set extends SetFactory[Set] { + private[collection] val hashSeed = "Set".hashCode + def newBuilder[A] = immutable.Set.newBuilder[A] override def empty[A]: Set[A] = immutable.Set.empty[A] implicit def canBuildFrom[A]: CanBuildFrom[Coll, A, Set[A]] = setCanBuildFrom[A] diff --git a/src/library/scala/collection/SetLike.scala b/src/library/scala/collection/SetLike.scala index 71c12be778..73f0b9c839 100644 --- a/src/library/scala/collection/SetLike.scala +++ b/src/library/scala/collection/SetLike.scala @@ -290,7 +290,8 @@ self => // override def hashCode() = this map (_.hashCode) sum // Calling map on a set drops duplicates: any hashcode collisions would // then be dropped before they can be added. - override def hashCode() = this.foldLeft(0)(_ + _.hashCode) + // Hash should be symmetric in set entries, but without trivial collisions. + override def hashCode() = util.MurmurHash.symmetricHash(this,Set.hashSeed) /** Compares this set with another object for equality. * diff --git a/src/library/scala/runtime/ScalaRunTime.scala b/src/library/scala/runtime/ScalaRunTime.scala index 956147ab4a..7a7afbcd88 100644 --- a/src/library/scala/runtime/ScalaRunTime.scala +++ b/src/library/scala/runtime/ScalaRunTime.scala @@ -157,19 +157,21 @@ object ScalaRunTime { def _toString(x: Product): String = x.productIterator.mkString(x.productPrefix + "(", ",", ")") - def _hashCodeMurmur(x: Product): Int = - scala.util.MurmurHash.product(x) - def _hashCode(x: Product): Int = { + import scala.util.MurmurHash._ val arr = x.productArity - var code = arr + var h = startHash(arr) + var c = startMagicA + var k = startMagicB var i = 0 while (i < arr) { val elem = x.productElement(i) - code = code * 41 + (if (elem == null) 0 else elem.##) + h = extendHash(h, if (elem == null) 0 else elem.##, c, k) + c = nextMagicA(c) + k = nextMagicB(k) i += 1 } - code + finalizeHash(h) } /** Fast path equality method for inlining; used when -optimise is set. diff --git a/src/library/scala/util/MurmurHash.scala b/src/library/scala/util/MurmurHash.scala index 8ed91796a1..b6efde6c89 100644 --- a/src/library/scala/util/MurmurHash.scala +++ b/src/library/scala/util/MurmurHash.scala @@ -8,121 +8,188 @@ package scala.util -/** An implementation of MurmurHash 2.0. - * http://sites.google.com/site/murmurhash/ +/** An implementation of Austin Appleby's MurmurHash 3.0 algorithm + * (32 bit version); reference: http://code.google.com/p/smhasher * - * It is here primarily for use by case classes. + * This is the hash used by collections and case classes (including + * tuples). * - * @author archontophoenix + * @author Rex Kerr * @version 2.9 * @since 2.9 */ -object MurmurHash { - private def bytesProvided (v: Any) = v match { - case x: Byte => 1 - case x: Short => 2 - case x: Int => 4 - case x: Long => 8 - case x: Float => 4 - case x: Double => 8 - case x: Boolean => 1 - case x: Char => 2 - case x: Unit => 0 - case _ => 4 +import java.lang.Integer.{ rotateLeft => rotl } + +/** A class designed to generate well-distributed non-cryptographic + * hashes. It is designed to be passed to a collection's foreach method, + * or can take individual hash values with append. Its own hash code is + * set equal to the hash code of whatever it is hashing. + */ +class MurmurHash[@specialized(Int,Long,Float,Double) T](seed: Int) extends (T => Unit) { + import MurmurHash._ + + private var h = startHash(seed) + private var c = hiddenMagicA + private var k = hiddenMagicB + private var hashed = false + private var hashvalue = h + + /** Begin a new hash using the same seed. */ + def reset() { + h = startHash(seed) + c = hiddenMagicA + k = hiddenMagicB + hashed = false + } + + /** Incorporate the hash value of one item. */ + def apply(t: T) { + h = extendHash(h,t.##,c,k) + c = nextMagicA(c) + k = nextMagicB(k) + hashed = false } - private def highInt (v: Any): Int = { - val l = v match { - case x: Long => x - case x: Double => java.lang.Double.doubleToRawLongBits(x) - case _ => throw new IllegalArgumentException("No highInt: " + v) + /** Incorporate a known hash value. */ + def append(i: Int) { + h = extendHash(h,i,c,k) + c = nextMagicA(c) + k = nextMagicB(k) + hashed = false + } + + /** Retrieve the hash value */ + def hash = { + if (!hashed) { + hashvalue = finalizeHash(h) + hashed = true } + hashvalue + } + override def hashCode = hash +} - l >>> 32 toInt +/** An object designed to generate well-distributed non-cryptographic + * hashes. It is designed to hash a collection of integers; along with + * the integers to hash, it generates two magic streams of integers to + * increase the distribution of repetitive input sequences. Thus, + * three methods need to be called at each step (to start and to + * incorporate a new integer) to update the values. Only one method + * needs to be called to finalize the hash. + */ + +object MurmurHash { + // Magic values used for MurmurHash's 32 bit hash. + // Don't change these without consulting a hashing expert! + final private val visibleMagic = 0x971e137b + final private val hiddenMagicA = 0x95543787 + final private val hiddenMagicB = 0x2ad7eb25 + final private val visibleMixer = 0x52dce729 + final private val hiddenMixerA = 0x7b7d159c + final private val hiddenMixerB = 0x6bce6396 + final private val finalMixer1 = 0x85ebca6b + final private val finalMixer2 = 0xc2b2ae35 + + // Arbitrary values used for hashing certain classes + final private val seedString = 0xf7ca7fd2 + final private val seedArray = 0x3c074a61 + + /** The first 23 magic integers from the first stream are stored here */ + val storedMagicA = + Iterator.iterate(hiddenMagicA)(nextMagicA).take(23).toArray + + /** The first 23 magic integers from the second stream are stored here */ + val storedMagicB = + Iterator.iterate(hiddenMagicB)(nextMagicB).take(23).toArray + + /** Begin a new hash with a seed value. */ + def startHash(seed: Int) = seed ^ visibleMagic + + /** The initial magic integers in the first stream. */ + def startMagicA = hiddenMagicA + + /** The initial magic integer in the second stream. */ + def startMagicB = hiddenMagicB + + /** Incorporates a new value into an existing hash. + * + * @param hash the prior hash value + * @param value the new value to incorporate + * @param magicA a magic integer from the stream + * @param magicB a magic integer from a different stream + * @return the updated hash value + */ + def extendHash(hash: Int, value: Int, magicA: Int, magicB: Int) = { + (hash ^ rotl(value*magicA,11)*magicB)*3 + visibleMixer } - final val m = 0x5bd1e995 - final val r = 24 - - private def step (hh: Int, kk: Int): Int = { - var h = hh - var k = kk - k *= m - k ^= k >>> r - k *= m - h *= m - h ^ k + /** Given a magic integer from the first stream, compute the next */ + def nextMagicA(magicA: Int) = magicA*5 + hiddenMixerA + + /** Given a magic integer from the second stream, compute the next */ + def nextMagicB(magicB: Int) = magicB*5 + hiddenMixerB + + /** Once all hashes have been incorporated, this performs a final mixing */ + def finalizeHash(hash: Int) = { + var i = (hash ^ (hash>>>16)) + i *= finalMixer1 + i ^= (i >>> 13) + i *= finalMixer2 + i ^= (i >>> 16) + i } - def product(p: Product): Int = product(p, 0) - def product(p: Product, seed: Int): Int = { - // Initialize the hash to a 'random' value - var n = p.productArity - var h = seed ^ n - var partial = 0 - var nextPartialByte = 0 - while (n > 0) { - n -= 1 - val el: Any = p.productElement(n) - bytesProvided(el) match { - case 0 => - case 1 => - partial |= el.## << (nextPartialByte * 8) - if (nextPartialByte < 3) - nextPartialByte += 1 - else { - h = step(h,partial) - partial = 0 - nextPartialByte = 0 - } - case 2 => - val i = el.## - partial |= i << (nextPartialByte * 8) - if (nextPartialByte < 2) - nextPartialByte += 2 - else { - h = step(h,partial) - nextPartialByte -= 2 - partial = if (nextPartialByte == 0) 0 else i >>> 8 - } - case 4 => - h = step(h, el.##) - case 8 => - h = step(h, el.##) - h = step(h, highInt(el)) - } - } - // Handle the last few bytes of the input array - if (nextPartialByte > 0) { - h ^= partial - h *= m + /** Compute a high-quality hash of an array */ + def arrayHash[@specialized T](a: Array[T]) = { + var h = startHash(a.length * seedArray) + var c = hiddenMagicA + var k = hiddenMagicB + var j = 0 + while (j < a.length) { + h = extendHash(h, a(j).##, c, k) + c = nextMagicA(c) + k = nextMagicB(k) + j += 1 } - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - h ^= h >>> 13 - h *= m - h ^ (h >>> 15) + finalizeHash(h) } - def string(s: String): Int = { - // Initialize the hash to a 'random' value - var n = s.length - var h = n - var nn = n & ~ 1 - while (nn > 0) { - nn -= 2 - h = step(h, (s(nn + 1) << 16) | s(nn)) + /** Compute a high-quality hash of a string */ + def stringHash(s: String) = { + var h = startHash(s.length * seedString) + var c = hiddenMagicA + var k = hiddenMagicB + var j = 0 + while (j+1 < s.length) { + val i = (s.charAt(j)<<16) + s.charAt(j+1); + h = extendHash(h,i,c,k) + c = nextMagicA(c) + k = nextMagicB(k) + j += 2 } - // Handle the last few bytes of the input array - if ((n & 1) != 0) { - h ^= s(n - 1) - h *= m - } - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - h ^= h >>> 13 - h *= m - h ^ (h >>> 15) + if (j < s.length) h = extendHash(h,s.charAt(j),c,k) + finalizeHash(h) + } + + /** Compute a hash that is symmetric in its arguments--that is, + * where the order of appearance of elements does not matter. + * This is useful for hashing sets, for example. + */ + def symmetricHash[T](xs: TraversableOnce[T], seed: Int) = { + var a,b,n = 0 + var c = 1 + xs.foreach(i => { + val h = i.## + a += h + b ^= h + if (h != 0) c *= h + n += 1 + }) + var h = startHash(seed * n) + h = extendHash(h, a, storedMagicA(0), storedMagicB(0)) + h = extendHash(h, b, storedMagicA(1), storedMagicB(1)) + h = extendHash(h, c, storedMagicA(2), storedMagicB(2)) + finalizeHash(h) } } diff --git a/src/library/scala/xml/Utility.scala b/src/library/scala/xml/Utility.scala index 02eeb4e84b..2d0682a47d 100644 --- a/src/library/scala/xml/Utility.scala +++ b/src/library/scala/xml/Utility.scala @@ -260,18 +260,14 @@ object Utility extends AnyRef with parsing.TokenTests * @param attribHashCode * @param children */ - def hashCode(pre: String, label: String, attribHashCode: Int, scpeHash: Int, children: Seq[Node]) = ( - ( if(pre ne null) {41 * pre.## % 7} else {0}) - + label.## * 53 - + attribHashCode * 7 - + scpeHash * 31 - + { - var c = 0 - val i = children.iterator - while(i.hasNext) c = c * 41 + i.next.## - c - } - ) + def hashCode(pre: String, label: String, attribHashCode: Int, scpeHash: Int, children: Seq[Node]) = { + val h = new util.MurmurHash[Node]( if (pre ne null) pre.## else 0 ) + h.append(label.##) + h.append(attribHashCode) + h.append(scpeHash) + children.foreach(h) + h.hash + } def appendQuoted(s: String): String = sbToString(appendQuoted(s, _)) |