diff options
author | Paul Phillips <paulp@improving.org> | 2011-09-08 05:43:32 +0000 |
---|---|---|
committer | Paul Phillips <paulp@improving.org> | 2011-09-08 05:43:32 +0000 |
commit | 52c1d019d63d2fffadc8f3f159d654187ac61ece (patch) | |
tree | 6958700cb96b574b0894b3792f76be68d2a7161d | |
parent | c37e8f45cf86d0a3f6cb084e85f9751f5e906bf2 (diff) | |
download | scala-52c1d019d63d2fffadc8f3f159d654187ac61ece.tar.gz scala-52c1d019d63d2fffadc8f3f159d654187ac61ece.tar.bz2 scala-52c1d019d63d2fffadc8f3f159d654187ac61ece.zip |
Refinement of murmurhash implementation.
Integrates recent speed improvements to algorithm.
Contributed by Ruediger Keller, no review.
-rw-r--r-- | src/compiler/scala/tools/nsc/util/BitSet.scala | 21 | ||||
-rw-r--r-- | src/library/scala/collection/GenMapLike.scala | 2 | ||||
-rw-r--r-- | src/library/scala/collection/GenSeqLike.scala | 11 | ||||
-rw-r--r-- | src/library/scala/collection/GenSetLike.scala | 2 | ||||
-rw-r--r-- | src/library/scala/runtime/ScalaRunTime.scala | 25 | ||||
-rw-r--r-- | src/library/scala/util/MurmurHash.scala | 3 | ||||
-rw-r--r-- | src/library/scala/util/MurmurHash3.scala | 178 | ||||
-rw-r--r-- | test/benchmarks/src/scala/util/HashSpeedTest.scala | 148 |
8 files changed, 279 insertions, 111 deletions
diff --git a/src/compiler/scala/tools/nsc/util/BitSet.scala b/src/compiler/scala/tools/nsc/util/BitSet.scala index a8d54c2c84..de0c6fd62a 100644 --- a/src/compiler/scala/tools/nsc/util/BitSet.scala +++ b/src/compiler/scala/tools/nsc/util/BitSet.scala @@ -72,20 +72,15 @@ abstract class BitSet { } override def hashCode: Int = { - import scala.util.MurmurHash._ - var h = startHash(hashSeed) - var c = startMagicA - var k = startMagicB - for (idx <- 0 until nwords) { - val w = word(idx) - h = extendHash(h, (w>>>32).toInt, c, k) - c = nextMagicA(c) - k = nextMagicB(k) - h = extendHash(h, w.toInt, c, k) - c = nextMagicA(c) - k = nextMagicB(k) + import scala.util.MurmurHash3._ + var h = hashSeed + var i = 0; while(i < nwords) { + val w = word(i) + h = mix(h, (w >>> 32).toInt) + h = mix(h, w.toInt) + i += 1 } - finalizeHash(h) + finalizeHash(h, nwords) } def addString(sb: StringBuilder, start: String, sep: String, end: String) { diff --git a/src/library/scala/collection/GenMapLike.scala b/src/library/scala/collection/GenMapLike.scala index 6060087d54..f4a3903c13 100644 --- a/src/library/scala/collection/GenMapLike.scala +++ b/src/library/scala/collection/GenMapLike.scala @@ -32,7 +32,7 @@ trait GenMapLike[A, +B, +Repr] extends GenIterableLike[(A, B), Repr] with Equals // This hash code must be symmetric in the contents but ought not // collide trivially. - override def hashCode() = util.MurmurHash.symmetricHash(seq, Map.hashSeed) + override def hashCode() = util.MurmurHash3.symmetricHash(seq, Map.hashSeed) /** Returns the value associated with a key, or a default value if the key is not contained in the map. * @param key the key. diff --git a/src/library/scala/collection/GenSeqLike.scala b/src/library/scala/collection/GenSeqLike.scala index 57e5105e23..2d50cb18b3 100644 --- a/src/library/scala/collection/GenSeqLike.scala +++ b/src/library/scala/collection/GenSeqLike.scala @@ -440,9 +440,14 @@ trait GenSeqLike[+A, +Repr] extends GenIterableLike[A, Repr] with Equals with Pa * elements of the $coll. */ override def hashCode() = { - val h = new util.MurmurHash[A](Seq.hashSeed) - seq.foreach(h) - h.hash + import util.MurmurHash3._ + var n = 0 + var h = Seq.hashSeed + seq foreach { + x => h = mix(h, x.##) + n += 1 + } + finalizeHash(h, n) } /** The equals method for arbitrary sequences. Compares this sequence to diff --git a/src/library/scala/collection/GenSetLike.scala b/src/library/scala/collection/GenSetLike.scala index 2fc94c2e87..adbb043ecd 100644 --- a/src/library/scala/collection/GenSetLike.scala +++ b/src/library/scala/collection/GenSetLike.scala @@ -143,6 +143,6 @@ extends GenIterableLike[A, Repr] // Calling map on a set drops duplicates: any hashcode collisions would // then be dropped before they can be added. // Hash should be symmetric in set entries, but without trivial collisions. - override def hashCode() = util.MurmurHash.symmetricHash(seq, Set.hashSeed) + override def hashCode() = util.MurmurHash3.symmetricHash(seq, Set.hashSeed) } diff --git a/src/library/scala/runtime/ScalaRunTime.scala b/src/library/scala/runtime/ScalaRunTime.scala index 28a093ce42..9162ae0fc0 100644 --- a/src/library/scala/runtime/ScalaRunTime.scala +++ b/src/library/scala/runtime/ScalaRunTime.scala @@ -174,30 +174,7 @@ object ScalaRunTime { def _toString(x: Product): String = x.productIterator.mkString(x.productPrefix + "(", ",", ")") - def _hashCode(x: Product): Int = { - import scala.util.MurmurHash._ - val arr = x.productArity - // Case objects have the hashCode inlined directly into the - // synthetic hashCode method, but this method should still give - // a correct result if passed a case object. - if (arr == 0) { - x.productPrefix.hashCode - } - else { - var h = startHash(arr) - var c = startMagicA - var k = startMagicB - var i = 0 - while (i < arr) { - val elem = x.productElement(i) - h = extendHash(h, elem.##, c, k) - c = nextMagicA(c) - k = nextMagicB(k) - i += 1 - } - finalizeHash(h) - } - } + def _hashCode(x: Product): Int = scala.util.MurmurHash3.productHash(x) /** A helper for case classes. */ def typedProductIterator[T](x: Product): Iterator[T] = { diff --git a/src/library/scala/util/MurmurHash.scala b/src/library/scala/util/MurmurHash.scala index 71db2536be..029fe095af 100644 --- a/src/library/scala/util/MurmurHash.scala +++ b/src/library/scala/util/MurmurHash.scala @@ -27,6 +27,7 @@ import scala.collection.Iterator * or can take individual hash values with append. Its own hash code is * set equal to the hash code of whatever it is hashing. */ +@deprecated("Use the object MurmurHash3 instead.", "2.10.0") class MurmurHash[@specialized(Int,Long,Float,Double) T](seed: Int) extends (T => Unit) { import MurmurHash._ @@ -79,7 +80,7 @@ class MurmurHash[@specialized(Int,Long,Float,Double) T](seed: Int) extends (T => * incorporate a new integer) to update the values. Only one method * needs to be called to finalize the hash. */ - +@deprecated("Use the object MurmurHash3 instead.", "2.10.0") object MurmurHash { // Magic values used for MurmurHash's 32 bit hash. // Don't change these without consulting a hashing expert! diff --git a/src/library/scala/util/MurmurHash3.scala b/src/library/scala/util/MurmurHash3.scala new file mode 100644 index 0000000000..0fd8b2e123 --- /dev/null +++ b/src/library/scala/util/MurmurHash3.scala @@ -0,0 +1,178 @@ +package scala.util + +import java.lang.Integer.{ rotateLeft => rotl } + + +/** + * An implementation of Austin Appleby's MurmurHash 3 algorithm + * (MurmurHash3_x86_32). + * + * An algorithm designed to generate well-distributed non-cryptographic + * hashes. It is designed to hash data in 32 bit chunks (ints). + * + * The mix method needs to be called at each step to update the intermediate + * hash value. For the last chunk to incorporate into the hash mixLast may + * be used instead, which is slightly faster. Finally finalizeHash needs to + * be called to compute the final hash value. + * + * This is based on the earlier MurmurHash3 code by Rex Kerr, but the + * MurmurHash3 algorithm was since changed by its creator Austin Appleby + * to remedy some weaknesses and improve performance. This represents the + * latest and supposedly final version of the algortihm (revision 136). + * + * @see http://code.google.com/p/smhasher + */ +object MurmurHash3 { + + // Some arbitrary values used as hash seeds + final val arraySeed = 0x3c074a61 + final val stringSeed = 0xf7ca7fd2 + final val productSeed = 0xcafebabe + final val symmetricSeed = 0xb592f7ae + final val traversableSeed = 0xe73a8b15 + + /** Mix in a block of data into an intermediate hash value. */ + final def mix(hash: Int, data: Int): Int = { + var h = mixLast(hash, data) + h = rotl(h, 13) + h * 5 + 0xe6546b64 + } + + /** May optionally be used as the last mixing step. Is a little bit faster than mix, + * as it does no further mixing of the resulting hash. For the last element this is not + * necessary as the hash is thoroughly mixed during finalization anyway. */ + final def mixLast(hash: Int, data: Int): Int = { + var k = data + + k *= 0xcc9e2d51 + k = rotl(k, 15) + k *= 0x1b873593 + + hash ^ k + } + + /** Finalize a hash to incorporate the length and make sure all bits avalanche. */ + final def finalizeHash(hash: Int, length: Int): Int = avalanche(hash ^ length) + + /** Force all bits of the hash to avalanche. Used for finalizing the hash. */ + private final def avalanche(hash: Int): Int = { + var h = hash + + h ^= h >>> 16 + h *= 0x85ebca6b + h ^= h >>> 13 + h *= 0xc2b2ae35 + h ^= h >>> 16 + + h + } + + /** Compute the hash of a product */ + final def productHash(x: Product, seed: Int = productSeed): Int = { + val arr = x.productArity + // Case objects have the hashCode inlined directly into the + // synthetic hashCode method, but this method should still give + // a correct result if passed a case object. + if (arr == 0) { + x.productPrefix.hashCode + } + else { + var h = seed + var i = 0 + while (i < arr) { + h = mix(h, x.productElement(i).##) + i += 1 + } + finalizeHash(h, arr) + } + } + + /** Compute the hash of a string */ + final def stringHash(str: String, seed: Int = stringSeed): Int = { + var h = seed + var i = 0 + while (i + 1 < str.length) { + val data = (str.charAt(i) << 16) + str.charAt(i + 1) + h = mix(h, data) + i += 2 + } + if (i < str.length) h = mixLast(h, str.charAt(i)) + finalizeHash(h, str.length) + } + + /** Compute a hash that is symmetric in its arguments - that is a hash + * where the order of appearance of elements does not matter. + * This is useful for hashing sets, for example. */ + final def symmetricHash[T](xs: collection.GenTraversableOnce[T], seed: Int = symmetricSeed): Int = { + var a, b, n = 0 + var c = 1 + xs.seq.foreach { x => + val h = x.## + a += h + b ^= h + if (h != 0) c *= h + n += 1 + } + + var h = seed + h = mix(h, a) + h = mix(h, b) + h = mixLast(h, c) + finalizeHash(h, n) + } + + /** Compute a hash for a traversable (once). */ + final def traversableHash[T](xs: collection.GenTraversableOnce[T], seed: Int = traversableSeed): Int = { + var n = 0 + var h = seed + xs.seq.foreach { x => + h = mix(h, x.##) + n += 1 + } + finalizeHash(h, n) + } + + /** Compute the hash of an array */ + final def arrayHash[@specialized T](a: Array[T], seed: Int = arraySeed): Int = { + var h = seed + var i = 0 + while (i < a.length) { + h = mix(h, a(i).##) + i += 1 + } + finalizeHash(h, a.length) + } + + /** Compute the hash of a byte array. Faster than arrayHash, because + * it hashes 4 bytes at once. */ + final def bytesHash(data: Array[Byte], seed: Int = arraySeed): Int = { + var len = data.length + var h = seed + + // Body + var i = 0 + while(len >= 4) { + var k = data(i + 0) & 0xFF + k |= (data(i + 1) & 0xFF) << 8 + k |= (data(i + 2) & 0xFF) << 16 + k |= (data(i + 3) & 0xFF) << 24 + + h = mix(h, k) + + i += 4 + len -= 4 + } + + // Tail + var k = 0 + if(len == 3) k ^= (data(i + 2) & 0xFF) << 16 + if(len >= 2) k ^= (data(i + 1) & 0xFF) << 8 + if(len >= 1) { + k ^= (data(i + 0) & 0xFF) + h = mixLast(h, k) + } + + // Finalization + finalizeHash(h, data.length) + } +} diff --git a/test/benchmarks/src/scala/util/HashSpeedTest.scala b/test/benchmarks/src/scala/util/HashSpeedTest.scala index 615bd47aa2..5f6915e4fc 100644 --- a/test/benchmarks/src/scala/util/HashSpeedTest.scala +++ b/test/benchmarks/src/scala/util/HashSpeedTest.scala @@ -1,65 +1,71 @@ object HashSpeedTest { - import System.{nanoTime => now} + + import System.{ nanoTime => now } def time[A](f: => A) = { val t0 = now val ans = f (ans, now - t0) } + def ptime[A](f: => A) = { - val (ans,dt) = time(f) - printf("Elapsed: %.3f\n",dt*1e-9) + val (ans, dt) = time(f) + printf("Elapsed: %.3f\n", dt * 1e-9) ans } object HashHist { var enabled = true - val counts = new collection.mutable.HashMap[Int,Int] - def add (i: Int) { if (enabled) counts(i) = counts.get(i).getOrElse(0)+1 } + val counts = new collection.mutable.HashMap[Int, Int] + def add(i: Int) { if (enabled) counts(i) = counts.get(i).getOrElse(0) + 1 } def resultAndReset = { var s = 0L var o = 0L var m = 0 counts.valuesIterator.foreach(i => { s += i - if (i>0) o += 1 - if (i>m) m = i + if (i > 0) o += 1 + if (i > m) m = i }) counts.clear - (s,o,m) + (s, o, m) } } - def report(s: String,res: (Long,Long,Int)) { - println("Hash quality of "+s) - printf(" %5.2f%% of entries are collisions\n",100*(res._1 - res._2).toDouble/res._1) - printf(" Max of %d entries mapped to the same value\n",res._3) + def report(s: String, res: (Long, Long, Int)) { + println("Hash quality of " + s) + printf(" %5.2f%% of entries are collisions\n", 100 * (res._1 - res._2).toDouble / res._1) + printf(" Max of %d entries mapped to the same value\n", res._3) } // If you have MurmurHash3 installed, uncomment below (and in main) - import scala.util.{MurmurHash3 => MH3} + import scala.util.{ MurmurHash3 => MH3 } + val justCountString: String => Unit = str => { - var s,i = 0 + var s, i = 0 while (i < str.length) { s += str.charAt(i); i += 1 } HashHist.add(s) } + val defaultHashString: String => Unit = str => HashHist.add(str.hashCode) + val murmurHashString: String => Unit = str => HashHist.add(MH3.stringHash(str)) + def makeCharStrings = { val a = new Array[Byte](4) val buffer = new collection.mutable.ArrayBuffer[String] var i: Int = 'A' while (i <= 'Z') { - a(0) = (i&0xFF).toByte + a(0) = (i & 0xFF).toByte var j: Int = 'a' while (j <= 'z') { - a(1) = (j&0xFF).toByte + a(1) = (j & 0xFF).toByte var k: Int = 'A' while (k <= 'z') { - a(2) = (k&0xFF).toByte + a(2) = (k & 0xFF).toByte var l: Int = 'A' while (l <= 'z') { - a(3) = (l&0xFF).toByte + a(3) = (l & 0xFF).toByte buffer += new String(a) l += 1 } @@ -71,6 +77,7 @@ object HashSpeedTest { } buffer.toArray } + def hashCharStrings(ss: Array[String], hash: String => Unit) { var i = 0 while (i < ss.length) { @@ -84,13 +91,15 @@ object HashSpeedTest { lli.foreach(_.foreach(s += _)) HashHist.add(s) } + def defaultHashList: List[List[Int]] => Unit = lli => HashHist.add(lli.hashCode) + def makeBinaryLists = { def singleLists(depth: Int): List[List[Int]] = { if (depth <= 0) List(Nil) else { - val set = singleLists(depth-1) - val longest = set filter (_.length == depth-1) + val set = singleLists(depth - 1) + val longest = set filter (_.length == depth - 1) set ::: (longest.map(0 :: _)) ::: (longest.map(1 :: _)) } } @@ -123,6 +132,7 @@ object HashSpeedTest { } buffer.toArray } + def hashBinaryLists(ls: Array[List[List[Int]]], hash: List[List[Int]] => Unit) { var i = 0 while (i < ls.length) { @@ -136,17 +146,20 @@ object HashSpeedTest { si.foreach(s += _) HashHist.add(s) } + def defaultHashSets: Set[Int] => Unit = si => HashHist.add(si.hashCode) + def makeIntSets = { def sets(depth: Int): List[Set[Int]] = { if (depth <= 0) List(Set.empty[Int]) else { - val set = sets(depth-1) + val set = sets(depth - 1) set ::: set.map(_ + depth) } } sets(20).toArray } + def hashIntSets(ss: Array[Set[Int]], hash: Set[Int] => Unit) { var i = 0 while (i < ss.length) { @@ -156,36 +169,37 @@ object HashSpeedTest { } def defaultHashTuples: (Product with Serializable) => Unit = p => HashHist.add(p.hashCode) + def makeNestedTuples = { val basic = Array( - (0,0), - (0,1), - (1,0), - (1,1), - (0,0,0), - (0,0,1), - (0,1,0), - (1,0,0), - (0,0,0,0), - (0,0,0,0,0), - (false,false), - (true,false), - (false,true), - (true,true), - (0.7,true,"fish"), - ((),true,'c',400,9.2,"galactic") - ) + (0, 0), + (0, 1), + (1, 0), + (1, 1), + (0, 0, 0), + (0, 0, 1), + (0, 1, 0), + (1, 0, 0), + (0, 0, 0, 0), + (0, 0, 0, 0, 0), + (false, false), + (true, false), + (false, true), + (true, true), + (0.7, true, "fish"), + ((), true, 'c', 400, 9.2, "galactic")) basic ++ - (for (i <- basic; j <- basic) yield (i,j)) ++ - (for (i <- basic; j <- basic; k <- basic) yield (i,j,k)) ++ - (for (i <- basic; j <- basic; k <- basic) yield ((i,j),k)) ++ - (for (i <- basic; j <- basic; k <- basic) yield (i,(j,k))) ++ - (for (i <- basic; j <- basic; k <- basic; l <- basic) yield (i,j,k,l)) ++ - (for (i <- basic; j <- basic; k <- basic; l <- basic) yield ((i,j),(k,l))) ++ - (for (i <- basic; j <- basic; k <- basic; l <- basic) yield (i,(j,k,l))) ++ - (for (i <- basic; j <- basic; k <- basic; l <- basic; m <- basic) yield (i,j,k,l,m)) ++ - (for (i <- basic; j <- basic; k <- basic; l <- basic; m <- basic) yield (i,(j,(k,(l,m))))) + (for (i <- basic; j <- basic) yield (i, j)) ++ + (for (i <- basic; j <- basic; k <- basic) yield (i, j, k)) ++ + (for (i <- basic; j <- basic; k <- basic) yield ((i, j), k)) ++ + (for (i <- basic; j <- basic; k <- basic) yield (i, (j, k))) ++ + (for (i <- basic; j <- basic; k <- basic; l <- basic) yield (i, j, k, l)) ++ + (for (i <- basic; j <- basic; k <- basic; l <- basic) yield ((i, j), (k, l))) ++ + (for (i <- basic; j <- basic; k <- basic; l <- basic) yield (i, (j, k, l))) ++ + (for (i <- basic; j <- basic; k <- basic; l <- basic; m <- basic) yield (i, j, k, l, m)) ++ + (for (i <- basic; j <- basic; k <- basic; l <- basic; m <- basic) yield (i, (j, (k, (l, m))))) } + def hashNestedTuples(ts: Array[Product with Serializable], hash: (Product with Serializable) => Unit) { var i = 0 while (i < ts.length) { @@ -194,21 +208,21 @@ object HashSpeedTest { } } - def findSpeed[A](n: Int, h: (Array[A],A=>Unit)=>Unit, aa: Array[A], f: A=>Unit) = { - (time { for (i <- 1 to n) { h(aa,f) } }._2, aa.length.toLong*n) + def findSpeed[A](n: Int, h: (Array[A], A => Unit) => Unit, aa: Array[A], f: A => Unit) = { + (time { for (i <- 1 to n) { h(aa, f) } }._2, aa.length.toLong * n) } - def reportSpeed[A](repeats: Int, xs: List[(String, ()=>(Long,Long))]) { - val tn = Array.fill(xs.length)((0L,0L)) + def reportSpeed[A](repeats: Int, xs: List[(String, () => (Long, Long))]) { + val tn = Array.fill(xs.length)((0L, 0L)) for (j <- 1 to repeats) { - for ((l,i) <- xs zipWithIndex) { + for ((l, i) <- xs zipWithIndex) { val x = l._2() tn(i) = (tn(i)._1 + x._1, tn(i)._2 + x._2) } } - for (((t,n),(title,_)) <- (tn zip xs)) { - val rate = (n*1e-6)/(t*1e-9) - printf("Hash rate for %s: %4.2f million/second\n",title,rate) + for (((t, n), (title, _)) <- (tn zip xs)) { + val rate = (n * 1e-6) / (t * 1e-9) + printf("Hash rate for %s: %4.2f million/second\n", title, rate) } } @@ -218,24 +232,22 @@ object HashSpeedTest { val nt = makeNestedTuples // Uncomment the following for string stats if MurmurHash3 available val cs = makeCharStrings - report("Java String hash for strings",{ hashCharStrings(cs,defaultHashString); HashHist.resultAndReset }) - report("MurmurHash3 for strings",{ hashCharStrings(cs,murmurHashString); HashHist.resultAndReset }) + report("Java String hash for strings", { hashCharStrings(cs, defaultHashString); HashHist.resultAndReset }) + report("MurmurHash3 for strings", { hashCharStrings(cs, murmurHashString); HashHist.resultAndReset }) HashHist.enabled = false reportSpeed(3, List( - ("Java string hash", () => findSpeed[String](30, (x, y) => hashCharStrings(x, y),cs,defaultHashString)), - ("MurmurHash3 string hash", () => findSpeed[String](30,(x, y) => hashCharStrings(x, y),cs,murmurHashString)) - )) + ("Java string hash", () => findSpeed[String](30, (x, y) => hashCharStrings(x, y), cs, defaultHashString)), + ("MurmurHash3 string hash", () => findSpeed[String](30, (x, y) => hashCharStrings(x, y), cs, murmurHashString)))) // reportSpeed("Java string hash",30,hashCharStrings.tupled,cs,defaultHashString) // reportSpeed("MurmurHash3 string hash",30,hashCharStrings.tupled,cs,murmurHashString) HashHist.enabled = true - report("lists of binary int lists",{ hashBinaryLists(bl,defaultHashList); HashHist.resultAndReset }) - report("small integer sets",{ hashIntSets(is,defaultHashSets); HashHist.resultAndReset }) - report("small nested tuples",{ hashNestedTuples(nt,defaultHashTuples); HashHist.resultAndReset }) + report("lists of binary int lists", { hashBinaryLists(bl, defaultHashList); HashHist.resultAndReset }) + report("small integer sets", { hashIntSets(is, defaultHashSets); HashHist.resultAndReset }) + report("small nested tuples", { hashNestedTuples(nt, defaultHashTuples); HashHist.resultAndReset }) HashHist.enabled = false - reportSpeed(3,List( - ("lists of lists of binary ints", () => findSpeed(20,hashBinaryLists,bl,defaultHashList)), - ("small integer sets", () => findSpeed(10,hashIntSets,is,defaultHashSets)), - ("small nested tuples", () => findSpeed(5,hashNestedTuples,nt,defaultHashTuples)) - )) + reportSpeed(3, List( + ("lists of lists of binary ints", () => findSpeed(20, hashBinaryLists, bl, defaultHashList)), + ("small integer sets", () => findSpeed(10, hashIntSets, is, defaultHashSets)), + ("small nested tuples", () => findSpeed(5, hashNestedTuples, nt, defaultHashTuples)))) } } |