diff options
author | Paul Phillips <paulp@improving.org> | 2011-11-24 01:24:28 +0000 |
---|---|---|
committer | Paul Phillips <paulp@improving.org> | 2011-11-24 01:24:28 +0000 |
commit | 60fb9ec19b50cd8059122ccffd72014b3eefc51f (patch) | |
tree | 47f47b7b9ad13fb79df825e4e42864a9f53d7035 /src/library | |
parent | 4cfca8a7f6763fbbaab37a3473d74118b3ec52bc (diff) | |
download | scala-60fb9ec19b50cd8059122ccffd72014b3eefc51f.tar.gz scala-60fb9ec19b50cd8059122ccffd72014b3eefc51f.tar.bz2 scala-60fb9ec19b50cd8059122ccffd72014b3eefc51f.zip |
Refinements of "def seq" and murmurhash.
Trying to make hashcodes faster. Didn't achieve much on that front, so
redirected into structural/consistency issues. The latter was lacking
in terms of how/where "def seq" was defined. The documentation I can
find doesn't give me much hint that the sequential form of my sequential
collection might be a single-use iterator! (As in StringOps, ArrayOps.)
If that's intentional it should be in huge letters. I'm assuming for now
that it wasn't.
Also, there was this:
GenMapLike: def seq: Map[A, B]
GenSetLike: def seq: Set[A]
GenSeqLike: // nothing, returns Traversable
So I added some def seqs where I needed the more specific types for
my hashcode work. Hashcodewise, I broke the MurmurHash3 object into
a reusable class and a collections-specific object, and I deprecated
the methods which took GenTraversableOnce in favor of ones taking
TraversableOnce, because there's no reason the hashcode library should
have to know about things like "make sure to call seq before you
traverse or you'll be sorry." Exclude things by their type and you can
never make a mistake. End transmission.
Diffstat (limited to 'src/library')
19 files changed, 103 insertions, 57 deletions
diff --git a/src/library/scala/collection/GenMapLike.scala b/src/library/scala/collection/GenMapLike.scala index f4a3903c13..12ecbcf140 100644 --- a/src/library/scala/collection/GenMapLike.scala +++ b/src/library/scala/collection/GenMapLike.scala @@ -29,10 +29,9 @@ trait GenMapLike[A, +B, +Repr] extends GenIterableLike[(A, B), Repr] with Equals def +[B1 >: B](kv: (A, B1)): GenMap[A, B1] def - (key: A): Repr - // This hash code must be symmetric in the contents but ought not // collide trivially. - override def hashCode() = util.MurmurHash3.symmetricHash(seq, Map.hashSeed) + override def hashCode() = util.MurmurHash3.mapHash(seq) /** Returns the value associated with a key, or a default value if the key is not contained in the map. * @param key the key. diff --git a/src/library/scala/collection/GenSeqLike.scala b/src/library/scala/collection/GenSeqLike.scala index 2d50cb18b3..b3dd4764a9 100644 --- a/src/library/scala/collection/GenSeqLike.scala +++ b/src/library/scala/collection/GenSeqLike.scala @@ -31,6 +31,7 @@ import annotation.bridge * Unlike iterables, sequences always have a defined order of elements. */ trait GenSeqLike[+A, +Repr] extends GenIterableLike[A, Repr] with Equals with Parallelizable[A, parallel.ParSeq[A]] { + def seq: Seq[A] /** Selects an element by its index in the $coll. * @@ -439,16 +440,7 @@ trait GenSeqLike[+A, +Repr] extends GenIterableLike[A, Repr] with Equals with Pa /** Hashcodes for $Coll produce a value from the hashcodes of all the * elements of the $coll. */ - override def hashCode() = { - import util.MurmurHash3._ - var n = 0 - var h = Seq.hashSeed - seq foreach { - x => h = mix(h, x.##) - n += 1 - } - finalizeHash(h, n) - } + override def hashCode() = util.MurmurHash3.seqHash(seq) /** The equals method for arbitrary sequences. Compares this sequence to * some other object. diff --git a/src/library/scala/collection/GenSetLike.scala b/src/library/scala/collection/GenSetLike.scala index adbb043ecd..f729f82bb4 100644 --- a/src/library/scala/collection/GenSetLike.scala +++ b/src/library/scala/collection/GenSetLike.scala @@ -143,6 +143,5 @@ extends GenIterableLike[A, Repr] // Calling map on a set drops duplicates: any hashcode collisions would // then be dropped before they can be added. // Hash should be symmetric in set entries, but without trivial collisions. - override def hashCode() = util.MurmurHash3.symmetricHash(seq, Set.hashSeed) - + override def hashCode() = util.MurmurHash3.setHash(seq) } diff --git a/src/library/scala/collection/IndexedSeq.scala b/src/library/scala/collection/IndexedSeq.scala index 13c48ec144..4a3586a375 100644 --- a/src/library/scala/collection/IndexedSeq.scala +++ b/src/library/scala/collection/IndexedSeq.scala @@ -20,6 +20,7 @@ trait IndexedSeq[+A] extends Seq[A] with GenericTraversableTemplate[A, IndexedSeq] with IndexedSeqLike[A, IndexedSeq[A]] { override def companion: GenericCompanion[IndexedSeq] = IndexedSeq + override def seq: IndexedSeq[A] = this } /** $factoryInfo diff --git a/src/library/scala/collection/IndexedSeqLike.scala b/src/library/scala/collection/IndexedSeqLike.scala index b743174260..baf5606ab5 100644 --- a/src/library/scala/collection/IndexedSeqLike.scala +++ b/src/library/scala/collection/IndexedSeqLike.scala @@ -6,8 +6,6 @@ ** |/ ** \* */ - - package scala.collection import generic._ @@ -42,6 +40,9 @@ import scala.annotation.tailrec trait IndexedSeqLike[+A, +Repr] extends SeqLike[A, Repr] { self => + def seq: IndexedSeq[A] + override def hashCode() = util.MurmurHash3.seqHash(seq) // TODO - can we get faster via "indexedSeqHash" ? + override protected[this] def thisCollection: IndexedSeq[A] = this.asInstanceOf[IndexedSeq[A]] override protected[this] def toCollection(repr: Repr): IndexedSeq[A] = repr.asInstanceOf[IndexedSeq[A]] @@ -96,4 +97,3 @@ trait IndexedSeqLike[+A, +Repr] extends SeqLike[A, Repr] { result } } - diff --git a/src/library/scala/collection/LinearSeq.scala b/src/library/scala/collection/LinearSeq.scala index dfa0a0ed68..be143cf96b 100644 --- a/src/library/scala/collection/LinearSeq.scala +++ b/src/library/scala/collection/LinearSeq.scala @@ -20,6 +20,7 @@ trait LinearSeq[+A] extends Seq[A] with GenericTraversableTemplate[A, LinearSeq] with LinearSeqLike[A, LinearSeq[A]] { override def companion: GenericCompanion[LinearSeq] = LinearSeq + override def seq: LinearSeq[A] = this } /** $factoryInfo diff --git a/src/library/scala/collection/LinearSeqLike.scala b/src/library/scala/collection/LinearSeqLike.scala index 1b3fa9b26e..75c1edac66 100644 --- a/src/library/scala/collection/LinearSeqLike.scala +++ b/src/library/scala/collection/LinearSeqLike.scala @@ -41,11 +41,16 @@ import scala.util.control.Breaks._ * @tparam A the element type of the $coll * @tparam Repr the type of the actual $coll containing the elements. */ -trait LinearSeqLike[+A, +Repr <: LinearSeqLike[A, Repr]] extends SeqLike[A, Repr] { self: Repr => +trait LinearSeqLike[+A, +Repr <: LinearSeqLike[A, Repr]] extends SeqLike[A, Repr] { + self: Repr => override protected[this] def thisCollection: LinearSeq[A] = this.asInstanceOf[LinearSeq[A]] override protected[this] def toCollection(repr: Repr): LinearSeq[A] = repr.asInstanceOf[LinearSeq[A]] + def seq: LinearSeq[A] + + override def hashCode() = util.MurmurHash3.seqHash(seq) // TODO - can we get faster via "linearSeqHash" ? + override /*IterableLike*/ def iterator: Iterator[A] = new AbstractIterator[A] { var these = self diff --git a/src/library/scala/collection/Map.scala b/src/library/scala/collection/Map.scala index a72bb13b5d..0c07d5bb74 100644 --- a/src/library/scala/collection/Map.scala +++ b/src/library/scala/collection/Map.scala @@ -37,9 +37,6 @@ trait Map[A, +B] extends Iterable[(A, B)] with GenMap[A, B] with MapLike[A, B, M * @define coll map */ object Map extends MapFactory[Map] { - - private[collection] val hashSeed = "Map".hashCode - def empty[A, B]: immutable.Map[A, B] = immutable.Map.empty /** $mapCanBuildFromInfo */ diff --git a/src/library/scala/collection/Seq.scala b/src/library/scala/collection/Seq.scala index cb3fe5ca71..fd03a49af4 100644 --- a/src/library/scala/collection/Seq.scala +++ b/src/library/scala/collection/Seq.scala @@ -6,8 +6,6 @@ ** |/ ** \* */ - - package scala.collection import generic._ @@ -32,9 +30,6 @@ trait Seq[+A] extends PartialFunction[Int, A] * @define Coll Seq */ object Seq extends SeqFactory[Seq] { - - private[collection] val hashSeed = "Seq".hashCode - /** $genericCanBuildFromInfo */ implicit def canBuildFrom[A]: CanBuildFrom[Coll, A, Seq[A]] = ReusableCBF.asInstanceOf[GenericCanBuildFrom[A]] diff --git a/src/library/scala/collection/Set.scala b/src/library/scala/collection/Set.scala index bdfbcd9c74..4c67aad603 100644 --- a/src/library/scala/collection/Set.scala +++ b/src/library/scala/collection/Set.scala @@ -38,8 +38,6 @@ trait Set[A] extends (A => Boolean) * @define Coll Set */ object Set extends SetFactory[Set] { - private[collection] val hashSeed = "Set".hashCode - def newBuilder[A] = immutable.Set.newBuilder[A] override def empty[A]: Set[A] = immutable.Set.empty[A] implicit def canBuildFrom[A]: CanBuildFrom[Coll, A, Set[A]] = setCanBuildFrom[A] diff --git a/src/library/scala/collection/immutable/IndexedSeq.scala b/src/library/scala/collection/immutable/IndexedSeq.scala index 35b93860a3..bbbef158af 100644 --- a/src/library/scala/collection/immutable/IndexedSeq.scala +++ b/src/library/scala/collection/immutable/IndexedSeq.scala @@ -23,6 +23,7 @@ trait IndexedSeq[+A] extends Seq[A] with IndexedSeqLike[A, IndexedSeq[A]] { override def companion: GenericCompanion[IndexedSeq] = IndexedSeq override def toIndexedSeq[B >: A]: IndexedSeq[B] = this + override def seq: IndexedSeq[A] = this } /** $factoryInfo diff --git a/src/library/scala/collection/immutable/LinearSeq.scala b/src/library/scala/collection/immutable/LinearSeq.scala index e2e2d97588..536894c287 100644 --- a/src/library/scala/collection/immutable/LinearSeq.scala +++ b/src/library/scala/collection/immutable/LinearSeq.scala @@ -23,6 +23,7 @@ trait LinearSeq[+A] extends Seq[A] with GenericTraversableTemplate[A, LinearSeq] with LinearSeqLike[A, LinearSeq[A]] { override def companion: GenericCompanion[LinearSeq] = LinearSeq + override def seq: LinearSeq[A] = this } /** $factoryInfo diff --git a/src/library/scala/collection/immutable/Set.scala b/src/library/scala/collection/immutable/Set.scala index fe3ce8fd0b..cd972d6c30 100644 --- a/src/library/scala/collection/immutable/Set.scala +++ b/src/library/scala/collection/immutable/Set.scala @@ -46,8 +46,6 @@ object Set extends ImmutableSetFactory[Set] { implicit def canBuildFrom[A]: CanBuildFrom[Coll, A, Set[A]] = setCanBuildFrom[A] override def empty[A]: Set[A] = EmptySet.asInstanceOf[Set[A]] - private val hashSeed = "Set".hashCode - /** An optimized representation for immutable empty sets */ private object EmptySet extends AbstractSet[Any] with Set[Any] with Serializable { override def size: Int = 0 diff --git a/src/library/scala/collection/immutable/StringOps.scala b/src/library/scala/collection/immutable/StringOps.scala index 5fc71c7259..8612357db9 100644 --- a/src/library/scala/collection/immutable/StringOps.scala +++ b/src/library/scala/collection/immutable/StringOps.scala @@ -48,5 +48,5 @@ final class StringOps(override val repr: String) extends StringLike[String] { override def toString = repr override def length = repr.length - def seq = this.iterator + def seq = new WrappedString(repr) } diff --git a/src/library/scala/collection/mutable/ArrayOps.scala b/src/library/scala/collection/mutable/ArrayOps.scala index 008e7fab9a..5f7c508181 100644 --- a/src/library/scala/collection/mutable/ArrayOps.scala +++ b/src/library/scala/collection/mutable/ArrayOps.scala @@ -93,7 +93,7 @@ abstract class ArrayOps[T] extends ArrayLike[T, Array[T]] with CustomParalleliza bb.result } - def seq = this.iterator + def seq = thisCollection } diff --git a/src/library/scala/collection/mutable/IndexedSeq.scala b/src/library/scala/collection/mutable/IndexedSeq.scala index e55cf0429f..0e2e06df84 100644 --- a/src/library/scala/collection/mutable/IndexedSeq.scala +++ b/src/library/scala/collection/mutable/IndexedSeq.scala @@ -23,6 +23,7 @@ trait IndexedSeq[A] extends Seq[A] with GenericTraversableTemplate[A, IndexedSeq] with IndexedSeqLike[A, IndexedSeq[A]] { override def companion: GenericCompanion[IndexedSeq] = IndexedSeq + override def seq: IndexedSeq[A] = this } /** $factoryInfo diff --git a/src/library/scala/collection/mutable/LinearSeq.scala b/src/library/scala/collection/mutable/LinearSeq.scala index 0994c9e885..e29d6bf3e6 100644 --- a/src/library/scala/collection/mutable/LinearSeq.scala +++ b/src/library/scala/collection/mutable/LinearSeq.scala @@ -25,6 +25,7 @@ trait LinearSeq[A] extends Seq[A] with GenericTraversableTemplate[A, LinearSeq] with LinearSeqLike[A, LinearSeq[A]] { override def companion: GenericCompanion[LinearSeq] = LinearSeq + override def seq: LinearSeq[A] = this } /** $factoryInfo diff --git a/src/library/scala/util/MurmurHash3.scala b/src/library/scala/util/MurmurHash3.scala index 0fd8b2e123..33d9d2f0e5 100644 --- a/src/library/scala/util/MurmurHash3.scala +++ b/src/library/scala/util/MurmurHash3.scala @@ -2,7 +2,6 @@ package scala.util import java.lang.Integer.{ rotateLeft => rotl } - /** * An implementation of Austin Appleby's MurmurHash 3 algorithm * (MurmurHash3_x86_32). @@ -22,15 +21,7 @@ import java.lang.Integer.{ rotateLeft => rotl } * * @see http://code.google.com/p/smhasher */ -object MurmurHash3 { - - // Some arbitrary values used as hash seeds - final val arraySeed = 0x3c074a61 - final val stringSeed = 0xf7ca7fd2 - final val productSeed = 0xcafebabe - final val symmetricSeed = 0xb592f7ae - final val traversableSeed = 0xe73a8b15 - +class MurmurHash3 { /** Mix in a block of data into an intermediate hash value. */ final def mix(hash: Int, data: Int): Int = { var h = mixLast(hash, data) @@ -68,7 +59,7 @@ object MurmurHash3 { } /** Compute the hash of a product */ - final def productHash(x: Product, seed: Int = productSeed): Int = { + final def productHash(x: Product, seed: Int): Int = { val arr = x.productArity // Case objects have the hashCode inlined directly into the // synthetic hashCode method, but this method should still give @@ -88,7 +79,7 @@ object MurmurHash3 { } /** Compute the hash of a string */ - final def stringHash(str: String, seed: Int = stringSeed): Int = { + final def stringHash(str: String, seed: Int): Int = { var h = seed var i = 0 while (i + 1 < str.length) { @@ -102,38 +93,39 @@ object MurmurHash3 { /** Compute a hash that is symmetric in its arguments - that is a hash * where the order of appearance of elements does not matter. - * This is useful for hashing sets, for example. */ - final def symmetricHash[T](xs: collection.GenTraversableOnce[T], seed: Int = symmetricSeed): Int = { + * This is useful for hashing sets, for example. + */ + final def unorderedHash(xs: TraversableOnce[Any], seed: Int): Int = { var a, b, n = 0 var c = 1 - xs.seq.foreach { x => + xs foreach { x => val h = x.## a += h b ^= h if (h != 0) c *= h n += 1 } - var h = seed h = mix(h, a) h = mix(h, b) h = mixLast(h, c) finalizeHash(h, n) } - - /** Compute a hash for a traversable (once). */ - final def traversableHash[T](xs: collection.GenTraversableOnce[T], seed: Int = traversableSeed): Int = { + /** Compute a hash that depends on the order of its arguments. + */ + final def orderedHash(xs: TraversableOnce[Any], seed: Int): Int = { var n = 0 var h = seed - xs.seq.foreach { x => + xs foreach { x => h = mix(h, x.##) n += 1 } finalizeHash(h, n) } - /** Compute the hash of an array */ - final def arrayHash[@specialized T](a: Array[T], seed: Int = arraySeed): Int = { + /** Compute the hash of an array. + */ + final def arrayHash[@specialized T](a: Array[T], seed: Int): Int = { var h = seed var i = 0 while (i < a.length) { @@ -144,8 +136,9 @@ object MurmurHash3 { } /** Compute the hash of a byte array. Faster than arrayHash, because - * it hashes 4 bytes at once. */ - final def bytesHash(data: Array[Byte], seed: Int = arraySeed): Int = { + * it hashes 4 bytes at once. + */ + final def bytesHash(data: Array[Byte], seed: Int): Int = { var len = data.length var h = seed @@ -176,3 +169,67 @@ object MurmurHash3 { finalizeHash(h, data.length) } } + +/** + * An instance of MurmurHash3 with predefined seeds for various + * classes. Used by all the scala collections and case classes. + */ +object MurmurHash3 extends MurmurHash3 { + final val arraySeed = 0x3c074a61 + final val stringSeed = 0xf7ca7fd2 + final val productSeed = 0xcafebabe + final val symmetricSeed = 0xb592f7ae + final val traversableSeed = 0xe73a8b15 + final val seqSeed = "Seq".hashCode + final val mapSeed = "Map".hashCode + final val setSeed = "Set".hashCode + + def arrayHash[@specialized T](a: Array[T]): Int = arrayHash(a, arraySeed) + def bytesHash(data: Array[Byte]): Int = bytesHash(data, arraySeed) + def orderedHash(xs: TraversableOnce[Any]): Int = orderedHash(xs, symmetricSeed) + def productHash(x: Product): Int = productHash(x, productSeed) + def stringHash(x: String): Int = stringHash(x, stringSeed) + def unorderedHash(xs: TraversableOnce[Any]): Int = unorderedHash(xs, traversableSeed) + + /** To offer some potential for optimization. + */ + def seqHash(xs: collection.Seq[_]): Int = orderedHash(xs, seqSeed) + def mapHash(xs: collection.Map[_, _]): Int = unorderedHash(xs, mapSeed) + def setHash(xs: collection.Set[_]): Int = unorderedHash(xs, setSeed) + + /** All this trouble and foreach still appears faster. + * Leaving in place in case someone would like to investigate further. + */ + /** + def linearSeqHash(xs: collection.LinearSeq[_], seed: Int): Int = { + var n = 0 + var h = seed + var elems = xs + while (elems.nonEmpty) { + h = mix(h, elems.head.##) + n += 1 + elems = elems.tail + } + finalizeHash(h, n) + } + + def indexedSeqHash(xs: collection.IndexedSeq[_], seed: Int): Int = { + var n = 0 + var h = seed + val len = xs.length + while (n < len) { + h = mix(h, xs(n).##) + n += 1 + } + finalizeHash(h, n) + } + */ + + @deprecated("Use unorderedHash", "2.10.0") + final def symmetricHash[T](xs: collection.GenTraversableOnce[T], seed: Int = symmetricSeed): Int = + unorderedHash(xs.seq, seed) + + @deprecated("Use orderedHash", "2.10.0") + final def traversableHash[T](xs: collection.GenTraversableOnce[T], seed: Int = traversableSeed): Int = + orderedHash(xs.seq, seed) +} diff --git a/src/library/scala/xml/Utility.scala b/src/library/scala/xml/Utility.scala index 9f230a7866..9b48f4e1bb 100644 --- a/src/library/scala/xml/Utility.scala +++ b/src/library/scala/xml/Utility.scala @@ -259,7 +259,7 @@ object Utility extends AnyRef with parsing.TokenTests { * @param children */ def hashCode(pre: String, label: String, attribHashCode: Int, scpeHash: Int, children: Seq[Node]) = - scala.util.MurmurHash3.traversableHash(label +: attribHashCode +: scpeHash +: children, pre.##) + scala.util.MurmurHash3.orderedHash(label +: attribHashCode +: scpeHash +: children, pre.##) def appendQuoted(s: String): String = sbToString(appendQuoted(s, _)) |