From 24a227d23d651caa121bbe50ee3c81dfef17a337 Mon Sep 17 00:00:00 2001 From: RĂ¼diger Klaehn Date: Tue, 14 Jan 2014 23:37:15 +0100 Subject: Implements specialized subsetOf for HashSet Fixes SI-7326. This also adds a basic test for subsetOf that was missing before. --- .../scala/collection/immutable/HashSet.scala | 85 +++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) (limited to 'src/library') diff --git a/src/library/scala/collection/immutable/HashSet.scala b/src/library/scala/collection/immutable/HashSet.scala index 0546c54826..9ebcf72d24 100644 --- a/src/library/scala/collection/immutable/HashSet.scala +++ b/src/library/scala/collection/immutable/HashSet.scala @@ -12,9 +12,9 @@ package scala package collection package immutable -import scala.annotation.unchecked.{ uncheckedVariance => uV } import generic._ import scala.collection.parallel.immutable.ParHashSet +import scala.collection.GenSet /** This class implements immutable sets using a hash trie. * @@ -54,6 +54,30 @@ class HashSet[A] extends AbstractSet[A] def contains(e: A): Boolean = get0(e, computeHash(e), 0) + override def subsetOf(that: GenSet[A]) = that match { + case that:HashSet[A] => + // call the specialized implementation with a level of 0 since both this and that are top-level hash sets + subsetOf0(that, 0) + case _ => + // call the generic implementation + super.subsetOf(that) + } + + /** + * A specialized implementation of subsetOf for when both this and that are HashSet[A] and we can take advantage + * of the tree structure of both operands and the precalculated hashcodes of the HashSet1 instances. + * @param that the other set + * @param level the level of this and that hashset + * The purpose of level is to keep track of how deep we are in the tree. + * We need this information for when we arrive at a leaf and have to call get0 on that + * The value of level is 0 for a top-level HashSet and grows in increments of 5 + * @return true if all elements of this set are contained in that set + */ + protected def subsetOf0(that: HashSet[A], level: Int) = { + // The default implementation is for the empty set and returns true because the empty set is a subset of all sets + true + } + override def + (e: A): HashSet[A] = updated0(e, computeHash(e), 0) override def + (elem1: A, elem2: A, elems: A*): HashSet[A] = @@ -136,6 +160,14 @@ object HashSet extends ImmutableSetFactory[HashSet] { override def get0(key: A, hash: Int, level: Int): Boolean = (hash == this.hash && key == this.key) + override def subsetOf0(that: HashSet[A], level: Int) = { + // check if that contains this.key + // we use get0 with our key and hash at the correct level instead of calling contains, + // which would not work since that might not be a top-level HashSet + // and in any case would be inefficient because it would require recalculating the hash code + that.get0(key, hash, level) + } + override def updated0(key: A, hash: Int, level: Int): HashSet[A] = if (hash == this.hash && key == this.key) this else { @@ -162,6 +194,14 @@ object HashSet extends ImmutableSetFactory[HashSet] { override def get0(key: A, hash: Int, level: Int): Boolean = if (hash == this.hash) ks.contains(key) else false + override def subsetOf0(that: HashSet[A], level: Int) = { + // we have to check each element + // we use get0 with our hash at the correct level instead of calling contains, + // which would not work since that might not be a top-level HashSet + // and in any case would be inefficient because it would require recalculating the hash code + ks.forall(key => that.get0(key, hash, level)) + } + override def updated0(key: A, hash: Int, level: Int): HashSet[A] = if (hash == this.hash) new HashSetCollision1(hash, ks + key) else makeHashTrieSet(this.hash, this, hash, new HashSet1(key, hash), level) @@ -279,6 +319,49 @@ object HashSet extends ImmutableSetFactory[HashSet] { } } + override def subsetOf0(that: HashSet[A], level: Int): Boolean = if (that eq this) true else that match { + case that: HashTrieSet[A] if this.size0 <= that.size0 => + // create local mutable copies of members + var abm = this.bitmap + val a = this.elems + var ai = 0 + val b = that.elems + var bbm = that.bitmap + var bi = 0 + if ((abm & bbm) == abm) { + // I tried rewriting this using tail recursion, but the generated java byte code was less than optimal + while(abm!=0) { + // highest remaining bit in abm + val alsb = abm ^ (abm & (abm - 1)) + // highest remaining bit in bbm + val blsb = bbm ^ (bbm & (bbm - 1)) + // if both trees have a bit set at the same position, we need to check the subtrees + if (alsb == blsb) { + // we are doing a comparison of a child of this with a child of that, + // so we have to increase the level by 5 to keep track of how deep we are in the tree + if (!a(ai).subsetOf0(b(bi), level + 5)) + return false + // clear lowest remaining one bit in abm and increase the a index + abm &= ~alsb; ai += 1 + } + // clear lowermost remaining one bit in bbm and increase the b index + // we must do this in any case + bbm &= ~blsb; bi += 1 + } + true + } else { + // the bitmap of this contains more one bits than the bitmap of that, + // so this can not possibly be a subset of that + false + } + case _ => + // if the other set is a HashTrieSet but has less elements than this, it can not be a subset + // if the other set is a HashSet1, we can not be a subset of it because we are a HashTrieSet with at least two children (see assertion) + // if the other set is a HashSetCollision1, we can not be a subset of it because we are a HashTrieSet with at least two different hash codes + // if the other set is the empty set, we are not a subset of it because we are not empty + false + } + override def iterator = new TrieIterator[A](elems.asInstanceOf[Array[Iterable[A]]]) { final override def getElem(cc: AnyRef): A = cc.asInstanceOf[HashSet1[A]].key } -- cgit v1.2.3 From b33740f0b46ea8f8f6eb5efa7ecfcf7bcb6b57cd Mon Sep 17 00:00:00 2001 From: RĂ¼diger Klaehn Date: Tue, 14 Jan 2014 23:38:03 +0100 Subject: Improved documentation of HashTrieSet internals Added some more general documentation about how levels work and how the bitmap field encodes the position of elems. --- .../scala/collection/immutable/HashSet.scala | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'src/library') diff --git a/src/library/scala/collection/immutable/HashSet.scala b/src/library/scala/collection/immutable/HashSet.scala index 9ebcf72d24..384cb6631f 100644 --- a/src/library/scala/collection/immutable/HashSet.scala +++ b/src/library/scala/collection/immutable/HashSet.scala @@ -237,6 +237,42 @@ object HashSet extends ImmutableSetFactory[HashSet] { } + /** + * A branch node of the HashTrieSet with at least one and up to 32 children. + * + * @param bitmap encodes which element corresponds to which child + * @param elems the up to 32 children of this node. + * the number of children must be identical to the number of 1 bits in bitmap + * @param size0 the total number of elements. This is stored just for performance reasons. + * @tparam A the type of the elements contained in this hash set. + * + * How levels work: + * + * When looking up or adding elements, the part of the hashcode that is used to address the children array depends + * on how deep we are in the tree. This is accomplished by having a level parameter in all internal methods + * that starts at 0 and increases by 5 (32 = 2^5) every time we go deeper into the tree. + * + * hashcode (binary): 00000000000000000000000000000000 + * level=0 (depth=0) ^^^^^ + * level=5 (depth=1) ^^^^^ + * level=10 (depth=2) ^^^^^ + * ... + * + * Be careful: a non-toplevel HashTrieSet is not a self-contained set, so e.g. calling contains on it will not work! + * It relies on its depth in the Trie for which part of a hash to use to address the children, but this information + * (the level) is not stored due to storage efficiency reasons but has to be passed explicitly! + * + * How bitmap and elems correspond: + * + * A naive implementation of a HashTrieSet would always have an array of size 32 for children and leave the unused + * children empty (null). But that would be very wasteful regarding memory. Instead, only non-empty children are + * stored in elems, and the bitmap is used to encode which elem corresponds to which child bucket. The lowest 1 bit + * corresponds to the first element, the second-lowest to the second, etc. + * + * bitmap (binary): 00010000000000000000100000000000 + * elems: [a,b] + * children: ---b----------------a----------- + */ class HashTrieSet[A](private val bitmap: Int, private[collection] val elems: Array[HashSet[A]], private val size0: Int) extends HashSet[A] { assert(Integer.bitCount(bitmap) == elems.length) -- cgit v1.2.3