summaryrefslogtreecommitdiff
path: root/src/library/scala/util/hashing/MurmurHash3.scala
blob: 6a569104519aea340a44afb19d328cfa437d2ed8 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
/*                     __                                               *\
**     ________ ___   / /  ___     Scala API                            **
**    / __/ __// _ | / /  / _ |    (c) 2003-2013, LAMP/EPFL             **
**  __\ \/ /__/ __ |/ /__/ __ |    http://scala-lang.org/               **
** /____/\___/_/ |_/____/_/ | |                                         **
**                          |/                                          **
\*                                                                      */

package scala
package util.hashing

import java.lang.Integer.{ rotateLeft => rotl }

private[hashing] class MurmurHash3 {
  /** Mix in a block of data into an intermediate hash value. */
  final def mix(hash: Int, data: Int): Int = {
    var h = mixLast(hash, data)
    h = rotl(h, 13)
    h * 5 + 0xe6546b64
  }

  /** May optionally be used as the last mixing step. Is a little bit faster than mix,
   *  as it does no further mixing of the resulting hash. For the last element this is not
   *  necessary as the hash is thoroughly mixed during finalization anyway. */
  final def mixLast(hash: Int, data: Int): Int = {
    var k = data

    k *= 0xcc9e2d51
    k = rotl(k, 15)
    k *= 0x1b873593

    hash ^ k
  }

  /** Finalize a hash to incorporate the length and make sure all bits avalanche. */
  final def finalizeHash(hash: Int, length: Int): Int = avalanche(hash ^ length)

  /** Force all bits of the hash to avalanche. Used for finalizing the hash. */
  private final def avalanche(hash: Int): Int = {
    var h = hash

    h ^= h >>> 16
    h *= 0x85ebca6b
    h ^= h >>> 13
    h *= 0xc2b2ae35
    h ^= h >>> 16

    h
  }

  /** Compute the hash of a product */
  final def productHash(x: Product, seed: Int): Int = {
    val arr = x.productArity
    // Case objects have the hashCode inlined directly into the
    // synthetic hashCode method, but this method should still give
    // a correct result if passed a case object.
    if (arr == 0) {
      x.productPrefix.hashCode
    }
    else {
      var h = seed
      var i = 0
      while (i < arr) {
        h = mix(h, x.productElement(i).##)
        i += 1
      }
      finalizeHash(h, arr)
    }
  }

  /** Compute the hash of a string */
  final def stringHash(str: String, seed: Int): Int = {
    var h = seed
    var i = 0
    while (i + 1 < str.length) {
      val data = (str.charAt(i) << 16) + str.charAt(i + 1)
      h = mix(h, data)
      i += 2
    }
    if (i < str.length) h = mixLast(h, str.charAt(i).toInt)
    finalizeHash(h, str.length)
  }

  /** Compute a hash that is symmetric in its arguments - that is a hash
   *  where the order of appearance of elements does not matter.
   *  This is useful for hashing sets, for example.
   */
  final def unorderedHash(xs: TraversableOnce[Any], seed: Int): Int = {
    var a, b, n = 0
    var c = 1
    xs foreach { x =>
      val h = x.##
      a += h
      b ^= h
      if (h != 0) c *= h
      n += 1
    }
    var h = seed
    h = mix(h, a)
    h = mix(h, b)
    h = mixLast(h, c)
    finalizeHash(h, n)
  }
  /** Compute a hash that depends on the order of its arguments.
   */
  final def orderedHash(xs: TraversableOnce[Any], seed: Int): Int = {
    var n = 0
    var h = seed
    xs foreach { x =>
      h = mix(h, x.##)
      n += 1
    }
    finalizeHash(h, n)
  }

  /** Compute the hash of an array.
   */
  final def arrayHash[@specialized T](a: Array[T], seed: Int): Int = {
    var h = seed
    var i = 0
    while (i < a.length) {
      h = mix(h, a(i).##)
      i += 1
    }
    finalizeHash(h, a.length)
  }

  /** Compute the hash of a byte array. Faster than arrayHash, because
   *  it hashes 4 bytes at once.
   */
  final def bytesHash(data: Array[Byte], seed: Int): Int = {
    var len = data.length
    var h = seed

    // Body
    var i = 0
    while(len >= 4) {
      var k = data(i + 0) & 0xFF
      k |= (data(i + 1) & 0xFF) << 8
      k |= (data(i + 2) & 0xFF) << 16
      k |= (data(i + 3) & 0xFF) << 24

      h = mix(h, k)

      i += 4
      len -= 4
    }

    // Tail
    var k = 0
    if(len == 3) k ^= (data(i + 2) & 0xFF) << 16
    if(len >= 2) k ^= (data(i + 1) & 0xFF) << 8
    if(len >= 1) {
      k ^= (data(i + 0) & 0xFF)
      h = mixLast(h, k)
    }

    // Finalization
    finalizeHash(h, data.length)
  }

  final def listHash(xs: scala.collection.immutable.List[_], seed: Int): Int = {
    var n = 0
    var h = seed
    var elems = xs
    while (!elems.isEmpty) {
      val head = elems.head
      val tail = elems.tail
      h = mix(h, head.##)
      n += 1
      elems = tail
    }
    finalizeHash(h, n)
  }
}

/**
 * An implementation of Austin Appleby's MurmurHash 3 algorithm
 * (MurmurHash3_x86_32). This object contains methods that hash
 * values of various types as well as means to construct `Hashing`
 * objects.
 *
 * This algorithm is designed to generate well-distributed non-cryptographic
 * hashes. It is designed to hash data in 32 bit chunks (ints).
 *
 * The mix method needs to be called at each step to update the intermediate
 * hash value. For the last chunk to incorporate into the hash mixLast may
 * be used instead, which is slightly faster. Finally finalizeHash needs to
 * be called to compute the final hash value.
 *
 * This is based on the earlier MurmurHash3 code by Rex Kerr, but the
 * MurmurHash3 algorithm was since changed by its creator Austin Appleby
 * to remedy some weaknesses and improve performance. This represents the
 * latest and supposedly final version of the algorithm (revision 136).
 *
 * @see [[https://github.com/aappleby/smhasher]]
 */
object MurmurHash3 extends MurmurHash3 {
  final val arraySeed       = 0x3c074a61
  final val stringSeed      = 0xf7ca7fd2
  final val productSeed     = 0xcafebabe
  final val symmetricSeed   = 0xb592f7ae
  final val traversableSeed = 0xe73a8b15
  final val seqSeed         = "Seq".hashCode
  final val mapSeed         = "Map".hashCode
  final val setSeed         = "Set".hashCode

  def arrayHash[@specialized T](a: Array[T]): Int  = arrayHash(a, arraySeed)
  def bytesHash(data: Array[Byte]): Int            = bytesHash(data, arraySeed)
  def orderedHash(xs: TraversableOnce[Any]): Int   = orderedHash(xs, symmetricSeed)
  def productHash(x: Product): Int                 = productHash(x, productSeed)
  def stringHash(x: String): Int                   = stringHash(x, stringSeed)
  def unorderedHash(xs: TraversableOnce[Any]): Int = unorderedHash(xs, traversableSeed)

  /** To offer some potential for optimization.
   */
  def seqHash(xs: scala.collection.Seq[_]): Int    = xs match {
    case xs: List[_] => listHash(xs, seqSeed)
    case xs => orderedHash(xs, seqSeed)
  }

  def mapHash(xs: scala.collection.Map[_, _]): Int = unorderedHash(xs, mapSeed)
  def setHash(xs: scala.collection.Set[_]): Int    = unorderedHash(xs, setSeed)

  class ArrayHashing[@specialized T] extends Hashing[Array[T]] {
    def hash(a: Array[T]) = arrayHash(a)
  }

  def arrayHashing[@specialized T] = new ArrayHashing[T]

  def bytesHashing = new Hashing[Array[Byte]] {
    def hash(data: Array[Byte]) = bytesHash(data)
  }

  def orderedHashing = new Hashing[TraversableOnce[Any]] {
    def hash(xs: TraversableOnce[Any]) = orderedHash(xs)
  }

  def productHashing = new Hashing[Product] {
    def hash(x: Product) = productHash(x)
  }

  def stringHashing = new Hashing[String] {
    def hash(x: String) = stringHash(x)
  }

  def unorderedHashing = new Hashing[TraversableOnce[Any]] {
    def hash(xs: TraversableOnce[Any]) = unorderedHash(xs)
  }

  /** All this trouble and foreach still appears faster.
   *  Leaving in place in case someone would like to investigate further.
   */
  /**
  def linearSeqHash(xs: scala.collection.LinearSeq[_], seed: Int): Int = {
    var n = 0
    var h = seed
    var elems = xs
    while (elems.nonEmpty) {
      h = mix(h, elems.head.##)
      n += 1
      elems = elems.tail
    }
    finalizeHash(h, n)
  }

  def indexedSeqHash(xs: scala.collection.IndexedSeq[_], seed: Int): Int = {
    var n = 0
    var h = seed
    val len = xs.length
    while (n < len) {
      h = mix(h, xs(n).##)
      n += 1
    }
    finalizeHash(h, n)
  }
  */
}