summaryrefslogtreecommitdiff
path: root/core/src/main/scala/com/rockymadden/stringmetric/similarity/JaroMetric.scala
blob: 575d67a4e1d9b36b7f127b712d020a727a3ae552 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
package com.rockymadden.stringmetric.similarity

import com.rockymadden.stringmetric.Metric.StringMetric
import scala.Some

/**
 * An implementation of the Jaro metric. One differing detail in this implementation is that if a character is matched
 * in string2, it cannot be matched upon again. This results in a more penalized distance in these scenarios.
 */
case object JaroMetric extends StringMetric[Double] {
	import com.rockymadden.stringmetric.{CompareTuple, MatchTuple}
	import scala.collection.mutable.{ArrayBuffer, HashSet}

	override def compare(a: Array[Char], b: Array[Char]): Option[Double] =
		if (a.length == 0 || b.length == 0) None
		else if (a.sameElements(b)) Some(1d)
		else {
			val mt = `match`(a, b)
			val ms = scoreMatches(mt._1, mt._2)

			if (ms == 0) Some(0d)
			else {
				val ts = scoreTranspositions(mt._1, mt._2)

				Some(((ms.toDouble / a.length) + (ms.toDouble / b.length) + ((ms.toDouble - ts) / ms)) / 3)
			}
		}

	override def compare(a: String, b: String): Option[Double] = compare(a.toCharArray, b.toCharArray)

	private val `match`: (CompareTuple[Char] => MatchTuple[Char]) = (ct) => {
		lazy val window = math.abs((math.max(ct._1.length, ct._2.length) / 2d).floor.toInt - 1)
		val one = ArrayBuffer.empty[Int]
		val two = HashSet.empty[Int]
		var i = 0
		var bi = false

		while (i < ct._1.length && !bi) {
			val start = if (i - window <= 0) 0 else i - window
			val end = if (i + window >= ct._2.length - 1) ct._2.length - 1 else i + window

			if (start > ct._2.length - 1) bi = !bi
			else {
				var ii = start
				var bii = false

				while (ii <= end && !bii) {
					if (!two.contains(ii) && ct._1(i) == ct._2(ii)) {
						one += i
						two += ii
						bii = !bii
					} else ii += 1
				}

				i += 1
			}
		}

		(one.toArray.map(ct._1(_)), two.toArray.sortWith(_ < _).map(ct._2(_)))
	}

	private val scoreMatches: (MatchTuple[Char] => Int) = (mt) => mt._1.length

	private val scoreTranspositions: (MatchTuple[Char] => Int) = (mt) =>
		(mt._1.zip(mt._2).count(t => t._1 != t._2) / 2d).floor.toInt
}