blob: 273d69b54d0684df1cc47d3414d6a9afa19dbf5c (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
package com.rockymadden.stringmetric.similarity
import com.rockymadden.stringmetric.{ StringFilter, StringMetric }
/**
* An implementation of the Jaro-Winkler metric. One differing detail in this implementation is that if a character is
* matched in string2, it cannot be matched upon again. This results in a more penalized distance in these scenarios
* (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722).
*/
class JaroWinklerMetric extends StringMetric[Double] {
this: StringFilter =>
final override def compare(charArray1: Array[Char], charArray2: Array[Char]): Option[Double] = {
val fca1 = filter(charArray1)
val fca2 = filter(charArray2)
JaroMetric().compare(fca1, fca2).map {
case 0d => 0d
case 1d => 1d
case jaro => {
val prefix = fca1.zip(fca2).takeWhile(t => t._1 == t._2)
jaro + ((if (prefix.length <= 4) prefix.length else 4) * 0.1d * (1 - jaro))
}
}
}
final override def compare(string1: String, string2: String): Option[Double] =
compare(filter(string1.toCharArray), filter(string2.toCharArray))
}
object JaroWinklerMetric {
private lazy val self = apply()
def apply(): JaroWinklerMetric = new JaroWinklerMetric with StringFilter
def compare(charArray1: Array[Char], charArray2: Array[Char]) = self.compare(charArray1, charArray2)
def compare(string1: String, string2: String) = self.compare(string1, string2)
}
|