summaryrefslogtreecommitdiff
path: root/core/src/main/scala/com/rockymadden/stringmetric/similarity/JaroWinklerMetric.scala
blob: e83f73f74fa5c9b3bcaca74d74e62e19a823546f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
package com.rockymadden.stringmetric.similarity

import com.rockymadden.stringmetric.Metric.StringMetric

/**
 * An implementation of the Jaro-Winkler metric. One differing detail in this implementation is that if a character is
 * matched in string2, it cannot be matched upon again. This results in a more penalized distance in these scenarios
 * (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722).
 */
case object JaroWinklerMetric extends StringMetric[Double] {
	override def compare(a: Array[Char], b: Array[Char]): Option[Double] =
		JaroMetric.compare(a, b).map {
			case 0d => 0d
			case 1d => 1d
			case jaro => {
				val prefix = a.zip(b).takeWhile(t => t._1 == t._2)

				jaro + ((if (prefix.length <= 4) prefix.length else 4) * 0.1d * (1 - jaro))
			}
		}

	override def compare(a: String, b: String): Option[Double] = compare(a.toCharArray, b.toCharArray)
}