summaryrefslogtreecommitdiff
path: root/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
blob: 2c16bba60db70c7110f3b6dace1b42319b35cc79 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
package com.rockymadden.stringmetric.similarity

import com.rockymadden.stringmetric.Metric.StringMetricLike

/**
 * An implementation of the Dice/Sorensen metric. This implementation differs in that n-gram size is required.
 * Traditionally, the algorithm uses bigrams.
 */
final case class DiceSorensenMetric(private val n: Int) extends StringMetricLike[Double] {
	import com.rockymadden.stringmetric.tokenize.NGramTokenizer
	import com.rockymadden.stringmetric.MatchTuple

	override def compare(a: Array[Char], b: Array[Char]): Option[Double] = {
		if (n <= 0) return None

		if (a.length < n || b.length < n) None // Because length is less than n, it is not possible to compare.
		else if (a.sameElements(b)) Some(1d)
		else NGramTokenizer(n).tokenize(a).flatMap { ca1bg =>
			NGramTokenizer(n).tokenize(b).map { ca2bg =>
				val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString))

				(2d * ms) / (ca1bg.length + ca2bg.length)
			}
		}
	}

	override def compare(a: String, b: String): Option[Double] = compare(a.toCharArray, b.toCharArray)

	private val scoreMatches: (MatchTuple[String] => Int) = (mt) => mt._1.intersect(mt._2).length
}