summaryrefslogtreecommitdiff
path: root/core/src/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
diff options
context:
space:
mode:
Diffstat (limited to 'core/src/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala')
-rwxr-xr-xcore/src/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala27
1 files changed, 27 insertions, 0 deletions
diff --git a/core/src/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/core/src/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
new file mode 100755
index 0000000..0ad3915
--- /dev/null
+++ b/core/src/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
@@ -0,0 +1,27 @@
+package com.rockymadden.stringmetric.similarity
+
+import com.rockymadden.stringmetric.Metric.StringMetric
+
+/**
+ * An implementation of the Dice/Sorensen metric. This implementation differs in that n-gram size is required.
+ * Traditionally, the algorithm uses bigrams.
+ */
+final case class DiceSorensenMetric(n: Int) extends StringMetric[Double] {
+ import com.rockymadden.stringmetric.Tokenize.NGramTokenizer
+ import com.rockymadden.stringmetric.MatchTuple
+
+ override def compare(a: Array[Char], b: Array[Char]): Option[Double] =
+ if (n <= 0 || a.length < n || b.length < n) None // Because length is less than n, it is not possible to compare.
+ else if (a.sameElements(b)) Some(1d)
+ else NGramTokenizer(n).tokenize(a).flatMap { ca1bg =>
+ NGramTokenizer(n).tokenize(b).map { ca2bg =>
+ val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString))
+
+ (2d * ms) / (ca1bg.length + ca2bg.length)
+ }
+ }
+
+ override def compare(a: String, b: String): Option[Double] = compare(a.toCharArray, b.toCharArray)
+
+ private val scoreMatches: (MatchTuple[String] => Int) = (mt) => mt._1.intersect(mt._2).length
+}