summaryrefslogtreecommitdiff
path: root/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
diff options
context:
space:
mode:
Diffstat (limited to 'core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala')
-rwxr-xr-xcore/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala34
1 files changed, 11 insertions, 23 deletions
diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
index 5e01bb1..8381921 100755
--- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
+++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
@@ -1,23 +1,22 @@
package com.rockymadden.stringmetric.similarity
-import com.rockymadden.stringmetric.{StringMetric, MatchTuple, StringFilter}
-import com.rockymadden.stringmetric.tokenization.NGramTokenizer
+import com.rockymadden.stringmetric.Metric.StringMetricLike
/**
* An implementation of the Dice/Sorensen metric. This implementation differs in that n-gram size is required.
* Traditionally, the algorithm uses bigrams.
*/
-class DiceSorensenMetric extends StringMetric[Int, Double] { this: StringFilter =>
- final override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit n: Int): Option[Double] = {
- if (n <= 0) throw new IllegalArgumentException("Expected valid n.")
+final case class DiceSorensenMetric(private val n: Int) extends StringMetricLike[Double] {
+ import com.rockymadden.stringmetric.tokenization.NGramTokenizer
+ import com.rockymadden.stringmetric.MatchTuple
- val fca1 = filter(charArray1)
- lazy val fca2 = filter(charArray2)
+ override def compare(a: Array[Char], b: Array[Char]): Option[Double] = {
+ if (n <= 0) return None
- if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare.
- else if (fca1.sameElements(fca2)) Some(1d)
- else NGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg =>
- NGramTokenizer.tokenize(fca2)(n).map { ca2bg =>
+ if (a.length < n || b.length < n) None // Because length is less than n, it is not possible to compare.
+ else if (a.sameElements(b)) Some(1d)
+ else NGramTokenizer(n).tokenize(a).flatMap { ca1bg =>
+ NGramTokenizer(n).tokenize(b).map { ca2bg =>
val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString))
(2d * ms) / (ca1bg.length + ca2bg.length)
@@ -25,18 +24,7 @@ class DiceSorensenMetric extends StringMetric[Int, Double] { this: StringFilter
}
}
- final override def compare(string1: String, string2: String)(implicit n: Int): Option[Double] =
- compare(string1.toCharArray, string2.toCharArray)(n: Int)
+ override def compare(a: String, b: String): Option[Double] = compare(a.toCharArray, b.toCharArray)
private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length
}
-
-object DiceSorensenMetric {
- private lazy val self = apply()
-
- def apply(): DiceSorensenMetric = new DiceSorensenMetric with StringFilter
-
- def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = self.compare(charArray1, charArray2)(n)
-
- def compare(string1: String, string2: String)(n: Int) = self.compare(string1, string2)(n)
-}