diff options
Diffstat (limited to 'core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala')
-rwxr-xr-x | core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala new file mode 100755 index 0000000..a543a7e --- /dev/null +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala @@ -0,0 +1,40 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{StringMetric, MatchTuple, StringFilter} +import com.rockymadden.stringmetric.tokenization.NGramTokenizer +import scala.math + +/* An implementation of the overlap metric. */ +class OverlapMetric extends StringMetric[Int, Double] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit n: Int): Option[Double] = { + if (n <= 0) throw new IllegalArgumentException("Expected valid n.") + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. + else if (fca1.sameElements(fca2)) Some(1d) + else NGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + NGramTokenizer.tokenize(fca2)(n).map { ca2bg => + val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString)) + + ms.toDouble / (math.min(ca1bg.length, ca2bg.length)) + } + } + } + + final override def compare(string1: String, string2: String)(implicit n: Int): Option[Double] = + compare(string1.toCharArray, string2.toCharArray)(n: Int) + + private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length +} + +object OverlapMetric { + private lazy val self = apply() + + def apply(): OverlapMetric = new OverlapMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = self.compare(charArray1, charArray2)(n) + + def compare(string1: String, string2: String)(n: Int) = self.compare(string1, string2)(n) +} |