diff options
author | Rocky Madden <git@rockymadden.com> | 2013-03-12 17:53:59 -0600 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2013-03-12 17:53:59 -0600 |
commit | b848cf2945b0be8d9fad0e81d98deda78e7443dd (patch) | |
tree | be4b6e8592c5166f4e9e162241b1e112f626e39d /core | |
parent | cd4919a5093b916e5f568734f8ec2e799aa1037a (diff) | |
download | stringmetric-b848cf2945b0be8d9fad0e81d98deda78e7443dd.tar.gz stringmetric-b848cf2945b0be8d9fad0e81d98deda78e7443dd.tar.bz2 stringmetric-b848cf2945b0be8d9fad0e81d98deda78e7443dd.zip |
Created overlap metric, spec, benchmark, and CLI.
Diffstat (limited to 'core')
4 files changed, 185 insertions, 0 deletions
diff --git a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/OverlapMetricBenchmark.scala b/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/OverlapMetricBenchmark.scala new file mode 100755 index 0000000..f607aff --- /dev/null +++ b/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/OverlapMetricBenchmark.scala @@ -0,0 +1,55 @@ +package com.rockymadden.stringmetric.similarity + +import com.google.caliper.Param +import com.rockymadden.stringmetric.{ CaliperBenchmark, CaliperRunner } +import scala.annotation.tailrec +import scala.util.Random + +final class OverlapMetricBenchmark extends CaliperBenchmark { + import OverlapMetricBenchmark.Metric + + @Param(Array("0", "1", "2", "4", "8", "16")) + var length: Int = _ + + var string1: String = _ + var charArray1: Array[Char] = _ + var string2: String = _ + var charArray2: Array[Char] = _ + + override protected def setUp() { + @tailrec + def random(l: Int, ps: String = null): String = + if (l == 0) "" + else { + val s = Random.alphanumeric.take(l).mkString + + if (ps == null || s != ps) s + else random(l, ps) + } + + string1 = random(length) + string2 = random(length, string1) + charArray1 = string1.toCharArray + charArray2 = string2.toCharArray + } + + def timeCompareWithDifferentCharArrays(reps: Int) = run(reps) { + Metric.compare(charArray1, charArray2)(2) + } + + def timeCompareWithDifferentStrings(reps: Int) = run(reps) { + Metric.compare(string1, string2)(2) + } + + def timeCompareWithIdenticalCharArrays(reps: Int) = run(reps) { + Metric.compare(charArray1, charArray1)(2) + } + + def timeCompareWithIdenticalStrings(reps: Int) = run(reps) { + Metric.compare(string1, string1)(2) + } +} + +object OverlapMetricBenchmark extends CaliperRunner(classOf[OverlapMetricBenchmark]) { + private final val Metric = OverlapMetric() +} diff --git a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala index 0ace668..575134a 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala @@ -14,6 +14,9 @@ object ConfigurableStringMetric { type NGramMetric = com.rockymadden.stringmetric.similarity.NGramMetric val NGramMetric = com.rockymadden.stringmetric.similarity.NGramMetric + type OverlapMetric = com.rockymadden.stringmetric.similarity.OverlapMetric + val OverlapMetric = com.rockymadden.stringmetric.similarity.OverlapMetric + type WeightedLevenshteinMetric = com.rockymadden.stringmetric.similarity.WeightedLevenshteinMetric val WeightedLevenshteinMetric = com.rockymadden.stringmetric.similarity.WeightedLevenshteinMetric @@ -33,6 +36,11 @@ object ConfigurableStringMetric { def compareWithNGram(string1: String, string2: String)(n: Int) = NGramMetric.compare(string1, string2)(n) + def compareWithOverlap(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = + OverlapMetric.compare(charArray1, charArray2)(n) + + def compareWithOverlap(string1: String, string2: String)(n: Int) = OverlapMetric.compare(string1, string2)(n) + def compareWithWeightedLevenshtein(charArray1: Array[Char], charArray2: Array[Char]) (options: (BigDecimal, BigDecimal, BigDecimal)) = diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala new file mode 100755 index 0000000..3bf85ca --- /dev/null +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala @@ -0,0 +1,45 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter } +import scala.math + +/* An implementation of the overlap metric. */ +class OverlapMetric extends ConfigurableStringMetric[Double, Int] { + this: StringFilter => + + final override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit n: Int): Option[Double] = { + if (n <= 0) throw new IllegalArgumentException("Expected valid n.") + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. + else if (fca1.sameElements(fca2)) Some(1d) + else { + val nGramAlgorithm = NGramAlgorithm() + + nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg => + nGramAlgorithm.compute(fca2)(n).map { ca2bg => + val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString))) + + ms.toDouble / (math.min(ca1bg.length, ca2bg.length)) + } + } + } + } + + final override def compare(string1: String, string2: String)(implicit n: Int): Option[Double] = + compare(string1.toCharArray, string2.toCharArray)(n: Int) + + private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length +} + +object OverlapMetric { + private lazy val self = apply() + + def apply(): OverlapMetric = new OverlapMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = self.compare(charArray1, charArray2)(n) + + def compare(string1: String, string2: String)(n: Int) = self.compare(string1, string2)(n) +} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/similarity/OverlapMetricSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/similarity/OverlapMetricSpec.scala new file mode 100755 index 0000000..32c9650 --- /dev/null +++ b/core/source/test/scala/com/rockymadden/stringmetric/similarity/OverlapMetricSpec.scala @@ -0,0 +1,77 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class OverlapMetricSpec extends ScalaTest { + import OverlapMetricSpec.Metric + + "OverlapMetric" should provide { + "compare method" when passed { + "empty arguments" should returns { + "None" in { + Metric.compare("", "")(1).isDefined should be (false) + Metric.compare("abc", "")(1).isDefined should be (false) + Metric.compare("", "xyz")(1).isDefined should be (false) + } + } + "equal arguments" should returns { + "1" in { + Metric.compare("abc", "abc")(1).get should be (1) + Metric.compare("abc", "abc")(2).get should be (1) + Metric.compare("abc", "abc")(3).get should be (1) + } + } + "unequal arguments" should returns { + "0" in { + Metric.compare("abc", "xyz")(1).get should be (0) + Metric.compare("abc", "xyz")(2).get should be (0) + Metric.compare("abc", "xyz")(3).get should be (0) + } + } + "invalid arguments" should returns { + "None" in { + Metric.compare("n", "naght")(2).isDefined should be (false) + Metric.compare("night", "n")(2).isDefined should be (false) + Metric.compare("ni", "naght")(3).isDefined should be (false) + Metric.compare("night", "na")(3).isDefined should be (false) + } + } + "valid arguments" should returns { + "Double indicating distance" in { + Metric.compare("bob", "bobman") (1).get should be (1) + Metric.compare("bob", "manbobman") (1).get should be (1) + Metric.compare("night", "nacht")(1).get should be (0.6) + Metric.compare("night", "naght")(1).get should be (0.8) + Metric.compare("context", "contact")(1).get should be (0.7142857142857143) + + Metric.compare("night", "nacht")(2).get should be (0.25) + Metric.compare("night", "naght")(2).get should be (0.5) + Metric.compare("context", "contact")(2).get should be (0.5) + Metric.compare("contextcontext", "contact")(2).get should be (0.5) + Metric.compare("context", "contactcontact")(2).get should be (0.5) + Metric.compare("ht", "nacht")(2).get should be (1) + Metric.compare("xp", "nacht")(2).get should be (0) + Metric.compare("ht", "hththt")(2).get should be (1) + + Metric.compare("night", "nacht")(3).get should be (0) + Metric.compare("night", "naght")(3).get should be (0.3333333333333333) + Metric.compare("context", "contact")(3).get should be (0.4) + } + } + } + } + "OverlapMetric companion object" should provide { + "pass-through compare method" should returns { + "same value as class" in { + OverlapMetric.compare("context", "contact")(3).get should be (0.4) + } + } + } +} + +object OverlapMetricSpec { + private final val Metric = OverlapMetric() +} |