diff options
author | Rocky Madden <git@rockymadden.com> | 2013-03-12 17:53:59 -0600 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2013-03-12 17:53:59 -0600 |
commit | b848cf2945b0be8d9fad0e81d98deda78e7443dd (patch) | |
tree | be4b6e8592c5166f4e9e162241b1e112f626e39d | |
parent | cd4919a5093b916e5f568734f8ec2e799aa1037a (diff) | |
download | stringmetric-b848cf2945b0be8d9fad0e81d98deda78e7443dd.tar.gz stringmetric-b848cf2945b0be8d9fad0e81d98deda78e7443dd.tar.bz2 stringmetric-b848cf2945b0be8d9fad0e81d98deda78e7443dd.zip |
Created overlap metric, spec, benchmark, and CLI.
9 files changed, 278 insertions, 3 deletions
diff --git a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/diceSorensenMetric.scala b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/diceSorensenMetric.scala index fc69dc7..e0d3f80 100755 --- a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/diceSorensenMetric.scala +++ b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/diceSorensenMetric.scala @@ -38,7 +38,7 @@ object diceSorensenMetric extends Command { tab + "-h, --help" + ls + tab + tab + "Outputs description, syntax, and options." + tab + "--n" + ls + - tab + tab + "The n, traditionally 2." + tab + tab + "The n-gram size, traditionally 2." ) } diff --git a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/jaccardMetric.scala b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/jaccardMetric.scala index d87c84b..b7aa9c7 100755 --- a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/jaccardMetric.scala +++ b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/jaccardMetric.scala @@ -38,7 +38,7 @@ object jaccardMetric extends Command { tab + "-h, --help" + ls + tab + tab + "Outputs description, syntax, and options." + tab + "--n" + ls + - tab + tab + "The n, traditionally 2." + tab + tab + "The n-gram size." ) } diff --git a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetric.scala b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetric.scala new file mode 100755 index 0000000..ebf0aca --- /dev/null +++ b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetric.scala @@ -0,0 +1,51 @@ +package com.rockymadden.stringmetric.cli.similarity + +import com.rockymadden.stringmetric.cli._ +import com.rockymadden.stringmetric.similarity.OverlapMetric + +/** + * The overlapMetric [[com.rockymadden.stringmetric.cli.Command]]. Compares the similarity of two strings using the + * overlap coefficient. + */ +object overlapMetric extends Command { + override def main(args: Array[String]): Unit = { + val options = OptionMap(args) + + try + if (options.contains('h) || options.contains('help)) { + help() + exit(options) + } else if (options.contains('dashless) && (options('dashless): OptionMapArray).length == 2 + && options.contains('n) && (options('n): OptionMapInt) >= 1) { + + execute(options) + exit(options) + } else throw new IllegalArgumentException("Expected valid syntax. See --help.") + catch { + case e: Throwable => error(e, options) + } + } + + override def help(): Unit = { + val ls = sys.props("line.separator") + val tab = " " + + println( + "Compares the similarity of two strings using the overlap coefficient." + ls + ls + + "Syntax:" + ls + + tab + "overlapMetric [Options] string1 string2..." + ls + ls + + "Options:" + ls + + tab + "-h, --help" + ls + + tab + tab + "Outputs description, syntax, and options." + + tab + "--n" + ls + + tab + tab + "The n-gram size." + ) + } + + override def execute(options: OptionMap): Unit = { + val strings: OptionMapArray = options('dashless) + val n: OptionMapInt = options('n) + + println(OverlapMetric.compare(strings(0), strings(1))(n).getOrElse("not comparable")) + } +} diff --git a/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetricSpec.scala b/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetricSpec.scala new file mode 100755 index 0000000..c1bb6b6 --- /dev/null +++ b/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetricSpec.scala @@ -0,0 +1,39 @@ +package com.rockymadden.stringmetric.cli.similarity + +import com.rockymadden.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class overlapMetricSpec extends ScalaTest { + "overlapMetric" should provide { + "main method" when passed { + "valid dashless arguments" should executes { + "print if they are a match" in { + val out = new java.io.ByteArrayOutputStream() + + Console.withOut(out)( + overlapMetric.main(Array("--unitTest", "--debug", "--n=2", "abc", "abc")) + ) + + out.toString should equal ("1.0\n") + out.reset() + + Console.withOut(out)( + overlapMetric.main(Array("--unitTest", "--debug", "--n=2", "abc", "xyz")) + ) + + out.toString should equal ("0.0\n") + out.reset() + } + } + "no dashless arguments" should throws { + "IllegalArgumentException" in { + evaluating { + overlapMetric.main(Array("--unitTest", "--debug")) + } should produce [IllegalArgumentException] + } + } + } + } +} diff --git a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/OverlapMetricBenchmark.scala b/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/OverlapMetricBenchmark.scala new file mode 100755 index 0000000..f607aff --- /dev/null +++ b/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/OverlapMetricBenchmark.scala @@ -0,0 +1,55 @@ +package com.rockymadden.stringmetric.similarity + +import com.google.caliper.Param +import com.rockymadden.stringmetric.{ CaliperBenchmark, CaliperRunner } +import scala.annotation.tailrec +import scala.util.Random + +final class OverlapMetricBenchmark extends CaliperBenchmark { + import OverlapMetricBenchmark.Metric + + @Param(Array("0", "1", "2", "4", "8", "16")) + var length: Int = _ + + var string1: String = _ + var charArray1: Array[Char] = _ + var string2: String = _ + var charArray2: Array[Char] = _ + + override protected def setUp() { + @tailrec + def random(l: Int, ps: String = null): String = + if (l == 0) "" + else { + val s = Random.alphanumeric.take(l).mkString + + if (ps == null || s != ps) s + else random(l, ps) + } + + string1 = random(length) + string2 = random(length, string1) + charArray1 = string1.toCharArray + charArray2 = string2.toCharArray + } + + def timeCompareWithDifferentCharArrays(reps: Int) = run(reps) { + Metric.compare(charArray1, charArray2)(2) + } + + def timeCompareWithDifferentStrings(reps: Int) = run(reps) { + Metric.compare(string1, string2)(2) + } + + def timeCompareWithIdenticalCharArrays(reps: Int) = run(reps) { + Metric.compare(charArray1, charArray1)(2) + } + + def timeCompareWithIdenticalStrings(reps: Int) = run(reps) { + Metric.compare(string1, string1)(2) + } +} + +object OverlapMetricBenchmark extends CaliperRunner(classOf[OverlapMetricBenchmark]) { + private final val Metric = OverlapMetric() +} diff --git a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala index 0ace668..575134a 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala @@ -14,6 +14,9 @@ object ConfigurableStringMetric { type NGramMetric = com.rockymadden.stringmetric.similarity.NGramMetric val NGramMetric = com.rockymadden.stringmetric.similarity.NGramMetric + type OverlapMetric = com.rockymadden.stringmetric.similarity.OverlapMetric + val OverlapMetric = com.rockymadden.stringmetric.similarity.OverlapMetric + type WeightedLevenshteinMetric = com.rockymadden.stringmetric.similarity.WeightedLevenshteinMetric val WeightedLevenshteinMetric = com.rockymadden.stringmetric.similarity.WeightedLevenshteinMetric @@ -33,6 +36,11 @@ object ConfigurableStringMetric { def compareWithNGram(string1: String, string2: String)(n: Int) = NGramMetric.compare(string1, string2)(n) + def compareWithOverlap(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = + OverlapMetric.compare(charArray1, charArray2)(n) + + def compareWithOverlap(string1: String, string2: String)(n: Int) = OverlapMetric.compare(string1, string2)(n) + def compareWithWeightedLevenshtein(charArray1: Array[Char], charArray2: Array[Char]) (options: (BigDecimal, BigDecimal, BigDecimal)) = diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala new file mode 100755 index 0000000..3bf85ca --- /dev/null +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala @@ -0,0 +1,45 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter } +import scala.math + +/* An implementation of the overlap metric. */ +class OverlapMetric extends ConfigurableStringMetric[Double, Int] { + this: StringFilter => + + final override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit n: Int): Option[Double] = { + if (n <= 0) throw new IllegalArgumentException("Expected valid n.") + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. + else if (fca1.sameElements(fca2)) Some(1d) + else { + val nGramAlgorithm = NGramAlgorithm() + + nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg => + nGramAlgorithm.compute(fca2)(n).map { ca2bg => + val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString))) + + ms.toDouble / (math.min(ca1bg.length, ca2bg.length)) + } + } + } + } + + final override def compare(string1: String, string2: String)(implicit n: Int): Option[Double] = + compare(string1.toCharArray, string2.toCharArray)(n: Int) + + private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length +} + +object OverlapMetric { + private lazy val self = apply() + + def apply(): OverlapMetric = new OverlapMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = self.compare(charArray1, charArray2)(n) + + def compare(string1: String, string2: String)(n: Int) = self.compare(string1, string2)(n) +} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/similarity/OverlapMetricSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/similarity/OverlapMetricSpec.scala new file mode 100755 index 0000000..32c9650 --- /dev/null +++ b/core/source/test/scala/com/rockymadden/stringmetric/similarity/OverlapMetricSpec.scala @@ -0,0 +1,77 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class OverlapMetricSpec extends ScalaTest { + import OverlapMetricSpec.Metric + + "OverlapMetric" should provide { + "compare method" when passed { + "empty arguments" should returns { + "None" in { + Metric.compare("", "")(1).isDefined should be (false) + Metric.compare("abc", "")(1).isDefined should be (false) + Metric.compare("", "xyz")(1).isDefined should be (false) + } + } + "equal arguments" should returns { + "1" in { + Metric.compare("abc", "abc")(1).get should be (1) + Metric.compare("abc", "abc")(2).get should be (1) + Metric.compare("abc", "abc")(3).get should be (1) + } + } + "unequal arguments" should returns { + "0" in { + Metric.compare("abc", "xyz")(1).get should be (0) + Metric.compare("abc", "xyz")(2).get should be (0) + Metric.compare("abc", "xyz")(3).get should be (0) + } + } + "invalid arguments" should returns { + "None" in { + Metric.compare("n", "naght")(2).isDefined should be (false) + Metric.compare("night", "n")(2).isDefined should be (false) + Metric.compare("ni", "naght")(3).isDefined should be (false) + Metric.compare("night", "na")(3).isDefined should be (false) + } + } + "valid arguments" should returns { + "Double indicating distance" in { + Metric.compare("bob", "bobman") (1).get should be (1) + Metric.compare("bob", "manbobman") (1).get should be (1) + Metric.compare("night", "nacht")(1).get should be (0.6) + Metric.compare("night", "naght")(1).get should be (0.8) + Metric.compare("context", "contact")(1).get should be (0.7142857142857143) + + Metric.compare("night", "nacht")(2).get should be (0.25) + Metric.compare("night", "naght")(2).get should be (0.5) + Metric.compare("context", "contact")(2).get should be (0.5) + Metric.compare("contextcontext", "contact")(2).get should be (0.5) + Metric.compare("context", "contactcontact")(2).get should be (0.5) + Metric.compare("ht", "nacht")(2).get should be (1) + Metric.compare("xp", "nacht")(2).get should be (0) + Metric.compare("ht", "hththt")(2).get should be (1) + + Metric.compare("night", "nacht")(3).get should be (0) + Metric.compare("night", "naght")(3).get should be (0.3333333333333333) + Metric.compare("context", "contact")(3).get should be (0.4) + } + } + } + } + "OverlapMetric companion object" should provide { + "pass-through compare method" should returns { + "same value as class" in { + OverlapMetric.compare("context", "contact")(3).get should be (0.4) + } + } + } +} + +object OverlapMetricSpec { + private final val Metric = OverlapMetric() +} @@ -15,7 +15,7 @@ String metrics and phonetic algorithms for Scala. The library provides facilitie * __[Needleman-Wunch](http://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm)__ (Queued similarity metric) * __[N-Gram](http://en.wikipedia.org/wiki/N-gram)__ (Similarity metric and algorithm) * __[NYSIIS](http://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System)__ (Phonetic metric and algorithm) -* __[Overlap](http://en.wikipedia.org/wiki/Overlap_coefficient)__ (Queued similarity metric) +* __[Overlap](http://en.wikipedia.org/wiki/Overlap_coefficient)__ (Similarity metric) * __[Ratcliff-Obershelp](http://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html)__ (Similarity metric) * __[Refined NYSIIS](http://www.markcrocker.com/rexxtipsntricks/rxtt28.2.0482.html)__ (Phonetic metric and algorithm) * __[Refined Soundex](http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html)__ (Phonetic metric and algorithm) |