summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2013-03-12 17:53:59 -0600
committerRocky Madden <git@rockymadden.com>2013-03-12 17:53:59 -0600
commitb848cf2945b0be8d9fad0e81d98deda78e7443dd (patch)
treebe4b6e8592c5166f4e9e162241b1e112f626e39d
parentcd4919a5093b916e5f568734f8ec2e799aa1037a (diff)
downloadstringmetric-b848cf2945b0be8d9fad0e81d98deda78e7443dd.tar.gz
stringmetric-b848cf2945b0be8d9fad0e81d98deda78e7443dd.tar.bz2
stringmetric-b848cf2945b0be8d9fad0e81d98deda78e7443dd.zip
Created overlap metric, spec, benchmark, and CLI.
-rwxr-xr-xcli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/diceSorensenMetric.scala2
-rwxr-xr-xcli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/jaccardMetric.scala2
-rwxr-xr-xcli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetric.scala51
-rwxr-xr-xcli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetricSpec.scala39
-rwxr-xr-xcore/source/benchmark/scala/com/rockymadden/stringmetric/similarity/OverlapMetricBenchmark.scala55
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala8
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala45
-rwxr-xr-xcore/source/test/scala/com/rockymadden/stringmetric/similarity/OverlapMetricSpec.scala77
-rwxr-xr-xreadme.md2
9 files changed, 278 insertions, 3 deletions
diff --git a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/diceSorensenMetric.scala b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/diceSorensenMetric.scala
index fc69dc7..e0d3f80 100755
--- a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/diceSorensenMetric.scala
+++ b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/diceSorensenMetric.scala
@@ -38,7 +38,7 @@ object diceSorensenMetric extends Command {
tab + "-h, --help" + ls +
tab + tab + "Outputs description, syntax, and options." +
tab + "--n" + ls +
- tab + tab + "The n, traditionally 2."
+ tab + tab + "The n-gram size, traditionally 2."
)
}
diff --git a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/jaccardMetric.scala b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/jaccardMetric.scala
index d87c84b..b7aa9c7 100755
--- a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/jaccardMetric.scala
+++ b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/jaccardMetric.scala
@@ -38,7 +38,7 @@ object jaccardMetric extends Command {
tab + "-h, --help" + ls +
tab + tab + "Outputs description, syntax, and options." +
tab + "--n" + ls +
- tab + tab + "The n, traditionally 2."
+ tab + tab + "The n-gram size."
)
}
diff --git a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetric.scala b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetric.scala
new file mode 100755
index 0000000..ebf0aca
--- /dev/null
+++ b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetric.scala
@@ -0,0 +1,51 @@
+package com.rockymadden.stringmetric.cli.similarity
+
+import com.rockymadden.stringmetric.cli._
+import com.rockymadden.stringmetric.similarity.OverlapMetric
+
+/**
+ * The overlapMetric [[com.rockymadden.stringmetric.cli.Command]]. Compares the similarity of two strings using the
+ * overlap coefficient.
+ */
+object overlapMetric extends Command {
+ override def main(args: Array[String]): Unit = {
+ val options = OptionMap(args)
+
+ try
+ if (options.contains('h) || options.contains('help)) {
+ help()
+ exit(options)
+ } else if (options.contains('dashless) && (options('dashless): OptionMapArray).length == 2
+ && options.contains('n) && (options('n): OptionMapInt) >= 1) {
+
+ execute(options)
+ exit(options)
+ } else throw new IllegalArgumentException("Expected valid syntax. See --help.")
+ catch {
+ case e: Throwable => error(e, options)
+ }
+ }
+
+ override def help(): Unit = {
+ val ls = sys.props("line.separator")
+ val tab = " "
+
+ println(
+ "Compares the similarity of two strings using the overlap coefficient." + ls + ls +
+ "Syntax:" + ls +
+ tab + "overlapMetric [Options] string1 string2..." + ls + ls +
+ "Options:" + ls +
+ tab + "-h, --help" + ls +
+ tab + tab + "Outputs description, syntax, and options." +
+ tab + "--n" + ls +
+ tab + tab + "The n-gram size."
+ )
+ }
+
+ override def execute(options: OptionMap): Unit = {
+ val strings: OptionMapArray = options('dashless)
+ val n: OptionMapInt = options('n)
+
+ println(OverlapMetric.compare(strings(0), strings(1))(n).getOrElse("not comparable"))
+ }
+}
diff --git a/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetricSpec.scala b/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetricSpec.scala
new file mode 100755
index 0000000..c1bb6b6
--- /dev/null
+++ b/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/overlapMetricSpec.scala
@@ -0,0 +1,39 @@
+package com.rockymadden.stringmetric.cli.similarity
+
+import com.rockymadden.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class overlapMetricSpec extends ScalaTest {
+ "overlapMetric" should provide {
+ "main method" when passed {
+ "valid dashless arguments" should executes {
+ "print if they are a match" in {
+ val out = new java.io.ByteArrayOutputStream()
+
+ Console.withOut(out)(
+ overlapMetric.main(Array("--unitTest", "--debug", "--n=2", "abc", "abc"))
+ )
+
+ out.toString should equal ("1.0\n")
+ out.reset()
+
+ Console.withOut(out)(
+ overlapMetric.main(Array("--unitTest", "--debug", "--n=2", "abc", "xyz"))
+ )
+
+ out.toString should equal ("0.0\n")
+ out.reset()
+ }
+ }
+ "no dashless arguments" should throws {
+ "IllegalArgumentException" in {
+ evaluating {
+ overlapMetric.main(Array("--unitTest", "--debug"))
+ } should produce [IllegalArgumentException]
+ }
+ }
+ }
+ }
+}
diff --git a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/OverlapMetricBenchmark.scala b/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/OverlapMetricBenchmark.scala
new file mode 100755
index 0000000..f607aff
--- /dev/null
+++ b/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/OverlapMetricBenchmark.scala
@@ -0,0 +1,55 @@
+package com.rockymadden.stringmetric.similarity
+
+import com.google.caliper.Param
+import com.rockymadden.stringmetric.{ CaliperBenchmark, CaliperRunner }
+import scala.annotation.tailrec
+import scala.util.Random
+
+final class OverlapMetricBenchmark extends CaliperBenchmark {
+ import OverlapMetricBenchmark.Metric
+
+ @Param(Array("0", "1", "2", "4", "8", "16"))
+ var length: Int = _
+
+ var string1: String = _
+ var charArray1: Array[Char] = _
+ var string2: String = _
+ var charArray2: Array[Char] = _
+
+ override protected def setUp() {
+ @tailrec
+ def random(l: Int, ps: String = null): String =
+ if (l == 0) ""
+ else {
+ val s = Random.alphanumeric.take(l).mkString
+
+ if (ps == null || s != ps) s
+ else random(l, ps)
+ }
+
+ string1 = random(length)
+ string2 = random(length, string1)
+ charArray1 = string1.toCharArray
+ charArray2 = string2.toCharArray
+ }
+
+ def timeCompareWithDifferentCharArrays(reps: Int) = run(reps) {
+ Metric.compare(charArray1, charArray2)(2)
+ }
+
+ def timeCompareWithDifferentStrings(reps: Int) = run(reps) {
+ Metric.compare(string1, string2)(2)
+ }
+
+ def timeCompareWithIdenticalCharArrays(reps: Int) = run(reps) {
+ Metric.compare(charArray1, charArray1)(2)
+ }
+
+ def timeCompareWithIdenticalStrings(reps: Int) = run(reps) {
+ Metric.compare(string1, string1)(2)
+ }
+}
+
+object OverlapMetricBenchmark extends CaliperRunner(classOf[OverlapMetricBenchmark]) {
+ private final val Metric = OverlapMetric()
+}
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala
index 0ace668..575134a 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala
@@ -14,6 +14,9 @@ object ConfigurableStringMetric {
type NGramMetric = com.rockymadden.stringmetric.similarity.NGramMetric
val NGramMetric = com.rockymadden.stringmetric.similarity.NGramMetric
+ type OverlapMetric = com.rockymadden.stringmetric.similarity.OverlapMetric
+ val OverlapMetric = com.rockymadden.stringmetric.similarity.OverlapMetric
+
type WeightedLevenshteinMetric = com.rockymadden.stringmetric.similarity.WeightedLevenshteinMetric
val WeightedLevenshteinMetric = com.rockymadden.stringmetric.similarity.WeightedLevenshteinMetric
@@ -33,6 +36,11 @@ object ConfigurableStringMetric {
def compareWithNGram(string1: String, string2: String)(n: Int) = NGramMetric.compare(string1, string2)(n)
+ def compareWithOverlap(charArray1: Array[Char], charArray2: Array[Char])(n: Int) =
+ OverlapMetric.compare(charArray1, charArray2)(n)
+
+ def compareWithOverlap(string1: String, string2: String)(n: Int) = OverlapMetric.compare(string1, string2)(n)
+
def compareWithWeightedLevenshtein(charArray1: Array[Char], charArray2: Array[Char])
(options: (BigDecimal, BigDecimal, BigDecimal)) =
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala
new file mode 100755
index 0000000..3bf85ca
--- /dev/null
+++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala
@@ -0,0 +1,45 @@
+package com.rockymadden.stringmetric.similarity
+
+import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter }
+import scala.math
+
+/* An implementation of the overlap metric. */
+class OverlapMetric extends ConfigurableStringMetric[Double, Int] {
+ this: StringFilter =>
+
+ final override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit n: Int): Option[Double] = {
+ if (n <= 0) throw new IllegalArgumentException("Expected valid n.")
+
+ val fca1 = filter(charArray1)
+ lazy val fca2 = filter(charArray2)
+
+ if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare.
+ else if (fca1.sameElements(fca2)) Some(1d)
+ else {
+ val nGramAlgorithm = NGramAlgorithm()
+
+ nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg =>
+ nGramAlgorithm.compute(fca2)(n).map { ca2bg =>
+ val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString)))
+
+ ms.toDouble / (math.min(ca1bg.length, ca2bg.length))
+ }
+ }
+ }
+ }
+
+ final override def compare(string1: String, string2: String)(implicit n: Int): Option[Double] =
+ compare(string1.toCharArray, string2.toCharArray)(n: Int)
+
+ private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length
+}
+
+object OverlapMetric {
+ private lazy val self = apply()
+
+ def apply(): OverlapMetric = new OverlapMetric with StringFilter
+
+ def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = self.compare(charArray1, charArray2)(n)
+
+ def compare(string1: String, string2: String)(n: Int) = self.compare(string1, string2)(n)
+}
diff --git a/core/source/test/scala/com/rockymadden/stringmetric/similarity/OverlapMetricSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/similarity/OverlapMetricSpec.scala
new file mode 100755
index 0000000..32c9650
--- /dev/null
+++ b/core/source/test/scala/com/rockymadden/stringmetric/similarity/OverlapMetricSpec.scala
@@ -0,0 +1,77 @@
+package com.rockymadden.stringmetric.similarity
+
+import com.rockymadden.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class OverlapMetricSpec extends ScalaTest {
+ import OverlapMetricSpec.Metric
+
+ "OverlapMetric" should provide {
+ "compare method" when passed {
+ "empty arguments" should returns {
+ "None" in {
+ Metric.compare("", "")(1).isDefined should be (false)
+ Metric.compare("abc", "")(1).isDefined should be (false)
+ Metric.compare("", "xyz")(1).isDefined should be (false)
+ }
+ }
+ "equal arguments" should returns {
+ "1" in {
+ Metric.compare("abc", "abc")(1).get should be (1)
+ Metric.compare("abc", "abc")(2).get should be (1)
+ Metric.compare("abc", "abc")(3).get should be (1)
+ }
+ }
+ "unequal arguments" should returns {
+ "0" in {
+ Metric.compare("abc", "xyz")(1).get should be (0)
+ Metric.compare("abc", "xyz")(2).get should be (0)
+ Metric.compare("abc", "xyz")(3).get should be (0)
+ }
+ }
+ "invalid arguments" should returns {
+ "None" in {
+ Metric.compare("n", "naght")(2).isDefined should be (false)
+ Metric.compare("night", "n")(2).isDefined should be (false)
+ Metric.compare("ni", "naght")(3).isDefined should be (false)
+ Metric.compare("night", "na")(3).isDefined should be (false)
+ }
+ }
+ "valid arguments" should returns {
+ "Double indicating distance" in {
+ Metric.compare("bob", "bobman") (1).get should be (1)
+ Metric.compare("bob", "manbobman") (1).get should be (1)
+ Metric.compare("night", "nacht")(1).get should be (0.6)
+ Metric.compare("night", "naght")(1).get should be (0.8)
+ Metric.compare("context", "contact")(1).get should be (0.7142857142857143)
+
+ Metric.compare("night", "nacht")(2).get should be (0.25)
+ Metric.compare("night", "naght")(2).get should be (0.5)
+ Metric.compare("context", "contact")(2).get should be (0.5)
+ Metric.compare("contextcontext", "contact")(2).get should be (0.5)
+ Metric.compare("context", "contactcontact")(2).get should be (0.5)
+ Metric.compare("ht", "nacht")(2).get should be (1)
+ Metric.compare("xp", "nacht")(2).get should be (0)
+ Metric.compare("ht", "hththt")(2).get should be (1)
+
+ Metric.compare("night", "nacht")(3).get should be (0)
+ Metric.compare("night", "naght")(3).get should be (0.3333333333333333)
+ Metric.compare("context", "contact")(3).get should be (0.4)
+ }
+ }
+ }
+ }
+ "OverlapMetric companion object" should provide {
+ "pass-through compare method" should returns {
+ "same value as class" in {
+ OverlapMetric.compare("context", "contact")(3).get should be (0.4)
+ }
+ }
+ }
+}
+
+object OverlapMetricSpec {
+ private final val Metric = OverlapMetric()
+}
diff --git a/readme.md b/readme.md
index b82399a..7d3a4d5 100755
--- a/readme.md
+++ b/readme.md
@@ -15,7 +15,7 @@ String metrics and phonetic algorithms for Scala. The library provides facilitie
* __[Needleman-Wunch](http://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm)__ (Queued similarity metric)
* __[N-Gram](http://en.wikipedia.org/wiki/N-gram)__ (Similarity metric and algorithm)
* __[NYSIIS](http://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System)__ (Phonetic metric and algorithm)
-* __[Overlap](http://en.wikipedia.org/wiki/Overlap_coefficient)__ (Queued similarity metric)
+* __[Overlap](http://en.wikipedia.org/wiki/Overlap_coefficient)__ (Similarity metric)
* __[Ratcliff-Obershelp](http://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html)__ (Similarity metric)
* __[Refined NYSIIS](http://www.markcrocker.com/rexxtipsntricks/rxtt28.2.0482.html)__ (Phonetic metric and algorithm)
* __[Refined Soundex](http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html)__ (Phonetic metric and algorithm)