summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-11-04 19:30:17 -0700
committerRocky Madden <git@rockymadden.com>2012-11-04 19:30:17 -0700
commit46902b79562e7df92b05c73346be79b6e80d8c03 (patch)
tree681c6f991371fb8dead13949c87d830dd03b715d /core
parent72d88de258e6665fcfebabe6be200a27524fc016 (diff)
downloadstringmetric-46902b79562e7df92b05c73346be79b6e80d8c03.tar.gz
stringmetric-46902b79562e7df92b05c73346be79b6e80d8c03.tar.bz2
stringmetric-46902b79562e7df92b05c73346be79b6e80d8c03.zip
Created WeightedLevenshtein metric, command, specs, and supporting code.
Diffstat (limited to 'core')
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/OptionMetric.scala6
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/StringOptionMetric.scala21
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetric.scala51
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetricSpec.scala49
4 files changed, 127 insertions, 0 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/OptionMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/OptionMetric.scala
new file mode 100755
index 0000000..1a942e0
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/OptionMetric.scala
@@ -0,0 +1,6 @@
+package org.hashtree.stringmetric
+
+/** Marks those which leverage traits of an option metric. */
+trait OptionMetric[T, O, F <: Filter[T]] {
+ def compare(t1: T, t2: T)(o: O)(implicit f: F): Option[AnyVal]
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringOptionMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/StringOptionMetric.scala
new file mode 100755
index 0000000..a40273f
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/StringOptionMetric.scala
@@ -0,0 +1,21 @@
+package org.hashtree.stringmetric
+
+import org.hashtree.stringmetric.similarity.WeightedLevenshteinMetric
+
+/** Marks those which leverage traits of a string based [[org.hashtree.stringmetric.OptionMetric]]. */
+trait StringOptionMetric[O] extends OptionMetric[String, O, StringFilter] {
+ def compare(charArray1: Array[Char], charArray2: Array[Char])(o: O)(implicit stringFilter: StringFilter): Option[AnyVal]
+}
+
+/** Convenience object for those extending [[org.hashtree.stringmetric.StringOptionMetric]]. */
+object StringOptionMetric {
+ def compareWeightedLevenshtein(charArray1: Array[Char], charArray2: Array[Char])
+ (options: Tuple3[BigDecimal, BigDecimal, BigDecimal])(implicit stringFilter: StringFilter): Option[Double] =
+ WeightedLevenshteinMetric.compare(charArray1, charArray2)(options)(stringFilter)
+
+ def compareWeightedLevenshtein(string1: String, string2: String)
+ (options: Tuple3[BigDecimal, BigDecimal, BigDecimal])(implicit stringFilter: StringFilter): Option[Double] =
+ WeightedLevenshteinMetric.compare(string1, string2)(options)(stringFilter)
+
+ def weightedLevenshtein: WeightedLevenshteinMetric.type = WeightedLevenshteinMetric
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetric.scala
new file mode 100755
index 0000000..545f0ee
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetric.scala
@@ -0,0 +1,51 @@
+package org.hashtree.stringmetric.similarity
+
+import org.hashtree.stringmetric.{ CompareTuple, StringFilter, StringFilterDelegate, StringOptionMetric }
+import scala.math.BigDecimal
+
+/** An implementation of a weighted Levenshtein [[org.hashtree.stringmetric.StringOptionMetric]]. */
+object WeightedLevenshteinMetric extends StringOptionMetric[Tuple3[BigDecimal, BigDecimal, BigDecimal]] {
+ /** Options order is delete, insert, then substitute weight. */
+ def compare(charArray1: Array[Char], charArray2: Array[Char])(options: Tuple3[BigDecimal, BigDecimal, BigDecimal])
+ (implicit stringFilter: StringFilter): Option[Double] = {
+ val ca1 = stringFilter.filter(charArray1)
+ val ca2 = stringFilter.filter(charArray2)
+
+ if (ca1.length == 0 && ca2.length == 0) None
+ else if (ca1.length == 0) Some((options._2 * ca2.length).toDouble)
+ else if (ca2.length == 0) Some((options._1 * ca1.length).toDouble)
+ else if (ca1.sameElements(ca2)) Some(0d)
+ else Some(weightedLevenshtein((ca1, ca2), options).toDouble)
+ }
+
+ /** Options order is delete, insert, then substitute weight. */
+ override def compare(string1: String, string2: String)(options: Tuple3[BigDecimal, BigDecimal, BigDecimal])
+ (implicit stringFilter: StringFilter): Option[Double] =
+ compare(
+ stringFilter.filter(string1.toCharArray),
+ stringFilter.filter(string2.toCharArray)
+ )(options)(new StringFilterDelegate)
+
+ private[this] def weightedLevenshtein(ct: CompareTuple[Char], w: Tuple3[BigDecimal, BigDecimal, BigDecimal]) = {
+ val m = Array.ofDim[BigDecimal](ct._1.length + 1, ct._2.length + 1)
+
+ for (r <- 0 to ct._1.length) m(r)(0) = w._1 * r
+ for (c <- 0 to ct._2.length) m(0)(c) = w._2 * c
+
+ for (r <- 1 to ct._1.length; c <- 1 to ct._2.length) {
+ m(r)(c) = {
+ if (ct._1(r - 1) == ct._2(c - 1))
+ m(r - 1)(c - 1)
+ else {
+ (m(r - 1)(c) + w._1).min( // Delete (left).
+ (m(r)(c - 1) + w._2).min( // Insert (up).
+ m(r - 1)(c - 1) + w._3 // Substitute (left-up).
+ )
+ )
+ }
+ }
+ }
+
+ m(ct._1.length)(ct._2.length)
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetricSpec.scala
new file mode 100755
index 0000000..cca3a82
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetricSpec.scala
@@ -0,0 +1,49 @@
+package org.hashtree.stringmetric.similarity
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class WeightedLevenshteinMetricSpec extends ScalaTest {
+ private final val Options = Tuple3[BigDecimal, BigDecimal, BigDecimal](10, 0.1, 1)
+
+ "WeightedLevenshteinMetric" should provide {
+ "compare method" when passed {
+ "empty arguments" should returns {
+ "None" in {
+ WeightedLevenshteinMetric.compare("", "")(Options).isDefined should be (false)
+ }
+ }
+ "equal arguments" should returns {
+ "0" in {
+ WeightedLevenshteinMetric.compare("abc", "abc")(Options).get should be (0)
+ }
+ }
+ "unequal arguments" should returns {
+ "Double indicating distance" in {
+ WeightedLevenshteinMetric.compare("abc", "")(Options).get should be (30)
+ WeightedLevenshteinMetric.compare("", "xyz")(Options).get should be (0.3)
+ }
+ }
+ "valid arguments" should returns {
+ "Double indicating distance" in {
+ WeightedLevenshteinMetric.compare("az", "z")(Options).get should be (10)
+ WeightedLevenshteinMetric.compare("z", "az")(Options).get should be (0.1)
+ WeightedLevenshteinMetric.compare("a", "z")(Options).get should be (1)
+ WeightedLevenshteinMetric.compare("z", "a")(Options).get should be (1)
+ WeightedLevenshteinMetric.compare("ab", "yz")(Options).get should be (2)
+ WeightedLevenshteinMetric.compare("yz", "ab")(Options).get should be (2)
+ WeightedLevenshteinMetric.compare("0", "0123456789")(Options).get should be (0.9)
+ WeightedLevenshteinMetric.compare("0123456789", "0")(Options).get should be (90)
+ WeightedLevenshteinMetric.compare("book", "back")(Options).get should be (2)
+ WeightedLevenshteinMetric.compare("back", "book")(Options).get should be (2)
+ WeightedLevenshteinMetric.compare("hosp", "hospital")(Options).get should be (0.4)
+ WeightedLevenshteinMetric.compare("hospital", "hosp")(Options).get should be (40)
+ WeightedLevenshteinMetric.compare("clmbs blvd", "columbus boulevard")(Options).get should be (0.8)
+ WeightedLevenshteinMetric.compare("columbus boulevard", "clmbs blvd")(Options).get should be (80)
+ }
+ }
+ }
+ }
+} \ No newline at end of file