summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-11-04 19:30:17 -0700
committerRocky Madden <git@rockymadden.com>2012-11-04 19:30:17 -0700
commit46902b79562e7df92b05c73346be79b6e80d8c03 (patch)
tree681c6f991371fb8dead13949c87d830dd03b715d
parent72d88de258e6665fcfebabe6be200a27524fc016 (diff)
downloadstringmetric-46902b79562e7df92b05c73346be79b6e80d8c03.tar.gz
stringmetric-46902b79562e7df92b05c73346be79b6e80d8c03.tar.bz2
stringmetric-46902b79562e7df92b05c73346be79b6e80d8c03.zip
Created WeightedLevenshtein metric, command, specs, and supporting code.
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/ParseUtility.scala14
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetric.scala74
-rwxr-xr-xcli/source/test/scala/org/hashtree/stringmetric/cli/ParseUtilitySpec.scala60
-rwxr-xr-xcli/source/test/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetricSpec.scala123
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/OptionMetric.scala6
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/StringOptionMetric.scala21
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetric.scala51
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetricSpec.scala49
-rwxr-xr-xreadme.md3
9 files changed, 401 insertions, 0 deletions
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/ParseUtility.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/ParseUtility.scala
new file mode 100755
index 0000000..94f22c6
--- /dev/null
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/ParseUtility.scala
@@ -0,0 +1,14 @@
+package org.hashtree.stringmetric.cli
+
+import scala.math.BigDecimal
+
+/** Utility standalone for parse based operations. */
+object ParseUtility {
+ def parseBigDecimal(string: String): Option[BigDecimal] = try { Some(BigDecimal(string)) } catch { case _ => None }
+
+ def parseDouble(string: String): Option[Double] = try { Some(string.toDouble) } catch { case _ => None }
+
+ def parseFloat(string: String): Option[Float] = try { Some(string.toFloat) } catch { case _ => None }
+
+ def parseInt(string: String): Option[Int] = try { Some(string.toInt) } catch { case _ => None }
+} \ No newline at end of file
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetric.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetric.scala
new file mode 100755
index 0000000..ac10754
--- /dev/null
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetric.scala
@@ -0,0 +1,74 @@
+package org.hashtree.stringmetric.cli.similarity
+
+import org.hashtree.stringmetric.StringFilterDelegate
+import org.hashtree.stringmetric.cli._
+import org.hashtree.stringmetric.filter.AsciiLetterCaseStringFilter
+import org.hashtree.stringmetric.similarity.WeightedLevenshteinMetric
+import scala.math.BigDecimal
+
+/**
+ * The weightedLevenshteinMetric [[org.hashtree.stringmetric.cli.Command]]. Compares the number of characters that two
+ * strings are different from one another via insertion, deletion, and substitution. Allows the invoker to indicate
+ * the weight each operation takes.
+ */
+object weightedLevenshteinMetric extends Command {
+ override def main(args: Array[String]): Unit = {
+ val options = OptionMapUtility.toOptionMap(args)
+
+ try {
+ // Help.
+ if (options.contains('h) || options.contains('help)) {
+ help()
+ exit(options)
+ // Execute.
+ } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1 &&
+ options.contains('deleteWeight) && ParseUtility.parseDouble(options('deleteWeight)).isDefined &&
+ options.contains('insertWeight) && ParseUtility.parseDouble(options('insertWeight)).isDefined &&
+ options.contains('substituteWeight) && ParseUtility.parseDouble(options('substituteWeight)).isDefined
+ ) {
+ execute(options)
+ exit(options)
+ // Invalid syntax.
+ } else throw new IllegalArgumentException("Expected valid syntax. See --help.")
+ } catch {
+ case e => error(e, options)
+ }
+ }
+
+ override def help(): Unit = {
+ val ls = sys.props("line.separator")
+ val tab = " "
+
+ println(
+ "Compares the number of characters that two strings are different from one another via insertion, deletion, " +
+ "and substitution. Allows the invoker to indicate the weight each operation takes." + ls + ls +
+ "Syntax:" + ls +
+ tab + "weightedLevenshteinMetric [Options] --deleteWeight=[double] --insertWeight=[double] --substituteWeight=[double] string1 string2..." + ls + ls +
+ "Options:" + ls +
+ tab + "--deleteWeight" + ls +
+ tab + tab + "The weight given to delete operations." +
+ tab + "-h, --help" + ls +
+ tab + tab + "Outputs description, syntax, and options." +
+ tab + "--insertWeight" + ls +
+ tab + tab + "The weight given to insert operations." +
+ tab + "--substituteWeight" + ls +
+ tab + tab + "The weight given to substitute operations."
+ )
+ }
+
+ override def execute(options: OptionMap): Unit = {
+ val strings = options('dashless).split(" ")
+ val weights = Tuple3[BigDecimal, BigDecimal, BigDecimal](
+ ParseUtility.parseBigDecimal(options('deleteWeight)).get,
+ ParseUtility.parseBigDecimal(options('insertWeight)).get,
+ ParseUtility.parseBigDecimal(options('substituteWeight)).get
+ )
+
+ println(
+ WeightedLevenshteinMetric.compare(
+ strings(0),
+ strings(1)
+ )(weights)(new StringFilterDelegate with AsciiLetterCaseStringFilter).getOrElse("not comparable").toString
+ )
+ }
+} \ No newline at end of file
diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/ParseUtilitySpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/ParseUtilitySpec.scala
new file mode 100755
index 0000000..686f3da
--- /dev/null
+++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/ParseUtilitySpec.scala
@@ -0,0 +1,60 @@
+package org.hashtree.stringmetric.cli
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import scala.math.BigDecimal
+
+@RunWith(classOf[JUnitRunner])
+final class ParseUtilitySpec extends ScalaTest {
+ "ParseUtility" should provide {
+ "parseBigDecimal method" when passed {
+ "invalid argument" should returns {
+ "None" in {
+ ParseUtility.parseBigDecimal("one").isDefined should be (false)
+ }
+ }
+ "valid argument" should returns {
+ "Some(BigDecimal)" in {
+ ParseUtility.parseBigDecimal("1").get should equal (BigDecimal(1))
+ }
+ }
+ }
+ "parseDouble method" when passed {
+ "invalid argument" should returns {
+ "None" in {
+ ParseUtility.parseDouble("one").isDefined should be (false)
+ }
+ }
+ "valid argument" should returns {
+ "Some(Double)" in {
+ ParseUtility.parseDouble("1").get should be (1d)
+ }
+ }
+ }
+ "parseFloat method" when passed {
+ "invalid argument" should returns {
+ "None" in {
+ ParseUtility.parseFloat("one").isDefined should be (false)
+ }
+ }
+ "valid argument" should returns {
+ "Some(Float)" in {
+ ParseUtility.parseFloat("1").get should be (1f)
+ }
+ }
+ }
+ "parseInt method" when passed {
+ "invalid argument" should returns {
+ "None" in {
+ ParseUtility.parseInt("one").isDefined should be (false)
+ }
+ }
+ "valid argument" should returns {
+ "Some(Int)" in {
+ ParseUtility.parseInt("1").get should be (1)
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetricSpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetricSpec.scala
new file mode 100755
index 0000000..43e22c4
--- /dev/null
+++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetricSpec.scala
@@ -0,0 +1,123 @@
+package org.hashtree.stringmetric.cli.similarity
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class weightedLevenshteinMetricSpec extends ScalaTest {
+ "weightedLevenshteinMetric" should provide {
+ "main method" when passed {
+ "valid dashless arguments and valid weight arguments" should executes {
+ "print if they are a match" in {
+ val out = new java.io.ByteArrayOutputStream()
+
+ Console.withOut(out)(
+ weightedLevenshteinMetric.main(
+ Array(
+ "--unitTest",
+ "--debug",
+ "--deleteWeight=1",
+ "--insertWeight=1",
+ "--substituteWeight=1",
+ "aBc",
+ "abc"
+ )
+ )
+ )
+
+ out.toString should equal ("0.0\n")
+ out.reset()
+
+ Console.withOut(out)(
+ weightedLevenshteinMetric.main(
+ Array(
+ "--unitTest",
+ "--debug",
+ "--deleteWeight=2",
+ "--insertWeight=2",
+ "--substituteWeight=1",
+ "aBc",
+ "xyz"
+ )
+ )
+ )
+
+ out.toString should equal ("3.0\n")
+ out.reset()
+
+ Console.withOut(out)(
+ weightedLevenshteinMetric.main(
+ Array(
+ "--unitTest",
+ "--debug",
+ "--deleteWeight=2",
+ "--insertWeight=1",
+ "--substituteWeight=2",
+ "xyz",
+ "xyzxyz"
+ )
+ )
+ )
+
+ out.toString should equal ("3.0\n")
+ out.reset()
+
+ Console.withOut(out)(
+ weightedLevenshteinMetric.main(
+ Array(
+ "--unitTest",
+ "--debug",
+ "--deleteWeight=1",
+ "--insertWeight=2",
+ "--substituteWeight=2",
+ "xyzxyz",
+ "xyz"
+ )
+ )
+ )
+
+ out.toString should equal ("3.0\n")
+ out.reset()
+ }
+ }
+ "valid dashless arguments and invalid weight arguments" should throws {
+ "IllegalArgumentException" in {
+ evaluating {
+ weightedLevenshteinMetric.main(
+ Array(
+ "--unitTest",
+ "--debug",
+ "--deleteWeight=1",
+ "--substituteWeight=1",
+ "aBc",
+ "abc"
+ )
+ )
+ } should produce [IllegalArgumentException]
+
+ evaluating {
+ weightedLevenshteinMetric.main(
+ Array(
+ "--unitTest",
+ "--debug",
+ "--deleteWeight=1",
+ "--insertWeight=q",
+ "--substituteWeight=1",
+ "aBc",
+ "abc"
+ )
+ )
+ } should produce [IllegalArgumentException]
+ }
+ }
+ "no dashless arguments" should throws {
+ "IllegalArgumentException" in {
+ evaluating {
+ weightedLevenshteinMetric.main(Array("--unitTest", "--debug"))
+ } should produce [IllegalArgumentException]
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/OptionMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/OptionMetric.scala
new file mode 100755
index 0000000..1a942e0
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/OptionMetric.scala
@@ -0,0 +1,6 @@
+package org.hashtree.stringmetric
+
+/** Marks those which leverage traits of an option metric. */
+trait OptionMetric[T, O, F <: Filter[T]] {
+ def compare(t1: T, t2: T)(o: O)(implicit f: F): Option[AnyVal]
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringOptionMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/StringOptionMetric.scala
new file mode 100755
index 0000000..a40273f
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/StringOptionMetric.scala
@@ -0,0 +1,21 @@
+package org.hashtree.stringmetric
+
+import org.hashtree.stringmetric.similarity.WeightedLevenshteinMetric
+
+/** Marks those which leverage traits of a string based [[org.hashtree.stringmetric.OptionMetric]]. */
+trait StringOptionMetric[O] extends OptionMetric[String, O, StringFilter] {
+ def compare(charArray1: Array[Char], charArray2: Array[Char])(o: O)(implicit stringFilter: StringFilter): Option[AnyVal]
+}
+
+/** Convenience object for those extending [[org.hashtree.stringmetric.StringOptionMetric]]. */
+object StringOptionMetric {
+ def compareWeightedLevenshtein(charArray1: Array[Char], charArray2: Array[Char])
+ (options: Tuple3[BigDecimal, BigDecimal, BigDecimal])(implicit stringFilter: StringFilter): Option[Double] =
+ WeightedLevenshteinMetric.compare(charArray1, charArray2)(options)(stringFilter)
+
+ def compareWeightedLevenshtein(string1: String, string2: String)
+ (options: Tuple3[BigDecimal, BigDecimal, BigDecimal])(implicit stringFilter: StringFilter): Option[Double] =
+ WeightedLevenshteinMetric.compare(string1, string2)(options)(stringFilter)
+
+ def weightedLevenshtein: WeightedLevenshteinMetric.type = WeightedLevenshteinMetric
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetric.scala
new file mode 100755
index 0000000..545f0ee
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetric.scala
@@ -0,0 +1,51 @@
+package org.hashtree.stringmetric.similarity
+
+import org.hashtree.stringmetric.{ CompareTuple, StringFilter, StringFilterDelegate, StringOptionMetric }
+import scala.math.BigDecimal
+
+/** An implementation of a weighted Levenshtein [[org.hashtree.stringmetric.StringOptionMetric]]. */
+object WeightedLevenshteinMetric extends StringOptionMetric[Tuple3[BigDecimal, BigDecimal, BigDecimal]] {
+ /** Options order is delete, insert, then substitute weight. */
+ def compare(charArray1: Array[Char], charArray2: Array[Char])(options: Tuple3[BigDecimal, BigDecimal, BigDecimal])
+ (implicit stringFilter: StringFilter): Option[Double] = {
+ val ca1 = stringFilter.filter(charArray1)
+ val ca2 = stringFilter.filter(charArray2)
+
+ if (ca1.length == 0 && ca2.length == 0) None
+ else if (ca1.length == 0) Some((options._2 * ca2.length).toDouble)
+ else if (ca2.length == 0) Some((options._1 * ca1.length).toDouble)
+ else if (ca1.sameElements(ca2)) Some(0d)
+ else Some(weightedLevenshtein((ca1, ca2), options).toDouble)
+ }
+
+ /** Options order is delete, insert, then substitute weight. */
+ override def compare(string1: String, string2: String)(options: Tuple3[BigDecimal, BigDecimal, BigDecimal])
+ (implicit stringFilter: StringFilter): Option[Double] =
+ compare(
+ stringFilter.filter(string1.toCharArray),
+ stringFilter.filter(string2.toCharArray)
+ )(options)(new StringFilterDelegate)
+
+ private[this] def weightedLevenshtein(ct: CompareTuple[Char], w: Tuple3[BigDecimal, BigDecimal, BigDecimal]) = {
+ val m = Array.ofDim[BigDecimal](ct._1.length + 1, ct._2.length + 1)
+
+ for (r <- 0 to ct._1.length) m(r)(0) = w._1 * r
+ for (c <- 0 to ct._2.length) m(0)(c) = w._2 * c
+
+ for (r <- 1 to ct._1.length; c <- 1 to ct._2.length) {
+ m(r)(c) = {
+ if (ct._1(r - 1) == ct._2(c - 1))
+ m(r - 1)(c - 1)
+ else {
+ (m(r - 1)(c) + w._1).min( // Delete (left).
+ (m(r)(c - 1) + w._2).min( // Insert (up).
+ m(r - 1)(c - 1) + w._3 // Substitute (left-up).
+ )
+ )
+ }
+ }
+ }
+
+ m(ct._1.length)(ct._2.length)
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetricSpec.scala
new file mode 100755
index 0000000..cca3a82
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetricSpec.scala
@@ -0,0 +1,49 @@
+package org.hashtree.stringmetric.similarity
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class WeightedLevenshteinMetricSpec extends ScalaTest {
+ private final val Options = Tuple3[BigDecimal, BigDecimal, BigDecimal](10, 0.1, 1)
+
+ "WeightedLevenshteinMetric" should provide {
+ "compare method" when passed {
+ "empty arguments" should returns {
+ "None" in {
+ WeightedLevenshteinMetric.compare("", "")(Options).isDefined should be (false)
+ }
+ }
+ "equal arguments" should returns {
+ "0" in {
+ WeightedLevenshteinMetric.compare("abc", "abc")(Options).get should be (0)
+ }
+ }
+ "unequal arguments" should returns {
+ "Double indicating distance" in {
+ WeightedLevenshteinMetric.compare("abc", "")(Options).get should be (30)
+ WeightedLevenshteinMetric.compare("", "xyz")(Options).get should be (0.3)
+ }
+ }
+ "valid arguments" should returns {
+ "Double indicating distance" in {
+ WeightedLevenshteinMetric.compare("az", "z")(Options).get should be (10)
+ WeightedLevenshteinMetric.compare("z", "az")(Options).get should be (0.1)
+ WeightedLevenshteinMetric.compare("a", "z")(Options).get should be (1)
+ WeightedLevenshteinMetric.compare("z", "a")(Options).get should be (1)
+ WeightedLevenshteinMetric.compare("ab", "yz")(Options).get should be (2)
+ WeightedLevenshteinMetric.compare("yz", "ab")(Options).get should be (2)
+ WeightedLevenshteinMetric.compare("0", "0123456789")(Options).get should be (0.9)
+ WeightedLevenshteinMetric.compare("0123456789", "0")(Options).get should be (90)
+ WeightedLevenshteinMetric.compare("book", "back")(Options).get should be (2)
+ WeightedLevenshteinMetric.compare("back", "book")(Options).get should be (2)
+ WeightedLevenshteinMetric.compare("hosp", "hospital")(Options).get should be (0.4)
+ WeightedLevenshteinMetric.compare("hospital", "hosp")(Options).get should be (40)
+ WeightedLevenshteinMetric.compare("clmbs blvd", "columbus boulevard")(Options).get should be (0.8)
+ WeightedLevenshteinMetric.compare("columbus boulevard", "clmbs blvd")(Options).get should be (80)
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/readme.md b/readme.md
index 92cc9d6..38100a2 100755
--- a/readme.md
+++ b/readme.md
@@ -29,6 +29,9 @@ A collection of string metrics and phonetic algorithms implemented in Scala. All
* __[Soundex](http://en.wikipedia.org/wiki/Soundex)__
* API: org.hashtree.stringmetric.phonetic.SoundexMetric and org.hashtree.stringmetric.phonetic.SoundexAlgorithm
* CLI: soundexMetric and soundexAlgorithm
+* __Weighted Levenshtein__
+ * API: org.hashtree.stringmetric.similarity.WeightedLevenshteinMetric
+ * CLI: weightedLevenshteinMetric
## Filters
Filters, which can optionally be applied, clean up arguments prior to evaluation. Filtering rules can be composed via trait decoration.