diff options
author | Rocky Madden <git@rockymadden.com> | 2012-11-04 19:30:17 -0700 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2012-11-04 19:30:17 -0700 |
commit | 46902b79562e7df92b05c73346be79b6e80d8c03 (patch) | |
tree | 681c6f991371fb8dead13949c87d830dd03b715d | |
parent | 72d88de258e6665fcfebabe6be200a27524fc016 (diff) | |
download | stringmetric-46902b79562e7df92b05c73346be79b6e80d8c03.tar.gz stringmetric-46902b79562e7df92b05c73346be79b6e80d8c03.tar.bz2 stringmetric-46902b79562e7df92b05c73346be79b6e80d8c03.zip |
Created WeightedLevenshtein metric, command, specs, and supporting code.
9 files changed, 401 insertions, 0 deletions
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/ParseUtility.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/ParseUtility.scala new file mode 100755 index 0000000..94f22c6 --- /dev/null +++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/ParseUtility.scala @@ -0,0 +1,14 @@ +package org.hashtree.stringmetric.cli + +import scala.math.BigDecimal + +/** Utility standalone for parse based operations. */ +object ParseUtility { + def parseBigDecimal(string: String): Option[BigDecimal] = try { Some(BigDecimal(string)) } catch { case _ => None } + + def parseDouble(string: String): Option[Double] = try { Some(string.toDouble) } catch { case _ => None } + + def parseFloat(string: String): Option[Float] = try { Some(string.toFloat) } catch { case _ => None } + + def parseInt(string: String): Option[Int] = try { Some(string.toInt) } catch { case _ => None } +}
\ No newline at end of file diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetric.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetric.scala new file mode 100755 index 0000000..ac10754 --- /dev/null +++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetric.scala @@ -0,0 +1,74 @@ +package org.hashtree.stringmetric.cli.similarity + +import org.hashtree.stringmetric.StringFilterDelegate +import org.hashtree.stringmetric.cli._ +import org.hashtree.stringmetric.filter.AsciiLetterCaseStringFilter +import org.hashtree.stringmetric.similarity.WeightedLevenshteinMetric +import scala.math.BigDecimal + +/** + * The weightedLevenshteinMetric [[org.hashtree.stringmetric.cli.Command]]. Compares the number of characters that two + * strings are different from one another via insertion, deletion, and substitution. Allows the invoker to indicate + * the weight each operation takes. + */ +object weightedLevenshteinMetric extends Command { + override def main(args: Array[String]): Unit = { + val options = OptionMapUtility.toOptionMap(args) + + try { + // Help. + if (options.contains('h) || options.contains('help)) { + help() + exit(options) + // Execute. + } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1 && + options.contains('deleteWeight) && ParseUtility.parseDouble(options('deleteWeight)).isDefined && + options.contains('insertWeight) && ParseUtility.parseDouble(options('insertWeight)).isDefined && + options.contains('substituteWeight) && ParseUtility.parseDouble(options('substituteWeight)).isDefined + ) { + execute(options) + exit(options) + // Invalid syntax. + } else throw new IllegalArgumentException("Expected valid syntax. See --help.") + } catch { + case e => error(e, options) + } + } + + override def help(): Unit = { + val ls = sys.props("line.separator") + val tab = " " + + println( + "Compares the number of characters that two strings are different from one another via insertion, deletion, " + + "and substitution. Allows the invoker to indicate the weight each operation takes." + ls + ls + + "Syntax:" + ls + + tab + "weightedLevenshteinMetric [Options] --deleteWeight=[double] --insertWeight=[double] --substituteWeight=[double] string1 string2..." + ls + ls + + "Options:" + ls + + tab + "--deleteWeight" + ls + + tab + tab + "The weight given to delete operations." + + tab + "-h, --help" + ls + + tab + tab + "Outputs description, syntax, and options." + + tab + "--insertWeight" + ls + + tab + tab + "The weight given to insert operations." + + tab + "--substituteWeight" + ls + + tab + tab + "The weight given to substitute operations." + ) + } + + override def execute(options: OptionMap): Unit = { + val strings = options('dashless).split(" ") + val weights = Tuple3[BigDecimal, BigDecimal, BigDecimal]( + ParseUtility.parseBigDecimal(options('deleteWeight)).get, + ParseUtility.parseBigDecimal(options('insertWeight)).get, + ParseUtility.parseBigDecimal(options('substituteWeight)).get + ) + + println( + WeightedLevenshteinMetric.compare( + strings(0), + strings(1) + )(weights)(new StringFilterDelegate with AsciiLetterCaseStringFilter).getOrElse("not comparable").toString + ) + } +}
\ No newline at end of file diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/ParseUtilitySpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/ParseUtilitySpec.scala new file mode 100755 index 0000000..686f3da --- /dev/null +++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/ParseUtilitySpec.scala @@ -0,0 +1,60 @@ +package org.hashtree.stringmetric.cli + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import scala.math.BigDecimal + +@RunWith(classOf[JUnitRunner]) +final class ParseUtilitySpec extends ScalaTest { + "ParseUtility" should provide { + "parseBigDecimal method" when passed { + "invalid argument" should returns { + "None" in { + ParseUtility.parseBigDecimal("one").isDefined should be (false) + } + } + "valid argument" should returns { + "Some(BigDecimal)" in { + ParseUtility.parseBigDecimal("1").get should equal (BigDecimal(1)) + } + } + } + "parseDouble method" when passed { + "invalid argument" should returns { + "None" in { + ParseUtility.parseDouble("one").isDefined should be (false) + } + } + "valid argument" should returns { + "Some(Double)" in { + ParseUtility.parseDouble("1").get should be (1d) + } + } + } + "parseFloat method" when passed { + "invalid argument" should returns { + "None" in { + ParseUtility.parseFloat("one").isDefined should be (false) + } + } + "valid argument" should returns { + "Some(Float)" in { + ParseUtility.parseFloat("1").get should be (1f) + } + } + } + "parseInt method" when passed { + "invalid argument" should returns { + "None" in { + ParseUtility.parseInt("one").isDefined should be (false) + } + } + "valid argument" should returns { + "Some(Int)" in { + ParseUtility.parseInt("1").get should be (1) + } + } + } + } +}
\ No newline at end of file diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetricSpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetricSpec.scala new file mode 100755 index 0000000..43e22c4 --- /dev/null +++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/similarity/weightedLevenshteinMetricSpec.scala @@ -0,0 +1,123 @@ +package org.hashtree.stringmetric.cli.similarity + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class weightedLevenshteinMetricSpec extends ScalaTest { + "weightedLevenshteinMetric" should provide { + "main method" when passed { + "valid dashless arguments and valid weight arguments" should executes { + "print if they are a match" in { + val out = new java.io.ByteArrayOutputStream() + + Console.withOut(out)( + weightedLevenshteinMetric.main( + Array( + "--unitTest", + "--debug", + "--deleteWeight=1", + "--insertWeight=1", + "--substituteWeight=1", + "aBc", + "abc" + ) + ) + ) + + out.toString should equal ("0.0\n") + out.reset() + + Console.withOut(out)( + weightedLevenshteinMetric.main( + Array( + "--unitTest", + "--debug", + "--deleteWeight=2", + "--insertWeight=2", + "--substituteWeight=1", + "aBc", + "xyz" + ) + ) + ) + + out.toString should equal ("3.0\n") + out.reset() + + Console.withOut(out)( + weightedLevenshteinMetric.main( + Array( + "--unitTest", + "--debug", + "--deleteWeight=2", + "--insertWeight=1", + "--substituteWeight=2", + "xyz", + "xyzxyz" + ) + ) + ) + + out.toString should equal ("3.0\n") + out.reset() + + Console.withOut(out)( + weightedLevenshteinMetric.main( + Array( + "--unitTest", + "--debug", + "--deleteWeight=1", + "--insertWeight=2", + "--substituteWeight=2", + "xyzxyz", + "xyz" + ) + ) + ) + + out.toString should equal ("3.0\n") + out.reset() + } + } + "valid dashless arguments and invalid weight arguments" should throws { + "IllegalArgumentException" in { + evaluating { + weightedLevenshteinMetric.main( + Array( + "--unitTest", + "--debug", + "--deleteWeight=1", + "--substituteWeight=1", + "aBc", + "abc" + ) + ) + } should produce [IllegalArgumentException] + + evaluating { + weightedLevenshteinMetric.main( + Array( + "--unitTest", + "--debug", + "--deleteWeight=1", + "--insertWeight=q", + "--substituteWeight=1", + "aBc", + "abc" + ) + ) + } should produce [IllegalArgumentException] + } + } + "no dashless arguments" should throws { + "IllegalArgumentException" in { + evaluating { + weightedLevenshteinMetric.main(Array("--unitTest", "--debug")) + } should produce [IllegalArgumentException] + } + } + } + } +}
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/OptionMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/OptionMetric.scala new file mode 100755 index 0000000..1a942e0 --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/OptionMetric.scala @@ -0,0 +1,6 @@ +package org.hashtree.stringmetric + +/** Marks those which leverage traits of an option metric. */ +trait OptionMetric[T, O, F <: Filter[T]] { + def compare(t1: T, t2: T)(o: O)(implicit f: F): Option[AnyVal] +}
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringOptionMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/StringOptionMetric.scala new file mode 100755 index 0000000..a40273f --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/StringOptionMetric.scala @@ -0,0 +1,21 @@ +package org.hashtree.stringmetric + +import org.hashtree.stringmetric.similarity.WeightedLevenshteinMetric + +/** Marks those which leverage traits of a string based [[org.hashtree.stringmetric.OptionMetric]]. */ +trait StringOptionMetric[O] extends OptionMetric[String, O, StringFilter] { + def compare(charArray1: Array[Char], charArray2: Array[Char])(o: O)(implicit stringFilter: StringFilter): Option[AnyVal] +} + +/** Convenience object for those extending [[org.hashtree.stringmetric.StringOptionMetric]]. */ +object StringOptionMetric { + def compareWeightedLevenshtein(charArray1: Array[Char], charArray2: Array[Char]) + (options: Tuple3[BigDecimal, BigDecimal, BigDecimal])(implicit stringFilter: StringFilter): Option[Double] = + WeightedLevenshteinMetric.compare(charArray1, charArray2)(options)(stringFilter) + + def compareWeightedLevenshtein(string1: String, string2: String) + (options: Tuple3[BigDecimal, BigDecimal, BigDecimal])(implicit stringFilter: StringFilter): Option[Double] = + WeightedLevenshteinMetric.compare(string1, string2)(options)(stringFilter) + + def weightedLevenshtein: WeightedLevenshteinMetric.type = WeightedLevenshteinMetric +}
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetric.scala new file mode 100755 index 0000000..545f0ee --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetric.scala @@ -0,0 +1,51 @@ +package org.hashtree.stringmetric.similarity + +import org.hashtree.stringmetric.{ CompareTuple, StringFilter, StringFilterDelegate, StringOptionMetric } +import scala.math.BigDecimal + +/** An implementation of a weighted Levenshtein [[org.hashtree.stringmetric.StringOptionMetric]]. */ +object WeightedLevenshteinMetric extends StringOptionMetric[Tuple3[BigDecimal, BigDecimal, BigDecimal]] { + /** Options order is delete, insert, then substitute weight. */ + def compare(charArray1: Array[Char], charArray2: Array[Char])(options: Tuple3[BigDecimal, BigDecimal, BigDecimal]) + (implicit stringFilter: StringFilter): Option[Double] = { + val ca1 = stringFilter.filter(charArray1) + val ca2 = stringFilter.filter(charArray2) + + if (ca1.length == 0 && ca2.length == 0) None + else if (ca1.length == 0) Some((options._2 * ca2.length).toDouble) + else if (ca2.length == 0) Some((options._1 * ca1.length).toDouble) + else if (ca1.sameElements(ca2)) Some(0d) + else Some(weightedLevenshtein((ca1, ca2), options).toDouble) + } + + /** Options order is delete, insert, then substitute weight. */ + override def compare(string1: String, string2: String)(options: Tuple3[BigDecimal, BigDecimal, BigDecimal]) + (implicit stringFilter: StringFilter): Option[Double] = + compare( + stringFilter.filter(string1.toCharArray), + stringFilter.filter(string2.toCharArray) + )(options)(new StringFilterDelegate) + + private[this] def weightedLevenshtein(ct: CompareTuple[Char], w: Tuple3[BigDecimal, BigDecimal, BigDecimal]) = { + val m = Array.ofDim[BigDecimal](ct._1.length + 1, ct._2.length + 1) + + for (r <- 0 to ct._1.length) m(r)(0) = w._1 * r + for (c <- 0 to ct._2.length) m(0)(c) = w._2 * c + + for (r <- 1 to ct._1.length; c <- 1 to ct._2.length) { + m(r)(c) = { + if (ct._1(r - 1) == ct._2(c - 1)) + m(r - 1)(c - 1) + else { + (m(r - 1)(c) + w._1).min( // Delete (left). + (m(r)(c - 1) + w._2).min( // Insert (up). + m(r - 1)(c - 1) + w._3 // Substitute (left-up). + ) + ) + } + } + } + + m(ct._1.length)(ct._2.length) + } +}
\ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetricSpec.scala new file mode 100755 index 0000000..cca3a82 --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/WeightedLevenshteinMetricSpec.scala @@ -0,0 +1,49 @@ +package org.hashtree.stringmetric.similarity + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class WeightedLevenshteinMetricSpec extends ScalaTest { + private final val Options = Tuple3[BigDecimal, BigDecimal, BigDecimal](10, 0.1, 1) + + "WeightedLevenshteinMetric" should provide { + "compare method" when passed { + "empty arguments" should returns { + "None" in { + WeightedLevenshteinMetric.compare("", "")(Options).isDefined should be (false) + } + } + "equal arguments" should returns { + "0" in { + WeightedLevenshteinMetric.compare("abc", "abc")(Options).get should be (0) + } + } + "unequal arguments" should returns { + "Double indicating distance" in { + WeightedLevenshteinMetric.compare("abc", "")(Options).get should be (30) + WeightedLevenshteinMetric.compare("", "xyz")(Options).get should be (0.3) + } + } + "valid arguments" should returns { + "Double indicating distance" in { + WeightedLevenshteinMetric.compare("az", "z")(Options).get should be (10) + WeightedLevenshteinMetric.compare("z", "az")(Options).get should be (0.1) + WeightedLevenshteinMetric.compare("a", "z")(Options).get should be (1) + WeightedLevenshteinMetric.compare("z", "a")(Options).get should be (1) + WeightedLevenshteinMetric.compare("ab", "yz")(Options).get should be (2) + WeightedLevenshteinMetric.compare("yz", "ab")(Options).get should be (2) + WeightedLevenshteinMetric.compare("0", "0123456789")(Options).get should be (0.9) + WeightedLevenshteinMetric.compare("0123456789", "0")(Options).get should be (90) + WeightedLevenshteinMetric.compare("book", "back")(Options).get should be (2) + WeightedLevenshteinMetric.compare("back", "book")(Options).get should be (2) + WeightedLevenshteinMetric.compare("hosp", "hospital")(Options).get should be (0.4) + WeightedLevenshteinMetric.compare("hospital", "hosp")(Options).get should be (40) + WeightedLevenshteinMetric.compare("clmbs blvd", "columbus boulevard")(Options).get should be (0.8) + WeightedLevenshteinMetric.compare("columbus boulevard", "clmbs blvd")(Options).get should be (80) + } + } + } + } +}
\ No newline at end of file @@ -29,6 +29,9 @@ A collection of string metrics and phonetic algorithms implemented in Scala. All * __[Soundex](http://en.wikipedia.org/wiki/Soundex)__ * API: org.hashtree.stringmetric.phonetic.SoundexMetric and org.hashtree.stringmetric.phonetic.SoundexAlgorithm * CLI: soundexMetric and soundexAlgorithm +* __Weighted Levenshtein__ + * API: org.hashtree.stringmetric.similarity.WeightedLevenshteinMetric + * CLI: weightedLevenshteinMetric ## Filters Filters, which can optionally be applied, clean up arguments prior to evaluation. Filtering rules can be composed via trait decoration. |