summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-10-16 17:33:23 -0600
committerRocky Madden <git@rockymadden.com>2012-10-16 17:33:23 -0600
commitdfa1b11cf4d2269f5661763b2b8e0298f5216a55 (patch)
tree8e8a9298dd5f0a5566252aa80de10bcd27782c16
parent631a5bdd72e2825bf97994043506f87eecb1f055 (diff)
downloadstringmetric-dfa1b11cf4d2269f5661763b2b8e0298f5216a55.tar.gz
stringmetric-dfa1b11cf4d2269f5661763b2b8e0298f5216a55.tar.bz2
stringmetric-dfa1b11cf4d2269f5661763b2b8e0298f5216a55.zip
Created LevenshteinMetric, spec, and command.
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/command/levenshteinMetric.scala57
-rwxr-xr-xcli/source/test/scala/org/hashtree/stringmetric/cli/command/levenshteinMetricSpec.scala39
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/LevenshteinMetric.scala60
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/LevenshteinMetricSpec.scala30
-rwxr-xr-xreadme.md1
5 files changed, 187 insertions, 0 deletions
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/command/levenshteinMetric.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/levenshteinMetric.scala
new file mode 100755
index 0000000..5a2d2d6
--- /dev/null
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/levenshteinMetric.scala
@@ -0,0 +1,57 @@
+package org.hashtree.stringmetric.cli.command
+
+import org.hashtree.stringmetric.{ CaseStringCleaner, LevenshteinMetric, StringCleanerDelegate }
+import org.hashtree.stringmetric.cli._
+import org.hashtree.stringmetric.cli.command._
+
+/**
+ * The levenshteinMetric [[org.hashtree.stringmetric.cli.command.Command]]. Compares the number of characters that two
+ * strings are different from one another via insertion, deletion, and substitution.
+ */
+object levenshteinMetric extends Command {
+ override def main(args: Array[String]): Unit = {
+ val options = OptionMapUtility.toOptionMap(args)
+
+ try {
+ // Help.
+ if (options.contains('h) || options.contains('help)) {
+ help()
+ exit(options)
+ // Execute.
+ } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1) {
+ execute(options)
+ exit(options)
+ // Invalid syntax.
+ } else {
+ throw new IllegalArgumentException("Expected valid syntax. See --help.")
+ }
+ } catch {
+ case e => error(e)(options)
+ }
+ }
+
+ override def help(): Unit = {
+ val ls = sys.props("line.separator")
+ val tab = " "
+
+ println(
+ "Compares the number of characters that two strings are different from one another via insertion, deletion, " +
+ "and substitution." + ls + ls +
+ "Syntax:" + ls +
+ tab + "levenshteinMetric [Options] string1 string2..." + ls + ls +
+ "Options:" + ls +
+ tab + "-h, --help" + ls +
+ tab + tab + "Outputs description, syntax, and options."
+ )
+ }
+
+ override def execute(options: OptionMap): Unit = {
+ val strings = options('dashless).split(" ")
+
+ println(
+ LevenshteinMetric.compare(strings(0),
+ strings(1))(new StringCleanerDelegate with CaseStringCleaner
+ ).getOrElse("not comparable").toString
+ )
+ }
+} \ No newline at end of file
diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/command/levenshteinMetricSpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/levenshteinMetricSpec.scala
new file mode 100755
index 0000000..77d7187
--- /dev/null
+++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/levenshteinMetricSpec.scala
@@ -0,0 +1,39 @@
+package org.hashtree.stringmetric.cli.command
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class levenshteinMetricSpec extends ScalaTest {
+ "levenshteinMetric" should provide {
+ "main method" when passed {
+ "valid dashless arguments" should executes {
+ "print if they are a match" in {
+ val out = new java.io.ByteArrayOutputStream()
+
+ Console.withOut(out)(
+ levenshteinMetric.main(Array("--unitTest", "--debug", "aBc", "abc"))
+ )
+
+ out.toString should equal ("0\n")
+ out.reset()
+
+ Console.withOut(out)(
+ levenshteinMetric.main(Array("--unitTest", "--debug", "aBc", "xyz"))
+ )
+
+ out.toString should equal ("3\n")
+ out.reset()
+ }
+ }
+ "no dashless arguments" should throws {
+ "IllegalArgumentException" in {
+ evaluating {
+ levenshteinMetric.main(Array("--unitTest", "--debug"))
+ } should produce [IllegalArgumentException]
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/LevenshteinMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/LevenshteinMetric.scala
new file mode 100755
index 0000000..9465c66
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/LevenshteinMetric.scala
@@ -0,0 +1,60 @@
+package org.hashtree.stringmetric
+
+import scala.math
+
+/** An implementation of the Levenshtein [[org.hashtree.stringmetric.StringMetric]]. */
+object LevenshteinMetric extends StringMetric {
+ implicit val stringCleaner = new StringCleanerDelegate with CaseStringCleaner
+
+ override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringCleaner: StringCleaner): Option[Int] = {
+ val ca1 = stringCleaner.clean(charArray1)
+ val ca2 = stringCleaner.clean(charArray2)
+
+ if (ca1.length == 0 && ca2.length == 0)
+ None
+ else {
+ val levenshteinMemoize = Memoize.Y(levenshtein)
+
+ Some(levenshteinMemoize(ca1, ca2))
+ }
+ }
+
+ override def compare(string1: String, string2: String)(implicit stringCleaner: StringCleaner): Option[Int] = {
+ compare(
+ stringCleaner.clean(string1.toCharArray),
+ stringCleaner.clean(string2.toCharArray)
+ )(new StringCleanerDelegate)
+ }
+
+ private[this] def levenshtein(f: CompareTuple => Int)(ct: CompareTuple): Int = {
+ if (ct._1.length == 0)
+ ct._2.length
+ else if (ct._2.length == 0)
+ ct._1.length
+ else {
+ math.min(
+ math.min(
+ f(ct._1.tail, ct._2) + 1,
+ f(ct._1, ct._2.tail) + 1
+ ),
+ f(ct._1.tail, ct._2.tail) + (if (ct._1.head != ct._2.head) 1 else 0)
+ )
+ }
+ }
+
+ private[this] final class Memoize[-T, +R](f: T => R) extends (T => R) {
+ private[this] val map = scala.collection.mutable.Map[T, R]()
+
+ def apply(k: T): R = map.getOrElseUpdate(k, f(k))
+ }
+
+ private[this] object Memoize {
+ def apply[T, R](f: T => R) = new Memoize(f)
+
+ def Y[T, R](f: (T => R) => T => R): (T => R) = {
+ lazy val yf: T => R = Memoize(f(yf)(_))
+
+ yf
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/LevenshteinMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/LevenshteinMetricSpec.scala
new file mode 100755
index 0000000..01c9082
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/LevenshteinMetricSpec.scala
@@ -0,0 +1,30 @@
+package org.hashtree.stringmetric
+
+import org.hashtree.stringmetric.LevenshteinMetric.stringCleaner
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class LevenshteinMetricSpec extends ScalaTest {
+ "LevenshteinMetric" should provide {
+ "compare method" when passed {
+ "valid arguments" should returns {
+ "Int indicating distance" in {
+ LevenshteinMetric.compare("", "").isDefined should be (false)
+
+ LevenshteinMetric.compare("abc", "").get should be (3)
+ LevenshteinMetric.compare("", "xyz").get should be (3)
+ LevenshteinMetric.compare("abc", "abc").get should be (0)
+ LevenshteinMetric.compare("abc", "xyz").get should be (3)
+ LevenshteinMetric.compare("abc", "a").get should be (2)
+ LevenshteinMetric.compare("a", "abc").get should be (2)
+ LevenshteinMetric.compare("abc", "c").get should be (2)
+ LevenshteinMetric.compare("c", "abc").get should be (2)
+
+ LevenshteinMetric.compare("kitten", "sitting").get should be (3)
+ LevenshteinMetric.compare("drake", "cake").get should be (2)
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/readme.md b/readme.md
index e0e2be9..395d155 100755
--- a/readme.md
+++ b/readme.md
@@ -4,6 +4,7 @@ A collection of string metrics implemented in Scala. Includes a light-weight cor
* Hamming
* Jaro
* Jaro-Winkler
+* Levenshtein
* Soundex
## Building the API