From c91fe035e4427f4ca071c55455e19b011a33a1d5 Mon Sep 17 00:00:00 2001 From: Rocky Madden Date: Sat, 20 Oct 2012 15:44:53 -0600 Subject: Created DiceSorensen metric, spec, and command. --- .../cli/command/diceSorensenMetric.scala | 58 ++++++++++++++++++++++ .../cli/command/diceSorensenMetricSpec.scala | 39 +++++++++++++++ .../stringmetric/distance/DiceSorensenMetric.scala | 44 ++++++++++++++++ .../distance/DiceSorensenMetricSpec.scala | 35 +++++++++++++ readme.md | 1 + 5 files changed, 177 insertions(+) create mode 100755 cli/source/core/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetric.scala create mode 100755 cli/source/test/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetricSpec.scala create mode 100755 core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala create mode 100755 core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetric.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetric.scala new file mode 100755 index 0000000..1e21a2d --- /dev/null +++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetric.scala @@ -0,0 +1,58 @@ +package org.hashtree.stringmetric.cli.command + +import org.hashtree.stringmetric.{ CaseStringCleaner, StringCleanerDelegate } +import org.hashtree.stringmetric.cli._ +import org.hashtree.stringmetric.cli.command._ +import org.hashtree.stringmetric.distance.DiceSorensenMetric + +/** + * The diceSorensenMetric [[org.hashtree.stringmetric.cli.command.Command]]. Compares the similarity of two strings + * using the Dice coefficient / Sorensen similarity index. + */ +object diceSorensenMetric extends Command { + override def main(args: Array[String]): Unit = { + val options = OptionMapUtility.toOptionMap(args) + + try { + // Help. + if (options.contains('h) || options.contains('help)) { + help() + exit(options) + // Execute. + } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1) { + execute(options) + exit(options) + // Invalid syntax. + } else { + throw new IllegalArgumentException("Expected valid syntax. See --help.") + } + } catch { + case e => error(e)(options) + } + } + + override def help(): Unit = { + val ls = sys.props("line.separator") + val tab = " " + + println( + "Compares the similarity of two strings using the Dice coefficient / Sorensen similarity index." + ls + ls + + "Syntax:" + ls + + tab + "diceSorensenMetric [Options] string1 string2..." + ls + ls + + "Options:" + ls + + tab + "-h, --help" + ls + + tab + tab + "Outputs description, syntax, and options." + ) + } + + override def execute(options: OptionMap): Unit = { + val strings = options('dashless).split(" ") + + println( + DiceSorensenMetric.compare( + strings(0), + strings(1) + )(new StringCleanerDelegate with CaseStringCleaner).getOrElse("not comparable").toString + ) + } +} \ No newline at end of file diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetricSpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetricSpec.scala new file mode 100755 index 0000000..33ea7dd --- /dev/null +++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetricSpec.scala @@ -0,0 +1,39 @@ +package org.hashtree.stringmetric.cli.command + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class diceSorensenMetricSpec extends ScalaTest { + "diceSorensenMetric" should provide { + "main method" when passed { + "valid dashless arguments" should executes { + "print if they are a match" in { + val out = new java.io.ByteArrayOutputStream() + + Console.withOut(out)( + diceSorensenMetric.main(Array("--unitTest", "--debug", "aBc", "abc")) + ) + + out.toString should equal ("1.0\n") + out.reset() + + Console.withOut(out)( + diceSorensenMetric.main(Array("--unitTest", "--debug", "aBc", "xyz")) + ) + + out.toString should equal ("0.0\n") + out.reset() + } + } + "no dashless arguments" should throws { + "IllegalArgumentException" in { + evaluating { + diceSorensenMetric.main(Array("--unitTest", "--debug")) + } should produce [IllegalArgumentException] + } + } + } + } +} \ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala new file mode 100755 index 0000000..efec67c --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala @@ -0,0 +1,44 @@ +package org.hashtree.stringmetric.distance + +import org.hashtree.stringmetric.{ CompareTuple, MatchTuple, StringCleaner, StringCleanerDelegate, StringMetric } +import scala.annotation.tailrec + +/** An implementation of the Dice, and Sorensen, [[org.hashtree.stringmetric.StringMetric]]. */ +object DiceSorensenMetric extends StringMetric { + override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringCleaner: StringCleaner): Option[Float] = { + val ca1 = stringCleaner.clean(charArray1) + val ca2 = stringCleaner.clean(charArray2) + + if (ca1.length == 0 || ca2.length == 0) None + else { + val b = bigrams(ca1, ca2) + val ms = scoreMatches(b) + + Some((2f * ms) / (b._1.length + b._2.length)) + } + } + + override def compare(string1: String, string2: String)(implicit stringCleaner: StringCleaner): Option[Float] = { + if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(1f) + else + compare( + stringCleaner.clean(string1.toCharArray), + stringCleaner.clean(string2.toCharArray) + )(new StringCleanerDelegate) + } + + private[this] def bigrams(ct: CompareTuple[Char]): MatchTuple[String] = { + @tailrec + def set(ca: Array[Char], sa: Array[String]): Array[String] = { + if (ca.length <= 1) sa + else + set(ca.tail, sa :+ "" + ca.head + ca.tail.head) + } + + (set(ct._1, Array.empty[String]), set(ct._2, Array.empty[String])) + } + + private[this] def scoreMatches(mt: MatchTuple[String]) = { + mt._1.intersect(mt._2).length + } +} \ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala new file mode 100755 index 0000000..d07673b --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala @@ -0,0 +1,35 @@ +package org.hashtree.stringmetric.distance + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class DiceSorensenMetricSpec extends ScalaTest { + "DiceSorensenMetric" should provide { + "compare method" when passed { + "empty arguments" should returns { + "None" in { + DiceSorensenMetric.compare("", "").isDefined should be (false) + DiceSorensenMetric.compare("abc", "").isDefined should be (false) + DiceSorensenMetric.compare("", "xyz").isDefined should be (false) + } + } + "equal arguments" should returns { + "1" in { + DiceSorensenMetric.compare("abc", "abc").get should be (1.0f) + } + } + "unequal arguments" should returns { + "0" in { + DiceSorensenMetric.compare("abc", "xyz").get should be (0.0f) + } + } + "valid arguments" should returns { + "Float indicating distance" in { + DiceSorensenMetric.compare("night", "nacht").get should be (0.25f) + } + } + } + } +} \ No newline at end of file diff --git a/readme.md b/readme.md index 56174bf..0840cb5 100755 --- a/readme.md +++ b/readme.md @@ -1,6 +1,7 @@ #stringmetric A collection of string metrics implemented in Scala. Includes a light-weight core API and CLI for each string metric. The following string metrics are currently supported: +* Dice / Sorensen * Hamming * Jaro * Jaro-Winkler -- cgit v1.2.3