summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-10-20 15:44:53 -0600
committerRocky Madden <git@rockymadden.com>2012-10-20 15:44:53 -0600
commitc91fe035e4427f4ca071c55455e19b011a33a1d5 (patch)
tree9b220424f76b42dd9cdc61c50c28f97077673665
parent9fbe4868503142a7bd2e502b393d3bd60fdb441c (diff)
downloadstringmetric-c91fe035e4427f4ca071c55455e19b011a33a1d5.tar.gz
stringmetric-c91fe035e4427f4ca071c55455e19b011a33a1d5.tar.bz2
stringmetric-c91fe035e4427f4ca071c55455e19b011a33a1d5.zip
Created DiceSorensen metric, spec, and command.
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetric.scala58
-rwxr-xr-xcli/source/test/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetricSpec.scala39
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala44
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala35
-rwxr-xr-xreadme.md1
5 files changed, 177 insertions, 0 deletions
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetric.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetric.scala
new file mode 100755
index 0000000..1e21a2d
--- /dev/null
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetric.scala
@@ -0,0 +1,58 @@
+package org.hashtree.stringmetric.cli.command
+
+import org.hashtree.stringmetric.{ CaseStringCleaner, StringCleanerDelegate }
+import org.hashtree.stringmetric.cli._
+import org.hashtree.stringmetric.cli.command._
+import org.hashtree.stringmetric.distance.DiceSorensenMetric
+
+/**
+ * The diceSorensenMetric [[org.hashtree.stringmetric.cli.command.Command]]. Compares the similarity of two strings
+ * using the Dice coefficient / Sorensen similarity index.
+ */
+object diceSorensenMetric extends Command {
+ override def main(args: Array[String]): Unit = {
+ val options = OptionMapUtility.toOptionMap(args)
+
+ try {
+ // Help.
+ if (options.contains('h) || options.contains('help)) {
+ help()
+ exit(options)
+ // Execute.
+ } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1) {
+ execute(options)
+ exit(options)
+ // Invalid syntax.
+ } else {
+ throw new IllegalArgumentException("Expected valid syntax. See --help.")
+ }
+ } catch {
+ case e => error(e)(options)
+ }
+ }
+
+ override def help(): Unit = {
+ val ls = sys.props("line.separator")
+ val tab = " "
+
+ println(
+ "Compares the similarity of two strings using the Dice coefficient / Sorensen similarity index." + ls + ls +
+ "Syntax:" + ls +
+ tab + "diceSorensenMetric [Options] string1 string2..." + ls + ls +
+ "Options:" + ls +
+ tab + "-h, --help" + ls +
+ tab + tab + "Outputs description, syntax, and options."
+ )
+ }
+
+ override def execute(options: OptionMap): Unit = {
+ val strings = options('dashless).split(" ")
+
+ println(
+ DiceSorensenMetric.compare(
+ strings(0),
+ strings(1)
+ )(new StringCleanerDelegate with CaseStringCleaner).getOrElse("not comparable").toString
+ )
+ }
+} \ No newline at end of file
diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetricSpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetricSpec.scala
new file mode 100755
index 0000000..33ea7dd
--- /dev/null
+++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/diceSorensenMetricSpec.scala
@@ -0,0 +1,39 @@
+package org.hashtree.stringmetric.cli.command
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class diceSorensenMetricSpec extends ScalaTest {
+ "diceSorensenMetric" should provide {
+ "main method" when passed {
+ "valid dashless arguments" should executes {
+ "print if they are a match" in {
+ val out = new java.io.ByteArrayOutputStream()
+
+ Console.withOut(out)(
+ diceSorensenMetric.main(Array("--unitTest", "--debug", "aBc", "abc"))
+ )
+
+ out.toString should equal ("1.0\n")
+ out.reset()
+
+ Console.withOut(out)(
+ diceSorensenMetric.main(Array("--unitTest", "--debug", "aBc", "xyz"))
+ )
+
+ out.toString should equal ("0.0\n")
+ out.reset()
+ }
+ }
+ "no dashless arguments" should throws {
+ "IllegalArgumentException" in {
+ evaluating {
+ diceSorensenMetric.main(Array("--unitTest", "--debug"))
+ } should produce [IllegalArgumentException]
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala
new file mode 100755
index 0000000..efec67c
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala
@@ -0,0 +1,44 @@
+package org.hashtree.stringmetric.distance
+
+import org.hashtree.stringmetric.{ CompareTuple, MatchTuple, StringCleaner, StringCleanerDelegate, StringMetric }
+import scala.annotation.tailrec
+
+/** An implementation of the Dice, and Sorensen, [[org.hashtree.stringmetric.StringMetric]]. */
+object DiceSorensenMetric extends StringMetric {
+ override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringCleaner: StringCleaner): Option[Float] = {
+ val ca1 = stringCleaner.clean(charArray1)
+ val ca2 = stringCleaner.clean(charArray2)
+
+ if (ca1.length == 0 || ca2.length == 0) None
+ else {
+ val b = bigrams(ca1, ca2)
+ val ms = scoreMatches(b)
+
+ Some((2f * ms) / (b._1.length + b._2.length))
+ }
+ }
+
+ override def compare(string1: String, string2: String)(implicit stringCleaner: StringCleaner): Option[Float] = {
+ if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(1f)
+ else
+ compare(
+ stringCleaner.clean(string1.toCharArray),
+ stringCleaner.clean(string2.toCharArray)
+ )(new StringCleanerDelegate)
+ }
+
+ private[this] def bigrams(ct: CompareTuple[Char]): MatchTuple[String] = {
+ @tailrec
+ def set(ca: Array[Char], sa: Array[String]): Array[String] = {
+ if (ca.length <= 1) sa
+ else
+ set(ca.tail, sa :+ "" + ca.head + ca.tail.head)
+ }
+
+ (set(ct._1, Array.empty[String]), set(ct._2, Array.empty[String]))
+ }
+
+ private[this] def scoreMatches(mt: MatchTuple[String]) = {
+ mt._1.intersect(mt._2).length
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala
new file mode 100755
index 0000000..d07673b
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala
@@ -0,0 +1,35 @@
+package org.hashtree.stringmetric.distance
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class DiceSorensenMetricSpec extends ScalaTest {
+ "DiceSorensenMetric" should provide {
+ "compare method" when passed {
+ "empty arguments" should returns {
+ "None" in {
+ DiceSorensenMetric.compare("", "").isDefined should be (false)
+ DiceSorensenMetric.compare("abc", "").isDefined should be (false)
+ DiceSorensenMetric.compare("", "xyz").isDefined should be (false)
+ }
+ }
+ "equal arguments" should returns {
+ "1" in {
+ DiceSorensenMetric.compare("abc", "abc").get should be (1.0f)
+ }
+ }
+ "unequal arguments" should returns {
+ "0" in {
+ DiceSorensenMetric.compare("abc", "xyz").get should be (0.0f)
+ }
+ }
+ "valid arguments" should returns {
+ "Float indicating distance" in {
+ DiceSorensenMetric.compare("night", "nacht").get should be (0.25f)
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/readme.md b/readme.md
index 56174bf..0840cb5 100755
--- a/readme.md
+++ b/readme.md
@@ -1,6 +1,7 @@
#stringmetric
A collection of string metrics implemented in Scala. Includes a light-weight core API and CLI for each string metric. The following string metrics are currently supported:
+* Dice / Sorensen
* Hamming
* Jaro
* Jaro-Winkler