summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-10-20 15:44:53 -0600
committerRocky Madden <git@rockymadden.com>2012-10-20 15:44:53 -0600
commitc91fe035e4427f4ca071c55455e19b011a33a1d5 (patch)
tree9b220424f76b42dd9cdc61c50c28f97077673665 /core
parent9fbe4868503142a7bd2e502b393d3bd60fdb441c (diff)
downloadstringmetric-c91fe035e4427f4ca071c55455e19b011a33a1d5.tar.gz
stringmetric-c91fe035e4427f4ca071c55455e19b011a33a1d5.tar.bz2
stringmetric-c91fe035e4427f4ca071c55455e19b011a33a1d5.zip
Created DiceSorensen metric, spec, and command.
Diffstat (limited to 'core')
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala44
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala35
2 files changed, 79 insertions, 0 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala
new file mode 100755
index 0000000..efec67c
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala
@@ -0,0 +1,44 @@
+package org.hashtree.stringmetric.distance
+
+import org.hashtree.stringmetric.{ CompareTuple, MatchTuple, StringCleaner, StringCleanerDelegate, StringMetric }
+import scala.annotation.tailrec
+
+/** An implementation of the Dice, and Sorensen, [[org.hashtree.stringmetric.StringMetric]]. */
+object DiceSorensenMetric extends StringMetric {
+ override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringCleaner: StringCleaner): Option[Float] = {
+ val ca1 = stringCleaner.clean(charArray1)
+ val ca2 = stringCleaner.clean(charArray2)
+
+ if (ca1.length == 0 || ca2.length == 0) None
+ else {
+ val b = bigrams(ca1, ca2)
+ val ms = scoreMatches(b)
+
+ Some((2f * ms) / (b._1.length + b._2.length))
+ }
+ }
+
+ override def compare(string1: String, string2: String)(implicit stringCleaner: StringCleaner): Option[Float] = {
+ if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(1f)
+ else
+ compare(
+ stringCleaner.clean(string1.toCharArray),
+ stringCleaner.clean(string2.toCharArray)
+ )(new StringCleanerDelegate)
+ }
+
+ private[this] def bigrams(ct: CompareTuple[Char]): MatchTuple[String] = {
+ @tailrec
+ def set(ca: Array[Char], sa: Array[String]): Array[String] = {
+ if (ca.length <= 1) sa
+ else
+ set(ca.tail, sa :+ "" + ca.head + ca.tail.head)
+ }
+
+ (set(ct._1, Array.empty[String]), set(ct._2, Array.empty[String]))
+ }
+
+ private[this] def scoreMatches(mt: MatchTuple[String]) = {
+ mt._1.intersect(mt._2).length
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala
new file mode 100755
index 0000000..d07673b
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala
@@ -0,0 +1,35 @@
+package org.hashtree.stringmetric.distance
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class DiceSorensenMetricSpec extends ScalaTest {
+ "DiceSorensenMetric" should provide {
+ "compare method" when passed {
+ "empty arguments" should returns {
+ "None" in {
+ DiceSorensenMetric.compare("", "").isDefined should be (false)
+ DiceSorensenMetric.compare("abc", "").isDefined should be (false)
+ DiceSorensenMetric.compare("", "xyz").isDefined should be (false)
+ }
+ }
+ "equal arguments" should returns {
+ "1" in {
+ DiceSorensenMetric.compare("abc", "abc").get should be (1.0f)
+ }
+ }
+ "unequal arguments" should returns {
+ "0" in {
+ DiceSorensenMetric.compare("abc", "xyz").get should be (0.0f)
+ }
+ }
+ "valid arguments" should returns {
+ "Float indicating distance" in {
+ DiceSorensenMetric.compare("night", "nacht").get should be (0.25f)
+ }
+ }
+ }
+ }
+} \ No newline at end of file