diff options
author | Rocky Madden <git@rockymadden.com> | 2012-10-20 15:44:53 -0600 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2012-10-20 15:44:53 -0600 |
commit | c91fe035e4427f4ca071c55455e19b011a33a1d5 (patch) | |
tree | 9b220424f76b42dd9cdc61c50c28f97077673665 /core | |
parent | 9fbe4868503142a7bd2e502b393d3bd60fdb441c (diff) | |
download | stringmetric-c91fe035e4427f4ca071c55455e19b011a33a1d5.tar.gz stringmetric-c91fe035e4427f4ca071c55455e19b011a33a1d5.tar.bz2 stringmetric-c91fe035e4427f4ca071c55455e19b011a33a1d5.zip |
Created DiceSorensen metric, spec, and command.
Diffstat (limited to 'core')
-rwxr-xr-x | core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala | 44 | ||||
-rwxr-xr-x | core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala | 35 |
2 files changed, 79 insertions, 0 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala new file mode 100755 index 0000000..efec67c --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/distance/DiceSorensenMetric.scala @@ -0,0 +1,44 @@ +package org.hashtree.stringmetric.distance + +import org.hashtree.stringmetric.{ CompareTuple, MatchTuple, StringCleaner, StringCleanerDelegate, StringMetric } +import scala.annotation.tailrec + +/** An implementation of the Dice, and Sorensen, [[org.hashtree.stringmetric.StringMetric]]. */ +object DiceSorensenMetric extends StringMetric { + override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringCleaner: StringCleaner): Option[Float] = { + val ca1 = stringCleaner.clean(charArray1) + val ca2 = stringCleaner.clean(charArray2) + + if (ca1.length == 0 || ca2.length == 0) None + else { + val b = bigrams(ca1, ca2) + val ms = scoreMatches(b) + + Some((2f * ms) / (b._1.length + b._2.length)) + } + } + + override def compare(string1: String, string2: String)(implicit stringCleaner: StringCleaner): Option[Float] = { + if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(1f) + else + compare( + stringCleaner.clean(string1.toCharArray), + stringCleaner.clean(string2.toCharArray) + )(new StringCleanerDelegate) + } + + private[this] def bigrams(ct: CompareTuple[Char]): MatchTuple[String] = { + @tailrec + def set(ca: Array[Char], sa: Array[String]): Array[String] = { + if (ca.length <= 1) sa + else + set(ca.tail, sa :+ "" + ca.head + ca.tail.head) + } + + (set(ct._1, Array.empty[String]), set(ct._2, Array.empty[String])) + } + + private[this] def scoreMatches(mt: MatchTuple[String]) = { + mt._1.intersect(mt._2).length + } +}
\ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala new file mode 100755 index 0000000..d07673b --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/distance/DiceSorensenMetricSpec.scala @@ -0,0 +1,35 @@ +package org.hashtree.stringmetric.distance + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class DiceSorensenMetricSpec extends ScalaTest { + "DiceSorensenMetric" should provide { + "compare method" when passed { + "empty arguments" should returns { + "None" in { + DiceSorensenMetric.compare("", "").isDefined should be (false) + DiceSorensenMetric.compare("abc", "").isDefined should be (false) + DiceSorensenMetric.compare("", "xyz").isDefined should be (false) + } + } + "equal arguments" should returns { + "1" in { + DiceSorensenMetric.compare("abc", "abc").get should be (1.0f) + } + } + "unequal arguments" should returns { + "0" in { + DiceSorensenMetric.compare("abc", "xyz").get should be (0.0f) + } + } + "valid arguments" should returns { + "Float indicating distance" in { + DiceSorensenMetric.compare("night", "nacht").get should be (0.25f) + } + } + } + } +}
\ No newline at end of file |