summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
Diffstat (limited to 'core')
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/StringMetric.scala8
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala21
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala44
3 files changed, 45 insertions, 28 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
index 9cabb82..078361b 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
@@ -10,11 +10,11 @@ trait StringMetric extends Metric[String] {
/** Convenience object for those extending [[org.hashtree.stringmetric.StringMetric]]. */
object StringMetric {
- def compareDiceSorensen(charArray1: Array[Char], charArray2: Array[Char])(implicit stringFilter: StringFilter): Option[Double] =
- DiceSorensenMetric.compare(charArray1, charArray2)(stringFilter)
+ def compareDiceSorensen(charArray1: Array[Char], charArray2: Array[Char])(n: Int)(implicit stringFilter: StringFilter): Option[Double] =
+ DiceSorensenMetric.compare(charArray1, charArray2)(n)(stringFilter)
- def compareDiceSorensen(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Double] =
- DiceSorensenMetric.compare(string1, string2)(stringFilter)
+ def compareDiceSorensen(string1: String, string2: String)(n: Int)(implicit stringFilter: StringFilter): Option[Double] =
+ DiceSorensenMetric.compare(string1, string2)(n)(stringFilter)
def compareHamming(charArray1: Array[Char], charArray2: Array[Char])(implicit stringFilter: StringFilter): Option[Int] =
HammingMetric.compare(charArray1, charArray2)(stringFilter)
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
index cc13fa3..d77c8e7 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
@@ -1,30 +1,33 @@
package org.hashtree.stringmetric.similarity
-import org.hashtree.stringmetric.{ FilterableStringMetric, MatchTuple, StringFilter, StringMetric, StringFilterDelegate }
+import org.hashtree.stringmetric.{ FilterableConfigurableStringMetric, MatchTuple, StringFilter, StringMetric, StringFilterDelegate }
-/** An implementation of the Dice, and Sorensen, [[org.hashtree.stringmetric.StringMetric]]. */
-object DiceSorensenMetric extends StringMetric with FilterableStringMetric {
- override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringFilter: StringFilter): Option[Double] = {
+/**
+ * An implementation of the Dice, and lesser known Sorensen, [[org.hashtree.stringmetric.StringMetric]]. This
+ * implementation differs in that n-gram size is required. Traditionally, the algorithm uses bigrams.
+ */
+object DiceSorensenMetric extends StringMetric with FilterableConfigurableStringMetric[Int] {
+ override def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int)(implicit stringFilter: StringFilter): Option[Double] = {
val ca1 = stringFilter.filter(charArray1)
val ca2 = stringFilter.filter(charArray2)
if (ca1.length == 0 || ca2.length == 0) None
- else if (ca1.length < 2 || ca2.length < 2) Some(0d) // Because length is less than that of bigram, it will always be 0.
+ else if (ca1.length < n || ca2.length < n) Some(0d) // Because length is less than n, it will always be 0.
else if (ca1.sameElements(ca2)) Some(1d)
else {
- val ca1bg = NGramAlgorithm.compute(ca1)(2).get
- val ca2bg = NGramAlgorithm.compute(ca2)(2).get
+ val ca1bg = NGramAlgorithm.compute(ca1)(n).get
+ val ca2bg = NGramAlgorithm.compute(ca2)(n).get
val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString)))
Some((2d * ms) / (ca1bg.length + ca2bg.length))
}
}
- override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Double] =
+ override def compare(string1: String, string2: String)(n: Int)(implicit stringFilter: StringFilter): Option[Double] =
compare(
stringFilter.filter(string1.toCharArray),
stringFilter.filter(string2.toCharArray)
- )(new StringFilterDelegate)
+ )(n: Int)(new StringFilterDelegate)
private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length
} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
index 0e8b3a7..163bab5 100755
--- a/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
+++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
@@ -10,37 +10,51 @@ final class DiceSorensenMetricSpec extends ScalaTest {
"compare method" when passed {
"empty arguments" should returns {
"None" in {
- DiceSorensenMetric.compare("", "").isDefined should be (false)
- DiceSorensenMetric.compare("abc", "").isDefined should be (false)
- DiceSorensenMetric.compare("", "xyz").isDefined should be (false)
+ DiceSorensenMetric.compare("", "")(1).isDefined should be (false)
+ DiceSorensenMetric.compare("abc", "")(1).isDefined should be (false)
+ DiceSorensenMetric.compare("", "xyz")(1).isDefined should be (false)
}
}
"equal arguments" should returns {
"1" in {
- DiceSorensenMetric.compare("abc", "abc").get should be (1)
+ DiceSorensenMetric.compare("abc", "abc")(1).get should be (1)
+ DiceSorensenMetric.compare("abc", "abc")(2).get should be (1)
+ DiceSorensenMetric.compare("abc", "abc")(3).get should be (1)
}
}
"unequal arguments" should returns {
"0" in {
- DiceSorensenMetric.compare("abc", "xyz").get should be (0)
+ DiceSorensenMetric.compare("abc", "xyz")(1).get should be (0)
+ DiceSorensenMetric.compare("abc", "xyz")(2).get should be (0)
+ DiceSorensenMetric.compare("abc", "xyz")(3).get should be (0)
}
}
"invalid arguments" should returns {
"Double indicating distance" in {
- DiceSorensenMetric.compare("n", "naght").get should be (0)
- DiceSorensenMetric.compare("night", "n").get should be (0)
+ DiceSorensenMetric.compare("n", "naght")(2).get should be (0)
+ DiceSorensenMetric.compare("night", "n")(2).get should be (0)
+ DiceSorensenMetric.compare("ni", "naght")(3).get should be (0)
+ DiceSorensenMetric.compare("night", "na")(3).get should be (0)
}
}
"valid arguments" should returns {
"Double indicating distance" in {
- DiceSorensenMetric.compare("night", "nacht").get should be (0.25)
- DiceSorensenMetric.compare("night", "naght").get should be (0.5)
- DiceSorensenMetric.compare("context", "contact").get should be (0.5)
- DiceSorensenMetric.compare("contextcontext", "contact").get should be (0.3157894736842105)
- DiceSorensenMetric.compare("context", "contactcontact").get should be (0.3157894736842105)
- DiceSorensenMetric.compare("ht", "nacht").get should be (0.4)
- DiceSorensenMetric.compare("xp", "nacht").get should be (0)
- DiceSorensenMetric.compare("ht", "hththt").get should be (0.3333333333333333)
+ DiceSorensenMetric.compare("night", "nacht")(1).get should be (0.6)
+ DiceSorensenMetric.compare("night", "naght")(1).get should be (0.8)
+ DiceSorensenMetric.compare("context", "contact")(1).get should be (0.7142857142857143)
+
+ DiceSorensenMetric.compare("night", "nacht")(2).get should be (0.25)
+ DiceSorensenMetric.compare("night", "naght")(2).get should be (0.5)
+ DiceSorensenMetric.compare("context", "contact")(2).get should be (0.5)
+ DiceSorensenMetric.compare("contextcontext", "contact")(2).get should be (0.3157894736842105)
+ DiceSorensenMetric.compare("context", "contactcontact")(2).get should be (0.3157894736842105)
+ DiceSorensenMetric.compare("ht", "nacht")(2).get should be (0.4)
+ DiceSorensenMetric.compare("xp", "nacht")(2).get should be (0)
+ DiceSorensenMetric.compare("ht", "hththt")(2).get should be (0.3333333333333333)
+
+ DiceSorensenMetric.compare("night", "nacht")(3).get should be (0)
+ DiceSorensenMetric.compare("night", "naght")(3).get should be (0.3333333333333333)
+ DiceSorensenMetric.compare("context", "contact")(3).get should be (0.4)
}
}
}