summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-11-09 15:36:59 -0700
committerRocky Madden <git@rockymadden.com>2012-11-09 15:36:59 -0700
commit60cbc08776285e5bef6aae41b7a323cb556406ff (patch)
treeec65a429d0271a8b3a51684d9cbfab8ac590a803
parent9f8194dc286ee89275cf37b87910dca240945b3e (diff)
downloadstringmetric-60cbc08776285e5bef6aae41b7a323cb556406ff.tar.gz
stringmetric-60cbc08776285e5bef6aae41b7a323cb556406ff.tar.bz2
stringmetric-60cbc08776285e5bef6aae41b7a323cb556406ff.zip
Changed compare implemention. It is now required to specify the size of the n-gram. Typically, this was 2.
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/similarity/diceSorensenMetric.scala11
-rwxr-xr-xcli/source/test/scala/org/hashtree/stringmetric/cli/similarity/diceSorensenMetricSpec.scala4
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/StringMetric.scala8
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala21
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala44
5 files changed, 55 insertions, 33 deletions
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/similarity/diceSorensenMetric.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/similarity/diceSorensenMetric.scala
index 229e989..de0b302 100755
--- a/cli/source/core/scala/org/hashtree/stringmetric/cli/similarity/diceSorensenMetric.scala
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/similarity/diceSorensenMetric.scala
@@ -19,7 +19,9 @@ object diceSorensenMetric extends Command {
help()
exit(options)
// Execute.
- } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1) {
+ } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1 &&
+ options.contains('n) && ParseUtility.parseInt(options('n)).isDefined
+ ) {
execute(options)
exit(options)
// Invalid syntax.
@@ -39,18 +41,21 @@ object diceSorensenMetric extends Command {
tab + "diceSorensenMetric [Options] string1 string2..." + ls + ls +
"Options:" + ls +
tab + "-h, --help" + ls +
- tab + tab + "Outputs description, syntax, and options."
+ tab + tab + "Outputs description, syntax, and options." +
+ tab + "--n" + ls +
+ tab + tab + "The n, traditionally 2."
)
}
override def execute(options: OptionMap): Unit = {
val strings = options('dashless).split(" ")
+ val n = ParseUtility.parseInt(options('n)).get
println(
DiceSorensenMetric.compare(
strings(0),
strings(1)
- )(new StringFilterDelegate with AsciiLetterCaseStringFilter).getOrElse("not comparable").toString
+ )(n)(new StringFilterDelegate with AsciiLetterCaseStringFilter).getOrElse("not comparable").toString
)
}
} \ No newline at end of file
diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/similarity/diceSorensenMetricSpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/similarity/diceSorensenMetricSpec.scala
index d9e2cb0..1e2c286 100755
--- a/cli/source/test/scala/org/hashtree/stringmetric/cli/similarity/diceSorensenMetricSpec.scala
+++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/similarity/diceSorensenMetricSpec.scala
@@ -13,14 +13,14 @@ final class diceSorensenMetricSpec extends ScalaTest {
val out = new java.io.ByteArrayOutputStream()
Console.withOut(out)(
- diceSorensenMetric.main(Array("--unitTest", "--debug", "aBc", "abc"))
+ diceSorensenMetric.main(Array("--unitTest", "--debug", "--n=2", "aBc", "abc"))
)
out.toString should equal ("1.0\n")
out.reset()
Console.withOut(out)(
- diceSorensenMetric.main(Array("--unitTest", "--debug", "aBc", "xyz"))
+ diceSorensenMetric.main(Array("--unitTest", "--debug", "--n=2", "aBc", "xyz"))
)
out.toString should equal ("0.0\n")
diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
index 9cabb82..078361b 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
@@ -10,11 +10,11 @@ trait StringMetric extends Metric[String] {
/** Convenience object for those extending [[org.hashtree.stringmetric.StringMetric]]. */
object StringMetric {
- def compareDiceSorensen(charArray1: Array[Char], charArray2: Array[Char])(implicit stringFilter: StringFilter): Option[Double] =
- DiceSorensenMetric.compare(charArray1, charArray2)(stringFilter)
+ def compareDiceSorensen(charArray1: Array[Char], charArray2: Array[Char])(n: Int)(implicit stringFilter: StringFilter): Option[Double] =
+ DiceSorensenMetric.compare(charArray1, charArray2)(n)(stringFilter)
- def compareDiceSorensen(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Double] =
- DiceSorensenMetric.compare(string1, string2)(stringFilter)
+ def compareDiceSorensen(string1: String, string2: String)(n: Int)(implicit stringFilter: StringFilter): Option[Double] =
+ DiceSorensenMetric.compare(string1, string2)(n)(stringFilter)
def compareHamming(charArray1: Array[Char], charArray2: Array[Char])(implicit stringFilter: StringFilter): Option[Int] =
HammingMetric.compare(charArray1, charArray2)(stringFilter)
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
index cc13fa3..d77c8e7 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
@@ -1,30 +1,33 @@
package org.hashtree.stringmetric.similarity
-import org.hashtree.stringmetric.{ FilterableStringMetric, MatchTuple, StringFilter, StringMetric, StringFilterDelegate }
+import org.hashtree.stringmetric.{ FilterableConfigurableStringMetric, MatchTuple, StringFilter, StringMetric, StringFilterDelegate }
-/** An implementation of the Dice, and Sorensen, [[org.hashtree.stringmetric.StringMetric]]. */
-object DiceSorensenMetric extends StringMetric with FilterableStringMetric {
- override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringFilter: StringFilter): Option[Double] = {
+/**
+ * An implementation of the Dice, and lesser known Sorensen, [[org.hashtree.stringmetric.StringMetric]]. This
+ * implementation differs in that n-gram size is required. Traditionally, the algorithm uses bigrams.
+ */
+object DiceSorensenMetric extends StringMetric with FilterableConfigurableStringMetric[Int] {
+ override def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int)(implicit stringFilter: StringFilter): Option[Double] = {
val ca1 = stringFilter.filter(charArray1)
val ca2 = stringFilter.filter(charArray2)
if (ca1.length == 0 || ca2.length == 0) None
- else if (ca1.length < 2 || ca2.length < 2) Some(0d) // Because length is less than that of bigram, it will always be 0.
+ else if (ca1.length < n || ca2.length < n) Some(0d) // Because length is less than n, it will always be 0.
else if (ca1.sameElements(ca2)) Some(1d)
else {
- val ca1bg = NGramAlgorithm.compute(ca1)(2).get
- val ca2bg = NGramAlgorithm.compute(ca2)(2).get
+ val ca1bg = NGramAlgorithm.compute(ca1)(n).get
+ val ca2bg = NGramAlgorithm.compute(ca2)(n).get
val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString)))
Some((2d * ms) / (ca1bg.length + ca2bg.length))
}
}
- override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Double] =
+ override def compare(string1: String, string2: String)(n: Int)(implicit stringFilter: StringFilter): Option[Double] =
compare(
stringFilter.filter(string1.toCharArray),
stringFilter.filter(string2.toCharArray)
- )(new StringFilterDelegate)
+ )(n: Int)(new StringFilterDelegate)
private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length
} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
index 0e8b3a7..163bab5 100755
--- a/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
+++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
@@ -10,37 +10,51 @@ final class DiceSorensenMetricSpec extends ScalaTest {
"compare method" when passed {
"empty arguments" should returns {
"None" in {
- DiceSorensenMetric.compare("", "").isDefined should be (false)
- DiceSorensenMetric.compare("abc", "").isDefined should be (false)
- DiceSorensenMetric.compare("", "xyz").isDefined should be (false)
+ DiceSorensenMetric.compare("", "")(1).isDefined should be (false)
+ DiceSorensenMetric.compare("abc", "")(1).isDefined should be (false)
+ DiceSorensenMetric.compare("", "xyz")(1).isDefined should be (false)
}
}
"equal arguments" should returns {
"1" in {
- DiceSorensenMetric.compare("abc", "abc").get should be (1)
+ DiceSorensenMetric.compare("abc", "abc")(1).get should be (1)
+ DiceSorensenMetric.compare("abc", "abc")(2).get should be (1)
+ DiceSorensenMetric.compare("abc", "abc")(3).get should be (1)
}
}
"unequal arguments" should returns {
"0" in {
- DiceSorensenMetric.compare("abc", "xyz").get should be (0)
+ DiceSorensenMetric.compare("abc", "xyz")(1).get should be (0)
+ DiceSorensenMetric.compare("abc", "xyz")(2).get should be (0)
+ DiceSorensenMetric.compare("abc", "xyz")(3).get should be (0)
}
}
"invalid arguments" should returns {
"Double indicating distance" in {
- DiceSorensenMetric.compare("n", "naght").get should be (0)
- DiceSorensenMetric.compare("night", "n").get should be (0)
+ DiceSorensenMetric.compare("n", "naght")(2).get should be (0)
+ DiceSorensenMetric.compare("night", "n")(2).get should be (0)
+ DiceSorensenMetric.compare("ni", "naght")(3).get should be (0)
+ DiceSorensenMetric.compare("night", "na")(3).get should be (0)
}
}
"valid arguments" should returns {
"Double indicating distance" in {
- DiceSorensenMetric.compare("night", "nacht").get should be (0.25)
- DiceSorensenMetric.compare("night", "naght").get should be (0.5)
- DiceSorensenMetric.compare("context", "contact").get should be (0.5)
- DiceSorensenMetric.compare("contextcontext", "contact").get should be (0.3157894736842105)
- DiceSorensenMetric.compare("context", "contactcontact").get should be (0.3157894736842105)
- DiceSorensenMetric.compare("ht", "nacht").get should be (0.4)
- DiceSorensenMetric.compare("xp", "nacht").get should be (0)
- DiceSorensenMetric.compare("ht", "hththt").get should be (0.3333333333333333)
+ DiceSorensenMetric.compare("night", "nacht")(1).get should be (0.6)
+ DiceSorensenMetric.compare("night", "naght")(1).get should be (0.8)
+ DiceSorensenMetric.compare("context", "contact")(1).get should be (0.7142857142857143)
+
+ DiceSorensenMetric.compare("night", "nacht")(2).get should be (0.25)
+ DiceSorensenMetric.compare("night", "naght")(2).get should be (0.5)
+ DiceSorensenMetric.compare("context", "contact")(2).get should be (0.5)
+ DiceSorensenMetric.compare("contextcontext", "contact")(2).get should be (0.3157894736842105)
+ DiceSorensenMetric.compare("context", "contactcontact")(2).get should be (0.3157894736842105)
+ DiceSorensenMetric.compare("ht", "nacht")(2).get should be (0.4)
+ DiceSorensenMetric.compare("xp", "nacht")(2).get should be (0)
+ DiceSorensenMetric.compare("ht", "hththt")(2).get should be (0.3333333333333333)
+
+ DiceSorensenMetric.compare("night", "nacht")(3).get should be (0)
+ DiceSorensenMetric.compare("night", "naght")(3).get should be (0.3333333333333333)
+ DiceSorensenMetric.compare("context", "contact")(3).get should be (0.4)
}
}
}