summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-11-09 14:02:55 -0700
committerRocky Madden <git@rockymadden.com>2012-11-09 14:02:55 -0700
commiteb7b65e7e8b9d1ca5405e25c3780a7336a999ac5 (patch)
tree90943c70f88e810974a9f1ea264cf0d7e96ba763 /core
parent7b9741921c3858aacfbfe625b237118b1d029873 (diff)
downloadstringmetric-eb7b65e7e8b9d1ca5405e25c3780a7336a999ac5.tar.gz
stringmetric-eb7b65e7e8b9d1ca5405e25c3780a7336a999ac5.tar.bz2
stringmetric-eb7b65e7e8b9d1ca5405e25c3780a7336a999ac5.zip
Created NGramMetric and spec.
Diffstat (limited to 'core')
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/NGramMetric.scala31
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala5
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/similarity/NGramMetricSpec.scala62
3 files changed, 98 insertions, 0 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/NGramMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/NGramMetric.scala
new file mode 100755
index 0000000..3ae0052
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/NGramMetric.scala
@@ -0,0 +1,31 @@
+package org.hashtree.stringmetric.similarity
+
+import org.hashtree.stringmetric.{ FilterableConfigurableStringMetric, MatchTuple, StringFilter, StringMetric, StringFilterDelegate }
+import scala.math
+
+/** An implementation of the N-Gram [[org.hashtree.stringmetric.StringMetric]]. */
+object NGramMetric extends StringMetric with FilterableConfigurableStringMetric[Int] {
+ override def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int)(implicit stringFilter: StringFilter): Option[Double] = {
+ val ca1 = stringFilter.filter(charArray1)
+ val ca2 = stringFilter.filter(charArray2)
+
+ if (ca1.length == 0 || ca2.length == 0) None
+ else if (ca1.length < n || ca2.length < n) Some(0d) // Because length is less than that of n, it will always be 0.
+ else if (ca1.sameElements(ca2)) Some(1d)
+ else {
+ val ca1bg = NGramAlgorithm.compute(ca1)(n).get
+ val ca2bg = NGramAlgorithm.compute(ca2)(n).get
+ val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString)))
+
+ Some(ms.toDouble / math.max(ca1bg.length, ca2bg.length))
+ }
+ }
+
+ override def compare(string1: String, string2: String)(n: Int)(implicit stringFilter: StringFilter): Option[Double] =
+ compare(
+ stringFilter.filter(string1.toCharArray),
+ stringFilter.filter(string2.toCharArray)
+ )(n)(new StringFilterDelegate)
+
+ private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
index d121ea0..0e8b3a7 100755
--- a/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
+++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
@@ -36,6 +36,11 @@ final class DiceSorensenMetricSpec extends ScalaTest {
DiceSorensenMetric.compare("night", "nacht").get should be (0.25)
DiceSorensenMetric.compare("night", "naght").get should be (0.5)
DiceSorensenMetric.compare("context", "contact").get should be (0.5)
+ DiceSorensenMetric.compare("contextcontext", "contact").get should be (0.3157894736842105)
+ DiceSorensenMetric.compare("context", "contactcontact").get should be (0.3157894736842105)
+ DiceSorensenMetric.compare("ht", "nacht").get should be (0.4)
+ DiceSorensenMetric.compare("xp", "nacht").get should be (0)
+ DiceSorensenMetric.compare("ht", "hththt").get should be (0.3333333333333333)
}
}
}
diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/NGramMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/NGramMetricSpec.scala
new file mode 100755
index 0000000..ca4fdd5
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/NGramMetricSpec.scala
@@ -0,0 +1,62 @@
+package org.hashtree.stringmetric.similarity
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class NGramMetricSpec extends ScalaTest {
+ "NGramMetric" should provide {
+ "compare method" when passed {
+ "empty arguments" should returns {
+ "None" in {
+ NGramMetric.compare("", "")(1).isDefined should be (false)
+ NGramMetric.compare("abc", "")(1).isDefined should be (false)
+ NGramMetric.compare("", "xyz")(1).isDefined should be (false)
+ }
+ }
+ "equal arguments" should returns {
+ "1" in {
+ NGramMetric.compare("abc", "abc")(1).get should be (1)
+ NGramMetric.compare("abc", "abc")(2).get should be (1)
+ NGramMetric.compare("abc", "abc")(3).get should be (1)
+ }
+ }
+ "unequal arguments" should returns {
+ "0" in {
+ NGramMetric.compare("abc", "xyz")(1).get should be (0)
+ NGramMetric.compare("abc", "xyz")(2).get should be (0)
+ NGramMetric.compare("abc", "xyz")(3).get should be (0)
+ }
+ }
+ "invalid arguments" should returns {
+ "Double indicating distance" in {
+ NGramMetric.compare("n", "naght")(2).get should be (0)
+ NGramMetric.compare("night", "n")(2).get should be (0)
+ NGramMetric.compare("ni", "naght")(3).get should be (0)
+ NGramMetric.compare("night", "na")(3).get should be (0)
+ }
+ }
+ "valid arguments" should returns {
+ "Double indicating distance" in {
+ NGramMetric.compare("night", "nacht")(1).get should be (0.6)
+ NGramMetric.compare("night", "naght")(1).get should be (0.8)
+ NGramMetric.compare("context", "contact")(1).get should be (0.7142857142857143)
+
+ NGramMetric.compare("night", "nacht")(2).get should be (0.25)
+ NGramMetric.compare("night", "naght")(2).get should be (0.5)
+ NGramMetric.compare("context", "contact")(2).get should be (0.5)
+ NGramMetric.compare("contextcontext", "contact")(2).get should be (0.23076923076923078)
+ NGramMetric.compare("context", "contactcontact")(2).get should be (0.23076923076923078)
+ NGramMetric.compare("ht", "nacht")(2).get should be (0.25)
+ NGramMetric.compare("xp", "nacht")(2).get should be (0)
+ NGramMetric.compare("ht", "hththt")(2).get should be (0.2)
+
+ NGramMetric.compare("night", "nacht")(3).get should be (0)
+ NGramMetric.compare("night", "naght")(3).get should be (0.3333333333333333)
+ NGramMetric.compare("context", "contact")(3).get should be (0.4)
+ }
+ }
+ }
+ }
+} \ No newline at end of file