From eb7b65e7e8b9d1ca5405e25c3780a7336a999ac5 Mon Sep 17 00:00:00 2001 From: Rocky Madden Date: Fri, 9 Nov 2012 14:02:55 -0700 Subject: Created NGramMetric and spec. --- .../stringmetric/similarity/NGramMetric.scala | 31 +++++++++++ .../similarity/DiceSorensenMetricSpec.scala | 5 ++ .../stringmetric/similarity/NGramMetricSpec.scala | 62 ++++++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100755 core/source/core/scala/org/hashtree/stringmetric/similarity/NGramMetric.scala create mode 100755 core/source/test/scala/org/hashtree/stringmetric/similarity/NGramMetricSpec.scala (limited to 'core') diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/NGramMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/NGramMetric.scala new file mode 100755 index 0000000..3ae0052 --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/NGramMetric.scala @@ -0,0 +1,31 @@ +package org.hashtree.stringmetric.similarity + +import org.hashtree.stringmetric.{ FilterableConfigurableStringMetric, MatchTuple, StringFilter, StringMetric, StringFilterDelegate } +import scala.math + +/** An implementation of the N-Gram [[org.hashtree.stringmetric.StringMetric]]. */ +object NGramMetric extends StringMetric with FilterableConfigurableStringMetric[Int] { + override def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int)(implicit stringFilter: StringFilter): Option[Double] = { + val ca1 = stringFilter.filter(charArray1) + val ca2 = stringFilter.filter(charArray2) + + if (ca1.length == 0 || ca2.length == 0) None + else if (ca1.length < n || ca2.length < n) Some(0d) // Because length is less than that of n, it will always be 0. + else if (ca1.sameElements(ca2)) Some(1d) + else { + val ca1bg = NGramAlgorithm.compute(ca1)(n).get + val ca2bg = NGramAlgorithm.compute(ca2)(n).get + val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString))) + + Some(ms.toDouble / math.max(ca1bg.length, ca2bg.length)) + } + } + + override def compare(string1: String, string2: String)(n: Int)(implicit stringFilter: StringFilter): Option[Double] = + compare( + stringFilter.filter(string1.toCharArray), + stringFilter.filter(string2.toCharArray) + )(n)(new StringFilterDelegate) + + private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length +} \ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala index d121ea0..0e8b3a7 100755 --- a/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala +++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala @@ -36,6 +36,11 @@ final class DiceSorensenMetricSpec extends ScalaTest { DiceSorensenMetric.compare("night", "nacht").get should be (0.25) DiceSorensenMetric.compare("night", "naght").get should be (0.5) DiceSorensenMetric.compare("context", "contact").get should be (0.5) + DiceSorensenMetric.compare("contextcontext", "contact").get should be (0.3157894736842105) + DiceSorensenMetric.compare("context", "contactcontact").get should be (0.3157894736842105) + DiceSorensenMetric.compare("ht", "nacht").get should be (0.4) + DiceSorensenMetric.compare("xp", "nacht").get should be (0) + DiceSorensenMetric.compare("ht", "hththt").get should be (0.3333333333333333) } } } diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/NGramMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/NGramMetricSpec.scala new file mode 100755 index 0000000..ca4fdd5 --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/NGramMetricSpec.scala @@ -0,0 +1,62 @@ +package org.hashtree.stringmetric.similarity + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class NGramMetricSpec extends ScalaTest { + "NGramMetric" should provide { + "compare method" when passed { + "empty arguments" should returns { + "None" in { + NGramMetric.compare("", "")(1).isDefined should be (false) + NGramMetric.compare("abc", "")(1).isDefined should be (false) + NGramMetric.compare("", "xyz")(1).isDefined should be (false) + } + } + "equal arguments" should returns { + "1" in { + NGramMetric.compare("abc", "abc")(1).get should be (1) + NGramMetric.compare("abc", "abc")(2).get should be (1) + NGramMetric.compare("abc", "abc")(3).get should be (1) + } + } + "unequal arguments" should returns { + "0" in { + NGramMetric.compare("abc", "xyz")(1).get should be (0) + NGramMetric.compare("abc", "xyz")(2).get should be (0) + NGramMetric.compare("abc", "xyz")(3).get should be (0) + } + } + "invalid arguments" should returns { + "Double indicating distance" in { + NGramMetric.compare("n", "naght")(2).get should be (0) + NGramMetric.compare("night", "n")(2).get should be (0) + NGramMetric.compare("ni", "naght")(3).get should be (0) + NGramMetric.compare("night", "na")(3).get should be (0) + } + } + "valid arguments" should returns { + "Double indicating distance" in { + NGramMetric.compare("night", "nacht")(1).get should be (0.6) + NGramMetric.compare("night", "naght")(1).get should be (0.8) + NGramMetric.compare("context", "contact")(1).get should be (0.7142857142857143) + + NGramMetric.compare("night", "nacht")(2).get should be (0.25) + NGramMetric.compare("night", "naght")(2).get should be (0.5) + NGramMetric.compare("context", "contact")(2).get should be (0.5) + NGramMetric.compare("contextcontext", "contact")(2).get should be (0.23076923076923078) + NGramMetric.compare("context", "contactcontact")(2).get should be (0.23076923076923078) + NGramMetric.compare("ht", "nacht")(2).get should be (0.25) + NGramMetric.compare("xp", "nacht")(2).get should be (0) + NGramMetric.compare("ht", "hththt")(2).get should be (0.2) + + NGramMetric.compare("night", "nacht")(3).get should be (0) + NGramMetric.compare("night", "naght")(3).get should be (0.3333333333333333) + NGramMetric.compare("context", "contact")(3).get should be (0.4) + } + } + } + } +} \ No newline at end of file -- cgit v1.2.3