diff options
Diffstat (limited to 'core/source')
4 files changed, 184 insertions, 0 deletions
diff --git a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/JaccardMetricBenchmark.scala b/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/JaccardMetricBenchmark.scala new file mode 100755 index 0000000..62be6bc --- /dev/null +++ b/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/JaccardMetricBenchmark.scala @@ -0,0 +1,55 @@ +package com.rockymadden.stringmetric.similarity + +import com.google.caliper.Param +import com.rockymadden.stringmetric.{ CaliperBenchmark, CaliperRunner } +import scala.annotation.tailrec +import scala.util.Random + +final class JaccardMetricBenchmark extends CaliperBenchmark { + import JaccardMetricBenchmark.Metric + + @Param(Array("0", "1", "2", "4", "8", "16")) + var length: Int = _ + + var string1: String = _ + var charArray1: Array[Char] = _ + var string2: String = _ + var charArray2: Array[Char] = _ + + override protected def setUp() { + @tailrec + def random(l: Int, ps: String = null): String = + if (l == 0) "" + else { + val s = Random.alphanumeric.take(l).mkString + + if (ps == null || s != ps) s + else random(l, ps) + } + + string1 = random(length) + string2 = random(length, string1) + charArray1 = string1.toCharArray + charArray2 = string2.toCharArray + } + + def timeCompareWithDifferentCharArrays(reps: Int) = run(reps) { + Metric.compare(charArray1, charArray2)(2) + } + + def timeCompareWithDifferentStrings(reps: Int) = run(reps) { + Metric.compare(string1, string2)(2) + } + + def timeCompareWithIdenticalCharArrays(reps: Int) = run(reps) { + Metric.compare(charArray1, charArray1)(2) + } + + def timeCompareWithIdenticalStrings(reps: Int) = run(reps) { + Metric.compare(string1, string1)(2) + } +} + +object JaccardMetricBenchmark extends CaliperRunner(classOf[JaccardMetricBenchmark]) { + private final val Metric = JaccardMetric() +} diff --git a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala index 2ccce55..0ace668 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala @@ -8,6 +8,9 @@ object ConfigurableStringMetric { type DiceSorensenMetric = com.rockymadden.stringmetric.similarity.DiceSorensenMetric val DiceSorensenMetric = com.rockymadden.stringmetric.similarity.DiceSorensenMetric + type JaccardMetric = com.rockymadden.stringmetric.similarity.JaccardMetric + val JaccardMetric = com.rockymadden.stringmetric.similarity.JaccardMetric + type NGramMetric = com.rockymadden.stringmetric.similarity.NGramMetric val NGramMetric = com.rockymadden.stringmetric.similarity.NGramMetric @@ -20,6 +23,11 @@ object ConfigurableStringMetric { def compareWithDiceSorensen(string1: String, string2: String)(n: Int) = DiceSorensenMetric.compare(string1, string2)(n) + def compareWithJaccard(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = + JaccardMetric.compare(charArray1, charArray2)(n) + + def compareWithJaccard(string1: String, string2: String)(n: Int) = JaccardMetric.compare(string1, string2)(n) + def compareWithNGram(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = NGramMetric.compare(charArray1, charArray2)(n) diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala new file mode 100755 index 0000000..ccf5f29 --- /dev/null +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala @@ -0,0 +1,44 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter } + +/* An implementation of the Jaccard metric. */ +class JaccardMetric extends ConfigurableStringMetric[Double, Int] { + this: StringFilter => + + final override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit n: Int): Option[Double] = { + if (n <= 0) throw new IllegalArgumentException("Expected valid n.") + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. + else if (fca1.sameElements(fca2)) Some(1d) + else { + val nGramAlgorithm = NGramAlgorithm() + + nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg => + nGramAlgorithm.compute(fca2)(n).map { ca2bg => + val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString))) + + ms.toDouble / (ca1bg.length + ca2bg.length) + } + } + } + } + + final override def compare(string1: String, string2: String)(implicit n: Int): Option[Double] = + compare(filter(string1.toCharArray), filter(string2.toCharArray))(n: Int) + + private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length +} + +object JaccardMetric { + private lazy val self = apply() + + def apply(): JaccardMetric = new JaccardMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = self.compare(charArray1, charArray2)(n) + + def compare(string1: String, string2: String)(n: Int) = self.compare(string1, string2)(n) +} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/similarity/JaccardMetricSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/similarity/JaccardMetricSpec.scala new file mode 100755 index 0000000..a3aeb5e --- /dev/null +++ b/core/source/test/scala/com/rockymadden/stringmetric/similarity/JaccardMetricSpec.scala @@ -0,0 +1,77 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class JaccardMetricSpec extends ScalaTest { + import JaccardMetricSpec.Metric + + "JaccardMetric" should provide { + "compare method" when passed { + "empty arguments" should returns { + "None" in { + Metric.compare("", "")(1).isDefined should be (false) + Metric.compare("abc", "")(1).isDefined should be (false) + Metric.compare("", "xyz")(1).isDefined should be (false) + } + } + "equal arguments" should returns { + "1" in { + Metric.compare("abc", "abc")(1).get should be (1) + Metric.compare("abc", "abc")(2).get should be (1) + Metric.compare("abc", "abc")(3).get should be (1) + } + } + "unequal arguments" should returns { + "0" in { + Metric.compare("abc", "xyz")(1).get should be (0) + Metric.compare("abc", "xyz")(2).get should be (0) + Metric.compare("abc", "xyz")(3).get should be (0) + } + } + "invalid arguments" should returns { + "None" in { + Metric.compare("n", "naght")(2).isDefined should be (false) + Metric.compare("night", "n")(2).isDefined should be (false) + Metric.compare("ni", "naght")(3).isDefined should be (false) + Metric.compare("night", "na")(3).isDefined should be (false) + } + } + "valid arguments" should returns { + "Double indicating distance" in { + Metric.compare("night", "nacht")(1).get should be (0.3) + Metric.compare("night", "naght")(1).get should be (0.4) + Metric.compare("context", "contact")(1).get should be (0.35714285714285715) + + Metric.compare("night", "nacht")(2).get should be (0.125) + Metric.compare("night", "naght")(2).get should be (0.25) + Metric.compare("context", "contact")(2).get should be (0.25) + Metric.compare("contextcontext", "contact")(2).get should be (0.15789473684210525) + Metric.compare("context", "contactcontact")(2).get should be (0.15789473684210525) + Metric.compare("ht", "nacht")(2).get should be (0.2) + Metric.compare("xp", "nacht")(2).get should be (0) + Metric.compare("ht", "hththt")(2).get should be (0.16666666666666666) + + Metric.compare("night", "nacht")(3).get should be (0) + Metric.compare("night", "naght")(3).get should be (0.16666666666666666) + Metric.compare("context", "contact")(3).get should be (0.2) + } + } + } + } + "JaccardMetric companion object" should provide { + "pass-through compare method" should returns { + "same value as class" in { + JaccardMetric.compare("context", "contact")(3).get should be (0.2) + } + } + } +} + +object JaccardMetricSpec { + private final val Metric = JaccardMetric() +} + + |