summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2013-03-09 15:01:48 -0700
committerRocky Madden <git@rockymadden.com>2013-03-09 15:01:48 -0700
commit532568e38856f277eb947c46239ed39b98216947 (patch)
treedbed1f847de666babb0a40f94b7c00ccab5d93b2 /core
parent2a775ee2a8279b458b84ef35c458c58b19aa98c9 (diff)
downloadstringmetric-532568e38856f277eb947c46239ed39b98216947.tar.gz
stringmetric-532568e38856f277eb947c46239ed39b98216947.tar.bz2
stringmetric-532568e38856f277eb947c46239ed39b98216947.zip
Created Jaccard metric, spec, benchmark, and CLI.
Diffstat (limited to 'core')
-rwxr-xr-xcore/source/benchmark/scala/com/rockymadden/stringmetric/similarity/JaccardMetricBenchmark.scala55
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala8
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala44
-rwxr-xr-xcore/source/test/scala/com/rockymadden/stringmetric/similarity/JaccardMetricSpec.scala77
4 files changed, 184 insertions, 0 deletions
diff --git a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/JaccardMetricBenchmark.scala b/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/JaccardMetricBenchmark.scala
new file mode 100755
index 0000000..62be6bc
--- /dev/null
+++ b/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/JaccardMetricBenchmark.scala
@@ -0,0 +1,55 @@
+package com.rockymadden.stringmetric.similarity
+
+import com.google.caliper.Param
+import com.rockymadden.stringmetric.{ CaliperBenchmark, CaliperRunner }
+import scala.annotation.tailrec
+import scala.util.Random
+
+final class JaccardMetricBenchmark extends CaliperBenchmark {
+ import JaccardMetricBenchmark.Metric
+
+ @Param(Array("0", "1", "2", "4", "8", "16"))
+ var length: Int = _
+
+ var string1: String = _
+ var charArray1: Array[Char] = _
+ var string2: String = _
+ var charArray2: Array[Char] = _
+
+ override protected def setUp() {
+ @tailrec
+ def random(l: Int, ps: String = null): String =
+ if (l == 0) ""
+ else {
+ val s = Random.alphanumeric.take(l).mkString
+
+ if (ps == null || s != ps) s
+ else random(l, ps)
+ }
+
+ string1 = random(length)
+ string2 = random(length, string1)
+ charArray1 = string1.toCharArray
+ charArray2 = string2.toCharArray
+ }
+
+ def timeCompareWithDifferentCharArrays(reps: Int) = run(reps) {
+ Metric.compare(charArray1, charArray2)(2)
+ }
+
+ def timeCompareWithDifferentStrings(reps: Int) = run(reps) {
+ Metric.compare(string1, string2)(2)
+ }
+
+ def timeCompareWithIdenticalCharArrays(reps: Int) = run(reps) {
+ Metric.compare(charArray1, charArray1)(2)
+ }
+
+ def timeCompareWithIdenticalStrings(reps: Int) = run(reps) {
+ Metric.compare(string1, string1)(2)
+ }
+}
+
+object JaccardMetricBenchmark extends CaliperRunner(classOf[JaccardMetricBenchmark]) {
+ private final val Metric = JaccardMetric()
+}
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala
index 2ccce55..0ace668 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringMetric.scala
@@ -8,6 +8,9 @@ object ConfigurableStringMetric {
type DiceSorensenMetric = com.rockymadden.stringmetric.similarity.DiceSorensenMetric
val DiceSorensenMetric = com.rockymadden.stringmetric.similarity.DiceSorensenMetric
+ type JaccardMetric = com.rockymadden.stringmetric.similarity.JaccardMetric
+ val JaccardMetric = com.rockymadden.stringmetric.similarity.JaccardMetric
+
type NGramMetric = com.rockymadden.stringmetric.similarity.NGramMetric
val NGramMetric = com.rockymadden.stringmetric.similarity.NGramMetric
@@ -20,6 +23,11 @@ object ConfigurableStringMetric {
def compareWithDiceSorensen(string1: String, string2: String)(n: Int) =
DiceSorensenMetric.compare(string1, string2)(n)
+ def compareWithJaccard(charArray1: Array[Char], charArray2: Array[Char])(n: Int) =
+ JaccardMetric.compare(charArray1, charArray2)(n)
+
+ def compareWithJaccard(string1: String, string2: String)(n: Int) = JaccardMetric.compare(string1, string2)(n)
+
def compareWithNGram(charArray1: Array[Char], charArray2: Array[Char])(n: Int) =
NGramMetric.compare(charArray1, charArray2)(n)
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala
new file mode 100755
index 0000000..ccf5f29
--- /dev/null
+++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala
@@ -0,0 +1,44 @@
+package com.rockymadden.stringmetric.similarity
+
+import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter }
+
+/* An implementation of the Jaccard metric. */
+class JaccardMetric extends ConfigurableStringMetric[Double, Int] {
+ this: StringFilter =>
+
+ final override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit n: Int): Option[Double] = {
+ if (n <= 0) throw new IllegalArgumentException("Expected valid n.")
+
+ val fca1 = filter(charArray1)
+ lazy val fca2 = filter(charArray2)
+
+ if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare.
+ else if (fca1.sameElements(fca2)) Some(1d)
+ else {
+ val nGramAlgorithm = NGramAlgorithm()
+
+ nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg =>
+ nGramAlgorithm.compute(fca2)(n).map { ca2bg =>
+ val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString)))
+
+ ms.toDouble / (ca1bg.length + ca2bg.length)
+ }
+ }
+ }
+ }
+
+ final override def compare(string1: String, string2: String)(implicit n: Int): Option[Double] =
+ compare(filter(string1.toCharArray), filter(string2.toCharArray))(n: Int)
+
+ private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length
+}
+
+object JaccardMetric {
+ private lazy val self = apply()
+
+ def apply(): JaccardMetric = new JaccardMetric with StringFilter
+
+ def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = self.compare(charArray1, charArray2)(n)
+
+ def compare(string1: String, string2: String)(n: Int) = self.compare(string1, string2)(n)
+}
diff --git a/core/source/test/scala/com/rockymadden/stringmetric/similarity/JaccardMetricSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/similarity/JaccardMetricSpec.scala
new file mode 100755
index 0000000..a3aeb5e
--- /dev/null
+++ b/core/source/test/scala/com/rockymadden/stringmetric/similarity/JaccardMetricSpec.scala
@@ -0,0 +1,77 @@
+package com.rockymadden.stringmetric.similarity
+
+import com.rockymadden.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class JaccardMetricSpec extends ScalaTest {
+ import JaccardMetricSpec.Metric
+
+ "JaccardMetric" should provide {
+ "compare method" when passed {
+ "empty arguments" should returns {
+ "None" in {
+ Metric.compare("", "")(1).isDefined should be (false)
+ Metric.compare("abc", "")(1).isDefined should be (false)
+ Metric.compare("", "xyz")(1).isDefined should be (false)
+ }
+ }
+ "equal arguments" should returns {
+ "1" in {
+ Metric.compare("abc", "abc")(1).get should be (1)
+ Metric.compare("abc", "abc")(2).get should be (1)
+ Metric.compare("abc", "abc")(3).get should be (1)
+ }
+ }
+ "unequal arguments" should returns {
+ "0" in {
+ Metric.compare("abc", "xyz")(1).get should be (0)
+ Metric.compare("abc", "xyz")(2).get should be (0)
+ Metric.compare("abc", "xyz")(3).get should be (0)
+ }
+ }
+ "invalid arguments" should returns {
+ "None" in {
+ Metric.compare("n", "naght")(2).isDefined should be (false)
+ Metric.compare("night", "n")(2).isDefined should be (false)
+ Metric.compare("ni", "naght")(3).isDefined should be (false)
+ Metric.compare("night", "na")(3).isDefined should be (false)
+ }
+ }
+ "valid arguments" should returns {
+ "Double indicating distance" in {
+ Metric.compare("night", "nacht")(1).get should be (0.3)
+ Metric.compare("night", "naght")(1).get should be (0.4)
+ Metric.compare("context", "contact")(1).get should be (0.35714285714285715)
+
+ Metric.compare("night", "nacht")(2).get should be (0.125)
+ Metric.compare("night", "naght")(2).get should be (0.25)
+ Metric.compare("context", "contact")(2).get should be (0.25)
+ Metric.compare("contextcontext", "contact")(2).get should be (0.15789473684210525)
+ Metric.compare("context", "contactcontact")(2).get should be (0.15789473684210525)
+ Metric.compare("ht", "nacht")(2).get should be (0.2)
+ Metric.compare("xp", "nacht")(2).get should be (0)
+ Metric.compare("ht", "hththt")(2).get should be (0.16666666666666666)
+
+ Metric.compare("night", "nacht")(3).get should be (0)
+ Metric.compare("night", "naght")(3).get should be (0.16666666666666666)
+ Metric.compare("context", "contact")(3).get should be (0.2)
+ }
+ }
+ }
+ }
+ "JaccardMetric companion object" should provide {
+ "pass-through compare method" should returns {
+ "same value as class" in {
+ JaccardMetric.compare("context", "contact")(3).get should be (0.2)
+ }
+ }
+ }
+}
+
+object JaccardMetricSpec {
+ private final val Metric = JaccardMetric()
+}
+
+