summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-11-09 13:13:44 -0700
committerRocky Madden <git@rockymadden.com>2012-11-09 13:13:44 -0700
commit32476de74b4e311c7a0944bbb962f99750e0269f (patch)
tree5e807aef3bf98fd173039f74075d191626411b2a /core
parentc5165d9fee36d1b2c8b9ed38998fc728e55cdb8d (diff)
downloadstringmetric-32476de74b4e311c7a0944bbb962f99750e0269f.tar.gz
stringmetric-32476de74b4e311c7a0944bbb962f99750e0269f.tar.bz2
stringmetric-32476de74b4e311c7a0944bbb962f99750e0269f.zip
Moved bigram code into it's own algorithm. Will be used in other metrics.
Diffstat (limited to 'core')
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/FilterableAlgorithm.scala2
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/FilterableConfigurableAlgorithm.scala2
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/FilterableConfigurableStringAlgorithm.scala2
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/FilterableStringAlgorithm.scala2
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala19
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/NGramAlgorithm.scala31
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala8
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/similarity/NGramAlgorithmSpec.scala53
8 files changed, 101 insertions, 18 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/FilterableAlgorithm.scala b/core/source/core/scala/org/hashtree/stringmetric/FilterableAlgorithm.scala
index f59781d..b58a2c3 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/FilterableAlgorithm.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/FilterableAlgorithm.scala
@@ -2,5 +2,5 @@ package org.hashtree.stringmetric
/** Marks those which leverage traits of a filterable [[org.hashtree.stringmetric.Algorithm]]. */
trait FilterableAlgorithm[T, F <: Filter[T]] extends Algorithm[T] {
- def compute(t: T)(implicit f: F): Option[T]
+ def compute(t: T)(implicit f: F): Option[AnyRef]
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/FilterableConfigurableAlgorithm.scala b/core/source/core/scala/org/hashtree/stringmetric/FilterableConfigurableAlgorithm.scala
index 2f6287e..53cb917 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/FilterableConfigurableAlgorithm.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/FilterableConfigurableAlgorithm.scala
@@ -2,5 +2,5 @@ package org.hashtree.stringmetric
/** Marks those which leverage traits of a filterable and configurable [[org.hashtree.stringmetric.Algorithm]]. */
trait FilterableConfigurableAlgorithm[T, O, F <: Filter[T]] extends Algorithm[T] {
- def compute(t: T)(o: O)(implicit f: F): Option[T]
+ def compute(t: T)(o: O)(implicit f: F): Option[AnyRef]
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/FilterableConfigurableStringAlgorithm.scala b/core/source/core/scala/org/hashtree/stringmetric/FilterableConfigurableStringAlgorithm.scala
index af9af1a..f223fd3 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/FilterableConfigurableStringAlgorithm.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/FilterableConfigurableStringAlgorithm.scala
@@ -2,5 +2,5 @@ package org.hashtree.stringmetric
/** Marks those which leverage traits of a string based [[org.hashtree.stringmetric.FilterableConfigurableAlgorithm]]. */
trait FilterableConfigurableStringAlgorithm[O] extends FilterableConfigurableAlgorithm[String, O, StringFilter] with StringAlgorithm {
- def compute(charArray: Array[Char])(o: O)(implicit stringFilter: StringFilter): Option[Array[Char]]
+ def compute(charArray: Array[Char])(o: O)(implicit stringFilter: StringFilter): Option[AnyRef]
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/FilterableStringAlgorithm.scala b/core/source/core/scala/org/hashtree/stringmetric/FilterableStringAlgorithm.scala
index f6cff27..ff6827d 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/FilterableStringAlgorithm.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/FilterableStringAlgorithm.scala
@@ -2,5 +2,5 @@ package org.hashtree.stringmetric
/** Marks those which leverage traits of a string based [[org.hashtree.stringmetric.FilterableAlgorithm]]. */
trait FilterableStringAlgorithm extends FilterableAlgorithm[String, StringFilter] with StringAlgorithm {
- def compute(charArray: Array[Char])(implicit stringFilter: StringFilter): Option[Array[Char]]
+ def compute(charArray: Array[Char])(implicit stringFilter: StringFilter): Option[AnyRef]
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
index 64447d6..08e0419 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
@@ -10,12 +10,14 @@ object DiceSorensenMetric extends StringMetric with FilterableStringMetric {
val ca2 = stringFilter.filter(charArray2)
if (ca1.length == 0 || ca2.length == 0) None
+ else if (ca1.length < 2 || ca2.length < 2) Some(0d) // Because length is less than that of bigram, it will always be 0.
else if (ca1.sameElements(ca2)) Some(1d)
else {
- val b = bigrams(ca1, ca2)
- val ms = scoreMatches(b)
+ val ca1bg = NGramAlgorithm.compute(ca1)(2).get
+ val ca2bg = NGramAlgorithm.compute(ca2)(2).get
+ val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString)))
- Some((2d * ms) / (b._1.length + b._2.length))
+ Some((2d * ms) / (ca1bg.length + ca2bg.length))
}
}
@@ -25,16 +27,5 @@ object DiceSorensenMetric extends StringMetric with FilterableStringMetric {
stringFilter.filter(string2.toCharArray)
)(new StringFilterDelegate)
- private[this] def bigrams(ct: CompareTuple[Char]): MatchTuple[String] = {
- @tailrec
- def set(ca: Array[Char], sa: Array[String]): Array[String] = {
- if (ca.length <= 1) sa
- else
- set(ca.tail, sa :+ "" + ca.head + ca(1))
- }
-
- (set(ct._1, Array.empty[String]), set(ct._2, Array.empty[String]))
- }
-
private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/NGramAlgorithm.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/NGramAlgorithm.scala
new file mode 100755
index 0000000..89a6d68
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/NGramAlgorithm.scala
@@ -0,0 +1,31 @@
+package org.hashtree.stringmetric.similarity
+
+import org.hashtree.stringmetric.{ FilterableConfigurableStringAlgorithm, StringAlgorithm, StringFilter, StringFilterDelegate }
+import scala.annotation.tailrec
+
+/** An implementation of the N-Gram [[org.hashtree.stringmetric.StringAlgorithm]]. */
+object NGramAlgorithm extends StringAlgorithm with FilterableConfigurableStringAlgorithm[Int] {
+ override def compute(charArray: Array[Char])(n: Int)(implicit stringFilter: StringFilter): Option[Array[Array[Char]]] = {
+ if (n <= 0)
+ throw new IllegalArgumentException("Expected valid n.")
+
+ val ca = stringFilter.filter(charArray)
+
+ if (ca.length == 0) None
+ else
+ Some(sequence(ca, Array.empty[Array[Char]], n))
+ }
+
+ override def compute(string: String)(n: Int)(implicit stringFilter: StringFilter): Option[Array[String]] =
+ compute(stringFilter.filter(string.toCharArray))(n)(new StringFilterDelegate) match {
+ case Some(mp) => Some(mp.map(_.mkString))
+ case None => None
+ }
+
+ @tailrec
+ private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = {
+ if (i.length <= n) o :+ i
+ else
+ sequence(i.tail, o :+ i.take(n), n)
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
index 70d2c25..d121ea0 100755
--- a/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
+++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetricSpec.scala
@@ -25,9 +25,17 @@ final class DiceSorensenMetricSpec extends ScalaTest {
DiceSorensenMetric.compare("abc", "xyz").get should be (0)
}
}
+ "invalid arguments" should returns {
+ "Double indicating distance" in {
+ DiceSorensenMetric.compare("n", "naght").get should be (0)
+ DiceSorensenMetric.compare("night", "n").get should be (0)
+ }
+ }
"valid arguments" should returns {
"Double indicating distance" in {
DiceSorensenMetric.compare("night", "nacht").get should be (0.25)
+ DiceSorensenMetric.compare("night", "naght").get should be (0.5)
+ DiceSorensenMetric.compare("context", "contact").get should be (0.5)
}
}
}
diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/NGramAlgorithmSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/NGramAlgorithmSpec.scala
new file mode 100755
index 0000000..f76a01e
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/NGramAlgorithmSpec.scala
@@ -0,0 +1,53 @@
+package org.hashtree.stringmetric.similarity
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class NGramAlgorithmSpec extends ScalaTest {
+ "NGramAlgorithm" should provide {
+ "compute method" when passed {
+ "empty argument" should returns {
+ "None" in {
+ NGramAlgorithm.compute("")(1).isDefined should be (false)
+ }
+ }
+ "invalid n argument" should throws {
+ "IllegalArgumentException" in {
+ evaluating {
+ NGramAlgorithm.compute("")(0).isDefined should be (false)
+ } should produce [IllegalArgumentException]
+
+ evaluating {
+ NGramAlgorithm.compute("")(-1).isDefined should be (false)
+ } should produce [IllegalArgumentException]
+ }
+ }
+ "valid argument" should returns {
+ "Array[String]" in {
+ NGramAlgorithm.compute("abcdefghijklmnopqrstuvwxyz")(1).get should equal (
+ Array(
+ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
+ "s", "t", "u", "v", "w", "x", "y", "z"
+ )
+ )
+
+ NGramAlgorithm.compute("abcdefghijklmnopqrstuvwxyz")(2).get should equal (
+ Array(
+ "ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl", "lm", "mn", "no", "op",
+ "pq", "qr", "rs", "st", "tu", "uv", "vw", "wx", "xy", "yz"
+ )
+ )
+
+ NGramAlgorithm.compute("abcdefghijklmnopqrstuvwxyz")(3).get should equal (
+ Array(
+ "abc", "bcd", "cde", "def", "efg", "fgh", "ghi", "hij", "ijk", "jkl", "klm", "lmn", "mno",
+ "nop", "opq", "pqr", "qrs", "rst", "stu", "tuv", "uvw", "vwx", "wxy", "xyz"
+ )
+ )
+ }
+ }
+ }
+ }
+} \ No newline at end of file