summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-10-08 22:29:42 -0600
committerRocky Madden <git@rockymadden.com>2012-10-08 22:29:42 -0600
commitf380762362860d432d27d9861d817be393f19da6 (patch)
tree59da9b12cc59452a1b2a774a54f508be36e73d42 /core
parent2b6d745e54ed5a4c9f3253f9c97188861aef4452 (diff)
downloadstringmetric-f380762362860d432d27d9861d817be393f19da6.tar.gz
stringmetric-f380762362860d432d27d9861d817be393f19da6.tar.bz2
stringmetric-f380762362860d432d27d9861d817be393f19da6.zip
Created cleaners to allow implementors to determine the amount of cleaning performed on inputs prior to metric computations, if desired. Applied implicitly to metric compare methods in second curry. Previous behavior of ignoring case and spacing held in tact, while allowing for further definition if needed via arguments.
Diffstat (limited to 'core')
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/CaseStringCleaner.scala20
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/Cleaner.scala6
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala23
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala15
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/Metric.scala4
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/SpaceStringCleaner.scala12
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/StringCleaner.scala6
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/StringCleanerDelegate.scala11
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/StringMetric.scala6
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/package.scala2
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/CaseStringCleanerSpec.scala36
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala1
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/SpaceStringCleanerSpec.scala32
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/StringCleanerDelegateSpec.scala24
14 files changed, 176 insertions, 22 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/CaseStringCleaner.scala b/core/source/core/scala/org/hashtree/stringmetric/CaseStringCleaner.scala
new file mode 100755
index 0000000..b3663b2
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/CaseStringCleaner.scala
@@ -0,0 +1,20 @@
+package org.hashtree.stringmetric
+
+/** A decorator [[org.hashtree.stringmetric.StringCleaner]]. Ensures the input case-sensitivity does not matter. */
+trait CaseStringCleaner extends StringCleaner {
+ abstract override def clean(charArray: Array[Char]): Array[Char] = {
+ super.clean(
+ charArray.map { c =>
+ if (c >= 65 && c <= 90) {
+ (c + 32).toChar
+ } else {
+ c
+ }
+ }
+ )
+ }
+
+ abstract override def clean(string: String): String = {
+ super.clean(string.toLowerCase)
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/Cleaner.scala b/core/source/core/scala/org/hashtree/stringmetric/Cleaner.scala
new file mode 100755
index 0000000..1ce970a
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/Cleaner.scala
@@ -0,0 +1,6 @@
+package org.hashtree.stringmetric
+
+/** Marks those which leverage traits of a cleaner. */
+trait Cleaner[T] {
+ def clean(t: T): T
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala
index bcea174..ad566bc 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala
@@ -3,17 +3,21 @@ package org.hashtree.stringmetric
import scala.collection.mutable.ArrayBuffer
import scala.math
import scala.util.control.Breaks.{ break, breakable }
+import org.hashtree.stringmetric._
/**
- * An implementation of the Jaro string metric. One differing detail in this implementation is that if a character is
+ * An implementation of the Jaro [[org.hashtree.stringmetric.StringMetric]]. One differing detail in this implementation is that if a character is
* matched in string2, it cannot be matched upon again. This results in a more penalized distance in these scenarios.
*/
object JaroMetric extends StringMetric {
- override def compare(charArray1: Array[Char], charArray2: Array[Char]): Float = {
+ override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringCleaner: StringCleaner): Float = {
+ val ca1 = stringCleaner.clean(charArray1)
+ val ca2 = stringCleaner.clean(charArray2)
+
// Return 0 if either character array lacks length.
- if (charArray1.length == 0 || charArray2.length == 0) return 0f
+ if (ca1.length == 0 || ca2.length == 0) return 0f
- val mt = `match`((charArray1, charArray2))
+ val mt = `match`((ca1, ca2))
val ms = scoreMatches((mt._1, mt._2))
val ts = scoreTranspositions((mt._1, mt._2))
@@ -23,12 +27,11 @@ object JaroMetric extends StringMetric {
((ms.toFloat / charArray1.length) + (ms.toFloat / charArray2.length) + ((ms.toFloat - ts) / ms)) / 3
}
- override def compare(string1: String, string2: String): Float = {
- compare(string1.replaceAllLiterally(" ", "").toLowerCase.toCharArray,
- string2.replaceAllLiterally(" ", "").toLowerCase.toCharArray)
+ override def compare(string1: String, string2: String)(implicit stringCleaner: StringCleaner): Float = {
+ compare(stringCleaner.clean(string1.toCharArray), stringCleaner.clean(string2.toCharArray))(new StringCleanerDelegate)
}
- private[this] def `match`(ct: CompareTuple): MatchTuple = {
+ private[this] def `match`(ct: CompareTuple) = {
val window = math.abs((math.max(ct._1.length, ct._2.length) / 2f).floor.toInt - 1)
val ab1 = ArrayBuffer[Int]()
val ab2 = ArrayBuffer[Int]()
@@ -56,13 +59,13 @@ object JaroMetric extends StringMetric {
(ab1.map(ct._1(_)).toArray, ab2.sortWith(_ < _).map(ct._2(_)).toArray)
}
- private[this] def scoreMatches(mt: MatchTuple): Int = {
+ private[this] def scoreMatches(mt: MatchTuple) = {
require(mt._1.length == mt._2.length)
mt._1.length
}
- private[this] def scoreTranspositions(mt: MatchTuple): Int = {
+ private[this] def scoreTranspositions(mt: MatchTuple) = {
require(mt._1.length == mt._2.length)
(mt._1.zip(mt._2).filter(t => t._1 != t._2).length / 2f).floor.toInt
diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
index 01e56f8..b2b3627 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
@@ -1,20 +1,21 @@
package org.hashtree.stringmetric
/**
- * An implementation of the Jaro-Winkler string metric. One differing detail in this implementation is that if a
+ * An implementation of the Jaro-Winkler [[org.hashtree.stringmetric.StringMetric]]. One differing detail in this implementation is that if a
* character is matched in string2, it cannot be matched upon again. This results in a more penalized distance in these
* scenarios (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722).
*/
object JaroWinklerMetric extends StringMetric {
- override def compare(charArray1: Array[Char], charArray2: Array[Char]): Float = {
- val jaro = JaroMetric.compare(charArray1, charArray2)
- val prefix = charArray1.zip(charArray2).takeWhile(t => t._1 == t._2).map(_._1)
+ override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringCleaner: StringCleaner): Float = {
+ val ca1 = stringCleaner.clean(charArray1)
+ val ca2 = stringCleaner.clean(charArray2)
+ val jaro = JaroMetric.compare(ca1, ca2)(new StringCleanerDelegate)
+ val prefix = ca1.zip(ca2).takeWhile(t => t._1 == t._2).map(_._1)
jaro + ((if (prefix.length <= 4) prefix.length else 4) * (0.1f * (1 - jaro)))
}
- override def compare(string1: String, string2: String): Float = {
- compare(string1.replaceAllLiterally(" ", "").toLowerCase.toCharArray,
- string2.replaceAllLiterally(" ", "").toLowerCase.toCharArray)
+ override def compare(string1: String, string2: String)(implicit stringCleaner: StringCleaner): Float = {
+ compare(stringCleaner.clean(string1.toCharArray), stringCleaner.clean(string2.toCharArray))(new StringCleanerDelegate)
}
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/Metric.scala b/core/source/core/scala/org/hashtree/stringmetric/Metric.scala
index 2d570c2..86017b3 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/Metric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/Metric.scala
@@ -1,6 +1,6 @@
package org.hashtree.stringmetric
/** Marks those which leverage traits of a metric. */
-trait Metric[T] {
- def compare(t1: T, t2: T): AnyVal
+trait Metric[T, C] {
+ def compare(t1: T, t2: T)(implicit c: C): AnyVal
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/SpaceStringCleaner.scala b/core/source/core/scala/org/hashtree/stringmetric/SpaceStringCleaner.scala
new file mode 100755
index 0000000..50e2287
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/SpaceStringCleaner.scala
@@ -0,0 +1,12 @@
+package org.hashtree.stringmetric
+
+/** A decorator [[org.hashtree.stringmetric.StringCleaner]]. Ensures the input spacing does not matter. */
+trait SpaceStringCleaner extends StringCleaner {
+ abstract override def clean(charArray: Array[Char]): Array[Char] = {
+ super.clean(charArray.filter(_ != ' '))
+ }
+
+ abstract override def clean(string: String): String = {
+ super.clean(string.replaceAllLiterally(" ", ""))
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringCleaner.scala b/core/source/core/scala/org/hashtree/stringmetric/StringCleaner.scala
new file mode 100755
index 0000000..b6d2671
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/StringCleaner.scala
@@ -0,0 +1,6 @@
+package org.hashtree.stringmetric
+
+/** Marks those which leverage traits of a string based [[org.hashtree.stringmetric.Cleaner]]. */
+trait StringCleaner extends Cleaner[String] {
+ def clean(charArray: Array[Char]): Array[Char]
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringCleanerDelegate.scala b/core/source/core/scala/org/hashtree/stringmetric/StringCleanerDelegate.scala
new file mode 100755
index 0000000..70d5397
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/StringCleanerDelegate.scala
@@ -0,0 +1,11 @@
+package org.hashtree.stringmetric
+
+class StringCleanerDelegate extends StringCleaner {
+ override def clean(charArray: Array[Char]): Array[Char] = {
+ charArray
+ }
+
+ override def clean(string: String): String = {
+ string
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
index 2e92292..960540e 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
@@ -1,6 +1,6 @@
package org.hashtree.stringmetric
-/** Marks those which leverage traits of a string based Metric. */
-trait StringMetric extends Metric[String] {
- def compare(ca1: Array[Char], ca2: Array[Char]): AnyVal
+/** Marks those which leverage traits of a string based [[org.hashtree.stringmetric.Metric]]. */
+trait StringMetric extends Metric[String, StringCleaner] {
+ def compare(ca1: Array[Char], ca2: Array[Char])(implicit stringCleaner: StringCleaner): AnyVal
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/package.scala b/core/source/core/scala/org/hashtree/stringmetric/package.scala
index 3be74fb..ce63c52 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/package.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/package.scala
@@ -5,4 +5,6 @@ package object stringmetric {
type CompareTuple = Tuple2[Array[Char], Array[Char]]
type MatchTuple = CompareTuple
+
+ implicit val stringCleaner = new StringCleanerDelegate with CaseStringCleaner with SpaceStringCleaner
} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/CaseStringCleanerSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/CaseStringCleanerSpec.scala
new file mode 100755
index 0000000..a628700
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/CaseStringCleanerSpec.scala
@@ -0,0 +1,36 @@
+package org.hashtree.stringmetric
+
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class CaseStringCleanerSpec extends ScalaTest {
+ private final val Cleaner = new StringCleanerDelegate with CaseStringCleaner
+
+ "CaseStringCleaner" should provide {
+ "overloaded clean method" when passed {
+ "String with mixed case" should returns {
+ "String with the same case" in {
+ Cleaner.clean("HelloWorld") should (equal ("helloworld") or equal ("HELLOWORLD"))
+ Cleaner.clean("Hello World") should (equal ("hello world") or equal ("HELLO WORLD"))
+ Cleaner.clean("H e l l o W o r l d") should
+ (equal ("h e l l o w o r l d") or equal ("H E L L O W O R L D"))
+ Cleaner.clean("H e l l o W o r l d") should
+ (equal ("h e l l o w o r l d") or equal ("H E L L O W O R L D"))
+ }
+ }
+ "character array with mixed case" should returns {
+ "character array with the same case" in {
+ Cleaner.clean("HelloWorld".toCharArray) should
+ (equal ("helloworld".toCharArray) or equal ("HELLOWORLD".toCharArray))
+ Cleaner.clean("Hello World".toCharArray) should
+ (equal ("hello world".toCharArray) or equal ("HELLO WORLD".toCharArray))
+ Cleaner.clean("H e l l o W o r l d".toCharArray) should
+ (equal ("h e l l o w o r l d".toCharArray) or equal ("H E L L O W O R L D".toCharArray))
+ Cleaner.clean("H e l l o W o r l d".toCharArray) should
+ (equal ("h e l l o w o r l d".toCharArray) or equal ("H E L L O W O R L D".toCharArray))
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala
index 5d164e2..a7059a3 100755
--- a/core/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala
+++ b/core/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala
@@ -36,6 +36,7 @@ final class JaroMetricSpec extends ScalaTest {
JaroMetric.compare("brittney spears", "brittney startzman")
JaroMetric.compare("m a r t h a", "m a r h t a") should be (0.9444444f)
+
JaroMetric.compare("d w a y n e", "d u a n e") should be (0.82222223f)
JaroMetric.compare("d i x o n", "d i c k s o n x") should be (0.76666665f)
JaroMetric.compare("a b c v w x y z", "c a b v w x y z") should be (0.9583333f)
diff --git a/core/source/test/scala/org/hashtree/stringmetric/SpaceStringCleanerSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/SpaceStringCleanerSpec.scala
new file mode 100755
index 0000000..f58a4df
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/SpaceStringCleanerSpec.scala
@@ -0,0 +1,32 @@
+package org.hashtree.stringmetric
+
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class SpaceStringCleanerSpec extends ScalaTest {
+ private final val Cleaner = new StringCleanerDelegate with SpaceStringCleaner
+
+ "SpaceStringCleaner" should provide {
+ "overloaded clean method" when passed {
+ "String with spaces" should returns {
+ "String with spaces removed" in {
+ Cleaner.clean("HelloWorld") should equal ("HelloWorld")
+ Cleaner.clean(" HelloWorld ") should equal ("HelloWorld")
+ Cleaner.clean("Hello World") should equal ("HelloWorld")
+ Cleaner.clean("H e l l o W o r l d") should equal ("HelloWorld")
+ Cleaner.clean("H e l l o W o r l d") should equal ("HelloWorld")
+ }
+ }
+ "character array with spaces" should returns {
+ "character array with spaces removed" in {
+ Cleaner.clean("HelloWorld".toCharArray) should equal ("HelloWorld".toCharArray)
+ Cleaner.clean(" HelloWorld ".toCharArray) should equal ("HelloWorld".toCharArray)
+ Cleaner.clean("Hello World".toCharArray) should equal ("HelloWorld".toCharArray)
+ Cleaner.clean("H e l l o W o r l d".toCharArray) should equal ("HelloWorld".toCharArray)
+ Cleaner.clean("H e l l o W o r l d".toCharArray) should equal ("HelloWorld".toCharArray)
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/StringCleanerDelegateSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/StringCleanerDelegateSpec.scala
new file mode 100755
index 0000000..5016500
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/StringCleanerDelegateSpec.scala
@@ -0,0 +1,24 @@
+package org.hashtree.stringmetric
+
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class StringCleanerDelegateSpec extends ScalaTest {
+ private final val Cleaner = new StringCleanerDelegate
+
+ "StringCleanerDelegate" should provide {
+ "overloaded clean method" when passed {
+ "String" should returns {
+ "the same String" in {
+ Cleaner.clean("Hello World") should equal ("Hello World")
+ }
+ }
+ "character array" should returns {
+ "the same character array" in {
+ Cleaner.clean("Hello World".toCharArray) should equal ("Hello World".toCharArray)
+ }
+ }
+ }
+ }
+} \ No newline at end of file