diff options
author | Rocky Madden <git@rockymadden.com> | 2012-10-07 03:00:06 -0600 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2012-10-07 03:00:06 -0600 |
commit | ef31d1cc8391803d4a9991bbbbf91b45d1add14c (patch) | |
tree | 497d0e7c28bfb6876b6d02be0222f61a1a7399ec /core | |
parent | 7530433968effe6a4f5b20898ded7290db9f995f (diff) | |
download | stringmetric-ef31d1cc8391803d4a9991bbbbf91b45d1add14c.tar.gz stringmetric-ef31d1cc8391803d4a9991bbbbf91b45d1add14c.tar.bz2 stringmetric-ef31d1cc8391803d4a9991bbbbf91b45d1add14c.zip |
Refactored StringMetric trait to force implementors to provide a compare method which accepts character arrays. Character array arguments passed to compare method are assumed to already be clean. This allowed for the removal of duplicate calls to string cleaning methods.
Diffstat (limited to 'core')
3 files changed, 20 insertions, 15 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala index bd5f850..bcea174 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala @@ -9,21 +9,23 @@ import scala.util.control.Breaks.{ break, breakable } * matched in string2, it cannot be matched upon again. This results in a more penalized distance in these scenarios. */ object JaroMetric extends StringMetric { - override def compare(string1: String, string2: String): Float = { - val ca1 = string1.replaceAllLiterally(" ", "").toLowerCase.toCharArray - val ca2 = string2.replaceAllLiterally(" ", "").toLowerCase.toCharArray - + override def compare(charArray1: Array[Char], charArray2: Array[Char]): Float = { // Return 0 if either character array lacks length. - if (ca1.length == 0 || ca2.length == 0) return 0f + if (charArray1.length == 0 || charArray2.length == 0) return 0f - val mt = `match`(ca1, ca2) - val ms = scoreMatches(mt._1, mt._2) - val ts = scoreTranspositions(mt._1, mt._2) + val mt = `match`((charArray1, charArray2)) + val ms = scoreMatches((mt._1, mt._2)) + val ts = scoreTranspositions((mt._1, mt._2)) // Return 0 if matches score is 0. if (ms == 0) return 0f - ((ms.toFloat / ca1.length) + (ms.toFloat / ca2.length) + ((ms.toFloat - ts) / ms)) / 3 + ((ms.toFloat / charArray1.length) + (ms.toFloat / charArray2.length) + ((ms.toFloat - ts) / ms)) / 3 + } + + override def compare(string1: String, string2: String): Float = { + compare(string1.replaceAllLiterally(" ", "").toLowerCase.toCharArray, + string2.replaceAllLiterally(" ", "").toLowerCase.toCharArray) } private[this] def `match`(ct: CompareTuple): MatchTuple = { diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala index 545dc42..dec2cbe 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala @@ -10,12 +10,15 @@ import scala.util.control.Breaks.{ break, breakable } * scenarios (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722). */ object JaroWinklerMetric extends StringMetric { - override def compare(string1: String, string2: String): Float = { - val ca1 = string1.replaceAllLiterally(" ", "").toLowerCase.toCharArray - val ca2 = string2.replaceAllLiterally(" ", "").toLowerCase.toCharArray - val prefix = ca1.zip(ca2).takeWhile(t => t._1 == t._2).map(_._1) - val jaro = JaroMetric.compare(string1, string2) + override def compare(charArray1: Array[Char], charArray2: Array[Char]): Float = { + val prefix = charArray1.zip(charArray2).takeWhile(t => t._1 == t._2).map(_._1) + val jaro = JaroMetric.compare(charArray1, charArray2) jaro + ((if (prefix.length <= 4) prefix.length else 4) * (0.1f * (1 - jaro))) } + + override def compare(string1: String, string2: String): Float = { + compare(string1.replaceAllLiterally(" ", "").toLowerCase.toCharArray, + string2.replaceAllLiterally(" ", "").toLowerCase.toCharArray) + } }
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala index 792aeba..2e92292 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala @@ -2,5 +2,5 @@ package org.hashtree.stringmetric /** Marks those which leverage traits of a string based Metric. */ trait StringMetric extends Metric[String] { - + def compare(ca1: Array[Char], ca2: Array[Char]): AnyVal }
\ No newline at end of file |