diff options
author | Rocky Madden <git@rockymadden.com> | 2012-10-07 01:43:51 -0600 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2012-10-07 01:43:51 -0600 |
commit | ab8d0a077598e2adb19255d1c9df476031db0441 (patch) | |
tree | 0b116ea26e0cf5f43b63699b97b4ae82201062a3 /core | |
parent | 05e26b8c8baf488d0207faadfa31d30fdb5622e6 (diff) | |
download | stringmetric-ab8d0a077598e2adb19255d1c9df476031db0441.tar.gz stringmetric-ab8d0a077598e2adb19255d1c9df476031db0441.tar.bz2 stringmetric-ab8d0a077598e2adb19255d1c9df476031db0441.zip |
Added types to help clarify purposes.
Diffstat (limited to 'core')
-rwxr-xr-x | core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala | 50 |
1 files changed, 27 insertions, 23 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala index b59972b..5848026 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala @@ -10,45 +10,49 @@ import scala.util.control.Breaks.{ break, breakable } * scenarios (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722). */ object JaroWinklerMetric extends StringMetric { + type CompareTuple = Tuple2[Array[Char], Array[Char]] + type MatchTuple = CompareTuple + override def compare(s1: String, s2: String): Float = { - val ca1 = s1.replaceAllLiterally(" ", "").toLowerCase.toCharArray - val ca2 = s2.replaceAllLiterally(" ", "").toLowerCase.toCharArray + val charArray1 = s1.replaceAllLiterally(" ", "").toLowerCase.toCharArray + val charArray2 = s2.replaceAllLiterally(" ", "").toLowerCase.toCharArray // Return 0 if either character array lacks length. - if (ca1.length == 0 || ca2.length == 0) return 0f + if (charArray1.length == 0 || charArray2.length == 0) return 0f - val (mca1, mca2) = matchChars(ca1, ca2) - val matchesScore = scoreMatches(mca1, mca2) - val transpositionsScore = scoreTranspositions(mca1, mca2) + val matchTuple = `match`(charArray1, charArray2) + val matchesScore = scoreMatches(matchTuple._1, matchTuple._2) + val transpositionsScore = scoreTranspositions(matchTuple._1, matchTuple._2) // Return 0 if matches score is 0. if (matchesScore == 0) return 0f - val prefix = ca1.zip(ca2).takeWhile(t => t._1 == t._2).map(_._1) + val prefix = charArray1.zip(charArray2).takeWhile(t => t._1 == t._2).map(_._1) val jaro = ( - (matchesScore.toFloat / ca1.length) + - (matchesScore.toFloat / ca2.length) + + (matchesScore.toFloat / charArray1.length) + + (matchesScore.toFloat / charArray2.length) + ((matchesScore.toFloat - transpositionsScore) / matchesScore) ) / 3 - jaro + ((if (prefix.length <= 4) prefix.length else 4) * (.1f * (1 - jaro))) + // Add Winkler. + jaro + ((if (prefix.length <= 4) prefix.length else 4) * (0.1f * (1 - jaro))) } - private[this] def matchChars(ca1: Array[Char], ca2: Array[Char]): Tuple2[Array[Char], Array[Char]] = { - val window = math.abs((math.max(ca1.length, ca2.length) / 2f).floor.toInt - 1) + private[this] def `match`(ct: CompareTuple): MatchTuple = { + val window = math.abs((math.max(ct._1.length, ct._2.length) / 2f).floor.toInt - 1) val a1Indices = ArrayBuffer[Int]() val a2Indices = ArrayBuffer[Int]() breakable { - for (i <- 0 until ca1.length) { + for (i <- 0 until ct._1.length) { val start = if (i - window <= 0) 0 else i - window - val end = if (i + window >= ca2.length - 1) ca2.length - 1 else i + window + val end = if (i + window >= ct._2.length - 1) ct._2.length - 1 else i + window - if (start > ca2.length - 1) break() + if (start > ct._2.length - 1) break() breakable { for (ii <- start to end if ! a2Indices.contains(ii)) { - if (ca1(i) == ca2(ii)) { + if (ct._1(i) == ct._2(ii)) { a1Indices.append(i) a2Indices.append(ii) @@ -59,18 +63,18 @@ object JaroWinklerMetric extends StringMetric { } } - (a1Indices.map(ca1(_)).toArray, a2Indices.sortWith(_ < _).map(ca2(_)).toArray) + (a1Indices.map(ct._1(_)).toArray, a2Indices.sortWith(_ < _).map(ct._2(_)).toArray) } - private[this] def scoreMatches(mca1: Array[Char], mca2: Array[Char]): Int = { - require(mca1.length == mca2.length) + private[this] def scoreMatches(mt: MatchTuple): Int = { + require(mt._1.length == mt._2.length) - mca1.length + mt._1.length } - private[this] def scoreTranspositions(mca1: Array[Char], mca2: Array[Char]): Int = { - require(mca1.length == mca2.length) + private[this] def scoreTranspositions(mt: MatchTuple): Int = { + require(mt._1.length == mt._2.length) - (mca1.zip(mca2).filter(t => t._1 != t._2).length / 2f).floor.toInt + (mt._1.zip(mt._2).filter(t => t._1 != t._2).length / 2f).floor.toInt } }
\ No newline at end of file |