summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-10-07 01:43:51 -0600
committerRocky Madden <git@rockymadden.com>2012-10-07 01:43:51 -0600
commitab8d0a077598e2adb19255d1c9df476031db0441 (patch)
tree0b116ea26e0cf5f43b63699b97b4ae82201062a3 /core
parent05e26b8c8baf488d0207faadfa31d30fdb5622e6 (diff)
downloadstringmetric-ab8d0a077598e2adb19255d1c9df476031db0441.tar.gz
stringmetric-ab8d0a077598e2adb19255d1c9df476031db0441.tar.bz2
stringmetric-ab8d0a077598e2adb19255d1c9df476031db0441.zip
Added types to help clarify purposes.
Diffstat (limited to 'core')
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala50
1 files changed, 27 insertions, 23 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
index b59972b..5848026 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
@@ -10,45 +10,49 @@ import scala.util.control.Breaks.{ break, breakable }
* scenarios (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722).
*/
object JaroWinklerMetric extends StringMetric {
+ type CompareTuple = Tuple2[Array[Char], Array[Char]]
+ type MatchTuple = CompareTuple
+
override def compare(s1: String, s2: String): Float = {
- val ca1 = s1.replaceAllLiterally(" ", "").toLowerCase.toCharArray
- val ca2 = s2.replaceAllLiterally(" ", "").toLowerCase.toCharArray
+ val charArray1 = s1.replaceAllLiterally(" ", "").toLowerCase.toCharArray
+ val charArray2 = s2.replaceAllLiterally(" ", "").toLowerCase.toCharArray
// Return 0 if either character array lacks length.
- if (ca1.length == 0 || ca2.length == 0) return 0f
+ if (charArray1.length == 0 || charArray2.length == 0) return 0f
- val (mca1, mca2) = matchChars(ca1, ca2)
- val matchesScore = scoreMatches(mca1, mca2)
- val transpositionsScore = scoreTranspositions(mca1, mca2)
+ val matchTuple = `match`(charArray1, charArray2)
+ val matchesScore = scoreMatches(matchTuple._1, matchTuple._2)
+ val transpositionsScore = scoreTranspositions(matchTuple._1, matchTuple._2)
// Return 0 if matches score is 0.
if (matchesScore == 0) return 0f
- val prefix = ca1.zip(ca2).takeWhile(t => t._1 == t._2).map(_._1)
+ val prefix = charArray1.zip(charArray2).takeWhile(t => t._1 == t._2).map(_._1)
val jaro = (
- (matchesScore.toFloat / ca1.length) +
- (matchesScore.toFloat / ca2.length) +
+ (matchesScore.toFloat / charArray1.length) +
+ (matchesScore.toFloat / charArray2.length) +
((matchesScore.toFloat - transpositionsScore) / matchesScore)
) / 3
- jaro + ((if (prefix.length <= 4) prefix.length else 4) * (.1f * (1 - jaro)))
+ // Add Winkler.
+ jaro + ((if (prefix.length <= 4) prefix.length else 4) * (0.1f * (1 - jaro)))
}
- private[this] def matchChars(ca1: Array[Char], ca2: Array[Char]): Tuple2[Array[Char], Array[Char]] = {
- val window = math.abs((math.max(ca1.length, ca2.length) / 2f).floor.toInt - 1)
+ private[this] def `match`(ct: CompareTuple): MatchTuple = {
+ val window = math.abs((math.max(ct._1.length, ct._2.length) / 2f).floor.toInt - 1)
val a1Indices = ArrayBuffer[Int]()
val a2Indices = ArrayBuffer[Int]()
breakable {
- for (i <- 0 until ca1.length) {
+ for (i <- 0 until ct._1.length) {
val start = if (i - window <= 0) 0 else i - window
- val end = if (i + window >= ca2.length - 1) ca2.length - 1 else i + window
+ val end = if (i + window >= ct._2.length - 1) ct._2.length - 1 else i + window
- if (start > ca2.length - 1) break()
+ if (start > ct._2.length - 1) break()
breakable {
for (ii <- start to end if ! a2Indices.contains(ii)) {
- if (ca1(i) == ca2(ii)) {
+ if (ct._1(i) == ct._2(ii)) {
a1Indices.append(i)
a2Indices.append(ii)
@@ -59,18 +63,18 @@ object JaroWinklerMetric extends StringMetric {
}
}
- (a1Indices.map(ca1(_)).toArray, a2Indices.sortWith(_ < _).map(ca2(_)).toArray)
+ (a1Indices.map(ct._1(_)).toArray, a2Indices.sortWith(_ < _).map(ct._2(_)).toArray)
}
- private[this] def scoreMatches(mca1: Array[Char], mca2: Array[Char]): Int = {
- require(mca1.length == mca2.length)
+ private[this] def scoreMatches(mt: MatchTuple): Int = {
+ require(mt._1.length == mt._2.length)
- mca1.length
+ mt._1.length
}
- private[this] def scoreTranspositions(mca1: Array[Char], mca2: Array[Char]): Int = {
- require(mca1.length == mca2.length)
+ private[this] def scoreTranspositions(mt: MatchTuple): Int = {
+ require(mt._1.length == mt._2.length)
- (mca1.zip(mca2).filter(t => t._1 != t._2).length / 2f).floor.toInt
+ (mt._1.zip(mt._2).filter(t => t._1 != t._2).length / 2f).floor.toInt
}
} \ No newline at end of file