diff options
author | Rocky Madden <git@rockymadden.com> | 2012-10-06 23:19:20 -0600 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2012-10-06 23:19:20 -0600 |
commit | dadd1221ec7c1301b3cc2dfc178dba2091e1f9b8 (patch) | |
tree | ce698016721cdc2636d66742d705b232ad1c9fe9 /core | |
download | stringmetric-dadd1221ec7c1301b3cc2dfc178dba2091e1f9b8.tar.gz stringmetric-dadd1221ec7c1301b3cc2dfc178dba2091e1f9b8.tar.bz2 stringmetric-dadd1221ec7c1301b3cc2dfc178dba2091e1f9b8.zip |
Created repository.v0.0.0
Diffstat (limited to 'core')
6 files changed, 202 insertions, 0 deletions
diff --git a/core/build.gradle b/core/build.gradle new file mode 100755 index 0000000..55fc38f --- /dev/null +++ b/core/build.gradle @@ -0,0 +1,42 @@ +apply plugin: 'eclipse' +apply plugin: 'scala' + +dependencies { + compile 'org.scala-lang:scala-compiler:2.9.2' + compile 'org.scala-lang:scala-library:2.9.2' + + scalaTools 'org.scala-lang:scala-compiler:2.9.2' + scalaTools 'org.scala-lang:scala-library:2.9.2' + + testCompile 'junit:junit:4.10' + testCompile 'org.scalatest:scalatest_2.9.2:1.8' +} + +sourceSets { + main { + output.resourcesDir "${project.buildDir}/classes/main" + + java { + srcDir 'source/core/java' + } + resources { + srcDir 'source/core/resource' + } + scala { + srcDir 'source/core/scala' + } + } + test { + output.resourcesDir "${project.buildDir}/classes/test" + + java { + srcDir 'source/test/java' + } + resources { + srcDir 'source/test/resource' + } + scala { + srcDir 'source/test/scala' + } + } +}
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala new file mode 100755 index 0000000..4311379 --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala @@ -0,0 +1,76 @@ +package org.hashtree.stringmetric + +import scala.collection.mutable.ArrayBuffer +import scala.math +import scala.util.control.Breaks.{ break, breakable } + +/** + * An implementation of the Jaro-Winkler string metric. One differing detail in this implementation is that if a + * character is matched in string2, it cannot be matched upon again. This results in a more penalized distance in these + * scenarios (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722). + */ +object JaroWinklerMetric extends StringMetric { + override def compare(s1: String, s2: String): Float = { + val ca1 = s1.replaceAllLiterally(" ", "").toLowerCase.toCharArray + val ca2 = s2.replaceAllLiterally(" ", "").toLowerCase.toCharArray + + // Return 0 if either character array lacks length. + if (ca1.length == 0 || ca2.length == 0) return 0f + + val (m1, m2) = matchChars(ca1, ca2) + val matchesScore = scoreMatches(m1, m2) + val transpositionsScore = scoreTranspositions(m1, m2) + + // Return 0 if matches score is 0. + if (matchesScore == 0) return 0f + + val prefix = ca1.zip(ca2).takeWhile(t => t._1 == t._2).map(_._1).mkString + val jaro = ( + (matchesScore.toFloat / ca1.length) + + (matchesScore.toFloat / ca2.length) + + ((matchesScore.toFloat - transpositionsScore) / matchesScore.toFloat) + ) / 3 + + jaro + ((if (prefix.length <= 4) prefix.length else 4) * (.1f * (1 - jaro))) + } + + private[this] def matchChars(ca1: Array[Char], ca2: Array[Char]): Tuple2[Array[Char], Array[Char]] = { + val window = math.abs((math.max(ca1.length, ca2.length) / 2f).floor.toInt - 1) + val a1Indices = ArrayBuffer[Int]() + val a2Indices = ArrayBuffer[Int]() + + breakable { + for (i <- 0 until ca1.length) { + val start = if (i - window <= 0) 0 else i - window + val end = if (i + window >= ca2.length - 1) ca2.length - 1 else i + window + + if (start > ca2.length) break + + breakable { + for (ii <- start to end if ! a2Indices.contains(ii)) { + if (ca1(i) == ca2(ii)) { + a1Indices.append(i) + a2Indices.append(ii) + + break + } + } + } + } + } + + (a1Indices.map(i => ca1(i)).toArray, a2Indices.sortWith(_ < _).map(i => ca2(i)).toArray) + } + + private[this] def scoreMatches(mca1: Array[Char], mca2: Array[Char]): Int = { + require(mca1.length == mca2.length) + + mca1.length + } + + private[this] def scoreTranspositions(mca1: Array[Char], mca2: Array[Char]): Int = { + require(mca1.length == mca2.length) + + (mca1.zip(mca2).filter(t => t._1 != t._2).length / 2f).floor.toInt + } +}
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/Metric.scala b/core/source/core/scala/org/hashtree/stringmetric/Metric.scala new file mode 100755 index 0000000..2d570c2 --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/Metric.scala @@ -0,0 +1,6 @@ +package org.hashtree.stringmetric + +/** Marks those which leverage traits of a metric. */ +trait Metric[T] { + def compare(t1: T, t2: T): AnyVal +}
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala new file mode 100755 index 0000000..792aeba --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala @@ -0,0 +1,6 @@ +package org.hashtree.stringmetric + +/** Marks those which leverage traits of a string based Metric. */ +trait StringMetric extends Metric[String] { + +}
\ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/JaroWinklerMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/JaroWinklerMetricSpec.scala new file mode 100755 index 0000000..6e044a0 --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/JaroWinklerMetricSpec.scala @@ -0,0 +1,54 @@ +package org.hashtree.stringmetric + +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class JaroWinklerMetricSpec extends ScalaTest { + "JaroWinklerMetric" should provide { + "compare method" when passed { + "valid arguments" should returns { + "Float indicating distance" in { + JaroWinklerMetric.compare("abc", "abc") should be (1.0f) + JaroWinklerMetric.compare("abc", "xyz") should be (0.0f) + JaroWinklerMetric.compare("abc", "") should be (0.0f) + JaroWinklerMetric.compare("", "xyz") should be (0.0f) + JaroWinklerMetric.compare("", "") should be (0.0f) + JaroWinklerMetric.compare("a", "a") should be (1.0f) + + JaroWinklerMetric.compare("aa", "a") should be (0.84999996f) + JaroWinklerMetric.compare("a", "aa") should be (0.84999996f) + + JaroWinklerMetric.compare("veryveryverylong", "v") should be (0.71875f) + JaroWinklerMetric.compare("v", "veryveryverylong") should be (0.71875f) + + JaroWinklerMetric.compare("martha", "marhta") should be (0.96111107f) + JaroWinklerMetric.compare("dwayne", "duane") should be (0.84000003f) + JaroWinklerMetric.compare("dixon", "dicksonx") should be (0.81333333f) + JaroWinklerMetric.compare("abcvwxyz", "cabvwxyz") should be (0.9583333f) + JaroWinklerMetric.compare("jones", "johnson") should be (0.8323809f) + JaroWinklerMetric.compare("henka", "henkan") should be (0.96666664f) + JaroWinklerMetric.compare("fvie", "ten") should be (0.0f) + + JaroWinklerMetric.compare("zac ephron", "zac efron") should be > + JaroWinklerMetric.compare("zac ephron", "kai ephron") + JaroWinklerMetric.compare("brittney spears", "britney spears") should be > + JaroWinklerMetric.compare("brittney spears", "brittney startzman") + + JaroWinklerMetric.compare("m a r t h a", "m a r h t a") should be (0.96111107f) + JaroWinklerMetric.compare("d w a y n e", "d u a n e") should be (0.84000003f) + JaroWinklerMetric.compare("d i x o n", "d i c k s o n x") should be (0.81333333f) + JaroWinklerMetric.compare("a b c v w x y z", "c a b v w x y z") should be (0.9583333f) + JaroWinklerMetric.compare("j o n e s", "j o h n s o n") should be (0.8323809f) + JaroWinklerMetric.compare("h e n k a", "h e n k a n") should be (0.96666664f) + JaroWinklerMetric.compare("f v i e", "t e n") should be (0.0f) + + JaroWinklerMetric.compare("z a c e p h r o n", "z a c e f r o n") should be > + JaroWinklerMetric.compare("z a c e p h r o n", "k a i e p h r o n") + JaroWinklerMetric.compare("b r i t t n e y s p e a r s", "b r i t n e y s p e a r s") should be > + JaroWinklerMetric.compare("b r i t t n e y s p e a r s", "b r i t t n e y s t a r t z m a n") + } + } + } + } +}
\ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/ScalaTest.scala b/core/source/test/scala/org/hashtree/stringmetric/ScalaTest.scala new file mode 100755 index 0000000..a9cd5e1 --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/ScalaTest.scala @@ -0,0 +1,18 @@ +package org.hashtree.stringmetric + +import org.scalatest.WordSpec +import org.scalatest.matchers.ShouldMatchers + +trait ScalaTest extends WordSpec with ShouldMatchers { + def allows = afterWord("allow") + + def executes = afterWord("execute") + + def passed = afterWord("passed") + + def provide = afterWord("provide") + + def returns = afterWord("return") + + def throws = afterWord("throw") +}
\ No newline at end of file |