summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-10-06 23:19:20 -0600
committerRocky Madden <git@rockymadden.com>2012-10-06 23:19:20 -0600
commitdadd1221ec7c1301b3cc2dfc178dba2091e1f9b8 (patch)
treece698016721cdc2636d66742d705b232ad1c9fe9 /core
downloadstringmetric-dadd1221ec7c1301b3cc2dfc178dba2091e1f9b8.tar.gz
stringmetric-dadd1221ec7c1301b3cc2dfc178dba2091e1f9b8.tar.bz2
stringmetric-dadd1221ec7c1301b3cc2dfc178dba2091e1f9b8.zip
Created repository.v0.0.0
Diffstat (limited to 'core')
-rwxr-xr-xcore/build.gradle42
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala76
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/Metric.scala6
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/StringMetric.scala6
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/JaroWinklerMetricSpec.scala54
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/ScalaTest.scala18
6 files changed, 202 insertions, 0 deletions
diff --git a/core/build.gradle b/core/build.gradle
new file mode 100755
index 0000000..55fc38f
--- /dev/null
+++ b/core/build.gradle
@@ -0,0 +1,42 @@
+apply plugin: 'eclipse'
+apply plugin: 'scala'
+
+dependencies {
+ compile 'org.scala-lang:scala-compiler:2.9.2'
+ compile 'org.scala-lang:scala-library:2.9.2'
+
+ scalaTools 'org.scala-lang:scala-compiler:2.9.2'
+ scalaTools 'org.scala-lang:scala-library:2.9.2'
+
+ testCompile 'junit:junit:4.10'
+ testCompile 'org.scalatest:scalatest_2.9.2:1.8'
+}
+
+sourceSets {
+ main {
+ output.resourcesDir "${project.buildDir}/classes/main"
+
+ java {
+ srcDir 'source/core/java'
+ }
+ resources {
+ srcDir 'source/core/resource'
+ }
+ scala {
+ srcDir 'source/core/scala'
+ }
+ }
+ test {
+ output.resourcesDir "${project.buildDir}/classes/test"
+
+ java {
+ srcDir 'source/test/java'
+ }
+ resources {
+ srcDir 'source/test/resource'
+ }
+ scala {
+ srcDir 'source/test/scala'
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
new file mode 100755
index 0000000..4311379
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
@@ -0,0 +1,76 @@
+package org.hashtree.stringmetric
+
+import scala.collection.mutable.ArrayBuffer
+import scala.math
+import scala.util.control.Breaks.{ break, breakable }
+
+/**
+ * An implementation of the Jaro-Winkler string metric. One differing detail in this implementation is that if a
+ * character is matched in string2, it cannot be matched upon again. This results in a more penalized distance in these
+ * scenarios (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722).
+ */
+object JaroWinklerMetric extends StringMetric {
+ override def compare(s1: String, s2: String): Float = {
+ val ca1 = s1.replaceAllLiterally(" ", "").toLowerCase.toCharArray
+ val ca2 = s2.replaceAllLiterally(" ", "").toLowerCase.toCharArray
+
+ // Return 0 if either character array lacks length.
+ if (ca1.length == 0 || ca2.length == 0) return 0f
+
+ val (m1, m2) = matchChars(ca1, ca2)
+ val matchesScore = scoreMatches(m1, m2)
+ val transpositionsScore = scoreTranspositions(m1, m2)
+
+ // Return 0 if matches score is 0.
+ if (matchesScore == 0) return 0f
+
+ val prefix = ca1.zip(ca2).takeWhile(t => t._1 == t._2).map(_._1).mkString
+ val jaro = (
+ (matchesScore.toFloat / ca1.length) +
+ (matchesScore.toFloat / ca2.length) +
+ ((matchesScore.toFloat - transpositionsScore) / matchesScore.toFloat)
+ ) / 3
+
+ jaro + ((if (prefix.length <= 4) prefix.length else 4) * (.1f * (1 - jaro)))
+ }
+
+ private[this] def matchChars(ca1: Array[Char], ca2: Array[Char]): Tuple2[Array[Char], Array[Char]] = {
+ val window = math.abs((math.max(ca1.length, ca2.length) / 2f).floor.toInt - 1)
+ val a1Indices = ArrayBuffer[Int]()
+ val a2Indices = ArrayBuffer[Int]()
+
+ breakable {
+ for (i <- 0 until ca1.length) {
+ val start = if (i - window <= 0) 0 else i - window
+ val end = if (i + window >= ca2.length - 1) ca2.length - 1 else i + window
+
+ if (start > ca2.length) break
+
+ breakable {
+ for (ii <- start to end if ! a2Indices.contains(ii)) {
+ if (ca1(i) == ca2(ii)) {
+ a1Indices.append(i)
+ a2Indices.append(ii)
+
+ break
+ }
+ }
+ }
+ }
+ }
+
+ (a1Indices.map(i => ca1(i)).toArray, a2Indices.sortWith(_ < _).map(i => ca2(i)).toArray)
+ }
+
+ private[this] def scoreMatches(mca1: Array[Char], mca2: Array[Char]): Int = {
+ require(mca1.length == mca2.length)
+
+ mca1.length
+ }
+
+ private[this] def scoreTranspositions(mca1: Array[Char], mca2: Array[Char]): Int = {
+ require(mca1.length == mca2.length)
+
+ (mca1.zip(mca2).filter(t => t._1 != t._2).length / 2f).floor.toInt
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/Metric.scala b/core/source/core/scala/org/hashtree/stringmetric/Metric.scala
new file mode 100755
index 0000000..2d570c2
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/Metric.scala
@@ -0,0 +1,6 @@
+package org.hashtree.stringmetric
+
+/** Marks those which leverage traits of a metric. */
+trait Metric[T] {
+ def compare(t1: T, t2: T): AnyVal
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
new file mode 100755
index 0000000..792aeba
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
@@ -0,0 +1,6 @@
+package org.hashtree.stringmetric
+
+/** Marks those which leverage traits of a string based Metric. */
+trait StringMetric extends Metric[String] {
+
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/JaroWinklerMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/JaroWinklerMetricSpec.scala
new file mode 100755
index 0000000..6e044a0
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/JaroWinklerMetricSpec.scala
@@ -0,0 +1,54 @@
+package org.hashtree.stringmetric
+
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class JaroWinklerMetricSpec extends ScalaTest {
+ "JaroWinklerMetric" should provide {
+ "compare method" when passed {
+ "valid arguments" should returns {
+ "Float indicating distance" in {
+ JaroWinklerMetric.compare("abc", "abc") should be (1.0f)
+ JaroWinklerMetric.compare("abc", "xyz") should be (0.0f)
+ JaroWinklerMetric.compare("abc", "") should be (0.0f)
+ JaroWinklerMetric.compare("", "xyz") should be (0.0f)
+ JaroWinklerMetric.compare("", "") should be (0.0f)
+ JaroWinklerMetric.compare("a", "a") should be (1.0f)
+
+ JaroWinklerMetric.compare("aa", "a") should be (0.84999996f)
+ JaroWinklerMetric.compare("a", "aa") should be (0.84999996f)
+
+ JaroWinklerMetric.compare("veryveryverylong", "v") should be (0.71875f)
+ JaroWinklerMetric.compare("v", "veryveryverylong") should be (0.71875f)
+
+ JaroWinklerMetric.compare("martha", "marhta") should be (0.96111107f)
+ JaroWinklerMetric.compare("dwayne", "duane") should be (0.84000003f)
+ JaroWinklerMetric.compare("dixon", "dicksonx") should be (0.81333333f)
+ JaroWinklerMetric.compare("abcvwxyz", "cabvwxyz") should be (0.9583333f)
+ JaroWinklerMetric.compare("jones", "johnson") should be (0.8323809f)
+ JaroWinklerMetric.compare("henka", "henkan") should be (0.96666664f)
+ JaroWinklerMetric.compare("fvie", "ten") should be (0.0f)
+
+ JaroWinklerMetric.compare("zac ephron", "zac efron") should be >
+ JaroWinklerMetric.compare("zac ephron", "kai ephron")
+ JaroWinklerMetric.compare("brittney spears", "britney spears") should be >
+ JaroWinklerMetric.compare("brittney spears", "brittney startzman")
+
+ JaroWinklerMetric.compare("m a r t h a", "m a r h t a") should be (0.96111107f)
+ JaroWinklerMetric.compare("d w a y n e", "d u a n e") should be (0.84000003f)
+ JaroWinklerMetric.compare("d i x o n", "d i c k s o n x") should be (0.81333333f)
+ JaroWinklerMetric.compare("a b c v w x y z", "c a b v w x y z") should be (0.9583333f)
+ JaroWinklerMetric.compare("j o n e s", "j o h n s o n") should be (0.8323809f)
+ JaroWinklerMetric.compare("h e n k a", "h e n k a n") should be (0.96666664f)
+ JaroWinklerMetric.compare("f v i e", "t e n") should be (0.0f)
+
+ JaroWinklerMetric.compare("z a c e p h r o n", "z a c e f r o n") should be >
+ JaroWinklerMetric.compare("z a c e p h r o n", "k a i e p h r o n")
+ JaroWinklerMetric.compare("b r i t t n e y s p e a r s", "b r i t n e y s p e a r s") should be >
+ JaroWinklerMetric.compare("b r i t t n e y s p e a r s", "b r i t t n e y s t a r t z m a n")
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/ScalaTest.scala b/core/source/test/scala/org/hashtree/stringmetric/ScalaTest.scala
new file mode 100755
index 0000000..a9cd5e1
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/ScalaTest.scala
@@ -0,0 +1,18 @@
+package org.hashtree.stringmetric
+
+import org.scalatest.WordSpec
+import org.scalatest.matchers.ShouldMatchers
+
+trait ScalaTest extends WordSpec with ShouldMatchers {
+ def allows = afterWord("allow")
+
+ def executes = afterWord("execute")
+
+ def passed = afterWord("passed")
+
+ def provide = afterWord("provide")
+
+ def returns = afterWord("return")
+
+ def throws = afterWord("throw")
+} \ No newline at end of file