diff options
7 files changed, 217 insertions, 60 deletions
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/command/jaroMetric.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/jaroMetric.scala new file mode 100755 index 0000000..ba061c7 --- /dev/null +++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/jaroMetric.scala @@ -0,0 +1,52 @@ +package org.hashtree.stringmetric.cli.command + +import org.hashtree.stringmetric.JaroMetric +import org.hashtree.stringmetric.cli._ +import org.hashtree.stringmetric.cli.command._ + +/** + * The jaroMetric [[org.hashtree.stringmetric.cli.command.Command]]. Compares two strings to calculate the + * Jaro distance. + */ +object jaroMetric extends Command { + override def main(args: Array[String]): Unit = { + val options = OptionMapUtility.toOptionMap(args) + + try { + // Help. + if (options.contains('h) || options.contains('help)) { + help() + exit(options) + // Execute. + } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1) { + execute(options) + exit(options) + // Invalid syntax. + } else { + throw new IllegalArgumentException("Expected valid syntax. See --help.") + } + } catch { + case e => error(e)(options) + } + } + + override def help(): Unit = { + val ls = sys.props("line.separator") + val tab = " " + + println( + "Compares two strings to calculate the Jaro distance." + ls + ls + + "Syntax:" + ls + + tab + "jaroMetric [Options] string1 string2..." + ls + ls + + "Options:" + ls + + tab + "-h, --help" + ls + + tab + tab + "Outputs description, syntax, and options." + ) + } + + override def execute(options: OptionMap): Unit = { + val strings = options('dashless).split(" ") + + println(JaroMetric.compare(strings(0), strings(1)).toString) + } +}
\ No newline at end of file diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/command/jaroMetricSpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/jaroMetricSpec.scala new file mode 100755 index 0000000..d3eabe0 --- /dev/null +++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/jaroMetricSpec.scala @@ -0,0 +1,39 @@ +package org.hashtree.stringmetric.cli.command + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class jaroMetricSpec extends ScalaTest { + "jaroMetric" should provide { + "main method" when passed { + "valid dashless arguments" should executes { + "print the distance" in { + val out = new java.io.ByteArrayOutputStream() + + Console.withOut(out)( + jaroMetric.main(Array("--unitTest", "--debug", "abc", "abc")) + ) + + out.toString should equal ("1.0\n") + out.reset() + + Console.withOut(out)( + jaroMetric.main(Array("--unitTest", "--debug", "abc", "xyz")) + ) + + out.toString should equal ("0.0\n") + out.reset() + } + } + "no dashless arguments" should throws { + "IllegalArgumentException" in { + evaluating { + jaroMetric.main(Array("--unitTest", "--debug")) + } should produce [IllegalArgumentException] + } + } + } + } +}
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala new file mode 100755 index 0000000..bd5f850 --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala @@ -0,0 +1,68 @@ +package org.hashtree.stringmetric + +import scala.collection.mutable.ArrayBuffer +import scala.math +import scala.util.control.Breaks.{ break, breakable } + +/** + * An implementation of the Jaro string metric. One differing detail in this implementation is that if a character is + * matched in string2, it cannot be matched upon again. This results in a more penalized distance in these scenarios. + */ +object JaroMetric extends StringMetric { + override def compare(string1: String, string2: String): Float = { + val ca1 = string1.replaceAllLiterally(" ", "").toLowerCase.toCharArray + val ca2 = string2.replaceAllLiterally(" ", "").toLowerCase.toCharArray + + // Return 0 if either character array lacks length. + if (ca1.length == 0 || ca2.length == 0) return 0f + + val mt = `match`(ca1, ca2) + val ms = scoreMatches(mt._1, mt._2) + val ts = scoreTranspositions(mt._1, mt._2) + + // Return 0 if matches score is 0. + if (ms == 0) return 0f + + ((ms.toFloat / ca1.length) + (ms.toFloat / ca2.length) + ((ms.toFloat - ts) / ms)) / 3 + } + + private[this] def `match`(ct: CompareTuple): MatchTuple = { + val window = math.abs((math.max(ct._1.length, ct._2.length) / 2f).floor.toInt - 1) + val ab1 = ArrayBuffer[Int]() + val ab2 = ArrayBuffer[Int]() + + breakable { + for (i <- 0 until ct._1.length) { + val start = if (i - window <= 0) 0 else i - window + val end = if (i + window >= ct._2.length - 1) ct._2.length - 1 else i + window + + if (start > ct._2.length - 1) break() + + breakable { + for (ii <- start to end if !ab2.contains(ii)) { + if (ct._1(i) == ct._2(ii)) { + ab1.append(i) + ab2.append(ii) + + break() + } + } + } + } + } + + (ab1.map(ct._1(_)).toArray, ab2.sortWith(_ < _).map(ct._2(_)).toArray) + } + + private[this] def scoreMatches(mt: MatchTuple): Int = { + require(mt._1.length == mt._2.length) + + mt._1.length + } + + private[this] def scoreTranspositions(mt: MatchTuple): Int = { + require(mt._1.length == mt._2.length) + + (mt._1.zip(mt._2).filter(t => t._1 != t._2).length / 2f).floor.toInt + } +}
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala index 3e54881..545dc42 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala @@ -10,71 +10,12 @@ import scala.util.control.Breaks.{ break, breakable } * scenarios (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722). */ object JaroWinklerMetric extends StringMetric { - type CompareTuple = Tuple2[Array[Char], Array[Char]] - type MatchTuple = CompareTuple - override def compare(string1: String, string2: String): Float = { val ca1 = string1.replaceAllLiterally(" ", "").toLowerCase.toCharArray val ca2 = string2.replaceAllLiterally(" ", "").toLowerCase.toCharArray - - // Return 0 if either character array lacks length. - if (ca1.length == 0 || ca2.length == 0) return 0f - - val mt = `match`(ca1, ca2) - val ms = scoreMatches(mt._1, mt._2) - val ts = scoreTranspositions(mt._1, mt._2) - - // Return 0 if matches score is 0. - if (ms == 0) return 0f - val prefix = ca1.zip(ca2).takeWhile(t => t._1 == t._2).map(_._1) - val jaro = ( - (ms.toFloat / ca1.length) + - (ms.toFloat / ca2.length) + - ((ms.toFloat - ts) / ms) - ) / 3 + val jaro = JaroMetric.compare(string1, string2) - // Add Winkler. jaro + ((if (prefix.length <= 4) prefix.length else 4) * (0.1f * (1 - jaro))) } - - private[this] def `match`(ct: CompareTuple): MatchTuple = { - val window = math.abs((math.max(ct._1.length, ct._2.length) / 2f).floor.toInt - 1) - val ab1 = ArrayBuffer[Int]() - val ab2 = ArrayBuffer[Int]() - - breakable { - for (i <- 0 until ct._1.length) { - val start = if (i - window <= 0) 0 else i - window - val end = if (i + window >= ct._2.length - 1) ct._2.length - 1 else i + window - - if (start > ct._2.length - 1) break() - - breakable { - for (ii <- start to end if !ab2.contains(ii)) { - if (ct._1(i) == ct._2(ii)) { - ab1.append(i) - ab2.append(ii) - - break() - } - } - } - } - } - - (ab1.map(ct._1(_)).toArray, ab2.sortWith(_ < _).map(ct._2(_)).toArray) - } - - private[this] def scoreMatches(mt: MatchTuple): Int = { - require(mt._1.length == mt._2.length) - - mt._1.length - } - - private[this] def scoreTranspositions(mt: MatchTuple): Int = { - require(mt._1.length == mt._2.length) - - (mt._1.zip(mt._2).filter(t => t._1 != t._2).length / 2f).floor.toInt - } }
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/package.scala b/core/source/core/scala/org/hashtree/stringmetric/package.scala index b0671c7..3be74fb 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/package.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/package.scala @@ -2,5 +2,7 @@ package org.hashtree /** Provides core string metric functionality. */ package object stringmetric { + type CompareTuple = Tuple2[Array[Char], Array[Char]] + type MatchTuple = CompareTuple }
\ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala new file mode 100755 index 0000000..5d164e2 --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala @@ -0,0 +1,54 @@ +package org.hashtree.stringmetric + +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class JaroMetricSpec extends ScalaTest { + "JaroMetric" should provide { + "compare method" when passed { + "valid arguments" should returns { + "Float indicating distance" in { + JaroMetric.compare("abc", "abc") should be (1.0f) + JaroMetric.compare("abc", "xyz") should be (0.0f) + JaroMetric.compare("abc", "") should be (0.0f) + JaroMetric.compare("", "xyz") should be (0.0f) + JaroMetric.compare("", "") should be (0.0f) + JaroMetric.compare("a", "a") should be (1.0f) + + JaroMetric.compare("aa", "a") should be (0.8333333f) + JaroMetric.compare("a", "aa") should be (0.8333333f) + + JaroMetric.compare("veryveryverylong", "v") should be (0.6875f) + JaroMetric.compare("v", "veryveryverylong") should be (0.6875f) + + JaroMetric.compare("martha", "marhta") should be (0.9444444f) + JaroMetric.compare("dwayne", "duane") should be (0.82222223f) + JaroMetric.compare("dixon", "dicksonx") should be (0.76666665f) + JaroMetric.compare("abcvwxyz", "cabvwxyz") should be (0.9583333f) + JaroMetric.compare("jones", "johnson") should be (0.79047614f) + JaroMetric.compare("henka", "henkan") should be (0.9444444f) + JaroMetric.compare("fvie", "ten") should be (0.0f) + + JaroMetric.compare("zac ephron", "zac efron") should be > + JaroMetric.compare("zac ephron", "kai ephron") + JaroMetric.compare("brittney spears", "britney spears") should be > + JaroMetric.compare("brittney spears", "brittney startzman") + + JaroMetric.compare("m a r t h a", "m a r h t a") should be (0.9444444f) + JaroMetric.compare("d w a y n e", "d u a n e") should be (0.82222223f) + JaroMetric.compare("d i x o n", "d i c k s o n x") should be (0.76666665f) + JaroMetric.compare("a b c v w x y z", "c a b v w x y z") should be (0.9583333f) + JaroMetric.compare("j o n e s", "j o h n s o n") should be (0.79047614f) + JaroMetric.compare("h e n k a", "h e n k a n") should be (0.9444444f) + JaroMetric.compare("f v i e", "t e n") should be (0.0f) + + JaroMetric.compare("z a c e p h r o n", "z a c e f r o n") should be > + JaroMetric.compare("z a c e p h r o n", "k a i e p h r o n") + JaroMetric.compare("b r i t t n e y s p e a r s", "b r i t n e y s p e a r s") should be > + JaroMetric.compare("b r i t t n e y s p e a r s", "b r i t t n e y s t a r t z m a n") + } + } + } + } +}
\ No newline at end of file @@ -1,6 +1,7 @@ #stringmetric A collection of string metrics implemented in Scala. Includes a light-weight core API and CLI for each string metric. The following string metrics are currently supported: +* Jaro * Jaro-Winkler ## Building the API |