summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-10-07 02:40:07 -0600
committerRocky Madden <git@rockymadden.com>2012-10-07 02:40:07 -0600
commit2a5145f7280fac663a07c0585be51b7b03d485b9 (patch)
tree79f88ea4cee9887db9145783759dc229f6ab9086
parent3fca0f2f4c3ca58a8e043ae56752fd90f550ac9e (diff)
downloadstringmetric-2a5145f7280fac663a07c0585be51b7b03d485b9.tar.gz
stringmetric-2a5145f7280fac663a07c0585be51b7b03d485b9.tar.bz2
stringmetric-2a5145f7280fac663a07c0585be51b7b03d485b9.zip
Added JaroMetric and spec. Also created CLI and spec for metric.
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/command/jaroMetric.scala52
-rwxr-xr-xcli/source/test/scala/org/hashtree/stringmetric/cli/command/jaroMetricSpec.scala39
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala68
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala61
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/package.scala2
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala54
-rwxr-xr-xreadme.md1
7 files changed, 217 insertions, 60 deletions
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/command/jaroMetric.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/jaroMetric.scala
new file mode 100755
index 0000000..ba061c7
--- /dev/null
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/jaroMetric.scala
@@ -0,0 +1,52 @@
+package org.hashtree.stringmetric.cli.command
+
+import org.hashtree.stringmetric.JaroMetric
+import org.hashtree.stringmetric.cli._
+import org.hashtree.stringmetric.cli.command._
+
+/**
+ * The jaroMetric [[org.hashtree.stringmetric.cli.command.Command]]. Compares two strings to calculate the
+ * Jaro distance.
+ */
+object jaroMetric extends Command {
+ override def main(args: Array[String]): Unit = {
+ val options = OptionMapUtility.toOptionMap(args)
+
+ try {
+ // Help.
+ if (options.contains('h) || options.contains('help)) {
+ help()
+ exit(options)
+ // Execute.
+ } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1) {
+ execute(options)
+ exit(options)
+ // Invalid syntax.
+ } else {
+ throw new IllegalArgumentException("Expected valid syntax. See --help.")
+ }
+ } catch {
+ case e => error(e)(options)
+ }
+ }
+
+ override def help(): Unit = {
+ val ls = sys.props("line.separator")
+ val tab = " "
+
+ println(
+ "Compares two strings to calculate the Jaro distance." + ls + ls +
+ "Syntax:" + ls +
+ tab + "jaroMetric [Options] string1 string2..." + ls + ls +
+ "Options:" + ls +
+ tab + "-h, --help" + ls +
+ tab + tab + "Outputs description, syntax, and options."
+ )
+ }
+
+ override def execute(options: OptionMap): Unit = {
+ val strings = options('dashless).split(" ")
+
+ println(JaroMetric.compare(strings(0), strings(1)).toString)
+ }
+} \ No newline at end of file
diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/command/jaroMetricSpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/jaroMetricSpec.scala
new file mode 100755
index 0000000..d3eabe0
--- /dev/null
+++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/jaroMetricSpec.scala
@@ -0,0 +1,39 @@
+package org.hashtree.stringmetric.cli.command
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class jaroMetricSpec extends ScalaTest {
+ "jaroMetric" should provide {
+ "main method" when passed {
+ "valid dashless arguments" should executes {
+ "print the distance" in {
+ val out = new java.io.ByteArrayOutputStream()
+
+ Console.withOut(out)(
+ jaroMetric.main(Array("--unitTest", "--debug", "abc", "abc"))
+ )
+
+ out.toString should equal ("1.0\n")
+ out.reset()
+
+ Console.withOut(out)(
+ jaroMetric.main(Array("--unitTest", "--debug", "abc", "xyz"))
+ )
+
+ out.toString should equal ("0.0\n")
+ out.reset()
+ }
+ }
+ "no dashless arguments" should throws {
+ "IllegalArgumentException" in {
+ evaluating {
+ jaroMetric.main(Array("--unitTest", "--debug"))
+ } should produce [IllegalArgumentException]
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala
new file mode 100755
index 0000000..bd5f850
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/JaroMetric.scala
@@ -0,0 +1,68 @@
+package org.hashtree.stringmetric
+
+import scala.collection.mutable.ArrayBuffer
+import scala.math
+import scala.util.control.Breaks.{ break, breakable }
+
+/**
+ * An implementation of the Jaro string metric. One differing detail in this implementation is that if a character is
+ * matched in string2, it cannot be matched upon again. This results in a more penalized distance in these scenarios.
+ */
+object JaroMetric extends StringMetric {
+ override def compare(string1: String, string2: String): Float = {
+ val ca1 = string1.replaceAllLiterally(" ", "").toLowerCase.toCharArray
+ val ca2 = string2.replaceAllLiterally(" ", "").toLowerCase.toCharArray
+
+ // Return 0 if either character array lacks length.
+ if (ca1.length == 0 || ca2.length == 0) return 0f
+
+ val mt = `match`(ca1, ca2)
+ val ms = scoreMatches(mt._1, mt._2)
+ val ts = scoreTranspositions(mt._1, mt._2)
+
+ // Return 0 if matches score is 0.
+ if (ms == 0) return 0f
+
+ ((ms.toFloat / ca1.length) + (ms.toFloat / ca2.length) + ((ms.toFloat - ts) / ms)) / 3
+ }
+
+ private[this] def `match`(ct: CompareTuple): MatchTuple = {
+ val window = math.abs((math.max(ct._1.length, ct._2.length) / 2f).floor.toInt - 1)
+ val ab1 = ArrayBuffer[Int]()
+ val ab2 = ArrayBuffer[Int]()
+
+ breakable {
+ for (i <- 0 until ct._1.length) {
+ val start = if (i - window <= 0) 0 else i - window
+ val end = if (i + window >= ct._2.length - 1) ct._2.length - 1 else i + window
+
+ if (start > ct._2.length - 1) break()
+
+ breakable {
+ for (ii <- start to end if !ab2.contains(ii)) {
+ if (ct._1(i) == ct._2(ii)) {
+ ab1.append(i)
+ ab2.append(ii)
+
+ break()
+ }
+ }
+ }
+ }
+ }
+
+ (ab1.map(ct._1(_)).toArray, ab2.sortWith(_ < _).map(ct._2(_)).toArray)
+ }
+
+ private[this] def scoreMatches(mt: MatchTuple): Int = {
+ require(mt._1.length == mt._2.length)
+
+ mt._1.length
+ }
+
+ private[this] def scoreTranspositions(mt: MatchTuple): Int = {
+ require(mt._1.length == mt._2.length)
+
+ (mt._1.zip(mt._2).filter(t => t._1 != t._2).length / 2f).floor.toInt
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
index 3e54881..545dc42 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
@@ -10,71 +10,12 @@ import scala.util.control.Breaks.{ break, breakable }
* scenarios (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722).
*/
object JaroWinklerMetric extends StringMetric {
- type CompareTuple = Tuple2[Array[Char], Array[Char]]
- type MatchTuple = CompareTuple
-
override def compare(string1: String, string2: String): Float = {
val ca1 = string1.replaceAllLiterally(" ", "").toLowerCase.toCharArray
val ca2 = string2.replaceAllLiterally(" ", "").toLowerCase.toCharArray
-
- // Return 0 if either character array lacks length.
- if (ca1.length == 0 || ca2.length == 0) return 0f
-
- val mt = `match`(ca1, ca2)
- val ms = scoreMatches(mt._1, mt._2)
- val ts = scoreTranspositions(mt._1, mt._2)
-
- // Return 0 if matches score is 0.
- if (ms == 0) return 0f
-
val prefix = ca1.zip(ca2).takeWhile(t => t._1 == t._2).map(_._1)
- val jaro = (
- (ms.toFloat / ca1.length) +
- (ms.toFloat / ca2.length) +
- ((ms.toFloat - ts) / ms)
- ) / 3
+ val jaro = JaroMetric.compare(string1, string2)
- // Add Winkler.
jaro + ((if (prefix.length <= 4) prefix.length else 4) * (0.1f * (1 - jaro)))
}
-
- private[this] def `match`(ct: CompareTuple): MatchTuple = {
- val window = math.abs((math.max(ct._1.length, ct._2.length) / 2f).floor.toInt - 1)
- val ab1 = ArrayBuffer[Int]()
- val ab2 = ArrayBuffer[Int]()
-
- breakable {
- for (i <- 0 until ct._1.length) {
- val start = if (i - window <= 0) 0 else i - window
- val end = if (i + window >= ct._2.length - 1) ct._2.length - 1 else i + window
-
- if (start > ct._2.length - 1) break()
-
- breakable {
- for (ii <- start to end if !ab2.contains(ii)) {
- if (ct._1(i) == ct._2(ii)) {
- ab1.append(i)
- ab2.append(ii)
-
- break()
- }
- }
- }
- }
- }
-
- (ab1.map(ct._1(_)).toArray, ab2.sortWith(_ < _).map(ct._2(_)).toArray)
- }
-
- private[this] def scoreMatches(mt: MatchTuple): Int = {
- require(mt._1.length == mt._2.length)
-
- mt._1.length
- }
-
- private[this] def scoreTranspositions(mt: MatchTuple): Int = {
- require(mt._1.length == mt._2.length)
-
- (mt._1.zip(mt._2).filter(t => t._1 != t._2).length / 2f).floor.toInt
- }
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/package.scala b/core/source/core/scala/org/hashtree/stringmetric/package.scala
index b0671c7..3be74fb 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/package.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/package.scala
@@ -2,5 +2,7 @@ package org.hashtree
/** Provides core string metric functionality. */
package object stringmetric {
+ type CompareTuple = Tuple2[Array[Char], Array[Char]]
+ type MatchTuple = CompareTuple
} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala
new file mode 100755
index 0000000..5d164e2
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/JaroMetricSpec.scala
@@ -0,0 +1,54 @@
+package org.hashtree.stringmetric
+
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class JaroMetricSpec extends ScalaTest {
+ "JaroMetric" should provide {
+ "compare method" when passed {
+ "valid arguments" should returns {
+ "Float indicating distance" in {
+ JaroMetric.compare("abc", "abc") should be (1.0f)
+ JaroMetric.compare("abc", "xyz") should be (0.0f)
+ JaroMetric.compare("abc", "") should be (0.0f)
+ JaroMetric.compare("", "xyz") should be (0.0f)
+ JaroMetric.compare("", "") should be (0.0f)
+ JaroMetric.compare("a", "a") should be (1.0f)
+
+ JaroMetric.compare("aa", "a") should be (0.8333333f)
+ JaroMetric.compare("a", "aa") should be (0.8333333f)
+
+ JaroMetric.compare("veryveryverylong", "v") should be (0.6875f)
+ JaroMetric.compare("v", "veryveryverylong") should be (0.6875f)
+
+ JaroMetric.compare("martha", "marhta") should be (0.9444444f)
+ JaroMetric.compare("dwayne", "duane") should be (0.82222223f)
+ JaroMetric.compare("dixon", "dicksonx") should be (0.76666665f)
+ JaroMetric.compare("abcvwxyz", "cabvwxyz") should be (0.9583333f)
+ JaroMetric.compare("jones", "johnson") should be (0.79047614f)
+ JaroMetric.compare("henka", "henkan") should be (0.9444444f)
+ JaroMetric.compare("fvie", "ten") should be (0.0f)
+
+ JaroMetric.compare("zac ephron", "zac efron") should be >
+ JaroMetric.compare("zac ephron", "kai ephron")
+ JaroMetric.compare("brittney spears", "britney spears") should be >
+ JaroMetric.compare("brittney spears", "brittney startzman")
+
+ JaroMetric.compare("m a r t h a", "m a r h t a") should be (0.9444444f)
+ JaroMetric.compare("d w a y n e", "d u a n e") should be (0.82222223f)
+ JaroMetric.compare("d i x o n", "d i c k s o n x") should be (0.76666665f)
+ JaroMetric.compare("a b c v w x y z", "c a b v w x y z") should be (0.9583333f)
+ JaroMetric.compare("j o n e s", "j o h n s o n") should be (0.79047614f)
+ JaroMetric.compare("h e n k a", "h e n k a n") should be (0.9444444f)
+ JaroMetric.compare("f v i e", "t e n") should be (0.0f)
+
+ JaroMetric.compare("z a c e p h r o n", "z a c e f r o n") should be >
+ JaroMetric.compare("z a c e p h r o n", "k a i e p h r o n")
+ JaroMetric.compare("b r i t t n e y s p e a r s", "b r i t n e y s p e a r s") should be >
+ JaroMetric.compare("b r i t t n e y s p e a r s", "b r i t t n e y s t a r t z m a n")
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/readme.md b/readme.md
index 399014f..56b6010 100755
--- a/readme.md
+++ b/readme.md
@@ -1,6 +1,7 @@
#stringmetric
A collection of string metrics implemented in Scala. Includes a light-weight core API and CLI for each string metric. The following string metrics are currently supported:
+* Jaro
* Jaro-Winkler
## Building the API