summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-10-06 23:19:20 -0600
committerRocky Madden <git@rockymadden.com>2012-10-06 23:19:20 -0600
commitdadd1221ec7c1301b3cc2dfc178dba2091e1f9b8 (patch)
treece698016721cdc2636d66742d705b232ad1c9fe9
downloadstringmetric-dadd1221ec7c1301b3cc2dfc178dba2091e1f9b8.tar.gz
stringmetric-dadd1221ec7c1301b3cc2dfc178dba2091e1f9b8.tar.bz2
stringmetric-dadd1221ec7c1301b3cc2dfc178dba2091e1f9b8.zip
Created repository.v0.0.0
-rwxr-xr-x.gitignore29
-rwxr-xr-xbuild.gradle8
-rwxr-xr-xcli/build.gradle106
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/OptionMapUtility.scala55
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/command/Command.scala34
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/command/jaroWinklerMetric.scala52
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/command/package.scala6
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/package.scala6
-rwxr-xr-xcli/source/test/scala/org/hashtree/stringmetric/cli/OptionMapUtilitySpec.scala151
-rwxr-xr-xcli/source/test/scala/org/hashtree/stringmetric/cli/command/jaroWinklerMetricSpec.scala39
-rwxr-xr-xcore/build.gradle42
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala76
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/Metric.scala6
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/StringMetric.scala6
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/JaroWinklerMetricSpec.scala54
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/ScalaTest.scala18
-rwxr-xr-xreadme.md29
-rwxr-xr-xsettings.gradle6
18 files changed, 723 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..7f0a79e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,29 @@
+# Core ignores
+*.diff
+*.err
+*.orig
+*.log
+*.rej
+*.swo
+*.swp
+*.vi
+*~
+*#
+.git/
+
+# OS ignores
+.DS_Store
+thumbs.db
+
+# Editor ignores
+.cache
+.project
+.settings
+.settings/
+bin/
+
+# Build ignores
+build/
+
+# Project ignores
+.classpath \ No newline at end of file
diff --git a/build.gradle b/build.gradle
new file mode 100755
index 0000000..3275a88
--- /dev/null
+++ b/build.gradle
@@ -0,0 +1,8 @@
+group = 'org.hashtree.stringmetric'
+version = '0.0.0'
+
+allprojects {
+ repositories {
+ mavenCentral()
+ }
+} \ No newline at end of file
diff --git a/cli/build.gradle b/cli/build.gradle
new file mode 100755
index 0000000..6cd8aea
--- /dev/null
+++ b/cli/build.gradle
@@ -0,0 +1,106 @@
+apply plugin: 'eclipse'
+apply plugin: 'scala'
+
+dependencies {
+ compile project(':stringmetric-core')
+ compile 'org.scala-lang:scala-compiler:2.9.2'
+ compile 'org.scala-lang:scala-library:2.9.2'
+
+ scalaTools 'org.scala-lang:scala-compiler:2.9.2'
+ scalaTools 'org.scala-lang:scala-library:2.9.2'
+
+ testCompile 'junit:junit:4.10'
+ testCompile 'org.scalatest:scalatest_2.9.2:1.8'
+}
+
+sourceSets {
+ main {
+ output.resourcesDir "${project.buildDir}/classes/main"
+
+ java {
+ srcDir 'source/core/java'
+ }
+ resources {
+ srcDir 'source/core/resource'
+ }
+ scala {
+ srcDir 'source/core/scala'
+ }
+ }
+ test {
+ output.resourcesDir "${project.buildDir}/classes/test"
+
+ java {
+ srcDir 'source/test/java'
+ }
+ resources {
+ srcDir 'source/test/resource'
+ }
+ scala {
+ srcDir 'source/test/scala'
+ }
+ }
+}
+
+task tar(description: 'Assembles a compressed tar archive of all core source files.', dependsOn: [':stringmetric-cli:jar', ':stringmetric-core:jar']) {
+ ext.sourcePath = "${project.projectDir}/source/core/scala"
+ ext.outputPath = "${project.buildDir}"
+ ext.workingPath = "${project.buildDir}/${project.name}"
+
+ inputs.dir new File(sourcePath)
+ outputs.dir new File(outputPath, 'generated')
+ outputs.upToDateWhen {
+ new File(workingPath).isDirectory()
+ }
+
+ doLast {
+ // Clean up working directory and tar from last execution, should they exist.
+ ant.delete(dir: workingPath, failOnError: false)
+ ant.delete(file: "${project.buildDir}/${project.name}.tar.gz", failOnError: false)
+
+ // Create project working directory.
+ ant.mkdir(dir: workingPath)
+
+ // Create scalascript header file.
+ ant.echo(file: "${workingPath}/scalascript.sh", message: '#!/bin/bash\ndir="`dirname \\"$0\\"`"\ndir="`( cd \\"$dir\\" && pwd )`"\ncp=`echo $dir/*.jar|sed \'s/ /:/g\'`\nexec scala -classpath "$cp" -savecompiled "$0" "$@"\n!#\n//')
+
+ // Copy source files to working directory.
+ ant.copy(toDir: workingPath, force: true, overwrite: true) {
+ fileset(dir: sourcePath) {
+ exclude(name: '**/cli/*.scala')
+ exclude(name: '**/package.scala')
+ }
+ filterchain {
+ concatfilter(prepend: "${workingPath}/scalascript.sh")
+ }
+ }
+
+ // Delete scalascript header file.
+ ant.delete(file: "${workingPath}/scalascript.sh")
+
+ // Flatten and remove file extension.
+ ant.move(toDir: workingPath) {
+ fileset(dir: workingPath)
+ chainedmapper {
+ mapper(type: 'flatten')
+ mapper(from: '*.scala', to: '*', type: 'glob')
+ }
+ }
+
+ // Clean up emtpy folder(s) from flatten.
+ ant.delete(dir: "${workingPath}/org", includeEmptyDirs: true)
+
+ // Copy project jars into place.
+ ant.copy(toDir: workingPath, force: true, overwrite: true) {
+ fileset(dir: "${project.buildDir}/libs")
+ }
+ ant.copy(toDir: workingPath, force: true, overwrite: true) {
+ fileset(dir: "${project(':stringmetric-core').buildDir}/libs")
+ }
+
+ // Assemble compressed tar.
+ ant.tar(compression: 'gzip', destFile: "${project.buildDir}/${project.name}.tar.gz") {
+ tarfileset(dir: workingPath, fileMode: 755, prefix: project.name)
+ }
+ }
+} \ No newline at end of file
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/OptionMapUtility.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/OptionMapUtility.scala
new file mode 100755
index 0000000..926ba8b
--- /dev/null
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/OptionMapUtility.scala
@@ -0,0 +1,55 @@
+package org.hashtree.stringmetric.cli
+
+import scala.collection.immutable.HashMap
+
+/** Utility standalone for OptionMap based operations. */
+object OptionMapUtility {
+ def toOptionMap(arguments: Array[String]): OptionMap = {
+ toOptionMap(arguments.toList)
+ }
+
+ def toOptionMap(arguments: List[String]): OptionMap = {
+ next(new HashMap[Symbol, String](), arguments)
+ }
+
+ private[this] def next(optionMap: OptionMap, arguments: List[String]): OptionMap = {
+ val double = """^(--[a-zA-Z0-9]+)(\=[a-zA-Z0-9\.\-\_]+)?""".r
+ val single = """^(-[a-zA-Z0-9]+)(\=[a-zA-Z0-9\.\-\_]+)?""".r
+ val less = """([a-zA-Z0-9\/\-\_\$\.]+)""".r
+
+ arguments match {
+ // Empty List, return OptionMap.
+ case Nil => optionMap
+ // Double dash options, without value.
+ case double(name, null) :: tail => {
+ next(optionMap + (Symbol(name.tail.tail) -> ""), tail)
+ }
+ // Double dash options, with value.
+ case double(name, value) :: tail => {
+ next(optionMap + (Symbol(name.tail.tail) -> value.tail), tail)
+ }
+ // Single dash options, without value.
+ case single(name, null) :: tail => {
+ next(optionMap + (Symbol(name.tail) -> ""), tail)
+ }
+ // Single dash options, with value. Value is discarded.
+ case single(name, value) :: tail => {
+ next(optionMap + (Symbol(name.tail) -> ""), tail)
+ }
+ // Dashless options.
+ case less(value) :: tail if value.head != '-' => {
+ if (optionMap.contains('dashless)) {
+ val dashless = optionMap('dashless) + " " + value.trim
+
+ next((optionMap - 'dashless) + ('dashless -> dashless), tail)
+ } else {
+ next(optionMap + ('dashless -> value.trim), tail)
+ }
+ }
+ // Invalid option, ignore.
+ case _ :: tail => {
+ next(optionMap, tail)
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/command/Command.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/Command.scala
new file mode 100755
index 0000000..bac7786
--- /dev/null
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/Command.scala
@@ -0,0 +1,34 @@
+package org.hashtree.stringmetric.cli.command
+
+import org.hashtree.stringmetric.cli.OptionMap
+
+/** Defines the traits and provides basic implementations of a command. Commands are always implemented as objects. */
+trait Command {
+ def main(args: Array[String]): Unit
+
+ def help(): Unit
+
+ final def error(error: Throwable)(implicit options: OptionMap): Unit = {
+ if (! isUnitTest(options)) {
+ println(error.getMessage)
+ sys.exit(1)
+ } else {
+ throw error
+ }
+ }
+
+ def execute(options: OptionMap): Unit
+
+ final def exit(implicit options: OptionMap): Unit = {
+ if (! isUnitTest(options)) sys.exit(0)
+ }
+
+ protected[this] def isUnitTest(options: OptionMap): Boolean = {
+ (options.contains('ut) || (options.contains('unitTest) && options.get('unitTest) != "false"))
+ }
+
+ protected[this] def isDebug(options: OptionMap): Boolean = {
+ (options.contains('d) || (options.contains('debug) && options.get('debug) != "false"))
+ }
+}
+
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/command/jaroWinklerMetric.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/jaroWinklerMetric.scala
new file mode 100755
index 0000000..63ab69c
--- /dev/null
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/jaroWinklerMetric.scala
@@ -0,0 +1,52 @@
+package org.hashtree.stringmetric.cli.command
+
+import org.hashtree.stringmetric.JaroWinklerMetric
+import org.hashtree.stringmetric.cli._
+import org.hashtree.stringmetric.cli.command._
+
+/**
+ * The jaroWinklerMetric [[org.hashtree.stringmetric.cli.command.Command]]. Compares two strings to calculate the
+ * Jaro-Winkler distance.
+ */
+object jaroWinklerMetric extends Command {
+ override def main(args: Array[String]): Unit = {
+ val options = OptionMapUtility.toOptionMap(args)
+
+ try {
+ // Help.
+ if (options.contains('h) || options.contains('help)) {
+ help()
+ exit(options)
+ // Execute.
+ } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1) {
+ execute(options)
+ exit(options)
+ // Invalid syntax.
+ } else {
+ throw new IllegalArgumentException("Expected valid syntax. See --help.")
+ }
+ } catch {
+ case e => error(e)(options)
+ }
+ }
+
+ override def help(): Unit = {
+ val ls = sys.props("line.separator")
+ val tab = " "
+
+ println(
+ "Compares two strings to calculate the Jaro-Winkler distance." + ls + ls +
+ "Syntax:" + ls +
+ tab + "jaroWinklerMetric [Options] string1 string2..." + ls + ls +
+ "Options:" + ls +
+ tab + "-h, --help" + ls +
+ tab + tab + "Outputs description, syntax, and options."
+ )
+ }
+
+ override def execute(options: OptionMap): Unit = {
+ val strings = options('dashless).split(" ")
+
+ println(JaroWinklerMetric.compare(strings(0), strings(1)).toString)
+ }
+} \ No newline at end of file
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/command/package.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/package.scala
new file mode 100755
index 0000000..b0610ba
--- /dev/null
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/package.scala
@@ -0,0 +1,6 @@
+package org.hashtree.stringmetric.cli
+
+/** Provides core command functionality. */
+package object command {
+ implicit val optionMap: OptionMap = OptionMapUtility.toOptionMap(Array("--unitTest=false"))
+} \ No newline at end of file
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/package.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/package.scala
new file mode 100755
index 0000000..a8c1c01
--- /dev/null
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/package.scala
@@ -0,0 +1,6 @@
+package org.hashtree.stringmetric
+
+/** Provides core CLI functionality. */
+package object cli {
+ type OptionMap = Map[Symbol, String]
+} \ No newline at end of file
diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/OptionMapUtilitySpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/OptionMapUtilitySpec.scala
new file mode 100755
index 0000000..a5a8eb1
--- /dev/null
+++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/OptionMapUtilitySpec.scala
@@ -0,0 +1,151 @@
+package org.hashtree.stringmetric.cli
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class OptionMapUtilitySpec extends ScalaTest {
+ "OptionMapUtility" should provide {
+ "overloaded toOptionMap method" when passed {
+ "Array with a single valid double dashed option" should returns {
+ "populated Map" in {
+ val options = OptionMapUtility.toOptionMap(Array("--help"))
+
+ options('help) should equal ("")
+ }
+ }
+ "List with a single valid double dashed option" should returns {
+ "populated Map" in {
+ val options = OptionMapUtility.toOptionMap(List("--help"))
+
+ options('help) should equal ("")
+ }
+ }
+ "Array with a multiple valid double dashed options" should returns {
+ "populated Map" in {
+ val options = OptionMapUtility.toOptionMap(Array("--help", "--test=test"))
+
+ options('help) should equal ("")
+ options('test) should equal ("test")
+ }
+ }
+ "List with a multiple valid double dashed options" should returns {
+ "populated Map" in {
+ val options = OptionMapUtility.toOptionMap(List("--help", "--test=test"))
+
+ options('help) should equal ("")
+ options('test) should equal ("test")
+ }
+ }
+ "Array with invalid double dashed options" should returns {
+ "empty Map" in {
+ val options = OptionMapUtility.toOptionMap(Array("--help#", "--test%=test"))
+
+ options.keysIterator.length should be (0)
+ }
+ }
+ "List with invalid double dashed options" should returns {
+ "empty Map" in {
+ val options = OptionMapUtility.toOptionMap(List("--help#", "--test%=test"))
+
+ options.keysIterator.length should be (0)
+ }
+ }
+ "Array with a single valid single dashed option" should returns {
+ "populated Map" in {
+ val options = OptionMapUtility.toOptionMap(Array("-h"))
+
+ options('h) should equal ("")
+ }
+ }
+ "List with a single valid single dashed option" should returns {
+ "populated Map" in {
+ val options = OptionMapUtility.toOptionMap(List("-h"))
+
+ options('h) should equal ("")
+ }
+ }
+ "Array with multiple valid single dashed options" should returns {
+ "populated Map" in {
+ val options = OptionMapUtility.toOptionMap(Array("-h", "-i"))
+
+ options('h) should equal ("")
+ options('i) should equal ("")
+ }
+ }
+ "List with multiple valid single dashed options" should returns {
+ "populated Map" in {
+ val options = OptionMapUtility.toOptionMap(List("-h", "-i"))
+
+ options('h) should equal ("")
+ options('i) should equal ("")
+ }
+ }
+ "Array with an invalid single dashed options" should returns {
+ "empty Map" in {
+ val options = OptionMapUtility.toOptionMap(Array("-h-i", "-i#gloo"))
+
+ options.keysIterator.length should be (0)
+ }
+ }
+ "List with an invalid single dashed options" should returns {
+ "empty Map" in {
+ val options = OptionMapUtility.toOptionMap(List("-h-i", "-i#gloo"))
+
+ options.keysIterator.length should be (0)
+ }
+ }
+ "Array with a single nameless option" should returns {
+ "single key populated Map" in {
+ val options = OptionMapUtility.toOptionMap(Array("filename0"))
+
+ options('dashless).count(_ == ' ') should be (0)
+ }
+ }
+ "List with a single nameless option" should returns {
+ "single key populated Map" in {
+ val options = OptionMapUtility.toOptionMap(List("filename0"))
+
+ options('dashless).count(_ == ' ') should be (0)
+ }
+ }
+ "Array with multiple single nameless options" should returns {
+ "single key populated Map" in {
+ val options = OptionMapUtility.toOptionMap(Array("filename0", "filename1", "filename2"))
+
+ options('dashless).count(_ == ' ') should be (2)
+ }
+ }
+ "List with multiple single nameless options" should returns {
+ "single key populated Map" in {
+ val options = OptionMapUtility.toOptionMap(List("filename0", "filename1", "filename2"))
+
+ options('dashless).count(_ == ' ') should be (2)
+ }
+ }
+ "Array with mixed options" should returns {
+ "populated Map" in {
+ val options = OptionMapUtility.toOptionMap(Array("-q", "--help", "--test=test", "-go", "filename0", "filename1", "filename2"))
+
+ options('q) should equal ("")
+ options('help) should equal ("")
+ options('test) should equal ("test")
+ options('go) should equal ("")
+ options('dashless).count(_ == ' ') should be (2)
+ }
+ }
+ "List with mixed options" should returns {
+ "populated Map" in {
+ val options = OptionMapUtility.toOptionMap(List("-q", "--help", "--test=test", "-go", "filename0", "filename1", "filename2"))
+
+ options('q) should equal ("")
+ options('help) should equal ("")
+ options('test) should equal ("test")
+ options('go) should equal ("")
+ options('dashless).count(_ == ' ') should be (2)
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/command/jaroWinklerMetricSpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/jaroWinklerMetricSpec.scala
new file mode 100755
index 0000000..071454a
--- /dev/null
+++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/jaroWinklerMetricSpec.scala
@@ -0,0 +1,39 @@
+package org.hashtree.stringmetric.cli.command
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class jaroWinklerMetricSpec extends ScalaTest {
+ "jaroWinklerMetric" should provide {
+ "main method" when passed {
+ "valid dashless arguments" should executes {
+ "print the distance" in {
+ val out = new java.io.ByteArrayOutputStream()
+
+ Console.withOut(out)(
+ jaroWinklerMetric.main(Array("--unitTest", "--debug", "abc", "abc"))
+ )
+
+ out.toString should equal ("1.0\n")
+ out.reset()
+
+ Console.withOut(out)(
+ jaroWinklerMetric.main(Array("--unitTest", "--debug", "abc", "xyz"))
+ )
+
+ out.toString should equal ("0.0\n")
+ out.reset()
+ }
+ }
+ "no dashless arguments" should throws {
+ "IllegalArgumentException" in {
+ evaluating {
+ jaroWinklerMetric.main(Array("--unitTest", "--debug"))
+ } should produce [IllegalArgumentException]
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/build.gradle b/core/build.gradle
new file mode 100755
index 0000000..55fc38f
--- /dev/null
+++ b/core/build.gradle
@@ -0,0 +1,42 @@
+apply plugin: 'eclipse'
+apply plugin: 'scala'
+
+dependencies {
+ compile 'org.scala-lang:scala-compiler:2.9.2'
+ compile 'org.scala-lang:scala-library:2.9.2'
+
+ scalaTools 'org.scala-lang:scala-compiler:2.9.2'
+ scalaTools 'org.scala-lang:scala-library:2.9.2'
+
+ testCompile 'junit:junit:4.10'
+ testCompile 'org.scalatest:scalatest_2.9.2:1.8'
+}
+
+sourceSets {
+ main {
+ output.resourcesDir "${project.buildDir}/classes/main"
+
+ java {
+ srcDir 'source/core/java'
+ }
+ resources {
+ srcDir 'source/core/resource'
+ }
+ scala {
+ srcDir 'source/core/scala'
+ }
+ }
+ test {
+ output.resourcesDir "${project.buildDir}/classes/test"
+
+ java {
+ srcDir 'source/test/java'
+ }
+ resources {
+ srcDir 'source/test/resource'
+ }
+ scala {
+ srcDir 'source/test/scala'
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
new file mode 100755
index 0000000..4311379
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/JaroWinklerMetric.scala
@@ -0,0 +1,76 @@
+package org.hashtree.stringmetric
+
+import scala.collection.mutable.ArrayBuffer
+import scala.math
+import scala.util.control.Breaks.{ break, breakable }
+
+/**
+ * An implementation of the Jaro-Winkler string metric. One differing detail in this implementation is that if a
+ * character is matched in string2, it cannot be matched upon again. This results in a more penalized distance in these
+ * scenarios (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722).
+ */
+object JaroWinklerMetric extends StringMetric {
+ override def compare(s1: String, s2: String): Float = {
+ val ca1 = s1.replaceAllLiterally(" ", "").toLowerCase.toCharArray
+ val ca2 = s2.replaceAllLiterally(" ", "").toLowerCase.toCharArray
+
+ // Return 0 if either character array lacks length.
+ if (ca1.length == 0 || ca2.length == 0) return 0f
+
+ val (m1, m2) = matchChars(ca1, ca2)
+ val matchesScore = scoreMatches(m1, m2)
+ val transpositionsScore = scoreTranspositions(m1, m2)
+
+ // Return 0 if matches score is 0.
+ if (matchesScore == 0) return 0f
+
+ val prefix = ca1.zip(ca2).takeWhile(t => t._1 == t._2).map(_._1).mkString
+ val jaro = (
+ (matchesScore.toFloat / ca1.length) +
+ (matchesScore.toFloat / ca2.length) +
+ ((matchesScore.toFloat - transpositionsScore) / matchesScore.toFloat)
+ ) / 3
+
+ jaro + ((if (prefix.length <= 4) prefix.length else 4) * (.1f * (1 - jaro)))
+ }
+
+ private[this] def matchChars(ca1: Array[Char], ca2: Array[Char]): Tuple2[Array[Char], Array[Char]] = {
+ val window = math.abs((math.max(ca1.length, ca2.length) / 2f).floor.toInt - 1)
+ val a1Indices = ArrayBuffer[Int]()
+ val a2Indices = ArrayBuffer[Int]()
+
+ breakable {
+ for (i <- 0 until ca1.length) {
+ val start = if (i - window <= 0) 0 else i - window
+ val end = if (i + window >= ca2.length - 1) ca2.length - 1 else i + window
+
+ if (start > ca2.length) break
+
+ breakable {
+ for (ii <- start to end if ! a2Indices.contains(ii)) {
+ if (ca1(i) == ca2(ii)) {
+ a1Indices.append(i)
+ a2Indices.append(ii)
+
+ break
+ }
+ }
+ }
+ }
+ }
+
+ (a1Indices.map(i => ca1(i)).toArray, a2Indices.sortWith(_ < _).map(i => ca2(i)).toArray)
+ }
+
+ private[this] def scoreMatches(mca1: Array[Char], mca2: Array[Char]): Int = {
+ require(mca1.length == mca2.length)
+
+ mca1.length
+ }
+
+ private[this] def scoreTranspositions(mca1: Array[Char], mca2: Array[Char]): Int = {
+ require(mca1.length == mca2.length)
+
+ (mca1.zip(mca2).filter(t => t._1 != t._2).length / 2f).floor.toInt
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/Metric.scala b/core/source/core/scala/org/hashtree/stringmetric/Metric.scala
new file mode 100755
index 0000000..2d570c2
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/Metric.scala
@@ -0,0 +1,6 @@
+package org.hashtree.stringmetric
+
+/** Marks those which leverage traits of a metric. */
+trait Metric[T] {
+ def compare(t1: T, t2: T): AnyVal
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
new file mode 100755
index 0000000..792aeba
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/StringMetric.scala
@@ -0,0 +1,6 @@
+package org.hashtree.stringmetric
+
+/** Marks those which leverage traits of a string based Metric. */
+trait StringMetric extends Metric[String] {
+
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/JaroWinklerMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/JaroWinklerMetricSpec.scala
new file mode 100755
index 0000000..6e044a0
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/JaroWinklerMetricSpec.scala
@@ -0,0 +1,54 @@
+package org.hashtree.stringmetric
+
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class JaroWinklerMetricSpec extends ScalaTest {
+ "JaroWinklerMetric" should provide {
+ "compare method" when passed {
+ "valid arguments" should returns {
+ "Float indicating distance" in {
+ JaroWinklerMetric.compare("abc", "abc") should be (1.0f)
+ JaroWinklerMetric.compare("abc", "xyz") should be (0.0f)
+ JaroWinklerMetric.compare("abc", "") should be (0.0f)
+ JaroWinklerMetric.compare("", "xyz") should be (0.0f)
+ JaroWinklerMetric.compare("", "") should be (0.0f)
+ JaroWinklerMetric.compare("a", "a") should be (1.0f)
+
+ JaroWinklerMetric.compare("aa", "a") should be (0.84999996f)
+ JaroWinklerMetric.compare("a", "aa") should be (0.84999996f)
+
+ JaroWinklerMetric.compare("veryveryverylong", "v") should be (0.71875f)
+ JaroWinklerMetric.compare("v", "veryveryverylong") should be (0.71875f)
+
+ JaroWinklerMetric.compare("martha", "marhta") should be (0.96111107f)
+ JaroWinklerMetric.compare("dwayne", "duane") should be (0.84000003f)
+ JaroWinklerMetric.compare("dixon", "dicksonx") should be (0.81333333f)
+ JaroWinklerMetric.compare("abcvwxyz", "cabvwxyz") should be (0.9583333f)
+ JaroWinklerMetric.compare("jones", "johnson") should be (0.8323809f)
+ JaroWinklerMetric.compare("henka", "henkan") should be (0.96666664f)
+ JaroWinklerMetric.compare("fvie", "ten") should be (0.0f)
+
+ JaroWinklerMetric.compare("zac ephron", "zac efron") should be >
+ JaroWinklerMetric.compare("zac ephron", "kai ephron")
+ JaroWinklerMetric.compare("brittney spears", "britney spears") should be >
+ JaroWinklerMetric.compare("brittney spears", "brittney startzman")
+
+ JaroWinklerMetric.compare("m a r t h a", "m a r h t a") should be (0.96111107f)
+ JaroWinklerMetric.compare("d w a y n e", "d u a n e") should be (0.84000003f)
+ JaroWinklerMetric.compare("d i x o n", "d i c k s o n x") should be (0.81333333f)
+ JaroWinklerMetric.compare("a b c v w x y z", "c a b v w x y z") should be (0.9583333f)
+ JaroWinklerMetric.compare("j o n e s", "j o h n s o n") should be (0.8323809f)
+ JaroWinklerMetric.compare("h e n k a", "h e n k a n") should be (0.96666664f)
+ JaroWinklerMetric.compare("f v i e", "t e n") should be (0.0f)
+
+ JaroWinklerMetric.compare("z a c e p h r o n", "z a c e f r o n") should be >
+ JaroWinklerMetric.compare("z a c e p h r o n", "k a i e p h r o n")
+ JaroWinklerMetric.compare("b r i t t n e y s p e a r s", "b r i t n e y s p e a r s") should be >
+ JaroWinklerMetric.compare("b r i t t n e y s p e a r s", "b r i t t n e y s t a r t z m a n")
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/ScalaTest.scala b/core/source/test/scala/org/hashtree/stringmetric/ScalaTest.scala
new file mode 100755
index 0000000..a9cd5e1
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/ScalaTest.scala
@@ -0,0 +1,18 @@
+package org.hashtree.stringmetric
+
+import org.scalatest.WordSpec
+import org.scalatest.matchers.ShouldMatchers
+
+trait ScalaTest extends WordSpec with ShouldMatchers {
+ def allows = afterWord("allow")
+
+ def executes = afterWord("execute")
+
+ def passed = afterWord("passed")
+
+ def provide = afterWord("provide")
+
+ def returns = afterWord("return")
+
+ def throws = afterWord("throw")
+} \ No newline at end of file
diff --git a/readme.md b/readme.md
new file mode 100755
index 0000000..492a179
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,29 @@
+#stringmetric
+A collection of string metrics built with Scala. Includes a light-weight core API and CLI based interface for each string metric. The following string metrics are currently supported:
+
+* Jaro-Winkler
+
+## Building the API
+gradle jar
+
+## Building the CLI
+gradle tar
+
+## Using the API
+`// Import the metric of choice.`
+`import org.hashtree.stringmetric.JaroWinklerMetric`
+
+`// Invoke the compare method on the metric.`
+`val distance = JaroWinklerMetric.compare("string1", "string2")`
+
+`// Do something. In this case, distance is between 1.0f and 0.0f.`
+`if (distance >= 0.9) println("It's likely you're a match!")`
+
+## Using the CLI
+Uncompress the built tar and ensure you have ability to execute the commands. Execute the metric of choice via the command line:
+
+`jaroWinklerMetric --help`
+`jaroWinklerMetric abc xyz`
+
+## Requirements
+* Scala 2.9.2 \ No newline at end of file
diff --git a/settings.gradle b/settings.gradle
new file mode 100755
index 0000000..e99d74b
--- /dev/null
+++ b/settings.gradle
@@ -0,0 +1,6 @@
+include 'cli'
+include 'core'
+
+rootProject.name = 'stringmetric'
+project(':cli').name = 'stringmetric-cli'
+project(':core').name = 'stringmetric-core' \ No newline at end of file