diff options
author | Rocky Madden <git@rockymadden.com> | 2013-01-10 12:59:59 -0700 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2013-01-10 12:59:59 -0700 |
commit | 2b1ee6b4f3d548414e0513149c727a640f4548fe (patch) | |
tree | 97963f86cb3563b801fd5c6de5f6d9b259e35334 | |
parent | 5ba0d76e0266b6ad7ac9b1d3db4e04f04550ef50 (diff) | |
download | stringmetric-2b1ee6b4f3d548414e0513149c727a640f4548fe.tar.gz stringmetric-2b1ee6b4f3d548414e0513149c727a640f4548fe.tar.bz2 stringmetric-2b1ee6b4f3d548414e0513149c727a640f4548fe.zip |
Created Ratcliff/Obershelp metric, command, and specs.
5 files changed, 196 insertions, 5 deletions
diff --git a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetric.scala b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetric.scala new file mode 100755 index 0000000..fa31794 --- /dev/null +++ b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetric.scala @@ -0,0 +1,54 @@ +package com.rockymadden.stringmetric.cli.similarity + +import com.rockymadden.stringmetric.cli._ +import com.rockymadden.stringmetric.similarity.RatcliffObershelpMetric + +/** + * The ratcliffObershelpMetric [[com.rockymadden.stringmetric.cli.Command]]. Compares the similarity of two strings + * using the Ratcliff / Obershelp similarity index. + */ +object ratcliffObershelpMetric extends Command { + override def main(args: Array[String]): Unit = { + val options = OptionMap(args) + + try { + // Help. + if (options.contains('h) || options.contains('help)) { + help() + exit(options) + // Execute. + } else if (options.contains('dashless) && (options('dashless): OptionMapArray).length == 2) { + execute(options) + exit(options) + // Invalid syntax. + } else throw new IllegalArgumentException("Expected valid syntax. See --help.") + } catch { + case e: Throwable => error(e, options) + } + } + + override def help(): Unit = { + val ls = sys.props("line.separator") + val tab = " " + + println( + "Compares the similarity of two strings using the Ratcliff / Obershelp similarity index." + ls + ls + + "Syntax:" + ls + + tab + "ratcliffObershelpMetric [Options] string1 string2..." + ls + ls + + "Options:" + ls + + tab + "-h, --help" + ls + + tab + tab + "Outputs description, syntax, and options." + ) + } + + override def execute(options: OptionMap): Unit = { + val strings: OptionMapArray = options('dashless) + + println( + RatcliffObershelpMetric.compare( + strings(0), + strings(1) + ).getOrElse("not comparable") + ) + } +} diff --git a/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetricSpec.scala b/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetricSpec.scala new file mode 100755 index 0000000..cf7a0bf --- /dev/null +++ b/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetricSpec.scala @@ -0,0 +1,39 @@ +package com.rockymadden.stringmetric.cli.similarity + +import com.rockymadden.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class ratcliffObershelpMetricSpec extends ScalaTest { + "ratcliffObershelpMetric" should provide { + "main method" when passed { + "valid dashless arguments" should executes { + "print if they are a match" in { + val out = new java.io.ByteArrayOutputStream() + + Console.withOut(out)( + ratcliffObershelpMetric.main(Array("--unitTest", "--debug", "abc", "abc")) + ) + + out.toString should equal ("1.0\n") + out.reset() + + Console.withOut(out)( + ratcliffObershelpMetric.main(Array("--unitTest", "--debug", "abc", "xyz")) + ) + + out.toString should equal ("0.0\n") + out.reset() + } + } + "no dashless arguments" should throws { + "IllegalArgumentException" in { + evaluating { + ratcliffObershelpMetric.main(Array("--unitTest", "--debug")) + } should produce [IllegalArgumentException] + } + } + } + } +} diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetric.scala new file mode 100755 index 0000000..94e44d0 --- /dev/null +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetric.scala @@ -0,0 +1,55 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{ CompareTuple, FilterableStringMetric, StringFilter, StringMetric } + +/** An implementation of the Ratcliff/Obershelp [[com.rockymadden.stringmetric.StringMetric]]. */ +object RatcliffObershelpMetric extends StringMetric with FilterableStringMetric { + type CompareReturn = Double + + override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit stringFilter: StringFilter): Option[CompareReturn] = { + + val fca1 = stringFilter.filter(charArray1) + lazy val fca2 = stringFilter.filter(charArray2) + + if (fca1.length == 0 || fca2.length == 0) None + else if (fca1.sameElements(fca2)) Some(1d) + else Some(2d * commonSequences(fca1, fca2).foldLeft(0)(_ + _.length) / (fca1.length + fca2.length)) + } + + override def compare(string1: String, string2: String) + (implicit stringFilter: StringFilter): Option[CompareReturn] = + + compare( + stringFilter.filter(string1.toCharArray), + stringFilter.filter(string2.toCharArray) + ) + + private[this] def longestCommonSubsequence(ct: CompareTuple[Char]) = { + val m = Array.ofDim[Int](ct._1.length + 1, ct._2.length + 1) + var lrc = (0, 0, 0) // Length, row, column. + + for (r <- 0 to ct._1.length - 1; c <- 0 to ct._2.length - 1) { + if (ct._1(r) == ct._2(c)) { + val l = m(r)(c) + 1 + m(r + 1)(c + 1) = l + + if (l > lrc._1) lrc = (l, r + 1, c + 1) + } + } + + lrc + } + + private[this] def commonSequences(ct: CompareTuple[Char]): Array[Array[Char]] = { + val lcs = longestCommonSubsequence(ct) + + if (lcs._1 == 0) Array.empty + else { + val sct1 = (ct._1.take(lcs._2 - lcs._1), ct._1.takeRight(ct._1.length - lcs._2)) + val sct2 = (ct._2.take(lcs._3 - lcs._1), ct._2.takeRight(ct._2.length - lcs._3)) + + Array(ct._1.slice(lcs._2 - lcs._1, lcs._2)) ++ commonSequences(sct1._1, sct2._1) ++ commonSequences(sct1._2, sct2._2) + } + } +} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetricSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetricSpec.scala new file mode 100755 index 0000000..e1ff9cf --- /dev/null +++ b/core/source/test/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetricSpec.scala @@ -0,0 +1,42 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class RatcliffObershelpMetricSpec extends ScalaTest { + "RatcliffObershelpMetric" should provide { + "compare method" when passed { + "empty arguments" should returns { + "None" in { + RatcliffObershelpMetric.compare("", "").isDefined should be (false) + RatcliffObershelpMetric.compare("abc", "").isDefined should be (false) + RatcliffObershelpMetric.compare("", "xyz").isDefined should be (false) + } + } + "equal arguments" should returns { + "0" in { + RatcliffObershelpMetric.compare("abc", "abc").get should be (1) + RatcliffObershelpMetric.compare("123", "123").get should be (1) + } + } + "unequal arguments" should returns { + "Double indicating distance" in { + RatcliffObershelpMetric.compare("abc", "xyz").get should be (0) + RatcliffObershelpMetric.compare("123", "456").get should be (0) + } + } + "valid arguments" should returns { + "Double indicating distance" in { + RatcliffObershelpMetric.compare("aleksander", "alexandre").get should be (0.7368421052631579) + RatcliffObershelpMetric.compare("alexandre", "aleksander").get should be (0.7368421052631579) + RatcliffObershelpMetric.compare("pennsylvania", "pencilvaneya").get should be (0.6666666666666666) + RatcliffObershelpMetric.compare("pencilvaneya", "pennsylvania").get should be (0.6666666666666666) + RatcliffObershelpMetric.compare("abcefglmn", "abefglmo").get should be (0.8235294117647058) + RatcliffObershelpMetric.compare("abefglmo", "abcefglmn").get should be (0.8235294117647058) + } + } + } + } +} @@ -12,6 +12,7 @@ A Scala library of string metrics and phonetic algorithms. It provides implement * __[Metaphone](http://en.wikipedia.org/wiki/Metaphone)__ (Phonetic metric and algorithm) * __[N-Gram](http://en.wikipedia.org/wiki/N-gram)__ (Similarity metric and algorithm) * __[NYSIIS](http://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System)__ (Phonetic metric and algorithm) +* __[Ratcliff / Obershelp](http://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html)__ (Similarity metric) * __[Refined NYSIIS](http://www.markcrocker.com/rexxtipsntricks/rxtt28.2.0482.html)__ (Phonetic metric and algorithm) * __[Refined Soundex](http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html)__ (Phonetic metric and algorithm) * __[Soundex](http://en.wikipedia.org/wiki/Soundex)__ (Phonetic metric and algorithm) @@ -43,10 +44,10 @@ if (distance >= 0.9) println("It's likely you're a match!") You can also use the StringMetric, StringAlgorithm, and StringFilter convenience objects: ```scala -if (StringMetric.compareWithJaroWinkler("string1", "string2") >= 0.9) +if (StringMetric.compareWithJaroWinkler("string1", "string2") >= 0.9) println("It's likely you're a match!") - -if (StringMetric.compareWithJaroWinkler("string1", "string2")(StringFilter.asciiLetterCase) >= 0.9) + +if (StringMetric.compareWithJaroWinkler("string1", "string2")(StringFilter.asciiLetterCase) >= 0.9) println("It's likely you're a match!") ``` @@ -69,7 +70,7 @@ $ metaphoneMetric abc xyz ``` Get the phonetic representation of "abc" using the Metaphone phonetic algorithm: -```shell +```shell $ metaphoneAlgorithm abc ``` @@ -121,4 +122,4 @@ Available on the [Maven Central Repository](http://search.maven.org/#search%7Cga * Memoization decorator ## Questions and Comments -Reach me at <stringmetric@rockymadden.com>.
\ No newline at end of file +Reach me at <stringmetric@rockymadden.com>. |