summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2013-01-10 12:59:59 -0700
committerRocky Madden <git@rockymadden.com>2013-01-10 12:59:59 -0700
commit2b1ee6b4f3d548414e0513149c727a640f4548fe (patch)
tree97963f86cb3563b801fd5c6de5f6d9b259e35334
parent5ba0d76e0266b6ad7ac9b1d3db4e04f04550ef50 (diff)
downloadstringmetric-2b1ee6b4f3d548414e0513149c727a640f4548fe.tar.gz
stringmetric-2b1ee6b4f3d548414e0513149c727a640f4548fe.tar.bz2
stringmetric-2b1ee6b4f3d548414e0513149c727a640f4548fe.zip
Created Ratcliff/Obershelp metric, command, and specs.
-rwxr-xr-xcli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetric.scala54
-rwxr-xr-xcli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetricSpec.scala39
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetric.scala55
-rwxr-xr-xcore/source/test/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetricSpec.scala42
-rwxr-xr-xreadme.md11
5 files changed, 196 insertions, 5 deletions
diff --git a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetric.scala b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetric.scala
new file mode 100755
index 0000000..fa31794
--- /dev/null
+++ b/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetric.scala
@@ -0,0 +1,54 @@
+package com.rockymadden.stringmetric.cli.similarity
+
+import com.rockymadden.stringmetric.cli._
+import com.rockymadden.stringmetric.similarity.RatcliffObershelpMetric
+
+/**
+ * The ratcliffObershelpMetric [[com.rockymadden.stringmetric.cli.Command]]. Compares the similarity of two strings
+ * using the Ratcliff / Obershelp similarity index.
+ */
+object ratcliffObershelpMetric extends Command {
+ override def main(args: Array[String]): Unit = {
+ val options = OptionMap(args)
+
+ try {
+ // Help.
+ if (options.contains('h) || options.contains('help)) {
+ help()
+ exit(options)
+ // Execute.
+ } else if (options.contains('dashless) && (options('dashless): OptionMapArray).length == 2) {
+ execute(options)
+ exit(options)
+ // Invalid syntax.
+ } else throw new IllegalArgumentException("Expected valid syntax. See --help.")
+ } catch {
+ case e: Throwable => error(e, options)
+ }
+ }
+
+ override def help(): Unit = {
+ val ls = sys.props("line.separator")
+ val tab = " "
+
+ println(
+ "Compares the similarity of two strings using the Ratcliff / Obershelp similarity index." + ls + ls +
+ "Syntax:" + ls +
+ tab + "ratcliffObershelpMetric [Options] string1 string2..." + ls + ls +
+ "Options:" + ls +
+ tab + "-h, --help" + ls +
+ tab + tab + "Outputs description, syntax, and options."
+ )
+ }
+
+ override def execute(options: OptionMap): Unit = {
+ val strings: OptionMapArray = options('dashless)
+
+ println(
+ RatcliffObershelpMetric.compare(
+ strings(0),
+ strings(1)
+ ).getOrElse("not comparable")
+ )
+ }
+}
diff --git a/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetricSpec.scala b/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetricSpec.scala
new file mode 100755
index 0000000..cf7a0bf
--- /dev/null
+++ b/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/ratcliffObershelpMetricSpec.scala
@@ -0,0 +1,39 @@
+package com.rockymadden.stringmetric.cli.similarity
+
+import com.rockymadden.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class ratcliffObershelpMetricSpec extends ScalaTest {
+ "ratcliffObershelpMetric" should provide {
+ "main method" when passed {
+ "valid dashless arguments" should executes {
+ "print if they are a match" in {
+ val out = new java.io.ByteArrayOutputStream()
+
+ Console.withOut(out)(
+ ratcliffObershelpMetric.main(Array("--unitTest", "--debug", "abc", "abc"))
+ )
+
+ out.toString should equal ("1.0\n")
+ out.reset()
+
+ Console.withOut(out)(
+ ratcliffObershelpMetric.main(Array("--unitTest", "--debug", "abc", "xyz"))
+ )
+
+ out.toString should equal ("0.0\n")
+ out.reset()
+ }
+ }
+ "no dashless arguments" should throws {
+ "IllegalArgumentException" in {
+ evaluating {
+ ratcliffObershelpMetric.main(Array("--unitTest", "--debug"))
+ } should produce [IllegalArgumentException]
+ }
+ }
+ }
+ }
+}
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetric.scala
new file mode 100755
index 0000000..94e44d0
--- /dev/null
+++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetric.scala
@@ -0,0 +1,55 @@
+package com.rockymadden.stringmetric.similarity
+
+import com.rockymadden.stringmetric.{ CompareTuple, FilterableStringMetric, StringFilter, StringMetric }
+
+/** An implementation of the Ratcliff/Obershelp [[com.rockymadden.stringmetric.StringMetric]]. */
+object RatcliffObershelpMetric extends StringMetric with FilterableStringMetric {
+ type CompareReturn = Double
+
+ override def compare(charArray1: Array[Char], charArray2: Array[Char])
+ (implicit stringFilter: StringFilter): Option[CompareReturn] = {
+
+ val fca1 = stringFilter.filter(charArray1)
+ lazy val fca2 = stringFilter.filter(charArray2)
+
+ if (fca1.length == 0 || fca2.length == 0) None
+ else if (fca1.sameElements(fca2)) Some(1d)
+ else Some(2d * commonSequences(fca1, fca2).foldLeft(0)(_ + _.length) / (fca1.length + fca2.length))
+ }
+
+ override def compare(string1: String, string2: String)
+ (implicit stringFilter: StringFilter): Option[CompareReturn] =
+
+ compare(
+ stringFilter.filter(string1.toCharArray),
+ stringFilter.filter(string2.toCharArray)
+ )
+
+ private[this] def longestCommonSubsequence(ct: CompareTuple[Char]) = {
+ val m = Array.ofDim[Int](ct._1.length + 1, ct._2.length + 1)
+ var lrc = (0, 0, 0) // Length, row, column.
+
+ for (r <- 0 to ct._1.length - 1; c <- 0 to ct._2.length - 1) {
+ if (ct._1(r) == ct._2(c)) {
+ val l = m(r)(c) + 1
+ m(r + 1)(c + 1) = l
+
+ if (l > lrc._1) lrc = (l, r + 1, c + 1)
+ }
+ }
+
+ lrc
+ }
+
+ private[this] def commonSequences(ct: CompareTuple[Char]): Array[Array[Char]] = {
+ val lcs = longestCommonSubsequence(ct)
+
+ if (lcs._1 == 0) Array.empty
+ else {
+ val sct1 = (ct._1.take(lcs._2 - lcs._1), ct._1.takeRight(ct._1.length - lcs._2))
+ val sct2 = (ct._2.take(lcs._3 - lcs._1), ct._2.takeRight(ct._2.length - lcs._3))
+
+ Array(ct._1.slice(lcs._2 - lcs._1, lcs._2)) ++ commonSequences(sct1._1, sct2._1) ++ commonSequences(sct1._2, sct2._2)
+ }
+ }
+}
diff --git a/core/source/test/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetricSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetricSpec.scala
new file mode 100755
index 0000000..e1ff9cf
--- /dev/null
+++ b/core/source/test/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetricSpec.scala
@@ -0,0 +1,42 @@
+package com.rockymadden.stringmetric.similarity
+
+import com.rockymadden.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class RatcliffObershelpMetricSpec extends ScalaTest {
+ "RatcliffObershelpMetric" should provide {
+ "compare method" when passed {
+ "empty arguments" should returns {
+ "None" in {
+ RatcliffObershelpMetric.compare("", "").isDefined should be (false)
+ RatcliffObershelpMetric.compare("abc", "").isDefined should be (false)
+ RatcliffObershelpMetric.compare("", "xyz").isDefined should be (false)
+ }
+ }
+ "equal arguments" should returns {
+ "0" in {
+ RatcliffObershelpMetric.compare("abc", "abc").get should be (1)
+ RatcliffObershelpMetric.compare("123", "123").get should be (1)
+ }
+ }
+ "unequal arguments" should returns {
+ "Double indicating distance" in {
+ RatcliffObershelpMetric.compare("abc", "xyz").get should be (0)
+ RatcliffObershelpMetric.compare("123", "456").get should be (0)
+ }
+ }
+ "valid arguments" should returns {
+ "Double indicating distance" in {
+ RatcliffObershelpMetric.compare("aleksander", "alexandre").get should be (0.7368421052631579)
+ RatcliffObershelpMetric.compare("alexandre", "aleksander").get should be (0.7368421052631579)
+ RatcliffObershelpMetric.compare("pennsylvania", "pencilvaneya").get should be (0.6666666666666666)
+ RatcliffObershelpMetric.compare("pencilvaneya", "pennsylvania").get should be (0.6666666666666666)
+ RatcliffObershelpMetric.compare("abcefglmn", "abefglmo").get should be (0.8235294117647058)
+ RatcliffObershelpMetric.compare("abefglmo", "abcefglmn").get should be (0.8235294117647058)
+ }
+ }
+ }
+ }
+}
diff --git a/readme.md b/readme.md
index 4e7b665..d23c90b 100755
--- a/readme.md
+++ b/readme.md
@@ -12,6 +12,7 @@ A Scala library of string metrics and phonetic algorithms. It provides implement
* __[Metaphone](http://en.wikipedia.org/wiki/Metaphone)__ (Phonetic metric and algorithm)
* __[N-Gram](http://en.wikipedia.org/wiki/N-gram)__ (Similarity metric and algorithm)
* __[NYSIIS](http://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System)__ (Phonetic metric and algorithm)
+* __[Ratcliff / Obershelp](http://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html)__ (Similarity metric)
* __[Refined NYSIIS](http://www.markcrocker.com/rexxtipsntricks/rxtt28.2.0482.html)__ (Phonetic metric and algorithm)
* __[Refined Soundex](http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html)__ (Phonetic metric and algorithm)
* __[Soundex](http://en.wikipedia.org/wiki/Soundex)__ (Phonetic metric and algorithm)
@@ -43,10 +44,10 @@ if (distance >= 0.9) println("It's likely you're a match!")
You can also use the StringMetric, StringAlgorithm, and StringFilter convenience objects:
```scala
-if (StringMetric.compareWithJaroWinkler("string1", "string2") >= 0.9)
+if (StringMetric.compareWithJaroWinkler("string1", "string2") >= 0.9)
println("It's likely you're a match!")
-
-if (StringMetric.compareWithJaroWinkler("string1", "string2")(StringFilter.asciiLetterCase) >= 0.9)
+
+if (StringMetric.compareWithJaroWinkler("string1", "string2")(StringFilter.asciiLetterCase) >= 0.9)
println("It's likely you're a match!")
```
@@ -69,7 +70,7 @@ $ metaphoneMetric abc xyz
```
Get the phonetic representation of "abc" using the Metaphone phonetic algorithm:
-```shell
+```shell
$ metaphoneAlgorithm abc
```
@@ -121,4 +122,4 @@ Available on the [Maven Central Repository](http://search.maven.org/#search%7Cga
* Memoization decorator
## Questions and Comments
-Reach me at <stringmetric@rockymadden.com>. \ No newline at end of file
+Reach me at <stringmetric@rockymadden.com>.