summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-10-26 19:46:48 -0600
committerRocky Madden <git@rockymadden.com>2012-10-26 19:46:48 -0600
commitce234db3048af3d7cb5c825c8e217a9169ab5c3b (patch)
tree61716b04494d4fce785f2c4407401db492bd7559
parent93d1ecd3df6df537f951f63de060a5db92e01235 (diff)
downloadstringmetric-ce234db3048af3d7cb5c825c8e217a9169ab5c3b.tar.gz
stringmetric-ce234db3048af3d7cb5c825c8e217a9169ab5c3b.tar.bz2
stringmetric-ce234db3048af3d7cb5c825c8e217a9169ab5c3b.zip
Created NYSIIS algorithm, metric, command, and specs.
-rwxr-xr-xcli/source/core/scala/org/hashtree/stringmetric/cli/command/nysiisMetric.scala58
-rwxr-xr-xcli/source/test/scala/org/hashtree/stringmetric/cli/command/nysiisMetricSpec.scala46
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/phonetic/Nysiis.scala139
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/phonetic/NysiisMetric.scala29
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisMetricSpec.scala37
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisSpec.scala163
-rwxr-xr-xreadme.md1
7 files changed, 473 insertions, 0 deletions
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/command/nysiisMetric.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/nysiisMetric.scala
new file mode 100755
index 0000000..d49b99e
--- /dev/null
+++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/nysiisMetric.scala
@@ -0,0 +1,58 @@
+package org.hashtree.stringmetric.cli.command
+
+import org.hashtree.stringmetric.StringFilterDelegate
+import org.hashtree.stringmetric.cli._
+import org.hashtree.stringmetric.cli.command._
+import org.hashtree.stringmetric.phonetic.NysiisMetric
+
+/**
+ * The nysiisMetric [[org.hashtree.stringmetric.cli.command.Command]]. Compares two strings to determine if they are
+ * phonetically similarly, per the NYSIIS algorithm.
+ */
+object nysiisMetric extends Command {
+ override def main(args: Array[String]): Unit = {
+ val options = OptionMapUtility.toOptionMap(args)
+
+ try {
+ // Help.
+ if (options.contains('h) || options.contains('help)) {
+ help()
+ exit(options)
+ // Execute.
+ } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1) {
+ execute(options)
+ exit(options)
+ // Invalid syntax.
+ } else {
+ throw new IllegalArgumentException("Expected valid syntax. See --help.")
+ }
+ } catch {
+ case e => error(e)(options)
+ }
+ }
+
+ override def help(): Unit = {
+ val ls = sys.props("line.separator")
+ val tab = " "
+
+ println(
+ "Compares two strings to determine if they are phonetically similarly, per the NYSIIS algorithm." + ls + ls +
+ "Syntax:" + ls +
+ tab + "nysiisMetric [Options] string1 string2..." + ls + ls +
+ "Options:" + ls +
+ tab + "-h, --help" + ls +
+ tab + tab + "Outputs description, syntax, and options."
+ )
+ }
+
+ override def execute(options: OptionMap): Unit = {
+ val strings = options('dashless).split(" ")
+
+ println(
+ NysiisMetric.compare(
+ strings(0),
+ strings(1)
+ )(new StringFilterDelegate).getOrElse("not comparable").toString
+ )
+ }
+} \ No newline at end of file
diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/command/nysiisMetricSpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/nysiisMetricSpec.scala
new file mode 100755
index 0000000..4c0beea
--- /dev/null
+++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/nysiisMetricSpec.scala
@@ -0,0 +1,46 @@
+package org.hashtree.stringmetric.cli.command
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class nysiisMetricSpec extends ScalaTest {
+ "nysiisMetric" should provide {
+ "main method" when passed {
+ "valid dashless arguments" should executes {
+ "print if they are a match" in {
+ val out = new java.io.ByteArrayOutputStream()
+
+ Console.withOut(out)(
+ nysiisMetric.main(Array("--unitTest", "--debug", "aBc", "abc"))
+ )
+
+ out.toString should equal ("true\n")
+ out.reset()
+
+ Console.withOut(out)(
+ nysiisMetric.main(Array("--unitTest", "--debug", "aBc", "xyz"))
+ )
+
+ out.toString should equal ("false\n")
+ out.reset()
+
+ Console.withOut(out)(
+ nysiisMetric.main(Array("--unitTest", "--debug", "1", "1"))
+ )
+
+ out.toString should equal ("not comparable\n")
+ out.reset()
+ }
+ }
+ "no dashless arguments" should throws {
+ "IllegalArgumentException" in {
+ evaluating {
+ nysiisMetric.main(Array("--unitTest", "--debug"))
+ } should produce [IllegalArgumentException]
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Nysiis.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Nysiis.scala
new file mode 100755
index 0000000..90267ee
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Nysiis.scala
@@ -0,0 +1,139 @@
+package org.hashtree.stringmetric.phonetic
+
+import org.hashtree.stringmetric.{ StringAlgorithm, StringFilter, StringFilterDelegate }
+import scala.annotation.tailrec
+
+/** An implementation of the NYSIIS [[org.hashtree.stringmetric.StringAlgorithm]]. */
+object Nysiis extends StringAlgorithm {
+ override def compute(charArray: Array[Char])(implicit stringFilter: StringFilter): Option[Array[Char]] = {
+ val ca = stringFilter.filter(charArray)
+
+ if (ca.length == 0) None
+ else {
+ val thl = transcodeLast(transcodeHead(ca.map(_.toLower)))
+
+ if (thl.head < 97 || thl.head > 122) None
+ else {
+ if (thl.length == 1) Some(thl)
+ else {
+
+ val ts = thl.splitAt(1)
+ val t = transcode(ts._1, ts._2.head, ts._2.tail, ts._1)
+
+ Some(t.head +: deduplicate(transcodeClean(t.tail)))
+ }
+ }
+ }
+ }
+
+ override def compute(string: String)(implicit stringFilter: StringFilter): Option[String] = {
+ compute(stringFilter.filter(string.toCharArray))(new StringFilterDelegate) match {
+ case Some(se) => Some(se.mkString)
+ case None => None
+ }
+ }
+
+ private[this] def deduplicate(ca: Array[Char]) = {
+ if (ca.length <= 1) ca
+ else
+ ca.sliding(2).filter(a => a(0) != a(1)).map(a => a(0)).toArray[Char] :+ ca.last
+ }
+
+ private[this] def isVowel(c: Char) = (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u')
+
+ @tailrec
+ private[this] def transcode(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = {
+ if (c == '\0' && r.length == 0) o
+ else {
+ val shift = (d: Int, ca: Array[Char]) => {
+ val sa = r.splitAt(d - 1)
+
+ (
+ if (sa._1.length > 0) (l :+ c) ++ sa._1 else l :+ c,
+ if (sa._2.length > 0) sa._2.head else '\0',
+ if (sa._2.length > 1) sa._2.tail else Array.empty[Char],
+ ca
+ )
+ }
+
+ val t = {
+ c match {
+ case 'a' | 'i' | 'o' | 'u' => shift(1, o :+ 'a')
+ case 'b' | 'c' | 'd' | 'f' | 'g' | 'j' | 'l' | 'n' | 'r' | 't' | 'v' | 'x' | 'y' => shift(1, o :+ c)
+ case 'e' => {
+ if (r.length >= 1 && r.head == 'v')
+ shift(2, o ++ Array('a', 'f'))
+ else
+ shift(1, o :+ 'a')
+ }
+ case 'h' => {
+ if (l.length >= 1 && (!isVowel(l.last) || (r.length >= 1 && !isVowel(r.head))))
+ shift(1, o :+ l.last)
+ else
+ shift(1, o :+ c)
+ }
+ case 'k' => if (r.length >= 1 && r.head == 'n') shift(2, o :+ 'n') else shift(1, o :+ 'c')
+ case 'm' => shift(1, o :+ 'n')
+ case 'p' => if (r.length >= 1 && r.head == 'h') shift(2, o ++ Array('f', 'f')) else shift(1, o :+ 'p')
+ case 'q' => shift(1, o :+ 'g')
+ case 's' => {
+ if (r.length >= 2 && r.head == 'c' && r(1) == 'h')
+ shift(3, o ++ Array('s', 's', 's'))
+ else
+ shift(1, o :+ c)
+ }
+ case 'w' => {
+ if (l.length >= 1 && !isVowel(l.last))
+ shift(1, o :+ l.last)
+ else
+ shift(1, o :+ c)
+ }
+ case 'z' => shift(1, o :+ 's')
+ case _ => shift(1, o)
+ }
+ }
+
+ transcode(t._1, t._2, t._3, t._4)
+ }
+ }
+
+ private[this] def transcodeClean(ca: Array[Char]) = {
+ if (ca.length >= 1 && (ca.last == 'a' || ca.last == 's'))
+ ca.reverse.dropWhile(c => c == 'a' || c == 's').reverse
+ else if (ca.length >= 2 && ca.last == 'y' && ca(ca.length - 2) == 'a')
+ ca.dropRight(2) :+ 'y'
+ else
+ ca
+ }
+
+ private[this] def transcodeHead(ca: Array[Char]) = {
+ val h = ca.take(3).padTo(3, '\0')
+
+ if (h.head == 'm' && h(1) == 'a' && h.last == 'c')
+ Array('m', 'c', 'c') ++ ca.takeRight(ca.length - 3)
+ else if (h.head == 's' && h(1) == 'c' && h.last == 'h')
+ Array('s', 's', 's') ++ ca.takeRight(ca.length - 3)
+ else if (h.head == 'p' && (h(1) == 'h' || h(1) == 'f'))
+ Array('f', 'f') ++ ca.takeRight(ca.length - 2)
+ else if (h.head == 'k' && h(1) == 'n')
+ Array('n', 'n') ++ ca.takeRight(ca.length - 2)
+ else if (h.head == 'k')
+ Array('c') ++ ca.takeRight(ca.length - 1)
+ else
+ ca
+ }
+
+ private[this] def transcodeLast(ca: Array[Char]) = {
+ val h = ca.take(2).padTo(2, '\0')
+
+ if (
+ (h.last == 't' && (h.head == 'd' || h.head == 'r' || h.head == 'n')) ||
+ (h.last == 'd' && (h.head == 'r' || h.head == 'n'))
+ )
+ Array('d') ++ ca.takeRight(ca.length - 2)
+ else if (h.last == 'e' && (h.head == 'i' || h.head == 'e'))
+ Array('y') ++ ca.takeRight(ca.length - 2)
+ else
+ ca
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/NysiisMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/NysiisMetric.scala
new file mode 100755
index 0000000..0112b06
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/NysiisMetric.scala
@@ -0,0 +1,29 @@
+package org.hashtree.stringmetric.phonetic
+
+import org.hashtree.stringmetric.{ StringFilter, StringFilterDelegate, StringMetric }
+
+/** An implementation of the NYSIIS [[org.hashtree.stringmetric.StringMetric]]. */
+object NysiisMetric extends StringMetric {
+ override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringFilter: StringFilter): Option[Boolean] = {
+ val ca1 = stringFilter.filter(charArray1)
+ val ca2 = stringFilter.filter(charArray2)
+
+ if (ca1.length == 0 || ca2.length == 0) None
+ else {
+ val ny1 = Nysiis.compute(ca1)
+ val ny2 = Nysiis.compute(ca2)
+
+ if (!ny1.isDefined || !ny2.isDefined || (ny1.get.length == 0 && ny2.get.length == 0))
+ None
+ else
+ Some(ny1.get.sameElements(ny2.get))
+ }
+ }
+
+ override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Boolean] = {
+ compare(
+ stringFilter.filter(string1.toCharArray),
+ stringFilter.filter(string2.toCharArray)
+ )(new StringFilterDelegate)
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisMetricSpec.scala
new file mode 100755
index 0000000..4c4b02b
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisMetricSpec.scala
@@ -0,0 +1,37 @@
+package org.hashtree.stringmetric.phonetic
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class NysiisMetricSpec extends ScalaTest {
+ "NysiisMetric" should provide {
+ "compare method" when passed {
+ "empty arguments" should returns {
+ "None" in {
+ NysiisMetric.compare("", "").isDefined should be (false)
+ NysiisMetric.compare("abc", "").isDefined should be (false)
+ NysiisMetric.compare("", "xyz").isDefined should be (false)
+ }
+ }
+ "non-phonetic arguments" should returns {
+ "None" in {
+ NysiisMetric.compare("123", "123").isDefined should be (false)
+ NysiisMetric.compare("123", "").isDefined should be (false)
+ NysiisMetric.compare("", "123").isDefined should be (false)
+ }
+ }
+ "phonetically similar arguments" should returns {
+ "Boolean indicating true" in {
+ NysiisMetric.compare("ham", "hum").get should be (true)
+ }
+ }
+ "phonetically dissimilar arguments" should returns {
+ "Boolean indicating false" in {
+ NysiisMetric.compare("dumb", "gum").get should be (false)
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisSpec.scala
new file mode 100755
index 0000000..7186238
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisSpec.scala
@@ -0,0 +1,163 @@
+package org.hashtree.stringmetric.phonetic
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class NysiisSpec extends ScalaTest {
+ "Nysiis" should provide {
+ "compute method" when passed {
+ "empty argument" should returns {
+ "None" in {
+ Nysiis.compute("").isDefined should be (false)
+ }
+ }
+ "non-phonetic argument" should returns {
+ "None" in {
+ Nysiis.compute("123").isDefined should be (false)
+ }
+ }
+ "phonetic argument" should returns {
+ "Some" in {
+ // a
+ Nysiis.compute("a").get should equal ("a")
+ Nysiis.compute("aa").get should equal ("a")
+
+ // b
+ Nysiis.compute("b").get should equal ("b")
+ Nysiis.compute("bb").get should equal ("bb")
+
+ // c
+ Nysiis.compute("c").get should equal ("c")
+ Nysiis.compute("cc").get should equal ("cc")
+
+ // d
+ Nysiis.compute("d").get should equal ("d")
+ Nysiis.compute("dd").get should equal ("dd")
+
+ // e
+ Nysiis.compute("e").get should equal ("e")
+ Nysiis.compute("ee").get should equal ("y")
+
+ // f
+ Nysiis.compute("f").get should equal ("f")
+ Nysiis.compute("ff").get should equal ("ff")
+
+ // g
+ Nysiis.compute("g").get should equal ("g")
+ Nysiis.compute("gg").get should equal ("gg")
+
+ // h
+ Nysiis.compute("h").get should equal ("h")
+ Nysiis.compute("hh").get should equal ("hh")
+
+ // i
+ Nysiis.compute("i").get should equal ("i")
+ Nysiis.compute("ii").get should equal ("i")
+
+ // j
+ Nysiis.compute("j").get should equal ("j")
+ Nysiis.compute("jj").get should equal ("jj")
+
+ // k
+ Nysiis.compute("k").get should equal ("c")
+ Nysiis.compute("kk").get should equal ("cc")
+
+ // l
+ Nysiis.compute("l").get should equal ("l")
+ Nysiis.compute("ll").get should equal ("ll")
+
+ // m
+ Nysiis.compute("m").get should equal ("m")
+ Nysiis.compute("mm").get should equal ("mn")
+
+ // n
+ Nysiis.compute("n").get should equal ("n")
+ Nysiis.compute("nn").get should equal ("nn")
+
+ // o
+ Nysiis.compute("o").get should equal ("o")
+ Nysiis.compute("oo").get should equal ("o")
+
+ // p
+ Nysiis.compute("p").get should equal ("p")
+ Nysiis.compute("pp").get should equal ("pp")
+
+ // q
+ Nysiis.compute("q").get should equal ("q")
+ Nysiis.compute("qq").get should equal ("qg")
+
+ // r
+ Nysiis.compute("r").get should equal ("r")
+ Nysiis.compute("rr").get should equal ("rr")
+
+ // s
+ Nysiis.compute("s").get should equal ("s")
+ Nysiis.compute("ss").get should equal ("s")
+
+ // t
+ Nysiis.compute("t").get should equal ("t")
+ Nysiis.compute("tt").get should equal ("tt")
+
+ // u
+ Nysiis.compute("u").get should equal ("u")
+ Nysiis.compute("uu").get should equal ("u")
+
+ // v
+ Nysiis.compute("v").get should equal ("v")
+ Nysiis.compute("vv").get should equal ("vv")
+
+ // w
+ Nysiis.compute("w").get should equal ("w")
+ Nysiis.compute("ww").get should equal ("ww")
+
+ // x
+ Nysiis.compute("x").get should equal ("x")
+ Nysiis.compute("xx").get should equal ("xx")
+
+ // y
+ Nysiis.compute("y").get should equal ("y")
+ Nysiis.compute("yy").get should equal ("yy")
+
+ // z
+ Nysiis.compute("z").get should equal ("z")
+ Nysiis.compute("zz").get should equal ("z")
+
+ // Head cases.
+ Nysiis.compute("mac").get should equal ("mc")
+ Nysiis.compute("kn").get should equal ("nn")
+ Nysiis.compute("k").get should equal ("c")
+ Nysiis.compute("ph").get should equal ("ff")
+ Nysiis.compute("pf").get should equal ("ff")
+ Nysiis.compute("sch").get should equal ("s") // dropby wrongly says ss
+
+ // Last cases.
+ Nysiis.compute("ee").get should equal ("y")
+ Nysiis.compute("ie").get should equal ("y")
+ Nysiis.compute("dt").get should equal ("d")
+ Nysiis.compute("rt").get should equal ("d")
+ Nysiis.compute("rd").get should equal ("d")
+ Nysiis.compute("nt").get should equal ("d")
+ Nysiis.compute("nd").get should equal ("d")
+
+ // Core cases.
+ Nysiis.compute("eev").get should equal ("yv") //dropby wrongly says eaf
+ Nysiis.compute("zev").get should equal ("zaf")
+ Nysiis.compute("kkn").get should equal ("cn")
+ Nysiis.compute("sschn").get should equal ("ssn")
+ Nysiis.compute("pph").get should equal ("pf")
+
+ // Miscellaneous.
+ Nysiis.compute("macdonald").get should equal ("mcdanald")
+ Nysiis.compute("phone").get should equal ("ffan")
+ Nysiis.compute("aggregate").get should equal ("agragat")
+ Nysiis.compute("accuracy").get should equal ("acaracy")
+ Nysiis.compute("encyclopedia").get should equal ("encyclapad")
+ Nysiis.compute("honorificabilitudinitatibus").get should equal ("hanarafacabalatadanatatab")
+ Nysiis.compute("antidisestablishmentarianism").get should equal ("antadasastablasnantaranasn")
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/readme.md b/readme.md
index 0840cb5..70c6d03 100755
--- a/readme.md
+++ b/readme.md
@@ -7,6 +7,7 @@ A collection of string metrics implemented in Scala. Includes a light-weight cor
* Jaro-Winkler
* Levenshtein
* Metaphone
+* NYSIIS
* Soundex
## Building the API