diff options
author | Rocky Madden <git@rockymadden.com> | 2012-10-26 19:46:48 -0600 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2012-10-26 19:46:48 -0600 |
commit | ce234db3048af3d7cb5c825c8e217a9169ab5c3b (patch) | |
tree | 61716b04494d4fce785f2c4407401db492bd7559 | |
parent | 93d1ecd3df6df537f951f63de060a5db92e01235 (diff) | |
download | stringmetric-ce234db3048af3d7cb5c825c8e217a9169ab5c3b.tar.gz stringmetric-ce234db3048af3d7cb5c825c8e217a9169ab5c3b.tar.bz2 stringmetric-ce234db3048af3d7cb5c825c8e217a9169ab5c3b.zip |
Created NYSIIS algorithm, metric, command, and specs.
7 files changed, 473 insertions, 0 deletions
diff --git a/cli/source/core/scala/org/hashtree/stringmetric/cli/command/nysiisMetric.scala b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/nysiisMetric.scala new file mode 100755 index 0000000..d49b99e --- /dev/null +++ b/cli/source/core/scala/org/hashtree/stringmetric/cli/command/nysiisMetric.scala @@ -0,0 +1,58 @@ +package org.hashtree.stringmetric.cli.command + +import org.hashtree.stringmetric.StringFilterDelegate +import org.hashtree.stringmetric.cli._ +import org.hashtree.stringmetric.cli.command._ +import org.hashtree.stringmetric.phonetic.NysiisMetric + +/** + * The nysiisMetric [[org.hashtree.stringmetric.cli.command.Command]]. Compares two strings to determine if they are + * phonetically similarly, per the NYSIIS algorithm. + */ +object nysiisMetric extends Command { + override def main(args: Array[String]): Unit = { + val options = OptionMapUtility.toOptionMap(args) + + try { + // Help. + if (options.contains('h) || options.contains('help)) { + help() + exit(options) + // Execute. + } else if (options.contains('dashless) && options('dashless).count(_ == ' ') == 1) { + execute(options) + exit(options) + // Invalid syntax. + } else { + throw new IllegalArgumentException("Expected valid syntax. See --help.") + } + } catch { + case e => error(e)(options) + } + } + + override def help(): Unit = { + val ls = sys.props("line.separator") + val tab = " " + + println( + "Compares two strings to determine if they are phonetically similarly, per the NYSIIS algorithm." + ls + ls + + "Syntax:" + ls + + tab + "nysiisMetric [Options] string1 string2..." + ls + ls + + "Options:" + ls + + tab + "-h, --help" + ls + + tab + tab + "Outputs description, syntax, and options." + ) + } + + override def execute(options: OptionMap): Unit = { + val strings = options('dashless).split(" ") + + println( + NysiisMetric.compare( + strings(0), + strings(1) + )(new StringFilterDelegate).getOrElse("not comparable").toString + ) + } +}
\ No newline at end of file diff --git a/cli/source/test/scala/org/hashtree/stringmetric/cli/command/nysiisMetricSpec.scala b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/nysiisMetricSpec.scala new file mode 100755 index 0000000..4c0beea --- /dev/null +++ b/cli/source/test/scala/org/hashtree/stringmetric/cli/command/nysiisMetricSpec.scala @@ -0,0 +1,46 @@ +package org.hashtree.stringmetric.cli.command + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class nysiisMetricSpec extends ScalaTest { + "nysiisMetric" should provide { + "main method" when passed { + "valid dashless arguments" should executes { + "print if they are a match" in { + val out = new java.io.ByteArrayOutputStream() + + Console.withOut(out)( + nysiisMetric.main(Array("--unitTest", "--debug", "aBc", "abc")) + ) + + out.toString should equal ("true\n") + out.reset() + + Console.withOut(out)( + nysiisMetric.main(Array("--unitTest", "--debug", "aBc", "xyz")) + ) + + out.toString should equal ("false\n") + out.reset() + + Console.withOut(out)( + nysiisMetric.main(Array("--unitTest", "--debug", "1", "1")) + ) + + out.toString should equal ("not comparable\n") + out.reset() + } + } + "no dashless arguments" should throws { + "IllegalArgumentException" in { + evaluating { + nysiisMetric.main(Array("--unitTest", "--debug")) + } should produce [IllegalArgumentException] + } + } + } + } +}
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Nysiis.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Nysiis.scala new file mode 100755 index 0000000..90267ee --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Nysiis.scala @@ -0,0 +1,139 @@ +package org.hashtree.stringmetric.phonetic + +import org.hashtree.stringmetric.{ StringAlgorithm, StringFilter, StringFilterDelegate } +import scala.annotation.tailrec + +/** An implementation of the NYSIIS [[org.hashtree.stringmetric.StringAlgorithm]]. */ +object Nysiis extends StringAlgorithm { + override def compute(charArray: Array[Char])(implicit stringFilter: StringFilter): Option[Array[Char]] = { + val ca = stringFilter.filter(charArray) + + if (ca.length == 0) None + else { + val thl = transcodeLast(transcodeHead(ca.map(_.toLower))) + + if (thl.head < 97 || thl.head > 122) None + else { + if (thl.length == 1) Some(thl) + else { + + val ts = thl.splitAt(1) + val t = transcode(ts._1, ts._2.head, ts._2.tail, ts._1) + + Some(t.head +: deduplicate(transcodeClean(t.tail))) + } + } + } + } + + override def compute(string: String)(implicit stringFilter: StringFilter): Option[String] = { + compute(stringFilter.filter(string.toCharArray))(new StringFilterDelegate) match { + case Some(se) => Some(se.mkString) + case None => None + } + } + + private[this] def deduplicate(ca: Array[Char]) = { + if (ca.length <= 1) ca + else + ca.sliding(2).filter(a => a(0) != a(1)).map(a => a(0)).toArray[Char] :+ ca.last + } + + private[this] def isVowel(c: Char) = (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') + + @tailrec + private[this] def transcode(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = { + if (c == '\0' && r.length == 0) o + else { + val shift = (d: Int, ca: Array[Char]) => { + val sa = r.splitAt(d - 1) + + ( + if (sa._1.length > 0) (l :+ c) ++ sa._1 else l :+ c, + if (sa._2.length > 0) sa._2.head else '\0', + if (sa._2.length > 1) sa._2.tail else Array.empty[Char], + ca + ) + } + + val t = { + c match { + case 'a' | 'i' | 'o' | 'u' => shift(1, o :+ 'a') + case 'b' | 'c' | 'd' | 'f' | 'g' | 'j' | 'l' | 'n' | 'r' | 't' | 'v' | 'x' | 'y' => shift(1, o :+ c) + case 'e' => { + if (r.length >= 1 && r.head == 'v') + shift(2, o ++ Array('a', 'f')) + else + shift(1, o :+ 'a') + } + case 'h' => { + if (l.length >= 1 && (!isVowel(l.last) || (r.length >= 1 && !isVowel(r.head)))) + shift(1, o :+ l.last) + else + shift(1, o :+ c) + } + case 'k' => if (r.length >= 1 && r.head == 'n') shift(2, o :+ 'n') else shift(1, o :+ 'c') + case 'm' => shift(1, o :+ 'n') + case 'p' => if (r.length >= 1 && r.head == 'h') shift(2, o ++ Array('f', 'f')) else shift(1, o :+ 'p') + case 'q' => shift(1, o :+ 'g') + case 's' => { + if (r.length >= 2 && r.head == 'c' && r(1) == 'h') + shift(3, o ++ Array('s', 's', 's')) + else + shift(1, o :+ c) + } + case 'w' => { + if (l.length >= 1 && !isVowel(l.last)) + shift(1, o :+ l.last) + else + shift(1, o :+ c) + } + case 'z' => shift(1, o :+ 's') + case _ => shift(1, o) + } + } + + transcode(t._1, t._2, t._3, t._4) + } + } + + private[this] def transcodeClean(ca: Array[Char]) = { + if (ca.length >= 1 && (ca.last == 'a' || ca.last == 's')) + ca.reverse.dropWhile(c => c == 'a' || c == 's').reverse + else if (ca.length >= 2 && ca.last == 'y' && ca(ca.length - 2) == 'a') + ca.dropRight(2) :+ 'y' + else + ca + } + + private[this] def transcodeHead(ca: Array[Char]) = { + val h = ca.take(3).padTo(3, '\0') + + if (h.head == 'm' && h(1) == 'a' && h.last == 'c') + Array('m', 'c', 'c') ++ ca.takeRight(ca.length - 3) + else if (h.head == 's' && h(1) == 'c' && h.last == 'h') + Array('s', 's', 's') ++ ca.takeRight(ca.length - 3) + else if (h.head == 'p' && (h(1) == 'h' || h(1) == 'f')) + Array('f', 'f') ++ ca.takeRight(ca.length - 2) + else if (h.head == 'k' && h(1) == 'n') + Array('n', 'n') ++ ca.takeRight(ca.length - 2) + else if (h.head == 'k') + Array('c') ++ ca.takeRight(ca.length - 1) + else + ca + } + + private[this] def transcodeLast(ca: Array[Char]) = { + val h = ca.take(2).padTo(2, '\0') + + if ( + (h.last == 't' && (h.head == 'd' || h.head == 'r' || h.head == 'n')) || + (h.last == 'd' && (h.head == 'r' || h.head == 'n')) + ) + Array('d') ++ ca.takeRight(ca.length - 2) + else if (h.last == 'e' && (h.head == 'i' || h.head == 'e')) + Array('y') ++ ca.takeRight(ca.length - 2) + else + ca + } +}
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/NysiisMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/NysiisMetric.scala new file mode 100755 index 0000000..0112b06 --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/NysiisMetric.scala @@ -0,0 +1,29 @@ +package org.hashtree.stringmetric.phonetic + +import org.hashtree.stringmetric.{ StringFilter, StringFilterDelegate, StringMetric } + +/** An implementation of the NYSIIS [[org.hashtree.stringmetric.StringMetric]]. */ +object NysiisMetric extends StringMetric { + override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringFilter: StringFilter): Option[Boolean] = { + val ca1 = stringFilter.filter(charArray1) + val ca2 = stringFilter.filter(charArray2) + + if (ca1.length == 0 || ca2.length == 0) None + else { + val ny1 = Nysiis.compute(ca1) + val ny2 = Nysiis.compute(ca2) + + if (!ny1.isDefined || !ny2.isDefined || (ny1.get.length == 0 && ny2.get.length == 0)) + None + else + Some(ny1.get.sameElements(ny2.get)) + } + } + + override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Boolean] = { + compare( + stringFilter.filter(string1.toCharArray), + stringFilter.filter(string2.toCharArray) + )(new StringFilterDelegate) + } +}
\ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisMetricSpec.scala new file mode 100755 index 0000000..4c4b02b --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisMetricSpec.scala @@ -0,0 +1,37 @@ +package org.hashtree.stringmetric.phonetic + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class NysiisMetricSpec extends ScalaTest { + "NysiisMetric" should provide { + "compare method" when passed { + "empty arguments" should returns { + "None" in { + NysiisMetric.compare("", "").isDefined should be (false) + NysiisMetric.compare("abc", "").isDefined should be (false) + NysiisMetric.compare("", "xyz").isDefined should be (false) + } + } + "non-phonetic arguments" should returns { + "None" in { + NysiisMetric.compare("123", "123").isDefined should be (false) + NysiisMetric.compare("123", "").isDefined should be (false) + NysiisMetric.compare("", "123").isDefined should be (false) + } + } + "phonetically similar arguments" should returns { + "Boolean indicating true" in { + NysiisMetric.compare("ham", "hum").get should be (true) + } + } + "phonetically dissimilar arguments" should returns { + "Boolean indicating false" in { + NysiisMetric.compare("dumb", "gum").get should be (false) + } + } + } + } +}
\ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisSpec.scala new file mode 100755 index 0000000..7186238 --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/NysiisSpec.scala @@ -0,0 +1,163 @@ +package org.hashtree.stringmetric.phonetic + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class NysiisSpec extends ScalaTest { + "Nysiis" should provide { + "compute method" when passed { + "empty argument" should returns { + "None" in { + Nysiis.compute("").isDefined should be (false) + } + } + "non-phonetic argument" should returns { + "None" in { + Nysiis.compute("123").isDefined should be (false) + } + } + "phonetic argument" should returns { + "Some" in { + // a + Nysiis.compute("a").get should equal ("a") + Nysiis.compute("aa").get should equal ("a") + + // b + Nysiis.compute("b").get should equal ("b") + Nysiis.compute("bb").get should equal ("bb") + + // c + Nysiis.compute("c").get should equal ("c") + Nysiis.compute("cc").get should equal ("cc") + + // d + Nysiis.compute("d").get should equal ("d") + Nysiis.compute("dd").get should equal ("dd") + + // e + Nysiis.compute("e").get should equal ("e") + Nysiis.compute("ee").get should equal ("y") + + // f + Nysiis.compute("f").get should equal ("f") + Nysiis.compute("ff").get should equal ("ff") + + // g + Nysiis.compute("g").get should equal ("g") + Nysiis.compute("gg").get should equal ("gg") + + // h + Nysiis.compute("h").get should equal ("h") + Nysiis.compute("hh").get should equal ("hh") + + // i + Nysiis.compute("i").get should equal ("i") + Nysiis.compute("ii").get should equal ("i") + + // j + Nysiis.compute("j").get should equal ("j") + Nysiis.compute("jj").get should equal ("jj") + + // k + Nysiis.compute("k").get should equal ("c") + Nysiis.compute("kk").get should equal ("cc") + + // l + Nysiis.compute("l").get should equal ("l") + Nysiis.compute("ll").get should equal ("ll") + + // m + Nysiis.compute("m").get should equal ("m") + Nysiis.compute("mm").get should equal ("mn") + + // n + Nysiis.compute("n").get should equal ("n") + Nysiis.compute("nn").get should equal ("nn") + + // o + Nysiis.compute("o").get should equal ("o") + Nysiis.compute("oo").get should equal ("o") + + // p + Nysiis.compute("p").get should equal ("p") + Nysiis.compute("pp").get should equal ("pp") + + // q + Nysiis.compute("q").get should equal ("q") + Nysiis.compute("qq").get should equal ("qg") + + // r + Nysiis.compute("r").get should equal ("r") + Nysiis.compute("rr").get should equal ("rr") + + // s + Nysiis.compute("s").get should equal ("s") + Nysiis.compute("ss").get should equal ("s") + + // t + Nysiis.compute("t").get should equal ("t") + Nysiis.compute("tt").get should equal ("tt") + + // u + Nysiis.compute("u").get should equal ("u") + Nysiis.compute("uu").get should equal ("u") + + // v + Nysiis.compute("v").get should equal ("v") + Nysiis.compute("vv").get should equal ("vv") + + // w + Nysiis.compute("w").get should equal ("w") + Nysiis.compute("ww").get should equal ("ww") + + // x + Nysiis.compute("x").get should equal ("x") + Nysiis.compute("xx").get should equal ("xx") + + // y + Nysiis.compute("y").get should equal ("y") + Nysiis.compute("yy").get should equal ("yy") + + // z + Nysiis.compute("z").get should equal ("z") + Nysiis.compute("zz").get should equal ("z") + + // Head cases. + Nysiis.compute("mac").get should equal ("mc") + Nysiis.compute("kn").get should equal ("nn") + Nysiis.compute("k").get should equal ("c") + Nysiis.compute("ph").get should equal ("ff") + Nysiis.compute("pf").get should equal ("ff") + Nysiis.compute("sch").get should equal ("s") // dropby wrongly says ss + + // Last cases. + Nysiis.compute("ee").get should equal ("y") + Nysiis.compute("ie").get should equal ("y") + Nysiis.compute("dt").get should equal ("d") + Nysiis.compute("rt").get should equal ("d") + Nysiis.compute("rd").get should equal ("d") + Nysiis.compute("nt").get should equal ("d") + Nysiis.compute("nd").get should equal ("d") + + // Core cases. + Nysiis.compute("eev").get should equal ("yv") //dropby wrongly says eaf + Nysiis.compute("zev").get should equal ("zaf") + Nysiis.compute("kkn").get should equal ("cn") + Nysiis.compute("sschn").get should equal ("ssn") + Nysiis.compute("pph").get should equal ("pf") + + // Miscellaneous. + Nysiis.compute("macdonald").get should equal ("mcdanald") + Nysiis.compute("phone").get should equal ("ffan") + Nysiis.compute("aggregate").get should equal ("agragat") + Nysiis.compute("accuracy").get should equal ("acaracy") + Nysiis.compute("encyclopedia").get should equal ("encyclapad") + Nysiis.compute("honorificabilitudinitatibus").get should equal ("hanarafacabalatadanatatab") + Nysiis.compute("antidisestablishmentarianism").get should equal ("antadasastablasnantaranasn") + } + } + } + } +}
\ No newline at end of file @@ -7,6 +7,7 @@ A collection of string metrics implemented in Scala. Includes a light-weight cor * Jaro-Winkler * Levenshtein * Metaphone +* NYSIIS * Soundex ## Building the API |