diff options
author | Rocky Madden <git@rockymadden.com> | 2012-11-18 19:31:32 -0700 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2012-11-18 19:31:32 -0700 |
commit | 72dfa5ebe75b707f192cbe65c23393d08dbd8f8a (patch) | |
tree | d14db05d2a671f9e6cf9c0cb4c139003f36a8e50 /core/source | |
parent | a40947a8554332e97d05db8850ab1b6cd0540d9a (diff) | |
download | stringmetric-72dfa5ebe75b707f192cbe65c23393d08dbd8f8a.tar.gz stringmetric-72dfa5ebe75b707f192cbe65c23393d08dbd8f8a.tar.bz2 stringmetric-72dfa5ebe75b707f192cbe65c23393d08dbd8f8a.zip |
Created refined NYSIIS algorithm, metric, commands, and specs.
Diffstat (limited to 'core/source')
4 files changed, 413 insertions, 0 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisAlgorithm.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisAlgorithm.scala new file mode 100755 index 0000000..61393ec --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisAlgorithm.scala @@ -0,0 +1,135 @@ +package org.hashtree.stringmetric.phonetic + +import org.hashtree.stringmetric.{ FilterableStringAlgorithm, StringAlgorithm, StringFilter } +import org.hashtree.stringmetric.filter.StringFilterDelegate +import scala.annotation.tailrec + +/** An implementation of the refined NYSIIS [[org.hashtree.stringmetric.StringAlgorithm]]. */ +object RefinedNysiisAlgorithm extends StringAlgorithm with FilterableStringAlgorithm { + type ComputeReturn = String + + override def compute(charArray: Array[Char])(implicit stringFilter: StringFilter): Option[Array[Char]] = { + val ca = stringFilter.filter(charArray) + + if (ca.length == 0) None + else { + val cal = ca.map(_.toLower) + + if (cal.head < 97 || cal.head > 122) None + else { + val thl = transcodeLast(transcodeHead(cal.head +: cleanLast(cal.tail, Set('s', 'z')))) + val t = transcode(Array.empty[Char], thl.head, thl.tail, Array.empty[Char]) + + if (t.length == 1) Some(t) + else Some(deduplicate(t.head +: cleanTerminal(cleanLast(t.tail, Set('a'))))) + } + } + } + + override def compute(string: String)(implicit stringFilter: StringFilter): Option[ComputeReturn] = + compute(stringFilter.filter(string.toCharArray))(new StringFilterDelegate) match { + case Some(ny) => Some(ny.mkString) + case None => None + } + + private[this] def cleanLast(ca: Array[Char], s: Set[Char]) = + if (ca.length == 0) ca + else if(s.contains(ca.last)) ca.dropRight(ca.reverseIterator.takeWhile(c => s.contains(c)).length) + else ca + + private[this] def cleanTerminal(ca: Array[Char]) = + if (ca.length >= 2 && ca.last == 'y' && ca(ca.length - 2) == 'a') ca.dropRight(2) :+ 'y' + else ca + + private[this] def deduplicate(ca: Array[Char]) = + if (ca.length <= 1) ca + else + ca.sliding(2).withFilter(a => a(0) != a(1)).map(a => a(0)).toArray[Char] :+ ca.last + + @tailrec + private[this] def transcode(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = { + if (c == '\0' && r.length == 0) o + else { + val shift = (d: Int, ca: Array[Char]) => { + val sa = r.splitAt(d - 1) + + ( + if (sa._1.length > 0) (l :+ c) ++ sa._1 else l :+ c, + if (sa._2.length > 0) sa._2.head else '\0', + if (sa._2.length > 1) sa._2.tail else Array.empty[Char], + ca + ) + } + + val t = { + c match { + case 'a' | 'i' | 'o' | 'u' => + if (l.length == 0) shift(1, o :+ c) + else shift(1, o :+ 'a') + case 'b' | 'c' | 'f' | 'j' | 'l' | 'n' | 'r' | 't' | 'v' | 'x' => shift(1, o :+ c) + case 'd' => + if (r.length >= 1 && r.head == 'g') shift(2, o :+ 'g') else shift(1, o :+ c) + case 'e' => + if (l.length == 0) shift(1, o :+ c) + else if (r.length >= 1 && r.head == 'v') shift(2, o ++ Array('a', 'f')) + else shift(1, o :+ 'a') + case 'g' => + if (r.length >= 2 && r.head == 'h' && r(1) == 't') shift(3, o ++ Array('g', 't')) + else shift(1, o :+ c) + case 'h' => + if (l.length == 0) shift(1, o :+ c) + else if (!Alphabet.isVowel(l.last) || (r.length >= 1 && !Alphabet.isVowel(r.head))) shift(1, o) + else shift(1, o :+ c) + case 'k' => if (r.length >= 1 && r.head == 'n') shift(2, o :+ 'n') else shift(1, o :+ 'c') + case 'm' => if (l.length == 0) shift(1, o :+ c) else shift(1, o :+ 'n') + case 'p' => if (r.length >= 1 && r.head == 'h') shift(2, o :+ 'f') else shift(1, o :+ c) + case 'q' => if (l.length == 0) shift(1, o :+ c) else shift(1, o :+ 'g') + case 's' => + if (r.length >= 2 && r.head == 'c' && r(1) == 'h') shift(3, o :+ c) + else if (r.length >= 1 && r.head == 'h') shift(2, o :+ c) + else shift(1, o :+ c) + case 'w' => + if (l.length >= 1 && Alphabet.isVowel(l.last)) shift(1, o) + else if (r.length >= 1 && r.head == 'r') shift(2, o :+ 'r') + else shift(1, o :+ c) + case 'y' => + if (l.length >= 1 && r.length >= 2 && r.head == 'w') shift(2, o :+ 'a') + else if (r.length >= 1 && r.head == 'w') shift(2, o :+ c) + else if (l.length >= 1 && r.length >= 1) shift(1, o :+ 'a') + else shift(1, o :+ c) + case 'z' => if (l.length == 0) shift(1, o :+ c) else shift(1, o :+ 's') + case _ => shift(1, o) + } + } + + transcode(t._1, t._2, t._3, t._4) + } + } + + private[this] def transcodeHead(ca: Array[Char]) = { + if (ca.length == 0) ca + else + ca.head match { + case 'm' if (ca.length >= 3 && (ca(1) == 'a' && ca(2) == 'c')) => Array('m', 'c') ++ ca.takeRight(ca.length - 3) + case 'p' if (ca.length >= 2 && ca(1) == 'f') => 'f' +: ca.takeRight(ca.length - 2) + case _ => ca + } + } + + private[this] def transcodeLast(ca: Array[Char]) = { + if (ca.length >= 2) { + val l = ca(ca.length - 1) + val lm1 = ca(ca.length - 2) + lazy val take2 = ca.take(ca.length - 2) + + l match { + case 'd' if (lm1 == 'n' || lm1 == 'r') => take2 :+ 'd' + case 'e' if (lm1 == 'e' || lm1 == 'i' || lm1 =='y') => take2 :+ 'y' + case 't' if (lm1 == 'd' || lm1 == 'n' || lm1 == 'r') => take2 :+ 'd' + case 'x' if (lm1 == 'e') => take2 ++ Array('e', 'c') + case 'x' if (lm1 == 'i') => take2 ++ Array('i', 'c') + case _ => ca + } + } else ca + } +}
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisMetric.scala new file mode 100755 index 0000000..732386d --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisMetric.scala @@ -0,0 +1,33 @@ +package org.hashtree.stringmetric.phonetic + +import org.hashtree.stringmetric.{ FilterableStringMetric, StringFilter, StringMetric } +import org.hashtree.stringmetric.filter.StringFilterDelegate + +/** An implementation of the refined NYSIIS [[org.hashtree.stringmetric.StringMetric]]. */ +object RefinedNysiisMetric extends StringMetric with FilterableStringMetric { + type CompareReturn = Boolean + + override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit stringFilter: StringFilter): Option[CompareReturn] = { + + val ca1 = stringFilter.filter(charArray1) + lazy val ca2 = stringFilter.filter(charArray2) + + if (ca1.length == 0 || ca2.length == 0) None + else { + val rny1 = RefinedNysiisAlgorithm.compute(ca1) + lazy val rny2 = RefinedNysiisAlgorithm.compute(ca2) + + if (!rny1.isDefined || rny1.get.length == 0 || !rny2.isDefined || rny2.get.length == 0) None + else Some(rny1.get.sameElements(rny2.get)) + } + } + + override def compare(string1: String, string2: String) + (implicit stringFilter: StringFilter): Option[CompareReturn] = + + compare( + stringFilter.filter(string1.toCharArray), + stringFilter.filter(string2.toCharArray) + )(new StringFilterDelegate) +}
\ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisAlgorithmSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisAlgorithmSpec.scala new file mode 100755 index 0000000..8b47456 --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisAlgorithmSpec.scala @@ -0,0 +1,208 @@ +package org.hashtree.stringmetric.phonetic + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class RefinedNysiisAlgorithmSpec extends ScalaTest { + "RefinedNysiisAlgorithm" should provide { + "compute method" when passed { + "empty argument" should returns { + "None" in { + RefinedNysiisAlgorithm.compute("").isDefined should be (false) + } + } + "non-phonetic argument" should returns { + "None" in { + RefinedNysiisAlgorithm.compute("123").isDefined should be (false) + } + } + "phonetic argument" should returns { + "Some" in { + // a + RefinedNysiisAlgorithm.compute("a").get should equal ("a") + RefinedNysiisAlgorithm.compute("aa").get should equal ("a") + + // b + RefinedNysiisAlgorithm.compute("b").get should equal ("b") + RefinedNysiisAlgorithm.compute("bb").get should equal ("b") + + // c + RefinedNysiisAlgorithm.compute("c").get should equal ("c") + RefinedNysiisAlgorithm.compute("cc").get should equal ("c") + + // d + RefinedNysiisAlgorithm.compute("d").get should equal ("d") + RefinedNysiisAlgorithm.compute("dd").get should equal ("d") + + // e + RefinedNysiisAlgorithm.compute("e").get should equal ("e") + RefinedNysiisAlgorithm.compute("ee").get should equal ("y") + + // f + RefinedNysiisAlgorithm.compute("f").get should equal ("f") + RefinedNysiisAlgorithm.compute("ff").get should equal ("f") + + // g + RefinedNysiisAlgorithm.compute("g").get should equal ("g") + RefinedNysiisAlgorithm.compute("gg").get should equal ("g") + + // h + RefinedNysiisAlgorithm.compute("h").get should equal ("h") + RefinedNysiisAlgorithm.compute("hh").get should equal ("h") + + // i + RefinedNysiisAlgorithm.compute("i").get should equal ("i") + RefinedNysiisAlgorithm.compute("ii").get should equal ("i") + + // j + RefinedNysiisAlgorithm.compute("j").get should equal ("j") + RefinedNysiisAlgorithm.compute("jj").get should equal ("j") + + // k + RefinedNysiisAlgorithm.compute("k").get should equal ("c") + RefinedNysiisAlgorithm.compute("kk").get should equal ("c") + + // l + RefinedNysiisAlgorithm.compute("l").get should equal ("l") + RefinedNysiisAlgorithm.compute("ll").get should equal ("l") + + // m + RefinedNysiisAlgorithm.compute("m").get should equal ("m") + RefinedNysiisAlgorithm.compute("mm").get should equal ("mn") + + // n + RefinedNysiisAlgorithm.compute("n").get should equal ("n") + RefinedNysiisAlgorithm.compute("nn").get should equal ("n") + + // o + RefinedNysiisAlgorithm.compute("o").get should equal ("o") + RefinedNysiisAlgorithm.compute("oo").get should equal ("o") + + // p + RefinedNysiisAlgorithm.compute("p").get should equal ("p") + RefinedNysiisAlgorithm.compute("pp").get should equal ("p") + + // q + RefinedNysiisAlgorithm.compute("q").get should equal ("q") + RefinedNysiisAlgorithm.compute("qq").get should equal ("qg") + + // r + RefinedNysiisAlgorithm.compute("r").get should equal ("r") + RefinedNysiisAlgorithm.compute("rr").get should equal ("r") + + // s + RefinedNysiisAlgorithm.compute("s").get should equal ("s") + RefinedNysiisAlgorithm.compute("ss").get should equal ("s") + + // t + RefinedNysiisAlgorithm.compute("t").get should equal ("t") + RefinedNysiisAlgorithm.compute("tt").get should equal ("t") + + // u + RefinedNysiisAlgorithm.compute("u").get should equal ("u") + RefinedNysiisAlgorithm.compute("uu").get should equal ("u") + + // v + RefinedNysiisAlgorithm.compute("v").get should equal ("v") + RefinedNysiisAlgorithm.compute("vv").get should equal ("v") + + // w + RefinedNysiisAlgorithm.compute("w").get should equal ("w") + RefinedNysiisAlgorithm.compute("ww").get should equal ("w") + + // x + RefinedNysiisAlgorithm.compute("x").get should equal ("x") + RefinedNysiisAlgorithm.compute("xx").get should equal ("x") + + // y + RefinedNysiisAlgorithm.compute("y").get should equal ("y") + RefinedNysiisAlgorithm.compute("yy").get should equal ("y") + RefinedNysiisAlgorithm.compute("ybyb").get should equal ("ybab") + + // z + RefinedNysiisAlgorithm.compute("z").get should equal ("z") + RefinedNysiisAlgorithm.compute("zz").get should equal ("z") + + // Head cases. + RefinedNysiisAlgorithm.compute("mac").get should equal ("mc") + RefinedNysiisAlgorithm.compute("pf").get should equal ("f") + + // Last cases. + RefinedNysiisAlgorithm.compute("ix").get should equal ("ic") + RefinedNysiisAlgorithm.compute("ex").get should equal ("ec") + RefinedNysiisAlgorithm.compute("ye").get should equal ("y") + RefinedNysiisAlgorithm.compute("ee").get should equal ("y") + RefinedNysiisAlgorithm.compute("ie").get should equal ("y") + RefinedNysiisAlgorithm.compute("dt").get should equal ("d") + RefinedNysiisAlgorithm.compute("rt").get should equal ("d") + RefinedNysiisAlgorithm.compute("rd").get should equal ("d") + RefinedNysiisAlgorithm.compute("nt").get should equal ("d") + RefinedNysiisAlgorithm.compute("nd").get should equal ("d") + + // Core cases. + RefinedNysiisAlgorithm.compute("bevb").get should equal ("bafb") + RefinedNysiisAlgorithm.compute("bghtb").get should equal ("bgtb") + RefinedNysiisAlgorithm.compute("bdgb").get should equal ("bgb") + RefinedNysiisAlgorithm.compute("bphb").get should equal ("bfb") + RefinedNysiisAlgorithm.compute("bknb").get should equal ("bnb") + RefinedNysiisAlgorithm.compute("bshb").get should equal ("bsb") + RefinedNysiisAlgorithm.compute("bschb").get should equal ("bsb") + RefinedNysiisAlgorithm.compute("bywb").get should equal ("bab") + RefinedNysiisAlgorithm.compute("byw").get should equal ("by") + RefinedNysiisAlgorithm.compute("ywb").get should equal ("yb") + RefinedNysiisAlgorithm.compute("bwrb").get should equal ("brb") + + // Transcode cases. + RefinedNysiisAlgorithm.compute("bay").get should equal ("by") + + // Miscellaneous. + RefinedNysiisAlgorithm.compute("macdonald").get should equal ("mcdanald") + RefinedNysiisAlgorithm.compute("phone").get should equal ("fan") + RefinedNysiisAlgorithm.compute("aggregate").get should equal ("agragat") + RefinedNysiisAlgorithm.compute("accuracy").get should equal ("acaracy") + RefinedNysiisAlgorithm.compute("encyclopedia").get should equal ("encaclapad") + RefinedNysiisAlgorithm.compute("honorificabilitudinitatibus").get should equal ("hanarafacabalatadanatatab") + RefinedNysiisAlgorithm.compute("antidisestablishmentarianism").get should equal ("antadasastablasnantaranasn") + + // Dropby. + RefinedNysiisAlgorithm.compute("edwards").get should equal ("edwad") + RefinedNysiisAlgorithm.compute("parez").get should equal ("par") + RefinedNysiisAlgorithm.compute("macintosh").get should equal ("mcantas") + RefinedNysiisAlgorithm.compute("phillipson").get should equal ("falapsan") + RefinedNysiisAlgorithm.compute("haddix").get should equal ("hadac") + RefinedNysiisAlgorithm.compute("essex").get should equal ("esac") + RefinedNysiisAlgorithm.compute("moye").get should equal ("my") + RefinedNysiisAlgorithm.compute("mckee").get should equal ("mcy") + RefinedNysiisAlgorithm.compute("mackie").get should equal ("mcy") + RefinedNysiisAlgorithm.compute("heitschmidt").get should equal ("hatsnad") + RefinedNysiisAlgorithm.compute("bart").get should equal ("bad") + RefinedNysiisAlgorithm.compute("hurd").get should equal ("had") + RefinedNysiisAlgorithm.compute("hunt").get should equal ("had") + RefinedNysiisAlgorithm.compute("westerlund").get should equal ("wastarlad") + RefinedNysiisAlgorithm.compute("evers").get should equal ("evar") + RefinedNysiisAlgorithm.compute("devito").get should equal ("dafat") + RefinedNysiisAlgorithm.compute("rawson").get should equal ("rasan") + RefinedNysiisAlgorithm.compute("shoulders").get should equal ("saldar") + RefinedNysiisAlgorithm.compute("leighton").get should equal ("lagtan") + RefinedNysiisAlgorithm.compute("wooldridge").get should equal ("waldrag") + RefinedNysiisAlgorithm.compute("oliphant").get should equal ("olafad") + RefinedNysiisAlgorithm.compute("hatchett").get should equal ("hatcat") + RefinedNysiisAlgorithm.compute("mcknight").get should equal ("mcnagt") + RefinedNysiisAlgorithm.compute("rickert").get should equal ("racad") + RefinedNysiisAlgorithm.compute("bowman").get should equal ("banan") + RefinedNysiisAlgorithm.compute("vasquez").get should equal ("vasg") + RefinedNysiisAlgorithm.compute("bashaw").get should equal ("bas") + RefinedNysiisAlgorithm.compute("schoenhoeft").get should equal ("sanaft") // dropby wrongly says scanaft + RefinedNysiisAlgorithm.compute("heywood").get should equal ("had") + RefinedNysiisAlgorithm.compute("hayman").get should equal ("hanan") + RefinedNysiisAlgorithm.compute("seawright").get should equal ("saragt") + RefinedNysiisAlgorithm.compute("kratzer").get should equal ("cratsar") + RefinedNysiisAlgorithm.compute("canaday").get should equal ("canady") + RefinedNysiisAlgorithm.compute("crepeau").get should equal ("crap") + } + } + } + } +}
\ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisMetricSpec.scala new file mode 100755 index 0000000..940c0c2 --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/RefinedNysiisMetricSpec.scala @@ -0,0 +1,37 @@ +package org.hashtree.stringmetric.phonetic + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class RefinedNysiisMetricSpec extends ScalaTest { + "RefinedNysiisMetric" should provide { + "compare method" when passed { + "empty arguments" should returns { + "None" in { + RefinedNysiisMetric.compare("", "").isDefined should be (false) + RefinedNysiisMetric.compare("abc", "").isDefined should be (false) + RefinedNysiisMetric.compare("", "xyz").isDefined should be (false) + } + } + "non-phonetic arguments" should returns { + "None" in { + RefinedNysiisMetric.compare("123", "123").isDefined should be (false) + RefinedNysiisMetric.compare("123", "").isDefined should be (false) + RefinedNysiisMetric.compare("", "123").isDefined should be (false) + } + } + "phonetically similar arguments" should returns { + "Boolean indicating true" in { + RefinedNysiisMetric.compare("ham", "hum").get should be (true) + } + } + "phonetically dissimilar arguments" should returns { + "Boolean indicating false" in { + RefinedNysiisMetric.compare("dumb", "gum").get should be (false) + } + } + } + } +}
\ No newline at end of file |