summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-10-20 15:34:25 -0600
committerRocky Madden <git@rockymadden.com>2012-10-20 15:34:25 -0600
commit9fbe4868503142a7bd2e502b393d3bd60fdb441c (patch)
treefa2cecb89e3868fecaeb3790e9706862decb7a51 /core
parentf95a07d2481a45ddd8d088651f9669900da51edd (diff)
downloadstringmetric-9fbe4868503142a7bd2e502b393d3bd60fdb441c.tar.gz
stringmetric-9fbe4868503142a7bd2e502b393d3bd60fdb441c.tar.bz2
stringmetric-9fbe4868503142a7bd2e502b393d3bd60fdb441c.zip
Created Metaphone algorithm, metric, specs, and command.
Diffstat (limited to 'core')
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala222
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala30
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetricSpec.scala41
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala173
4 files changed, 466 insertions, 0 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala
new file mode 100755
index 0000000..4305a58
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala
@@ -0,0 +1,222 @@
+package org.hashtree.stringmetric.phonetic
+
+import org.hashtree.stringmetric.{ StringAlgorithm, StringCleaner, StringCleanerDelegate }
+import scala.annotation.tailrec
+
+/** An implementation of the Metaphone [[org.hashtree.stringmetric.StringAlgorithm]]. */
+object Metaphone extends StringAlgorithm {
+ override def compute(charArray: Array[Char])(implicit stringCleaner: StringCleaner): Option[Array[Char]] = {
+ val ca = stringCleaner.clean(charArray)
+
+ if (ca.length == 0) None
+ else {
+ val e = exceptions(ca.map(_.toLower))
+ val t = transformations(Array.empty[Char], e.head, e.tail, Array.empty[Char])
+
+ if (t.length == 0)
+ None
+ else
+ Some(t)
+ }
+ }
+
+ override def compute(string: String)(implicit stringCleaner: StringCleaner): Option[String] = {
+ compute(stringCleaner.clean(string.toCharArray))(new StringCleanerDelegate) match {
+ case Some(mp) => Some(mp.mkString)
+ case None => None
+ }
+ }
+
+ private[this] def exceptions(ca: Array[Char]): Array[Char] = {
+ val deduplicate = (x: Array[Char]) => {
+ x.sliding(2).filter(a => a(0) == 'c' || a(0) != a(1)).map(a => a(0)).toArray[Char] :+ x.last
+ }
+
+ ca.length match {
+ case 0 => Array.empty[Char]
+ case 1 if ca.head == 'x' => Array('s')
+ case 1 => ca
+ case _ if ca.head == 'x' => 's' +: deduplicate(ca.tail)
+ case _ => {
+ "" + ca.head + ca.tail.head match {
+ case "ae" | "gn" | "kn" | "pn" | "wr" => ca.tail
+ case "wh" => 'w' +: deduplicate(ca.tail.tail)
+ case _ => deduplicate(ca)
+ }
+ }
+ }
+ }
+
+ private[this] def isVowel(c: Char) = (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u')
+
+ @tailrec
+ private[this] def transformations(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = {
+ if (c == '\0' && r.length == 0) o
+ else {
+ val lshift = (d: Int) => if (d == 1) l :+ c else (l :+ c) ++ r.take(d - 1)
+ val cshift = (d: Int) => if (d > r.length) '\0' else r(d - 1)
+ val rshift = (d: Int) => if (d >= r.length) Array.empty[Char] else r.drop(d)
+
+ c match {
+ case 'a' | 'e' | 'i' | 'o' | 'u' => {
+ if (l.length == 0)
+ transformations(lshift(1), cshift(1), rshift(1), o :+ c)
+ else
+ transformations(lshift(1), cshift(1), rshift(1), o)
+ }
+ case 'f' | 'j' | 'l' | 'm' | 'n' | 'r' => transformations(lshift(1), cshift(1), rshift(1), o :+ c)
+ case 'b' => {
+ if (
+ (l.length >= 1 && l.last == 'm' && r.length == 0)
+ )
+ transformations(lshift(1), cshift(1), rshift(1), o)
+ else
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 'b')
+ }
+ case 'c' => {
+ if (
+ (r.length >= 1 && r.head == 'h' && l.length >= 1 && l.last == 's')
+ )
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 'k')
+ else if (
+ (r.length >= 2 && r.head == 'i' && r.tail.head == 'a')
+ )
+ transformations(lshift(3), cshift(3), rshift(3), o :+ 'x')
+ else if (
+ (r.length >= 1 && r.head == 'h') ||
+ (l.length >= 1 && r.length >= 1 && l.last == 's' && r.head == 'h')
+ )
+ transformations(lshift(2), cshift(2), rshift(2), o :+ 'x')
+ else if (
+ (l.length >= 1 && r.length >= 1 && l.last == 's' && (
+ r.head == 'i' ||
+ r.head == 'e' ||
+ r.head == 'y'
+ )
+ )
+ )
+ transformations(lshift(1), cshift(1), rshift(1), o)
+ else if (
+ (r.length >= 1 && (
+ r.head == 'i' ||
+ r.head == 'e' ||
+ r.head == 'y'
+ )
+ )
+ )
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 's')
+ else
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 'k')
+ }
+ case 'd' => {
+ if (
+ (r.length >= 2 && r.head == 'g' && (
+ r.tail.head == 'e' ||
+ r.tail.head == 'y' ||
+ r.tail.head == 'i'
+ )
+ )
+ )
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 'j') // just d
+ else
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 't')
+ }
+ case 'g' => {
+ if (
+ (r.length > 1 && r.head == 'h') ||
+ (r.length == 1 && r.head == 'n') ||
+ (r.length == 3 && r.head == 'n' && r.tail.head == 'e' && r.tail.tail.head == 'd')
+ )
+ transformations(lshift(1), cshift(1), rshift(1), o) // just g
+ else if (
+ (r.length >= 1 && (
+ r.head == 'i' ||
+ r.head == 'e' ||
+ r.head == 'y'
+ )
+ )
+ )
+ transformations(lshift(2), cshift(2), rshift(2), o :+ 'j')
+ else
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 'k')
+ }
+ case 'h' => {
+ if (
+ (l.length >= 1 && isVowel(l.last) && (r.length == 0 || !isVowel(r.head))) ||
+ (l.length >= 2 && l.last == 'h' && (
+ l(l.length - 2) == 'c' ||
+ l(l.length - 2) == 's' ||
+ l(l.length - 2) == 'p' ||
+ l(l.length - 2) == 't' ||
+ l(l.length - 2) == 'g'
+ )
+ )
+ )
+ transformations(lshift(1), cshift(1), rshift(1), o)
+ else
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 'h')
+ }
+ case 'k' => {
+ if (
+ (l.length >= 1 && l.last == 'c') // raw
+ )
+ transformations(lshift(1), cshift(1), rshift(1), o)
+ else
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 'k')
+ }
+ case 'p' => {
+ if (
+ (r.length >= 1 && r.head == 'h')
+ )
+ transformations(lshift(2), cshift(2), rshift(2), o :+ 'f')
+ else
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 'p')
+ }
+ case 'q' => transformations(lshift(1), cshift(1), rshift(1), o :+ 'k')
+ case 's' => {
+ if (
+
+ (r.length >= 2 && r.head == 'i' && (
+ r.tail.head == 'o' ||
+ r.tail.head == 'a'
+ )
+ )
+ )
+ transformations(lshift(3), cshift(3), rshift(3), o :+ 'x')
+ else if (
+ (r.length >= 1 && r.head == 'h')
+ )
+ transformations(lshift(2), cshift(2), rshift(2), o :+ 'x')
+ else
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 's')
+ }
+ case 't' => {
+ if (
+ (r.length >= 2 && r.head == 'i' && (
+ r.tail.head == 'a' ||
+ r.tail.head == 'o'
+ )
+ )
+ )
+ transformations(lshift(3), cshift(3), rshift(3), o :+ 'x')
+ else if (r.length >= 1 && r.head == 'h')
+ transformations(lshift(2), cshift(2), rshift(2), o :+ '0')
+ else if (r.length >= 2 && r.head == 'c' && r.tail.head == 'h')
+ transformations(lshift(1), cshift(1), rshift(1), o) // only t
+ else
+ transformations(lshift(1), cshift(1), rshift(1), o :+ 't')
+ }
+ case 'v' => transformations(lshift(1), cshift(1), rshift(1), o :+ 'f')
+ case 'w' | 'y' => {
+ if (r.length == 0 || !isVowel(r.head))
+ transformations(lshift(1), cshift(1), rshift(1), o)
+ else
+ transformations(lshift(1), cshift(1), rshift(1), o :+ c)
+ }
+ case 'x' => transformations(lshift(1), cshift(1), rshift(1), (o :+ 'k') :+ 's')
+ case 'z' => transformations(lshift(1), cshift(1), rshift(1), o :+ 's')
+ case _ => transformations(lshift(1), cshift(1), rshift(1), o)
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala
new file mode 100755
index 0000000..727d432
--- /dev/null
+++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala
@@ -0,0 +1,30 @@
+package org.hashtree.stringmetric.phonetic
+
+import org.hashtree.stringmetric.{ StringCleaner, StringCleanerDelegate, StringMetric }
+
+/** An implementation of the Metaphone [[org.hashtree.stringmetric.StringMetric]]. */
+object MetaphoneMetric extends StringMetric {
+ override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringCleaner: StringCleaner): Option[Boolean] = {
+ val ca1 = stringCleaner.clean(charArray1)
+ val ca2 = stringCleaner.clean(charArray2)
+
+ if (ca1.length == 0 || ca2.length == 0) None
+ else {
+ val mp1 = Metaphone.compute(ca1)
+ val mp2 = Metaphone.compute(ca2)
+
+ if (!mp1.isDefined || !mp2.isDefined || (mp1.get.length == 0 && mp2.get.length == 0))
+ None
+ else
+ Some(mp1.get.sameElements(mp2.get))
+ }
+ }
+
+ override def compare(string1: String, string2: String)(implicit stringCleaner: StringCleaner): Option[Boolean] = {
+ // Unable to perform simple equality check, due to situations where no letters are passed.
+ compare(
+ stringCleaner.clean(string1.toCharArray),
+ stringCleaner.clean(string2.toCharArray)
+ )(new StringCleanerDelegate)
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetricSpec.scala
new file mode 100755
index 0000000..20f45a9
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetricSpec.scala
@@ -0,0 +1,41 @@
+package org.hashtree.stringmetric.phonetic
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class MetaphoneMetricSpec extends ScalaTest {
+ "MetaphoneMetric" should provide {
+ "compare method" when passed {
+ "empty arguments" should returns {
+ "None" in {
+ MetaphoneMetric.compare("", "").isDefined should be (false)
+ MetaphoneMetric.compare("abc", "").isDefined should be (false)
+ MetaphoneMetric.compare("", "xyz").isDefined should be (false)
+ }
+ }
+ "non-phonetic arguments" should returns {
+ "None" in {
+ MetaphoneMetric.compare("123", "123").isDefined should be (false)
+ MetaphoneMetric.compare("123", "").isDefined should be (false)
+ MetaphoneMetric.compare("", "123").isDefined should be (false)
+ }
+ }
+ "phonetically similar arguments" should returns {
+ "Boolean indicating true" in {
+ MetaphoneMetric.compare("dumb", "dum").get should be (true)
+ MetaphoneMetric.compare("smith", "smeth").get should be (true)
+ MetaphoneMetric.compare("merci", "mercy").get should be (true)
+ }
+ }
+ "phonetically dissimilar arguments" should returns {
+ "Boolean indicating false" in {
+ MetaphoneMetric.compare("dumb", "gum").get should be (false)
+ MetaphoneMetric.compare("smith", "kiss").get should be (false)
+ MetaphoneMetric.compare("merci", "burpy").get should be (false)
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala
new file mode 100755
index 0000000..2324298
--- /dev/null
+++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala
@@ -0,0 +1,173 @@
+package org.hashtree.stringmetric.phonetic
+
+import org.hashtree.stringmetric.ScalaTest
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class MetaphoneSpec extends ScalaTest {
+ "Metaphone" should provide {
+ "compute method" when passed {
+ "empty argument" should returns {
+ "None" in {
+ Metaphone.compute("").isDefined should be (false)
+ }
+ }
+ "non-phonetic argument" should returns {
+ "None" in {
+ Metaphone.compute("123").isDefined should be (false)
+ }
+ }
+ "phonetic argument" should returns {
+ "Some" in {
+ // z
+ Metaphone.compute("z").get should equal ("s")
+ Metaphone.compute("zz").get should equal ("s")
+
+ // y
+ Metaphone.compute("zy").get should equal ("s")
+ Metaphone.compute("zyz").get should equal ("ss")
+ Metaphone.compute("zya").get should equal ("sy")
+
+ // x
+ Metaphone.compute("zx").get should equal ("sks")
+ Metaphone.compute("zxz").get should equal ("skss")
+
+ // w
+ Metaphone.compute("zw").get should equal ("s")
+ Metaphone.compute("zwz").get should equal ("ss")
+ Metaphone.compute("zwa").get should equal ("sw")
+
+ // v
+ Metaphone.compute("zv").get should equal ("sf")
+ Metaphone.compute("zvz").get should equal ("sfs")
+
+ // t
+ Metaphone.compute("ztiaz").get should equal ("sxs")
+ Metaphone.compute("ztioz").get should equal ("sxs")
+ Metaphone.compute("zthz").get should equal ("s0s")
+ Metaphone.compute("ztchz").get should equal ("sxs")
+ Metaphone.compute("ztz").get should equal ("sts")
+
+ // s
+ Metaphone.compute("zshz").get should equal ("sxs")
+ Metaphone.compute("zsioz").get should equal ("sxs")
+ Metaphone.compute("zsiaz").get should equal ("sxs")
+ Metaphone.compute("zs").get should equal ("ss")
+ Metaphone.compute("zsz").get should equal ("sss")
+
+ // r
+ Metaphone.compute("zr").get should equal ("sr")
+ Metaphone.compute("zrz").get should equal ("srs")
+
+ // q
+ Metaphone.compute("zq").get should equal ("sk")
+ Metaphone.compute("zqz").get should equal ("sks")
+
+ // p
+ Metaphone.compute("zph").get should equal ("sf")
+ Metaphone.compute("zp").get should equal ("sp")
+ Metaphone.compute("zpz").get should equal ("sps")
+
+ // n
+ Metaphone.compute("zn").get should equal ("sn")
+ Metaphone.compute("znz").get should equal ("sns")
+
+ // m
+ Metaphone.compute("zm").get should equal ("sm")
+ Metaphone.compute("zmz").get should equal ("sms")
+
+ // l
+ Metaphone.compute("zl").get should equal ("sl")
+ Metaphone.compute("zlz").get should equal ("sls")
+
+ // k
+ Metaphone.compute("zck").get should equal ("sk")
+ Metaphone.compute("zk").get should equal ("sk")
+
+ // j
+ Metaphone.compute("zj").get should equal ("sj")
+ Metaphone.compute("zjz").get should equal ("sjs")
+
+ // h
+ Metaphone.compute("zh").get should equal ("sh") // php wrongly says s
+ Metaphone.compute("zah").get should equal ("s")
+ Metaphone.compute("zchh").get should equal ("sx")
+ Metaphone.compute("ha").get should equal ("h")
+
+ // g
+ Metaphone.compute("zgh").get should equal ("skh") // php wrongly says sf
+ Metaphone.compute("zghz").get should equal ("shs") // php wrongly says sfs
+ Metaphone.compute("zgha").get should equal ("sh") // php wrongly says sf others wrongly say skh
+ Metaphone.compute("zgn").get should equal ("sn")
+ Metaphone.compute("zgns").get should equal ("skns")
+ Metaphone.compute("zgned").get should equal ("snt") // others wrongly says sknt
+ Metaphone.compute("zgneds").get should equal ("sknts") // php wrongly says snts
+ Metaphone.compute("zgi").get should equal ("sj")
+ Metaphone.compute("zgiz").get should equal ("sjs")
+ Metaphone.compute("zge").get should equal ("sj")
+ Metaphone.compute("zgez").get should equal ("sjs")
+ Metaphone.compute("zgy").get should equal ("sj")
+ Metaphone.compute("zgyz").get should equal ("sjs")
+ Metaphone.compute("zg").get should equal ("sk")
+ Metaphone.compute("zgz").get should equal ("sks")
+
+ // f
+ Metaphone.compute("zf").get should equal ("sf")
+ Metaphone.compute("zfz").get should equal ("sfs")
+
+ // d
+ Metaphone.compute("fudge").get should equal ("fjj") // php wrongly says fj
+ Metaphone.compute("dodgy").get should equal ("tjj") // php wrongly says tj others wrongly say tjjy
+ Metaphone.compute("dodgi").get should equal ("tjj") // php wrongly says tj
+ Metaphone.compute("zd").get should equal ("st")
+ Metaphone.compute("zdz").get should equal ("sts")
+
+ // c
+ Metaphone.compute("zcia").get should equal ("sx")
+ Metaphone.compute("zciaz").get should equal ("sxs")
+ Metaphone.compute("zch").get should equal ("sx")
+ Metaphone.compute("zchz").get should equal ("sxs")
+ Metaphone.compute("zci").get should equal ("ss")
+ Metaphone.compute("zciz").get should equal ("sss")
+ Metaphone.compute("zce").get should equal ("ss")
+ Metaphone.compute("zcez").get should equal ("sss")
+ Metaphone.compute("zcy").get should equal ("ss")
+ Metaphone.compute("zcyz").get should equal ("sss")
+ Metaphone.compute("zsci").get should equal ("ss")
+ Metaphone.compute("zsciz").get should equal ("sss")
+ Metaphone.compute("zsce").get should equal ("ss")
+ Metaphone.compute("zscez").get should equal ("sss")
+ Metaphone.compute("zscy").get should equal ("ss")
+ Metaphone.compute("zscyz").get should equal ("sss")
+ Metaphone.compute("zsch").get should equal ("sskh") // php wrongly says ssx
+ Metaphone.compute("zc").get should equal ("sk")
+ Metaphone.compute("zcz").get should equal ("sks")
+
+ // b
+ Metaphone.compute("zb").get should equal ("sb")
+ Metaphone.compute("zbz").get should equal ("sbs")
+ Metaphone.compute("zmb").get should equal ("sm")
+
+ // Miscellaneous.
+ Metaphone.compute("dumb").get should equal ("tm")
+ Metaphone.compute("smith").get should equal ("sm0")
+ Metaphone.compute("school").get should equal ("skhl") // php wrongly says sxl
+ Metaphone.compute("merci").get should equal ("mrs")
+ Metaphone.compute("cool").get should equal ("kl")
+ Metaphone.compute("aebersold").get should equal ("ebrslt")
+ Metaphone.compute("gnagy").get should equal ("nj")
+ Metaphone.compute("knuth").get should equal ("n0")
+ Metaphone.compute("pniewski").get should equal ("nsk")
+ Metaphone.compute("wright").get should equal ("rht") // php wrongly says rft
+ Metaphone.compute("phone").get should equal ("fn")
+ Metaphone.compute("aggregate").get should equal ("akrkt")
+ Metaphone.compute("accuracy").get should equal ("akkrs")
+ Metaphone.compute("encyclopedia").get should equal ("ensklpt")
+ Metaphone.compute("honorificabilitudinitatibus").get should equal ("hnrfkblttnttbs")
+ Metaphone.compute("antidisestablishmentarianism").get should equal ("anttsstblxmntrnsm")
+ }
+ }
+ }
+ }
+} \ No newline at end of file