From f95a07d2481a45ddd8d088651f9669900da51edd Mon Sep 17 00:00:00 2001 From: Rocky Madden Date: Sat, 20 Oct 2012 15:25:49 -0600 Subject: Split out the Soundex algorithm from the metric. --- .../org/hashtree/stringmetric/Algorithm.scala | 6 ++ .../hashtree/stringmetric/StringAlgorithm.scala | 6 ++ .../hashtree/stringmetric/phonetic/Soundex.scala | 94 ++++++++++++++++++++++ .../stringmetric/phonetic/SoundexMetric.scala | 92 +++------------------ .../stringmetric/phonetic/SoundexMetricSpec.scala | 39 ++++----- .../stringmetric/phonetic/SoundexSpec.scala | 50 ++++++++++++ 6 files changed, 187 insertions(+), 100 deletions(-) create mode 100755 core/source/core/scala/org/hashtree/stringmetric/Algorithm.scala create mode 100755 core/source/core/scala/org/hashtree/stringmetric/StringAlgorithm.scala create mode 100755 core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala create mode 100755 core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala diff --git a/core/source/core/scala/org/hashtree/stringmetric/Algorithm.scala b/core/source/core/scala/org/hashtree/stringmetric/Algorithm.scala new file mode 100755 index 0000000..e334d20 --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/Algorithm.scala @@ -0,0 +1,6 @@ +package org.hashtree.stringmetric + +/** Marks those which leverage traits of a stand alone algorithm. */ +trait Algorithm[T, C] { + def compute(t: T)(implicit c: C): Option[T] +} \ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/StringAlgorithm.scala b/core/source/core/scala/org/hashtree/stringmetric/StringAlgorithm.scala new file mode 100755 index 0000000..2901010 --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/StringAlgorithm.scala @@ -0,0 +1,6 @@ +package org.hashtree.stringmetric + +/** Marks those which leverage traits of a string based [[org.hashtree.stringmetric.Algorithm]]. */ +trait StringAlgorithm extends Algorithm[String, StringCleaner] { + def compute(ca: Array[Char])(implicit stringCleaner: StringCleaner): Option[Array[Char]] +} \ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala new file mode 100755 index 0000000..75762ca --- /dev/null +++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala @@ -0,0 +1,94 @@ +package org.hashtree.stringmetric.phonetic + +import org.hashtree.stringmetric.{ StringAlgorithm, StringCleaner, StringCleanerDelegate } +import scala.annotation.tailrec + +/** An implementation of the Soundex [[org.hashtree.stringmetric.StringAlgorithm]]. */ +object Soundex extends StringAlgorithm { + override def compute(charArray: Array[Char])(implicit stringCleaner: StringCleaner): Option[Array[Char]] = { + val ca = stringCleaner.clean(charArray) + + if (ca.length == 0) None + else { + letter(ca, 0) match { + case Some(l) => { + if (ca.length - 1 == l._2) Some(Array(l._1, '0', '0', '0')) + else { + Some( + code( + ca.takeRight(ca.length - (l._2 + 1)), + l._1, // Pass first letter. + Array(l._1) // Pass array with first letter. + ).padTo(4, '0') + ) + } + } + case None => None + } + } + } + + override def compute(string: String)(implicit stringCleaner: StringCleaner): Option[String] = { + compute(stringCleaner.clean(string.toCharArray))(new StringCleanerDelegate) match { + case Some(se) => Some(se.mkString) + case None => None + } + } + + @tailrec + private[this] def letter(i: Array[Char], ind: Int): Option[Tuple2[Char, Int]] = { + if (i.length == 0) None + else { + val c = i.head.toLower + + if (c >= 97 && c <= 122) Some((c, ind)) else letter(i.tail, ind + 1) + } + } + + @tailrec + private[this] def code(i: Array[Char], p: Char, o: Array[Char]): Array[Char] = { + require(p >= 97 && p <= 122) + require(o.length > 0) + + if (i.length == 0) o + else { + val c = i.head.toLower + val m2 = (mc: Char) => mc match { + case 'b' | 'f' | 'p' | 'v' => '1' + case 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => '2' + case 'd' | 't' => '3' + case 'l' => '4' + case 'm' | 'n' => '5' + case 'r' => '6' + case _ => '\0' + } + val m1 = (mc: Char, pc: Char) => mc match { + case 'b' | 'f' | 'p' | 'v' if pc != '1' => '1' + case 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' if pc != '2' => '2' + case 'd' | 't' if pc != '3' => '3' + case 'l' if pc != '4' => '4' + case 'm' | 'n' if pc != '5' => '5' + case 'r' if pc != '6' => '6' + case _ => '\0' + } + val a = p match { + // Code twice. + case 'a' | 'e' | 'i' | 'o' | 'u' | 'y' => m2(c) + // Code once. + case _ => { + m1( + c, + o.last match { + case '1' | '2' | '3' | '4' | '5' | '6' => o.last + case _ => m2(o.last) + } + ) + } + } + + if (o.length == 3 && a != '\0') o :+ a + else + code(i.tail, c, if (a != '\0') o :+ a else o) + } + } +} \ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/SoundexMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/SoundexMetric.scala index c8f2d13..cc891ab 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/phonetic/SoundexMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/SoundexMetric.scala @@ -1,100 +1,30 @@ package org.hashtree.stringmetric.phonetic import org.hashtree.stringmetric.{ StringCleaner, StringCleanerDelegate, StringMetric } -import scala.annotation.tailrec /** An implementation of the Soundex [[org.hashtree.stringmetric.StringMetric]]. */ object SoundexMetric extends StringMetric { - implicit val stringCleaner = new StringCleanerDelegate - override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit stringCleaner: StringCleaner): Option[Boolean] = { val ca1 = stringCleaner.clean(charArray1) val ca2 = stringCleaner.clean(charArray2) - val se1 = if (ca1.length > 0) soundex(ca1) else None - val se2 = if (ca2.length > 0) soundex(ca2) else None - if (!se1.isDefined || !se2.isDefined) None else Some(se1.get == se2.get) + if (ca1.length == 0 || ca2.length == 0) None + else { + val se1 = Soundex.compute(ca1) + val se2 = Soundex.compute(ca2) + + if (!se1.isDefined || !se2.isDefined || (se1.get.length == 0 && se2.get.length == 0)) + None + else + Some(se1.get.sameElements(se2.get)) + } } override def compare(string1: String, string2: String)(implicit stringCleaner: StringCleaner): Option[Boolean] = { + // Unable to perform simple equality check, due to situations where no letters are passed. compare( stringCleaner.clean(string1.toCharArray), stringCleaner.clean(string2.toCharArray) )(new StringCleanerDelegate) } - - private[this] def soundex(ca: Array[Char]) = { - require(ca.length > 0) - - @tailrec - def letter(ca: Array[Char], i: Int): Option[Tuple2[Char, Int]] = { - require(ca.length > 0) - - val c = ca.head.toLower - - if (c >= 97 && c <= 122) Some((c, i)) else if (ca.length == 1) None else letter(ca.tail, i + 1) - } - - @tailrec - def code(i: Array[Char], p: Char, o: Array[Char]): Array[Char] = { - require(i.length > 0) - require(p >= 97 && p <= 122) - require(o.length > 0) - - val c = i.head.toLower - val m2 = (mc: Char) => mc match { - case 'b' | 'f' | 'p' | 'v' => '1' - case 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => '2' - case 'd' | 't' => '3' - case 'l' => '4' - case 'm' | 'n' => '5' - case 'r' => '6' - case _ => '\0' - } - val m1 = (mc: Char, pc: Char) => mc match { - case 'b' | 'f' | 'p' | 'v' if pc != '1' => '1' - case 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' if pc != '2' => '2' - case 'd' | 't' if pc != '3' => '3' - case 'l' if pc != '4' => '4' - case 'm' | 'n' if pc != '5' => '5' - case 'r' if pc != '6' => '6' - case _ => '\0' - } - val a = p match { - // Code twice. - case 'a' | 'e' | 'i' | 'o' | 'u' | 'y' => m2(c) - // Code once. - case _ => { - m1( - c, - o.last match { - case '1' | '2' | '3' | '4' | '5' | '6' => o.last - case _ => m2(o.last) - } - ) - } - } - - if (i.length == 1 || (o.length == 3 && a != '\0')) - if (a != '\0') o :+ a else o - else - code(i.tail, c, if (a != '\0') o :+ a else o) - } - - letter(ca, 0) match { - case Some(l) => { - if (ca.length - 1 == l._2) Some(l._1 + "000") - else { - Some( - code( - ca.takeRight(ca.length - (l._2 + 1)), - l._1, // Pass first letter. - Array(l._1) // Pass array with first letter. - ).mkString.padTo(4, '0') - ) - } - } - case None => None - } - } } \ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexMetricSpec.scala index 6664696..f92f789 100755 --- a/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexMetricSpec.scala +++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexMetricSpec.scala @@ -8,27 +8,28 @@ import org.scalatest.junit.JUnitRunner final class SoundexMetricSpec extends ScalaTest { "SoundexMetric" should provide { "compare method" when passed { - "valid arguments" should returns { - "Boolean indicating matches" in { - SoundexMetric.compare("abc", "abc").get should be (true) // a120 vs. a120 - SoundexMetric.compare("a", "a").get should be (true) // a000 vs. a000 - SoundexMetric.compare("abc", "xyz").get should be (false) // a120 vs. x200 + "empty arguments" should returns { + "None" in { SoundexMetric.compare("", "").isDefined should be (false) + SoundexMetric.compare("abc", "").isDefined should be (false) + SoundexMetric.compare("", "xyz").isDefined should be (false) + } + } + "non-phonetic arguments" should returns { + "None" in { SoundexMetric.compare("123", "123").isDefined should be (false) - SoundexMetric.compare("1", "1").isDefined should be (false) - - SoundexMetric.compare("Robert", "Rupert").get should be (true) // r163 vs. r163 - SoundexMetric.compare("Robert", "Rubin").get should be (false) // r163 vs. r150 - - SoundexMetric.compare("Ashcraft", "Ashcroft").get should be (true) // a261 vs. a261 - SoundexMetric.compare("Tymczak", "Tymczak").get should be (true) // t522 vs. t522 - SoundexMetric.compare("Pfister", "Pfister").get should be (true) // p236 vs. p236 - SoundexMetric.compare("Euler", "Ellery").get should be (true) // e460 vs. e460 - SoundexMetric.compare("Gauss", "Ghosh").get should be (true) // g200 vs. g200 - SoundexMetric.compare("Hilbert", "Heilbronn").get should be (true) // h416 vs. h416 - SoundexMetric.compare("Knuth", "Kant").get should be (true) // k530 vs. k530 - SoundexMetric.compare("Lloyd", "Ladd").get should be (true) // l300 vs. l300 - SoundexMetric.compare("Lukasiewicz", "Lissajous").get should be (true) // l222 vs. l222 + SoundexMetric.compare("123", "").isDefined should be (false) + SoundexMetric.compare("", "123").isDefined should be (false) + } + } + "phonetically similar arguments" should returns { + "Boolean indicating true" in { + SoundexMetric.compare("robert", "rupert").get should be (true) + } + } + "phonetically dissimilar arguments" should returns { + "Boolean indicating false" in { + SoundexMetric.compare("robert", "rubin").get should be (false) } } } diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala new file mode 100755 index 0000000..13c6bdd --- /dev/null +++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala @@ -0,0 +1,50 @@ +package org.hashtree.stringmetric.phonetic + +import org.hashtree.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class SoundexSpec extends ScalaTest { + "Soundex" should provide { + "compute method" when passed { + "empty argument" should returns { + "None" in { + Soundex.compute("").isDefined should be (false) + } + } + "non-phonetic argument" should returns { + "None" in { + Soundex.compute("123").isDefined should be (false) + } + } + "phonetic argument" should returns { + "Some" in { + Soundex.compute("abc").get should equal ("a120") + Soundex.compute("xyz").get should equal ("x200") + Soundex.compute("robert").get should equal ("r163") + Soundex.compute("rupert").get should equal ("r163") + Soundex.compute("rubin").get should equal ("r150") + Soundex.compute("ashcraft").get should equal ("a261") + Soundex.compute("tymczak").get should equal ("t522") + Soundex.compute("pfister").get should equal ("p236") + Soundex.compute("euler").get should equal ("e460") + Soundex.compute("gauss").get should equal ("g200") + Soundex.compute("hilbert").get should equal ("h416") + Soundex.compute("knuth").get should equal ("k530") + Soundex.compute("lloyd").get should equal ("l300") + Soundex.compute("lukasiewicz").get should equal ("l222") + Soundex.compute("ashcroft").get should equal ("a261") + Soundex.compute("tymczak").get should equal ("t522") + Soundex.compute("pfister").get should equal ("p236") + Soundex.compute("ellery").get should equal ("e460") + Soundex.compute("ghosh").get should equal ("g200") + Soundex.compute("heilbronn").get should equal ("h416") + Soundex.compute("kant").get should equal ("k530") + Soundex.compute("ladd").get should equal ("l300") + Soundex.compute("lissajous").get should equal ("l222") + } + } + } + } +} \ No newline at end of file -- cgit v1.2.3