diff options
12 files changed, 253 insertions, 282 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala index 7fe848d..5560158 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala @@ -10,13 +10,14 @@ object Metaphone extends StringAlgorithm { if (ca.length == 0) None else { - val e = exceptions(ca.map(_.toLower)) - val t = transformations(Array.empty[Char], e.head, e.tail, Array.empty[Char]) + val th = deduplicate(transcodeHead(ca.map(_.toLower))) - if (t.length == 0) - None - else - Some(t) + if (th.head < 97 || th.head > 122) None + else { + val t = transcode(Array.empty[Char], th.head, th.tail, Array.empty[Char]) + + if (t.length == 0) None else Some(t) // Single Y or W would have 0 length. + } } } @@ -27,196 +28,135 @@ object Metaphone extends StringAlgorithm { } } - private[this] def exceptions(ca: Array[Char]): Array[Char] = { - val deduplicate = (x: Array[Char]) => { - x.sliding(2).filter(a => a(0) == 'c' || a(0) != a(1)).map(a => a(0)).toArray[Char] :+ x.last - } - - ca.length match { - case 0 => Array.empty[Char] - case 1 if ca.head == 'x' => Array('s') - case 1 => ca - case _ if ca.head == 'x' => 's' +: deduplicate(ca.tail) - case _ => { - "" + ca.head + ca.tail.head match { - case "ae" | "gn" | "kn" | "pn" | "wr" => ca.tail - case "wh" => 'w' +: deduplicate(ca.tail.tail) - case _ => deduplicate(ca) - } - } - } + private[this] def deduplicate(ca: Array[Char]) = { + if (ca.length <= 1) ca + else + ca.sliding(2).filter(a => a(0) == 'c' || a(0) != a(1)).map(a => a(0)).toArray[Char] :+ ca.last } private[this] def isVowel(c: Char) = (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') @tailrec - private[this] def transformations(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = { + private[this] def transcode(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = { if (c == '\0' && r.length == 0) o else { - val lshift = (d: Int) => if (d == 1) l :+ c else (l :+ c) ++ r.take(d - 1) - val cshift = (d: Int) => if (d > r.length) '\0' else r(d - 1) - val rshift = (d: Int) => if (d >= r.length) Array.empty[Char] else r.drop(d) + val shift = (d: Int, ca: Array[Char]) => { + val sa = r.splitAt(d - 1) - c match { - case 'a' | 'e' | 'i' | 'o' | 'u' => { - if (l.length == 0) - transformations(lshift(1), cshift(1), rshift(1), o :+ c) - else - transformations(lshift(1), cshift(1), rshift(1), o) - } - case 'f' | 'j' | 'l' | 'm' | 'n' | 'r' => transformations(lshift(1), cshift(1), rshift(1), o :+ c) - case 'b' => { - if ( - (l.length >= 1 && l.last == 'm' && r.length == 0) - ) - transformations(lshift(1), cshift(1), rshift(1), o) - else - transformations(lshift(1), cshift(1), rshift(1), o :+ 'b') - } - case 'c' => { - if ( - (r.length >= 1 && r.head == 'h' && l.length >= 1 && l.last == 's') - ) - transformations(lshift(1), cshift(1), rshift(1), o :+ 'k') - else if ( - (r.length >= 2 && r.head == 'i' && r.tail.head == 'a') - ) - transformations(lshift(3), cshift(3), rshift(3), o :+ 'x') - else if ( - (r.length >= 1 && r.head == 'h') || - (l.length >= 1 && r.length >= 1 && l.last == 's' && r.head == 'h') - ) - transformations(lshift(2), cshift(2), rshift(2), o :+ 'x') - else if ( - (l.length >= 1 && r.length >= 1 && l.last == 's' && ( - r.head == 'i' || - r.head == 'e' || - r.head == 'y' - ) - ) - ) - transformations(lshift(1), cshift(1), rshift(1), o) - else if ( - (r.length >= 1 && ( - r.head == 'i' || - r.head == 'e' || - r.head == 'y' - ) - ) - ) - transformations(lshift(1), cshift(1), rshift(1), o :+ 's') - else - transformations(lshift(1), cshift(1), rshift(1), o :+ 'k') - } - case 'd' => { - if ( - (r.length >= 2 && r.head == 'g' && ( - r.tail.head == 'e' || - r.tail.head == 'y' || - r.tail.head == 'i' - ) + ( + if (sa._1.length > 0) (l :+ c) ++ sa._1 else l :+ c, + if (sa._2.length > 0) sa._2.head else '\0', + if (sa._2.length > 1) sa._2.tail else Array.empty[Char], + ca + ) + } + + val t = { + c match { + case 'a' | 'e' | 'i' | 'o' | 'u' => if (l.length == 0) shift(1, o:+ c) else shift(1, o) + case 'f' | 'j' | 'l' | 'm' | 'n' | 'r' => shift(1, o :+ c) + case 'b' => if (l.length >= 1 && l.last == 'm' && r.length == 0) shift(1, o) else shift(1, o :+ 'b') + case 'c' => { + if (r.length >= 1 && r.head == 'h' && l.length >= 1 && l.last == 's') + shift(1, o :+ 'k') + else if (r.length >= 2 && r.head == 'i' && r(1) == 'a') + shift(3, o :+ 'x') + else if ( + (r.length >= 1 && r.head == 'h') || + (l.length >= 1 && r.length >= 1 && l.last == 's' && r.head == 'h') ) - ) - transformations(lshift(1), cshift(1), rshift(1), o :+ 'j') // just d - else - transformations(lshift(1), cshift(1), rshift(1), o :+ 't') - } - case 'g' => { - if ( - (r.length > 1 && r.head == 'h') || - (r.length == 1 && r.head == 'n') || - (r.length == 3 && r.head == 'n' && r.tail.head == 'e' && r.tail.tail.head == 'd') - ) - transformations(lshift(1), cshift(1), rshift(1), o) // just g - else if ( - (r.length >= 1 && ( - r.head == 'i' || - r.head == 'e' || - r.head == 'y' + shift(2, o :+ 'x') + else if (l.length >= 1 && r.length >= 1 && l.last == 's' && ( + r.head == 'i' || r.head == 'e' || r.head == 'y' ) ) - ) - transformations(lshift(2), cshift(2), rshift(2), o :+ 'j') - else - transformations(lshift(1), cshift(1), rshift(1), o :+ 'k') - } - case 'h' => { - if ( - (l.length >= 1 && isVowel(l.last) && (r.length == 0 || !isVowel(r.head))) || - (l.length >= 2 && l.last == 'h' && ( - l(l.length - 2) == 'c' || - l(l.length - 2) == 's' || - l(l.length - 2) == 'p' || - l(l.length - 2) == 't' || - l(l.length - 2) == 'g' + shift(1, o) + else if (r.length >= 1 && (r.head == 'i' || r.head == 'e' || r.head == 'y')) + shift(1, o :+ 's') + else + shift(1, o :+ 'k') + } + case 'd' => { + if (r.length >= 2 && r.head == 'g' && ( + r(1) == 'e' || r(1) == 'y' || r(1) == 'i' ) ) - ) - transformations(lshift(1), cshift(1), rshift(1), o) - else - transformations(lshift(1), cshift(1), rshift(1), o :+ 'h') - } - case 'k' => { - if ( - (l.length >= 1 && l.last == 'c') // raw - ) - transformations(lshift(1), cshift(1), rshift(1), o) - else - transformations(lshift(1), cshift(1), rshift(1), o :+ 'k') - } - case 'p' => { - if ( - (r.length >= 1 && r.head == 'h') - ) - transformations(lshift(2), cshift(2), rshift(2), o :+ 'f') - else - transformations(lshift(1), cshift(1), rshift(1), o :+ 'p') - } - case 'q' => transformations(lshift(1), cshift(1), rshift(1), o :+ 'k') - case 's' => { - if ( - - (r.length >= 2 && r.head == 'i' && ( - r.tail.head == 'o' || - r.tail.head == 'a' - ) + shift(1, o :+ 'j') + else + shift(1, o :+ 't') + } + case 'g' => { + if ( + (r.length > 1 && r.head == 'h') || + (r.length == 1 && r.head == 'n') || + (r.length == 3 && r.head == 'n' && r(1) == 'e' && r(2) == 'd') ) - ) - transformations(lshift(3), cshift(3), rshift(3), o :+ 'x') - else if ( - (r.length >= 1 && r.head == 'h') - ) - transformations(lshift(2), cshift(2), rshift(2), o :+ 'x') - else - transformations(lshift(1), cshift(1), rshift(1), o :+ 's') - } - case 't' => { - if ( - (r.length >= 2 && r.head == 'i' && ( - r.tail.head == 'a' || - r.tail.head == 'o' + shift(1, o) + else if (r.length >= 1 && (r.head == 'i' || r.head == 'e' || r.head == 'y')) + shift(2, o :+ 'j') + else + shift(1, o :+ 'k') + } + case 'h' => { + if ( + (l.length >= 1 && isVowel(l.last) && (r.length == 0 || !isVowel(r.head))) || + (l.length >= 2 && l.last == 'h' && ( + l(l.length - 2) == 'c' || l(l.length - 2) == 's' || l(l.length - 2) == 'p' || + l(l.length - 2) == 't' || l(l.length - 2) == 'g' + ) ) ) - ) - transformations(lshift(3), cshift(3), rshift(3), o :+ 'x') - else if (r.length >= 1 && r.head == 'h') - transformations(lshift(2), cshift(2), rshift(2), o :+ '0') - else if (r.length >= 2 && r.head == 'c' && r.tail.head == 'h') - transformations(lshift(1), cshift(1), rshift(1), o) // only t - else - transformations(lshift(1), cshift(1), rshift(1), o :+ 't') + shift(1, o) + else + shift(1, o :+ 'h') + } + case 'k' => if (l.length >= 1 && l.last == 'c') shift(1, o) else shift(1, o :+ 'k') + case 'p' => if (r.length >= 1 && r.head == 'h') shift(2, o :+ 'f') else shift(1, o :+ 'p') + case 'q' => shift(1, o :+ 'k') + case 's' => { + if (r.length >= 2 && r.head == 'i' && (r(1) == 'o' || r(1) == 'a')) + shift(3, o :+ 'x') + else if (r.length >= 1 && r.head == 'h') + shift(2, o :+ 'x') + else + shift(1, o :+ 's') + } + case 't' => { + if (r.length >= 2 && r.head == 'i' && (r(1) == 'a' || r(1) == 'o')) + shift(3, o :+ 'x') + else if (r.length >= 1 && r.head == 'h') + shift(2, o :+ '0') + else if (r.length >= 2 && r.head == 'c' && r(1) == 'h') + shift(1, o) + else + shift(1, o :+ 't') + } + case 'v' => shift(1, o :+ 'f') + case 'w' | 'y' => if (r.length == 0 || !isVowel(r.head)) shift(1, o) else shift(1, o :+ c) + case 'x' => shift(1, (o :+ 'k') :+ 's') + case 'z' => shift(1, o :+ 's') + case _ => shift(1, o) } - case 'v' => transformations(lshift(1), cshift(1), rshift(1), o :+ 'f') - case 'w' | 'y' => { - if (r.length == 0 || !isVowel(r.head)) - transformations(lshift(1), cshift(1), rshift(1), o) - else - transformations(lshift(1), cshift(1), rshift(1), o :+ c) - } - case 'x' => transformations(lshift(1), cshift(1), rshift(1), (o :+ 'k') :+ 's') - case 'z' => transformations(lshift(1), cshift(1), rshift(1), o :+ 's') - case _ => transformations(lshift(1), cshift(1), rshift(1), o) } + + transcode(t._1, t._2, t._3, t._4) } } + + private[this] def transcodeHead(ca: Array[Char]) = { + val h = ca.take(2).padTo(2, '\0') + + if ( + (h.head == 'a' && h.last == 'e') || + (h.last == 'n' && (h.head == 'g' || h.head == 'k' || h.head == 'p')) || + (h.head == 'w' && h.last == 'r') + ) + ca.tail + else if (h.head == 'w' && h.last == 'h') + 'w' +: ca.drop(2) + else if (h.head == 'x') + 's' +: ca.tail + else + ca + } }
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala index bd3f9bc..895c1a5 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala @@ -21,7 +21,6 @@ object MetaphoneMetric extends StringMetric { } override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Boolean] = { - // Unable to perform simple equality check, due to situations where no letters are passed. compare( stringFilter.filter(string1.toCharArray), stringFilter.filter(string2.toCharArray) diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala index 4d69b28..33b285c 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala @@ -10,20 +10,17 @@ object Soundex extends StringAlgorithm { if (ca.length == 0) None else { - letter(ca, 0) match { - case Some(l) => { - if (ca.length - 1 == l._2) Some(Array(l._1, '0', '0', '0')) - else { - Some( - code( - ca.takeRight(ca.length - (l._2 + 1)), - l._1, // Pass first letter. - Array(l._1) // Pass array with first letter. - ).padTo(4, '0') - ) - } - } - case None => None + val fc = ca.head.toLower + + if (fc < 97 || fc > 122) None + else { + Some( + transcode( + ca.tail, + fc, // Pass first letter. + Array(fc) // Pass array with first letter. + ).padTo(4, '0') + ) } } } @@ -36,18 +33,7 @@ object Soundex extends StringAlgorithm { } @tailrec - private[this] def letter(i: Array[Char], ind: Int): Option[Tuple2[Char, Int]] = { - if (i.length == 0) None - else { - val c = i.head.toLower - - if (c >= 97 && c <= 122) Some((c, ind)) else letter(i.tail, ind + 1) - } - } - - @tailrec - private[this] def code(i: Array[Char], p: Char, o: Array[Char]): Array[Char] = { - require(p >= 97 && p <= 122) + private[this] def transcode(i: Array[Char], p: Char, o: Array[Char]): Array[Char] = { require(o.length > 0) if (i.length == 0) o @@ -88,7 +74,7 @@ object Soundex extends StringAlgorithm { if (o.length == 3 && a != '\0') o :+ a else - code(i.tail, c, if (a != '\0') o :+ a else o) + transcode(i.tail, c, if (a != '\0') o :+ a else o) } } }
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala index 64a3c1a..d5e7eac 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala @@ -10,6 +10,7 @@ object DiceSorensenMetric extends StringMetric { val ca2 = stringFilter.filter(charArray2) if (ca1.length == 0 || ca2.length == 0) None + else if (ca1.sameElements(ca2)) Some(1d) else { val b = bigrams(ca1, ca2) val ms = scoreMatches(b) @@ -19,12 +20,10 @@ object DiceSorensenMetric extends StringMetric { } override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Double] = { - if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(1d) - else - compare( - stringFilter.filter(string1.toCharArray), - stringFilter.filter(string2.toCharArray) - )(new StringFilterDelegate) + compare( + stringFilter.filter(string1.toCharArray), + stringFilter.filter(string2.toCharArray) + )(new StringFilterDelegate) } private[this] def bigrams(ct: CompareTuple[Char]): MatchTuple[String] = { @@ -32,7 +31,7 @@ object DiceSorensenMetric extends StringMetric { def set(ca: Array[Char], sa: Array[String]): Array[String] = { if (ca.length <= 1) sa else - set(ca.tail, sa :+ "" + ca.head + ca.tail.head) + set(ca.tail, sa :+ "" + ca.head + ca(1)) } (set(ct._1, Array.empty[String]), set(ct._2, Array.empty[String])) diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/HammingMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/HammingMetric.scala index 04001f5..2dc1fbc 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/similarity/HammingMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/HammingMetric.scala @@ -9,16 +9,15 @@ object HammingMetric extends StringMetric { val ca2 = stringFilter.filter(charArray2) if (ca1.length == 0 || ca2.length == 0 || ca1.length != ca2.length) None + else if (ca1.sameElements(ca2)) Some(0) else Some(hamming(ca1, ca2)) } override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Int] = { - if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(0) - else - compare( - stringFilter.filter(string1.toCharArray), - stringFilter.filter(string2.toCharArray) - )(new StringFilterDelegate) + compare( + stringFilter.filter(string1.toCharArray), + stringFilter.filter(string2.toCharArray) + )(new StringFilterDelegate) } private[this] def hamming(ct: CompareTuple[Char]) = { diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroMetric.scala index 503b497..2ad137f 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroMetric.scala @@ -14,6 +14,7 @@ object JaroMetric extends StringMetric { val ca2 = stringFilter.filter(charArray2) if (ca1.length == 0 || ca2.length == 0) None + else if (ca1.sameElements(ca2)) Some(1d) else { val mt = `match`((ca1, ca2)) val ms = scoreMatches((mt._1, mt._2)) @@ -26,12 +27,10 @@ object JaroMetric extends StringMetric { } override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Double] = { - if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(1d) - else - compare( - stringFilter.filter(string1.toCharArray), - stringFilter.filter(string2.toCharArray) - )(new StringFilterDelegate) + compare( + stringFilter.filter(string1.toCharArray), + stringFilter.filter(string2.toCharArray) + )(new StringFilterDelegate) } private[this] def `match`(ct: CompareTuple[Char]): MatchTuple[Char] = { diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroWinklerMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroWinklerMetric.scala index 9b797bc..a9f007e 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroWinklerMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroWinklerMetric.scala @@ -13,6 +13,8 @@ object JaroWinklerMetric extends StringMetric { val ca2 = stringFilter.filter(charArray2) JaroMetric.compare(ca1, ca2)(new StringFilterDelegate) match { + case Some(0d) => Some(0d) + case Some(1d) => Some(1d) case Some(jaro) => { val prefix = ca1.zip(ca2).takeWhile(t => t._1 == t._2).map(_._1) @@ -23,11 +25,9 @@ object JaroWinklerMetric extends StringMetric { } override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Double] = { - if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(1d) - else - compare( - stringFilter.filter(string1.toCharArray), - stringFilter.filter(string2.toCharArray) - )(new StringFilterDelegate) + compare( + stringFilter.filter(string1.toCharArray), + stringFilter.filter(string2.toCharArray) + )(new StringFilterDelegate) } }
\ No newline at end of file diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/LevenshteinMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/LevenshteinMetric.scala index 3e6a926..4e58f02 100755 --- a/core/source/core/scala/org/hashtree/stringmetric/similarity/LevenshteinMetric.scala +++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/LevenshteinMetric.scala @@ -9,49 +9,47 @@ object LevenshteinMetric extends StringMetric { val ca2 = stringFilter.filter(charArray2) if (ca1.length == 0 && ca2.length == 0) None - else { - val levenshteinMemoize = Memoize.Y(levenshtein) - - Some(levenshteinMemoize(ca1, ca2)) - } + else if (ca1.length == 0) Some(ca2.length) + else if (ca2.length == 0) Some(ca1.length) + else Some(levenshtein(ca1, ca2)) } override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Int] = { - if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(0) - else - compare( - stringFilter.filter(string1.toCharArray), - stringFilter.filter(string2.toCharArray) - )(new StringFilterDelegate) + compare( + stringFilter.filter(string1.toCharArray), + stringFilter.filter(string2.toCharArray) + )(new StringFilterDelegate) } - private[this] def levenshtein(f: CompareTuple[Char] => Int)(ct: CompareTuple[Char]): Int = { - if (ct._1.length == 0) ct._2.length - else if (ct._2.length == 0) ct._1.length - else { - math.min( - math.min( - f(ct._1.tail, ct._2) + 1, - f(ct._1, ct._2.tail) + 1 - ), - f(ct._1.tail, ct._2.tail) + (if (ct._1.head != ct._2.head) 1 else 0) - ) + private[this] def levenshtein(ct: CompareTuple[Char]) = { + val m = Array.fill[Int](ct._1.length + 1, ct._2.length + 1)(-1) + + def distance(t: Tuple2[Int, Int]): Int = { + t match { + case (r, 0) => r + case (0, c) => c + case (r, c) if m(r)(c) != -1 => m(r)(c) + case (r, c) => { + val min = { + if (ct._1(r - 1) == ct._2(c - 1)) + distance(r - 1, c - 1) + else { + math.min( + math.min( + distance(r - 1, c) + 1, // Delete (left). + distance(r, c - 1) + 1 // Insert (up). + ), + distance(r - 1, c - 1) + 1 // Substitute (left-up). + ) + } + } + + m(r)(c) = min + min + } + } } - } - private[this] final class Memoize[-T, +R](f: T => R) extends (T => R) { - private[this] val map = scala.collection.mutable.Map.empty[T, R] - - def apply(k: T): R = map.getOrElseUpdate(k, f(k)) - } - - private[this] object Memoize { - def apply[T, R](f: T => R) = new Memoize(f) - - def Y[T, R](f: (T => R) => T => R): (T => R) = { - lazy val yf: T => R = Memoize(f(yf)(_)) - - yf - } + distance(ct._1.length, ct._2.length) } }
\ No newline at end of file diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala index 2324298..2cfcc36 100755 --- a/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala +++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala @@ -25,24 +25,33 @@ final class MetaphoneSpec extends ScalaTest { Metaphone.compute("zz").get should equal ("s") // y + Metaphone.compute("y").isDefined should be (false) Metaphone.compute("zy").get should equal ("s") Metaphone.compute("zyz").get should equal ("ss") Metaphone.compute("zya").get should equal ("sy") // x + Metaphone.compute("x").get should equal ("s") Metaphone.compute("zx").get should equal ("sks") Metaphone.compute("zxz").get should equal ("skss") // w + Metaphone.compute("w").isDefined should be (false) Metaphone.compute("zw").get should equal ("s") Metaphone.compute("zwz").get should equal ("ss") Metaphone.compute("zwa").get should equal ("sw") // v + Metaphone.compute("v").get should equal ("f") Metaphone.compute("zv").get should equal ("sf") Metaphone.compute("zvz").get should equal ("sfs") + // u + Metaphone.compute("u").get should equal ("u") + Metaphone.compute("zu").get should equal ("s") + // t + Metaphone.compute("t").get should equal ("t") Metaphone.compute("ztiaz").get should equal ("sxs") Metaphone.compute("ztioz").get should equal ("sxs") Metaphone.compute("zthz").get should equal ("s0s") @@ -50,6 +59,7 @@ final class MetaphoneSpec extends ScalaTest { Metaphone.compute("ztz").get should equal ("sts") // s + Metaphone.compute("s").get should equal ("s") Metaphone.compute("zshz").get should equal ("sxs") Metaphone.compute("zsioz").get should equal ("sxs") Metaphone.compute("zsiaz").get should equal ("sxs") @@ -57,45 +67,64 @@ final class MetaphoneSpec extends ScalaTest { Metaphone.compute("zsz").get should equal ("sss") // r + Metaphone.compute("r").get should equal ("r") Metaphone.compute("zr").get should equal ("sr") Metaphone.compute("zrz").get should equal ("srs") // q + Metaphone.compute("q").get should equal ("k") Metaphone.compute("zq").get should equal ("sk") Metaphone.compute("zqz").get should equal ("sks") // p - Metaphone.compute("zph").get should equal ("sf") + Metaphone.compute("p").get should equal ("p") Metaphone.compute("zp").get should equal ("sp") + Metaphone.compute("zph").get should equal ("sf") Metaphone.compute("zpz").get should equal ("sps") + // o + Metaphone.compute("o").get should equal ("o") + Metaphone.compute("zo").get should equal ("s") + // n + Metaphone.compute("n").get should equal ("n") Metaphone.compute("zn").get should equal ("sn") Metaphone.compute("znz").get should equal ("sns") // m + Metaphone.compute("m").get should equal ("m") Metaphone.compute("zm").get should equal ("sm") Metaphone.compute("zmz").get should equal ("sms") // l + Metaphone.compute("l").get should equal ("l") Metaphone.compute("zl").get should equal ("sl") Metaphone.compute("zlz").get should equal ("sls") // k - Metaphone.compute("zck").get should equal ("sk") + Metaphone.compute("k").get should equal ("k") Metaphone.compute("zk").get should equal ("sk") + Metaphone.compute("zck").get should equal ("sk") // j + Metaphone.compute("j").get should equal ("j") Metaphone.compute("zj").get should equal ("sj") Metaphone.compute("zjz").get should equal ("sjs") + // i + Metaphone.compute("i").get should equal ("i") + Metaphone.compute("zi").get should equal ("s") + // h + Metaphone.compute("h").get should equal ("h") // php wrongly says nothing Metaphone.compute("zh").get should equal ("sh") // php wrongly says s Metaphone.compute("zah").get should equal ("s") Metaphone.compute("zchh").get should equal ("sx") Metaphone.compute("ha").get should equal ("h") // g + Metaphone.compute("g").get should equal ("k") + Metaphone.compute("zg").get should equal ("sk") Metaphone.compute("zgh").get should equal ("skh") // php wrongly says sf Metaphone.compute("zghz").get should equal ("shs") // php wrongly says sfs Metaphone.compute("zgha").get should equal ("sh") // php wrongly says sf others wrongly say skh @@ -109,14 +138,19 @@ final class MetaphoneSpec extends ScalaTest { Metaphone.compute("zgez").get should equal ("sjs") Metaphone.compute("zgy").get should equal ("sj") Metaphone.compute("zgyz").get should equal ("sjs") - Metaphone.compute("zg").get should equal ("sk") Metaphone.compute("zgz").get should equal ("sks") // f + Metaphone.compute("f").get should equal ("f") Metaphone.compute("zf").get should equal ("sf") Metaphone.compute("zfz").get should equal ("sfs") + // e + Metaphone.compute("e").get should equal ("e") + Metaphone.compute("ze").get should equal ("s") + // d + Metaphone.compute("d").get should equal ("t") Metaphone.compute("fudge").get should equal ("fjj") // php wrongly says fj Metaphone.compute("dodgy").get should equal ("tjj") // php wrongly says tj others wrongly say tjjy Metaphone.compute("dodgi").get should equal ("tjj") // php wrongly says tj @@ -124,6 +158,7 @@ final class MetaphoneSpec extends ScalaTest { Metaphone.compute("zdz").get should equal ("sts") // c + Metaphone.compute("c").get should equal ("k") Metaphone.compute("zcia").get should equal ("sx") Metaphone.compute("zciaz").get should equal ("sxs") Metaphone.compute("zch").get should equal ("sx") @@ -145,10 +180,15 @@ final class MetaphoneSpec extends ScalaTest { Metaphone.compute("zcz").get should equal ("sks") // b + Metaphone.compute("b").get should equal ("b") Metaphone.compute("zb").get should equal ("sb") Metaphone.compute("zbz").get should equal ("sbs") Metaphone.compute("zmb").get should equal ("sm") + // a + Metaphone.compute("a").get should equal ("a") + Metaphone.compute("za").get should equal ("s") + // Miscellaneous. Metaphone.compute("dumb").get should equal ("tm") Metaphone.compute("smith").get should equal ("sm0") diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala index 13c6bdd..dbefc51 100755 --- a/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala +++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala @@ -43,6 +43,7 @@ final class SoundexSpec extends ScalaTest { Soundex.compute("kant").get should equal ("k530") Soundex.compute("ladd").get should equal ("l300") Soundex.compute("lissajous").get should equal ("l222") + Soundex.compute("x123").get should equal ("x000") } } } diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/JaroMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/JaroMetricSpec.scala index 8fd12b1..318f241 100755 --- a/core/source/test/scala/org/hashtree/stringmetric/similarity/JaroMetricSpec.scala +++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/JaroMetricSpec.scala @@ -28,17 +28,17 @@ final class JaroMetricSpec extends ScalaTest { } "valid arguments" should returns { "Double indicating distance" in { - JaroWinklerMetric.compare("aa", "a").get should be (0.8500000000000001) - JaroWinklerMetric.compare("a", "aa").get should be (0.8500000000000001) - JaroWinklerMetric.compare("veryveryverylong", "v").get should be (0.71875) - JaroWinklerMetric.compare("v", "veryveryverylong").get should be (0.71875) - JaroWinklerMetric.compare("martha", "marhta").get should be (0.9611111111111111) - JaroWinklerMetric.compare("dwayne", "duane").get should be (0.8400000000000001) - JaroWinklerMetric.compare("dixon", "dicksonx").get should be (0.8133333333333332) - JaroWinklerMetric.compare("abcvwxyz", "cabvwxyz").get should be (0.9583333333333334) - JaroWinklerMetric.compare("jones", "johnson").get should be (0.8323809523809523) - JaroWinklerMetric.compare("henka", "henkan").get should be (0.9666666666666667) - JaroWinklerMetric.compare("fvie", "ten").get should be (0) + JaroMetric.compare("aa", "a").get should be (0.8333333333333334) + JaroMetric.compare("a", "aa").get should be (0.8333333333333334) + JaroMetric.compare("veryveryverylong", "v").get should be (0.6875) + JaroMetric.compare("v", "veryveryverylong").get should be (0.6875) + JaroMetric.compare("martha", "marhta").get should be (0.9444444444444445) + JaroMetric.compare("dwayne", "duane").get should be (0.8222222222222223) + JaroMetric.compare("dixon", "dicksonx").get should be (0.7666666666666666) + JaroMetric.compare("abcvwxyz", "cabvwxyz").get should be (0.9583333333333334) + JaroMetric.compare("jones", "johnson").get should be (0.7904761904761904) + JaroMetric.compare("henka", "henkan").get should be (0.9444444444444445) + JaroMetric.compare("fvie", "ten").get should be (0) JaroMetric.compare("zac ephron", "zac efron").get should be > JaroMetric.compare("zac ephron", "kai ephron").get diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/LevenshteinMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/LevenshteinMetricSpec.scala index 53ac7e6..6091cf5 100755 --- a/core/source/test/scala/org/hashtree/stringmetric/similarity/LevenshteinMetricSpec.scala +++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/LevenshteinMetricSpec.scala @@ -31,8 +31,18 @@ final class LevenshteinMetricSpec extends ScalaTest { LevenshteinMetric.compare("a", "abc").get should be (2) LevenshteinMetric.compare("abc", "c").get should be (2) LevenshteinMetric.compare("c", "abc").get should be (2) + LevenshteinMetric.compare("sitting", "kitten").get should be (3) LevenshteinMetric.compare("kitten", "sitting").get should be (3) + LevenshteinMetric.compare("cake", "drake").get should be (2) LevenshteinMetric.compare("drake", "cake").get should be (2) + LevenshteinMetric.compare("saturday", "sunday").get should be (3) + LevenshteinMetric.compare("sunday", "saturday").get should be (3) + LevenshteinMetric.compare("book", "back").get should be (2) + LevenshteinMetric.compare("dog", "fog").get should be (1) + LevenshteinMetric.compare("foq", "fog").get should be (1) + LevenshteinMetric.compare("fvg", "fog").get should be (1) + LevenshteinMetric.compare("encyclopedia", "encyclopediaz").get should be (1) + LevenshteinMetric.compare("encyclopediz", "encyclopediaz").get should be (1) } } } |