summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2012-10-26 19:38:54 -0600
committerRocky Madden <git@rockymadden.com>2012-10-26 19:38:54 -0600
commit93d1ecd3df6df537f951f63de060a5db92e01235 (patch)
tree7c8618f7c34c3bf2738a00bc0958432166333710 /core
parentb98b36d4b6b50c54ff724148ec0e2c31e4d5eb6d (diff)
downloadstringmetric-93d1ecd3df6df537f951f63de060a5db92e01235.tar.gz
stringmetric-93d1ecd3df6df537f951f63de060a5db92e01235.tar.bz2
stringmetric-93d1ecd3df6df537f951f63de060a5db92e01235.zip
Performance enhancements and expanded unit tests.
Diffstat (limited to 'core')
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala296
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala1
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala40
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala13
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/HammingMetric.scala11
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/JaroMetric.scala11
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/JaroWinklerMetric.scala12
-rwxr-xr-xcore/source/core/scala/org/hashtree/stringmetric/similarity/LevenshteinMetric.scala72
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala46
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala1
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/similarity/JaroMetricSpec.scala22
-rwxr-xr-xcore/source/test/scala/org/hashtree/stringmetric/similarity/LevenshteinMetricSpec.scala10
12 files changed, 253 insertions, 282 deletions
diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala
index 7fe848d..5560158 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Metaphone.scala
@@ -10,13 +10,14 @@ object Metaphone extends StringAlgorithm {
if (ca.length == 0) None
else {
- val e = exceptions(ca.map(_.toLower))
- val t = transformations(Array.empty[Char], e.head, e.tail, Array.empty[Char])
+ val th = deduplicate(transcodeHead(ca.map(_.toLower)))
- if (t.length == 0)
- None
- else
- Some(t)
+ if (th.head < 97 || th.head > 122) None
+ else {
+ val t = transcode(Array.empty[Char], th.head, th.tail, Array.empty[Char])
+
+ if (t.length == 0) None else Some(t) // Single Y or W would have 0 length.
+ }
}
}
@@ -27,196 +28,135 @@ object Metaphone extends StringAlgorithm {
}
}
- private[this] def exceptions(ca: Array[Char]): Array[Char] = {
- val deduplicate = (x: Array[Char]) => {
- x.sliding(2).filter(a => a(0) == 'c' || a(0) != a(1)).map(a => a(0)).toArray[Char] :+ x.last
- }
-
- ca.length match {
- case 0 => Array.empty[Char]
- case 1 if ca.head == 'x' => Array('s')
- case 1 => ca
- case _ if ca.head == 'x' => 's' +: deduplicate(ca.tail)
- case _ => {
- "" + ca.head + ca.tail.head match {
- case "ae" | "gn" | "kn" | "pn" | "wr" => ca.tail
- case "wh" => 'w' +: deduplicate(ca.tail.tail)
- case _ => deduplicate(ca)
- }
- }
- }
+ private[this] def deduplicate(ca: Array[Char]) = {
+ if (ca.length <= 1) ca
+ else
+ ca.sliding(2).filter(a => a(0) == 'c' || a(0) != a(1)).map(a => a(0)).toArray[Char] :+ ca.last
}
private[this] def isVowel(c: Char) = (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u')
@tailrec
- private[this] def transformations(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = {
+ private[this] def transcode(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = {
if (c == '\0' && r.length == 0) o
else {
- val lshift = (d: Int) => if (d == 1) l :+ c else (l :+ c) ++ r.take(d - 1)
- val cshift = (d: Int) => if (d > r.length) '\0' else r(d - 1)
- val rshift = (d: Int) => if (d >= r.length) Array.empty[Char] else r.drop(d)
+ val shift = (d: Int, ca: Array[Char]) => {
+ val sa = r.splitAt(d - 1)
- c match {
- case 'a' | 'e' | 'i' | 'o' | 'u' => {
- if (l.length == 0)
- transformations(lshift(1), cshift(1), rshift(1), o :+ c)
- else
- transformations(lshift(1), cshift(1), rshift(1), o)
- }
- case 'f' | 'j' | 'l' | 'm' | 'n' | 'r' => transformations(lshift(1), cshift(1), rshift(1), o :+ c)
- case 'b' => {
- if (
- (l.length >= 1 && l.last == 'm' && r.length == 0)
- )
- transformations(lshift(1), cshift(1), rshift(1), o)
- else
- transformations(lshift(1), cshift(1), rshift(1), o :+ 'b')
- }
- case 'c' => {
- if (
- (r.length >= 1 && r.head == 'h' && l.length >= 1 && l.last == 's')
- )
- transformations(lshift(1), cshift(1), rshift(1), o :+ 'k')
- else if (
- (r.length >= 2 && r.head == 'i' && r.tail.head == 'a')
- )
- transformations(lshift(3), cshift(3), rshift(3), o :+ 'x')
- else if (
- (r.length >= 1 && r.head == 'h') ||
- (l.length >= 1 && r.length >= 1 && l.last == 's' && r.head == 'h')
- )
- transformations(lshift(2), cshift(2), rshift(2), o :+ 'x')
- else if (
- (l.length >= 1 && r.length >= 1 && l.last == 's' && (
- r.head == 'i' ||
- r.head == 'e' ||
- r.head == 'y'
- )
- )
- )
- transformations(lshift(1), cshift(1), rshift(1), o)
- else if (
- (r.length >= 1 && (
- r.head == 'i' ||
- r.head == 'e' ||
- r.head == 'y'
- )
- )
- )
- transformations(lshift(1), cshift(1), rshift(1), o :+ 's')
- else
- transformations(lshift(1), cshift(1), rshift(1), o :+ 'k')
- }
- case 'd' => {
- if (
- (r.length >= 2 && r.head == 'g' && (
- r.tail.head == 'e' ||
- r.tail.head == 'y' ||
- r.tail.head == 'i'
- )
+ (
+ if (sa._1.length > 0) (l :+ c) ++ sa._1 else l :+ c,
+ if (sa._2.length > 0) sa._2.head else '\0',
+ if (sa._2.length > 1) sa._2.tail else Array.empty[Char],
+ ca
+ )
+ }
+
+ val t = {
+ c match {
+ case 'a' | 'e' | 'i' | 'o' | 'u' => if (l.length == 0) shift(1, o:+ c) else shift(1, o)
+ case 'f' | 'j' | 'l' | 'm' | 'n' | 'r' => shift(1, o :+ c)
+ case 'b' => if (l.length >= 1 && l.last == 'm' && r.length == 0) shift(1, o) else shift(1, o :+ 'b')
+ case 'c' => {
+ if (r.length >= 1 && r.head == 'h' && l.length >= 1 && l.last == 's')
+ shift(1, o :+ 'k')
+ else if (r.length >= 2 && r.head == 'i' && r(1) == 'a')
+ shift(3, o :+ 'x')
+ else if (
+ (r.length >= 1 && r.head == 'h') ||
+ (l.length >= 1 && r.length >= 1 && l.last == 's' && r.head == 'h')
)
- )
- transformations(lshift(1), cshift(1), rshift(1), o :+ 'j') // just d
- else
- transformations(lshift(1), cshift(1), rshift(1), o :+ 't')
- }
- case 'g' => {
- if (
- (r.length > 1 && r.head == 'h') ||
- (r.length == 1 && r.head == 'n') ||
- (r.length == 3 && r.head == 'n' && r.tail.head == 'e' && r.tail.tail.head == 'd')
- )
- transformations(lshift(1), cshift(1), rshift(1), o) // just g
- else if (
- (r.length >= 1 && (
- r.head == 'i' ||
- r.head == 'e' ||
- r.head == 'y'
+ shift(2, o :+ 'x')
+ else if (l.length >= 1 && r.length >= 1 && l.last == 's' && (
+ r.head == 'i' || r.head == 'e' || r.head == 'y'
)
)
- )
- transformations(lshift(2), cshift(2), rshift(2), o :+ 'j')
- else
- transformations(lshift(1), cshift(1), rshift(1), o :+ 'k')
- }
- case 'h' => {
- if (
- (l.length >= 1 && isVowel(l.last) && (r.length == 0 || !isVowel(r.head))) ||
- (l.length >= 2 && l.last == 'h' && (
- l(l.length - 2) == 'c' ||
- l(l.length - 2) == 's' ||
- l(l.length - 2) == 'p' ||
- l(l.length - 2) == 't' ||
- l(l.length - 2) == 'g'
+ shift(1, o)
+ else if (r.length >= 1 && (r.head == 'i' || r.head == 'e' || r.head == 'y'))
+ shift(1, o :+ 's')
+ else
+ shift(1, o :+ 'k')
+ }
+ case 'd' => {
+ if (r.length >= 2 && r.head == 'g' && (
+ r(1) == 'e' || r(1) == 'y' || r(1) == 'i'
)
)
- )
- transformations(lshift(1), cshift(1), rshift(1), o)
- else
- transformations(lshift(1), cshift(1), rshift(1), o :+ 'h')
- }
- case 'k' => {
- if (
- (l.length >= 1 && l.last == 'c') // raw
- )
- transformations(lshift(1), cshift(1), rshift(1), o)
- else
- transformations(lshift(1), cshift(1), rshift(1), o :+ 'k')
- }
- case 'p' => {
- if (
- (r.length >= 1 && r.head == 'h')
- )
- transformations(lshift(2), cshift(2), rshift(2), o :+ 'f')
- else
- transformations(lshift(1), cshift(1), rshift(1), o :+ 'p')
- }
- case 'q' => transformations(lshift(1), cshift(1), rshift(1), o :+ 'k')
- case 's' => {
- if (
-
- (r.length >= 2 && r.head == 'i' && (
- r.tail.head == 'o' ||
- r.tail.head == 'a'
- )
+ shift(1, o :+ 'j')
+ else
+ shift(1, o :+ 't')
+ }
+ case 'g' => {
+ if (
+ (r.length > 1 && r.head == 'h') ||
+ (r.length == 1 && r.head == 'n') ||
+ (r.length == 3 && r.head == 'n' && r(1) == 'e' && r(2) == 'd')
)
- )
- transformations(lshift(3), cshift(3), rshift(3), o :+ 'x')
- else if (
- (r.length >= 1 && r.head == 'h')
- )
- transformations(lshift(2), cshift(2), rshift(2), o :+ 'x')
- else
- transformations(lshift(1), cshift(1), rshift(1), o :+ 's')
- }
- case 't' => {
- if (
- (r.length >= 2 && r.head == 'i' && (
- r.tail.head == 'a' ||
- r.tail.head == 'o'
+ shift(1, o)
+ else if (r.length >= 1 && (r.head == 'i' || r.head == 'e' || r.head == 'y'))
+ shift(2, o :+ 'j')
+ else
+ shift(1, o :+ 'k')
+ }
+ case 'h' => {
+ if (
+ (l.length >= 1 && isVowel(l.last) && (r.length == 0 || !isVowel(r.head))) ||
+ (l.length >= 2 && l.last == 'h' && (
+ l(l.length - 2) == 'c' || l(l.length - 2) == 's' || l(l.length - 2) == 'p' ||
+ l(l.length - 2) == 't' || l(l.length - 2) == 'g'
+ )
)
)
- )
- transformations(lshift(3), cshift(3), rshift(3), o :+ 'x')
- else if (r.length >= 1 && r.head == 'h')
- transformations(lshift(2), cshift(2), rshift(2), o :+ '0')
- else if (r.length >= 2 && r.head == 'c' && r.tail.head == 'h')
- transformations(lshift(1), cshift(1), rshift(1), o) // only t
- else
- transformations(lshift(1), cshift(1), rshift(1), o :+ 't')
+ shift(1, o)
+ else
+ shift(1, o :+ 'h')
+ }
+ case 'k' => if (l.length >= 1 && l.last == 'c') shift(1, o) else shift(1, o :+ 'k')
+ case 'p' => if (r.length >= 1 && r.head == 'h') shift(2, o :+ 'f') else shift(1, o :+ 'p')
+ case 'q' => shift(1, o :+ 'k')
+ case 's' => {
+ if (r.length >= 2 && r.head == 'i' && (r(1) == 'o' || r(1) == 'a'))
+ shift(3, o :+ 'x')
+ else if (r.length >= 1 && r.head == 'h')
+ shift(2, o :+ 'x')
+ else
+ shift(1, o :+ 's')
+ }
+ case 't' => {
+ if (r.length >= 2 && r.head == 'i' && (r(1) == 'a' || r(1) == 'o'))
+ shift(3, o :+ 'x')
+ else if (r.length >= 1 && r.head == 'h')
+ shift(2, o :+ '0')
+ else if (r.length >= 2 && r.head == 'c' && r(1) == 'h')
+ shift(1, o)
+ else
+ shift(1, o :+ 't')
+ }
+ case 'v' => shift(1, o :+ 'f')
+ case 'w' | 'y' => if (r.length == 0 || !isVowel(r.head)) shift(1, o) else shift(1, o :+ c)
+ case 'x' => shift(1, (o :+ 'k') :+ 's')
+ case 'z' => shift(1, o :+ 's')
+ case _ => shift(1, o)
}
- case 'v' => transformations(lshift(1), cshift(1), rshift(1), o :+ 'f')
- case 'w' | 'y' => {
- if (r.length == 0 || !isVowel(r.head))
- transformations(lshift(1), cshift(1), rshift(1), o)
- else
- transformations(lshift(1), cshift(1), rshift(1), o :+ c)
- }
- case 'x' => transformations(lshift(1), cshift(1), rshift(1), (o :+ 'k') :+ 's')
- case 'z' => transformations(lshift(1), cshift(1), rshift(1), o :+ 's')
- case _ => transformations(lshift(1), cshift(1), rshift(1), o)
}
+
+ transcode(t._1, t._2, t._3, t._4)
}
}
+
+ private[this] def transcodeHead(ca: Array[Char]) = {
+ val h = ca.take(2).padTo(2, '\0')
+
+ if (
+ (h.head == 'a' && h.last == 'e') ||
+ (h.last == 'n' && (h.head == 'g' || h.head == 'k' || h.head == 'p')) ||
+ (h.head == 'w' && h.last == 'r')
+ )
+ ca.tail
+ else if (h.head == 'w' && h.last == 'h')
+ 'w' +: ca.drop(2)
+ else if (h.head == 'x')
+ 's' +: ca.tail
+ else
+ ca
+ }
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala
index bd3f9bc..895c1a5 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/MetaphoneMetric.scala
@@ -21,7 +21,6 @@ object MetaphoneMetric extends StringMetric {
}
override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Boolean] = {
- // Unable to perform simple equality check, due to situations where no letters are passed.
compare(
stringFilter.filter(string1.toCharArray),
stringFilter.filter(string2.toCharArray)
diff --git a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala
index 4d69b28..33b285c 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/phonetic/Soundex.scala
@@ -10,20 +10,17 @@ object Soundex extends StringAlgorithm {
if (ca.length == 0) None
else {
- letter(ca, 0) match {
- case Some(l) => {
- if (ca.length - 1 == l._2) Some(Array(l._1, '0', '0', '0'))
- else {
- Some(
- code(
- ca.takeRight(ca.length - (l._2 + 1)),
- l._1, // Pass first letter.
- Array(l._1) // Pass array with first letter.
- ).padTo(4, '0')
- )
- }
- }
- case None => None
+ val fc = ca.head.toLower
+
+ if (fc < 97 || fc > 122) None
+ else {
+ Some(
+ transcode(
+ ca.tail,
+ fc, // Pass first letter.
+ Array(fc) // Pass array with first letter.
+ ).padTo(4, '0')
+ )
}
}
}
@@ -36,18 +33,7 @@ object Soundex extends StringAlgorithm {
}
@tailrec
- private[this] def letter(i: Array[Char], ind: Int): Option[Tuple2[Char, Int]] = {
- if (i.length == 0) None
- else {
- val c = i.head.toLower
-
- if (c >= 97 && c <= 122) Some((c, ind)) else letter(i.tail, ind + 1)
- }
- }
-
- @tailrec
- private[this] def code(i: Array[Char], p: Char, o: Array[Char]): Array[Char] = {
- require(p >= 97 && p <= 122)
+ private[this] def transcode(i: Array[Char], p: Char, o: Array[Char]): Array[Char] = {
require(o.length > 0)
if (i.length == 0) o
@@ -88,7 +74,7 @@ object Soundex extends StringAlgorithm {
if (o.length == 3 && a != '\0') o :+ a
else
- code(i.tail, c, if (a != '\0') o :+ a else o)
+ transcode(i.tail, c, if (a != '\0') o :+ a else o)
}
}
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
index 64a3c1a..d5e7eac 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/DiceSorensenMetric.scala
@@ -10,6 +10,7 @@ object DiceSorensenMetric extends StringMetric {
val ca2 = stringFilter.filter(charArray2)
if (ca1.length == 0 || ca2.length == 0) None
+ else if (ca1.sameElements(ca2)) Some(1d)
else {
val b = bigrams(ca1, ca2)
val ms = scoreMatches(b)
@@ -19,12 +20,10 @@ object DiceSorensenMetric extends StringMetric {
}
override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Double] = {
- if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(1d)
- else
- compare(
- stringFilter.filter(string1.toCharArray),
- stringFilter.filter(string2.toCharArray)
- )(new StringFilterDelegate)
+ compare(
+ stringFilter.filter(string1.toCharArray),
+ stringFilter.filter(string2.toCharArray)
+ )(new StringFilterDelegate)
}
private[this] def bigrams(ct: CompareTuple[Char]): MatchTuple[String] = {
@@ -32,7 +31,7 @@ object DiceSorensenMetric extends StringMetric {
def set(ca: Array[Char], sa: Array[String]): Array[String] = {
if (ca.length <= 1) sa
else
- set(ca.tail, sa :+ "" + ca.head + ca.tail.head)
+ set(ca.tail, sa :+ "" + ca.head + ca(1))
}
(set(ct._1, Array.empty[String]), set(ct._2, Array.empty[String]))
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/HammingMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/HammingMetric.scala
index 04001f5..2dc1fbc 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/similarity/HammingMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/HammingMetric.scala
@@ -9,16 +9,15 @@ object HammingMetric extends StringMetric {
val ca2 = stringFilter.filter(charArray2)
if (ca1.length == 0 || ca2.length == 0 || ca1.length != ca2.length) None
+ else if (ca1.sameElements(ca2)) Some(0)
else Some(hamming(ca1, ca2))
}
override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Int] = {
- if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(0)
- else
- compare(
- stringFilter.filter(string1.toCharArray),
- stringFilter.filter(string2.toCharArray)
- )(new StringFilterDelegate)
+ compare(
+ stringFilter.filter(string1.toCharArray),
+ stringFilter.filter(string2.toCharArray)
+ )(new StringFilterDelegate)
}
private[this] def hamming(ct: CompareTuple[Char]) = {
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroMetric.scala
index 503b497..2ad137f 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroMetric.scala
@@ -14,6 +14,7 @@ object JaroMetric extends StringMetric {
val ca2 = stringFilter.filter(charArray2)
if (ca1.length == 0 || ca2.length == 0) None
+ else if (ca1.sameElements(ca2)) Some(1d)
else {
val mt = `match`((ca1, ca2))
val ms = scoreMatches((mt._1, mt._2))
@@ -26,12 +27,10 @@ object JaroMetric extends StringMetric {
}
override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Double] = {
- if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(1d)
- else
- compare(
- stringFilter.filter(string1.toCharArray),
- stringFilter.filter(string2.toCharArray)
- )(new StringFilterDelegate)
+ compare(
+ stringFilter.filter(string1.toCharArray),
+ stringFilter.filter(string2.toCharArray)
+ )(new StringFilterDelegate)
}
private[this] def `match`(ct: CompareTuple[Char]): MatchTuple[Char] = {
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroWinklerMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroWinklerMetric.scala
index 9b797bc..a9f007e 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroWinklerMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/JaroWinklerMetric.scala
@@ -13,6 +13,8 @@ object JaroWinklerMetric extends StringMetric {
val ca2 = stringFilter.filter(charArray2)
JaroMetric.compare(ca1, ca2)(new StringFilterDelegate) match {
+ case Some(0d) => Some(0d)
+ case Some(1d) => Some(1d)
case Some(jaro) => {
val prefix = ca1.zip(ca2).takeWhile(t => t._1 == t._2).map(_._1)
@@ -23,11 +25,9 @@ object JaroWinklerMetric extends StringMetric {
}
override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Double] = {
- if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(1d)
- else
- compare(
- stringFilter.filter(string1.toCharArray),
- stringFilter.filter(string2.toCharArray)
- )(new StringFilterDelegate)
+ compare(
+ stringFilter.filter(string1.toCharArray),
+ stringFilter.filter(string2.toCharArray)
+ )(new StringFilterDelegate)
}
} \ No newline at end of file
diff --git a/core/source/core/scala/org/hashtree/stringmetric/similarity/LevenshteinMetric.scala b/core/source/core/scala/org/hashtree/stringmetric/similarity/LevenshteinMetric.scala
index 3e6a926..4e58f02 100755
--- a/core/source/core/scala/org/hashtree/stringmetric/similarity/LevenshteinMetric.scala
+++ b/core/source/core/scala/org/hashtree/stringmetric/similarity/LevenshteinMetric.scala
@@ -9,49 +9,47 @@ object LevenshteinMetric extends StringMetric {
val ca2 = stringFilter.filter(charArray2)
if (ca1.length == 0 && ca2.length == 0) None
- else {
- val levenshteinMemoize = Memoize.Y(levenshtein)
-
- Some(levenshteinMemoize(ca1, ca2))
- }
+ else if (ca1.length == 0) Some(ca2.length)
+ else if (ca2.length == 0) Some(ca1.length)
+ else Some(levenshtein(ca1, ca2))
}
override def compare(string1: String, string2: String)(implicit stringFilter: StringFilter): Option[Int] = {
- if (string1.length > 0 && string1.length == string2.length && string1 == string2) Some(0)
- else
- compare(
- stringFilter.filter(string1.toCharArray),
- stringFilter.filter(string2.toCharArray)
- )(new StringFilterDelegate)
+ compare(
+ stringFilter.filter(string1.toCharArray),
+ stringFilter.filter(string2.toCharArray)
+ )(new StringFilterDelegate)
}
- private[this] def levenshtein(f: CompareTuple[Char] => Int)(ct: CompareTuple[Char]): Int = {
- if (ct._1.length == 0) ct._2.length
- else if (ct._2.length == 0) ct._1.length
- else {
- math.min(
- math.min(
- f(ct._1.tail, ct._2) + 1,
- f(ct._1, ct._2.tail) + 1
- ),
- f(ct._1.tail, ct._2.tail) + (if (ct._1.head != ct._2.head) 1 else 0)
- )
+ private[this] def levenshtein(ct: CompareTuple[Char]) = {
+ val m = Array.fill[Int](ct._1.length + 1, ct._2.length + 1)(-1)
+
+ def distance(t: Tuple2[Int, Int]): Int = {
+ t match {
+ case (r, 0) => r
+ case (0, c) => c
+ case (r, c) if m(r)(c) != -1 => m(r)(c)
+ case (r, c) => {
+ val min = {
+ if (ct._1(r - 1) == ct._2(c - 1))
+ distance(r - 1, c - 1)
+ else {
+ math.min(
+ math.min(
+ distance(r - 1, c) + 1, // Delete (left).
+ distance(r, c - 1) + 1 // Insert (up).
+ ),
+ distance(r - 1, c - 1) + 1 // Substitute (left-up).
+ )
+ }
+ }
+
+ m(r)(c) = min
+ min
+ }
+ }
}
- }
- private[this] final class Memoize[-T, +R](f: T => R) extends (T => R) {
- private[this] val map = scala.collection.mutable.Map.empty[T, R]
-
- def apply(k: T): R = map.getOrElseUpdate(k, f(k))
- }
-
- private[this] object Memoize {
- def apply[T, R](f: T => R) = new Memoize(f)
-
- def Y[T, R](f: (T => R) => T => R): (T => R) = {
- lazy val yf: T => R = Memoize(f(yf)(_))
-
- yf
- }
+ distance(ct._1.length, ct._2.length)
}
} \ No newline at end of file
diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala
index 2324298..2cfcc36 100755
--- a/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala
+++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/MetaphoneSpec.scala
@@ -25,24 +25,33 @@ final class MetaphoneSpec extends ScalaTest {
Metaphone.compute("zz").get should equal ("s")
// y
+ Metaphone.compute("y").isDefined should be (false)
Metaphone.compute("zy").get should equal ("s")
Metaphone.compute("zyz").get should equal ("ss")
Metaphone.compute("zya").get should equal ("sy")
// x
+ Metaphone.compute("x").get should equal ("s")
Metaphone.compute("zx").get should equal ("sks")
Metaphone.compute("zxz").get should equal ("skss")
// w
+ Metaphone.compute("w").isDefined should be (false)
Metaphone.compute("zw").get should equal ("s")
Metaphone.compute("zwz").get should equal ("ss")
Metaphone.compute("zwa").get should equal ("sw")
// v
+ Metaphone.compute("v").get should equal ("f")
Metaphone.compute("zv").get should equal ("sf")
Metaphone.compute("zvz").get should equal ("sfs")
+ // u
+ Metaphone.compute("u").get should equal ("u")
+ Metaphone.compute("zu").get should equal ("s")
+
// t
+ Metaphone.compute("t").get should equal ("t")
Metaphone.compute("ztiaz").get should equal ("sxs")
Metaphone.compute("ztioz").get should equal ("sxs")
Metaphone.compute("zthz").get should equal ("s0s")
@@ -50,6 +59,7 @@ final class MetaphoneSpec extends ScalaTest {
Metaphone.compute("ztz").get should equal ("sts")
// s
+ Metaphone.compute("s").get should equal ("s")
Metaphone.compute("zshz").get should equal ("sxs")
Metaphone.compute("zsioz").get should equal ("sxs")
Metaphone.compute("zsiaz").get should equal ("sxs")
@@ -57,45 +67,64 @@ final class MetaphoneSpec extends ScalaTest {
Metaphone.compute("zsz").get should equal ("sss")
// r
+ Metaphone.compute("r").get should equal ("r")
Metaphone.compute("zr").get should equal ("sr")
Metaphone.compute("zrz").get should equal ("srs")
// q
+ Metaphone.compute("q").get should equal ("k")
Metaphone.compute("zq").get should equal ("sk")
Metaphone.compute("zqz").get should equal ("sks")
// p
- Metaphone.compute("zph").get should equal ("sf")
+ Metaphone.compute("p").get should equal ("p")
Metaphone.compute("zp").get should equal ("sp")
+ Metaphone.compute("zph").get should equal ("sf")
Metaphone.compute("zpz").get should equal ("sps")
+ // o
+ Metaphone.compute("o").get should equal ("o")
+ Metaphone.compute("zo").get should equal ("s")
+
// n
+ Metaphone.compute("n").get should equal ("n")
Metaphone.compute("zn").get should equal ("sn")
Metaphone.compute("znz").get should equal ("sns")
// m
+ Metaphone.compute("m").get should equal ("m")
Metaphone.compute("zm").get should equal ("sm")
Metaphone.compute("zmz").get should equal ("sms")
// l
+ Metaphone.compute("l").get should equal ("l")
Metaphone.compute("zl").get should equal ("sl")
Metaphone.compute("zlz").get should equal ("sls")
// k
- Metaphone.compute("zck").get should equal ("sk")
+ Metaphone.compute("k").get should equal ("k")
Metaphone.compute("zk").get should equal ("sk")
+ Metaphone.compute("zck").get should equal ("sk")
// j
+ Metaphone.compute("j").get should equal ("j")
Metaphone.compute("zj").get should equal ("sj")
Metaphone.compute("zjz").get should equal ("sjs")
+ // i
+ Metaphone.compute("i").get should equal ("i")
+ Metaphone.compute("zi").get should equal ("s")
+
// h
+ Metaphone.compute("h").get should equal ("h") // php wrongly says nothing
Metaphone.compute("zh").get should equal ("sh") // php wrongly says s
Metaphone.compute("zah").get should equal ("s")
Metaphone.compute("zchh").get should equal ("sx")
Metaphone.compute("ha").get should equal ("h")
// g
+ Metaphone.compute("g").get should equal ("k")
+ Metaphone.compute("zg").get should equal ("sk")
Metaphone.compute("zgh").get should equal ("skh") // php wrongly says sf
Metaphone.compute("zghz").get should equal ("shs") // php wrongly says sfs
Metaphone.compute("zgha").get should equal ("sh") // php wrongly says sf others wrongly say skh
@@ -109,14 +138,19 @@ final class MetaphoneSpec extends ScalaTest {
Metaphone.compute("zgez").get should equal ("sjs")
Metaphone.compute("zgy").get should equal ("sj")
Metaphone.compute("zgyz").get should equal ("sjs")
- Metaphone.compute("zg").get should equal ("sk")
Metaphone.compute("zgz").get should equal ("sks")
// f
+ Metaphone.compute("f").get should equal ("f")
Metaphone.compute("zf").get should equal ("sf")
Metaphone.compute("zfz").get should equal ("sfs")
+ // e
+ Metaphone.compute("e").get should equal ("e")
+ Metaphone.compute("ze").get should equal ("s")
+
// d
+ Metaphone.compute("d").get should equal ("t")
Metaphone.compute("fudge").get should equal ("fjj") // php wrongly says fj
Metaphone.compute("dodgy").get should equal ("tjj") // php wrongly says tj others wrongly say tjjy
Metaphone.compute("dodgi").get should equal ("tjj") // php wrongly says tj
@@ -124,6 +158,7 @@ final class MetaphoneSpec extends ScalaTest {
Metaphone.compute("zdz").get should equal ("sts")
// c
+ Metaphone.compute("c").get should equal ("k")
Metaphone.compute("zcia").get should equal ("sx")
Metaphone.compute("zciaz").get should equal ("sxs")
Metaphone.compute("zch").get should equal ("sx")
@@ -145,10 +180,15 @@ final class MetaphoneSpec extends ScalaTest {
Metaphone.compute("zcz").get should equal ("sks")
// b
+ Metaphone.compute("b").get should equal ("b")
Metaphone.compute("zb").get should equal ("sb")
Metaphone.compute("zbz").get should equal ("sbs")
Metaphone.compute("zmb").get should equal ("sm")
+ // a
+ Metaphone.compute("a").get should equal ("a")
+ Metaphone.compute("za").get should equal ("s")
+
// Miscellaneous.
Metaphone.compute("dumb").get should equal ("tm")
Metaphone.compute("smith").get should equal ("sm0")
diff --git a/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala
index 13c6bdd..dbefc51 100755
--- a/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala
+++ b/core/source/test/scala/org/hashtree/stringmetric/phonetic/SoundexSpec.scala
@@ -43,6 +43,7 @@ final class SoundexSpec extends ScalaTest {
Soundex.compute("kant").get should equal ("k530")
Soundex.compute("ladd").get should equal ("l300")
Soundex.compute("lissajous").get should equal ("l222")
+ Soundex.compute("x123").get should equal ("x000")
}
}
}
diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/JaroMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/JaroMetricSpec.scala
index 8fd12b1..318f241 100755
--- a/core/source/test/scala/org/hashtree/stringmetric/similarity/JaroMetricSpec.scala
+++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/JaroMetricSpec.scala
@@ -28,17 +28,17 @@ final class JaroMetricSpec extends ScalaTest {
}
"valid arguments" should returns {
"Double indicating distance" in {
- JaroWinklerMetric.compare("aa", "a").get should be (0.8500000000000001)
- JaroWinklerMetric.compare("a", "aa").get should be (0.8500000000000001)
- JaroWinklerMetric.compare("veryveryverylong", "v").get should be (0.71875)
- JaroWinklerMetric.compare("v", "veryveryverylong").get should be (0.71875)
- JaroWinklerMetric.compare("martha", "marhta").get should be (0.9611111111111111)
- JaroWinklerMetric.compare("dwayne", "duane").get should be (0.8400000000000001)
- JaroWinklerMetric.compare("dixon", "dicksonx").get should be (0.8133333333333332)
- JaroWinklerMetric.compare("abcvwxyz", "cabvwxyz").get should be (0.9583333333333334)
- JaroWinklerMetric.compare("jones", "johnson").get should be (0.8323809523809523)
- JaroWinklerMetric.compare("henka", "henkan").get should be (0.9666666666666667)
- JaroWinklerMetric.compare("fvie", "ten").get should be (0)
+ JaroMetric.compare("aa", "a").get should be (0.8333333333333334)
+ JaroMetric.compare("a", "aa").get should be (0.8333333333333334)
+ JaroMetric.compare("veryveryverylong", "v").get should be (0.6875)
+ JaroMetric.compare("v", "veryveryverylong").get should be (0.6875)
+ JaroMetric.compare("martha", "marhta").get should be (0.9444444444444445)
+ JaroMetric.compare("dwayne", "duane").get should be (0.8222222222222223)
+ JaroMetric.compare("dixon", "dicksonx").get should be (0.7666666666666666)
+ JaroMetric.compare("abcvwxyz", "cabvwxyz").get should be (0.9583333333333334)
+ JaroMetric.compare("jones", "johnson").get should be (0.7904761904761904)
+ JaroMetric.compare("henka", "henkan").get should be (0.9444444444444445)
+ JaroMetric.compare("fvie", "ten").get should be (0)
JaroMetric.compare("zac ephron", "zac efron").get should be >
JaroMetric.compare("zac ephron", "kai ephron").get
diff --git a/core/source/test/scala/org/hashtree/stringmetric/similarity/LevenshteinMetricSpec.scala b/core/source/test/scala/org/hashtree/stringmetric/similarity/LevenshteinMetricSpec.scala
index 53ac7e6..6091cf5 100755
--- a/core/source/test/scala/org/hashtree/stringmetric/similarity/LevenshteinMetricSpec.scala
+++ b/core/source/test/scala/org/hashtree/stringmetric/similarity/LevenshteinMetricSpec.scala
@@ -31,8 +31,18 @@ final class LevenshteinMetricSpec extends ScalaTest {
LevenshteinMetric.compare("a", "abc").get should be (2)
LevenshteinMetric.compare("abc", "c").get should be (2)
LevenshteinMetric.compare("c", "abc").get should be (2)
+ LevenshteinMetric.compare("sitting", "kitten").get should be (3)
LevenshteinMetric.compare("kitten", "sitting").get should be (3)
+ LevenshteinMetric.compare("cake", "drake").get should be (2)
LevenshteinMetric.compare("drake", "cake").get should be (2)
+ LevenshteinMetric.compare("saturday", "sunday").get should be (3)
+ LevenshteinMetric.compare("sunday", "saturday").get should be (3)
+ LevenshteinMetric.compare("book", "back").get should be (2)
+ LevenshteinMetric.compare("dog", "fog").get should be (1)
+ LevenshteinMetric.compare("foq", "fog").get should be (1)
+ LevenshteinMetric.compare("fvg", "fog").get should be (1)
+ LevenshteinMetric.compare("encyclopedia", "encyclopediaz").get should be (1)
+ LevenshteinMetric.compare("encyclopediz", "encyclopediaz").get should be (1)
}
}
}