diff options
Diffstat (limited to 'stringmetric-core/source/core/scala')
46 files changed, 1719 insertions, 0 deletions
diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Algorithm.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Algorithm.scala new file mode 100755 index 0000000..10bc2cd --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Algorithm.scala @@ -0,0 +1,5 @@ +package com.rockymadden.stringmetric + +trait Algorithm[A, B, C] { + def compute(a: A)(implicit b: B): Option[C] +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Alphabet.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Alphabet.scala new file mode 100755 index 0000000..d2ede81 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Alphabet.scala @@ -0,0 +1,55 @@ +package com.rockymadden.stringmetric + +import scala.collection.immutable.Set + +object Alphabet { + protected sealed abstract class AlphabetSet { + protected[Alphabet] val Chars: Set[Char] + + def isSuperset(char: Char): Boolean = Chars.contains(char) + + def isSuperset(charArray: Array[Char]): Boolean = + charArray.length > 0 && charArray.takeWhile(Chars.contains(_)).length == charArray.length + + def isSuperset(string: String): Boolean = isSuperset(string.toCharArray) + } + + case object LowercaseConsonant extends AlphabetSet { + override protected[Alphabet] final val Chars = + Set('b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x' ,'z') + } + case object UppercaseConsonant extends AlphabetSet { + override protected[Alphabet] final val Chars = + Set('B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X' ,'Z') + } + case object Consonant extends AlphabetSet { + override protected[Alphabet] final val Chars = LowercaseConsonant.Chars ++ UppercaseConsonant.Chars + } + case object LowercaseVowel extends AlphabetSet { + override protected[Alphabet] final val Chars = Set('a', 'e', 'i', 'o', 'u') + } + case object UppercaseVowel extends AlphabetSet { + override protected[Alphabet] final val Chars = Set('A', 'E', 'I', 'O', 'U') + } + case object Vowel extends AlphabetSet { + override protected[Alphabet] final val Chars = LowercaseVowel.Chars ++ UppercaseVowel.Chars + } + case object LowercaseY extends AlphabetSet { + override protected[Alphabet] final val Chars = Set('y') + } + case object UppercaseY extends AlphabetSet { + override protected[Alphabet] final val Chars = Set('Y') + } + case object Y extends AlphabetSet { + override protected[Alphabet] final val Chars = LowercaseY.Chars ++ UppercaseY.Chars + } + case object LowercaseAlpha extends AlphabetSet { + override protected[Alphabet] final val Chars = LowercaseConsonant.Chars ++ LowercaseVowel.Chars ++ LowercaseY.Chars + } + case object UppercaseAlpha extends AlphabetSet { + override protected[Alphabet] final val Chars = UppercaseConsonant.Chars ++ UppercaseVowel.Chars ++ UppercaseY.Chars + } + case object Alpha extends AlphabetSet { + override protected[Alphabet] final val Chars = LowercaseAlpha.Chars ++ UppercaseAlpha.Chars + } +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Filter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Filter.scala new file mode 100755 index 0000000..2a02f6b --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Filter.scala @@ -0,0 +1,5 @@ +package com.rockymadden.stringmetric + +trait Filter[A] extends Filterable[A] { + override def filter(a: A): A = a +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Filterable.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Filterable.scala new file mode 100755 index 0000000..77dc0bf --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Filterable.scala @@ -0,0 +1,5 @@ +package com.rockymadden.stringmetric + +trait Filterable[A] { + def filter(a: A): A +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Metric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Metric.scala new file mode 100755 index 0000000..6862321 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Metric.scala @@ -0,0 +1,5 @@ +package com.rockymadden.stringmetric + +trait Metric[A, B, C] { + def compare(a1: A, a2: A)(implicit b: B): Option[C] +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringAlgorithm.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringAlgorithm.scala new file mode 100755 index 0000000..0d194da --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringAlgorithm.scala @@ -0,0 +1,42 @@ +package com.rockymadden.stringmetric + +trait StringAlgorithm[A, B] extends Algorithm[String, A, B] { + def compute(charArray: Array[Char])(implicit a: A): Option[Array[Char]] +} + +object StringAlgorithm { + type Metaphone = com.rockymadden.stringmetric.phonetic.MetaphoneAlgorithm + val Metaphone = com.rockymadden.stringmetric.phonetic.MetaphoneAlgorithm + + type Nysiis = com.rockymadden.stringmetric.phonetic.NysiisAlgorithm + val Nysiis = com.rockymadden.stringmetric.phonetic.NysiisAlgorithm + + type RefinedNysiis = com.rockymadden.stringmetric.phonetic.RefinedNysiisAlgorithm + val RefinedNysiis = com.rockymadden.stringmetric.phonetic.RefinedNysiisAlgorithm + + type RefinedSoundex = com.rockymadden.stringmetric.phonetic.RefinedSoundexAlgorithm + val RefinedSoundex = com.rockymadden.stringmetric.phonetic.RefinedSoundexAlgorithm + + type Soundex = com.rockymadden.stringmetric.phonetic.SoundexAlgorithm + val Soundex = com.rockymadden.stringmetric.phonetic.SoundexAlgorithm + + def computeWithMetaphone(charArray: Array[Char]) = Metaphone.compute(charArray) + + def computeWithMetaphone(string: String) = Metaphone.compute(string) + + def computeWithNysiis(charArray: Array[Char]) = Nysiis.compute(charArray) + + def computeWithNysiis(string: String) = Nysiis.compute(string) + + def computeWithRefinedNysiis(charArray: Array[Char]) = RefinedNysiis.compute(charArray) + + def computeWithRefinedNysiis(string: String) = RefinedNysiis.compute(string) + + def computeWithRefinedSoundex(charArray: Array[Char]) = RefinedSoundex.compute(charArray) + + def computeWithRefinedSoundex(string: String) = RefinedSoundex.compute(string) + + def computeWithSoundex(charArray: Array[Char]) = Soundex.compute(charArray) + + def computeWithSoundex(string: String) = Soundex.compute(string) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringFilter.scala new file mode 100755 index 0000000..1430d34 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringFilter.scala @@ -0,0 +1,45 @@ +package com.rockymadden.stringmetric + +import com.rockymadden.stringmetric.filter.StringFilterDelegate + +trait StringFilter extends Filter[String] with StringFilterable { + override def filter(charArray: Array[Char]): Array[Char] = charArray +} + +object StringFilter { + type AsciiControl = com.rockymadden.stringmetric.filter.AsciiControlFilter + lazy val asciiControl = new StringFilterDelegate with AsciiControl + + type AsciiControlOnly = com.rockymadden.stringmetric.filter.AsciiControlOnlyFilter + lazy val asciiControlOnly = new StringFilterDelegate with AsciiControlOnly + + type AsciiLetterNumber = com.rockymadden.stringmetric.filter.AsciiLetterNumberFilter + lazy val asciiLetterNumber = new StringFilterDelegate with AsciiLetterNumber + + type AsciiLetterNumberOnly = com.rockymadden.stringmetric.filter.AsciiLetterNumberOnlyFilter + lazy val asciiLetterNumberOnly = new StringFilterDelegate with AsciiLetterNumberOnly + + type AsciiLetter = com.rockymadden.stringmetric.filter.AsciiLetterFilter + lazy val asciiLetter = new StringFilterDelegate with AsciiLetter + + type AsciiLetterOnly = com.rockymadden.stringmetric.filter.AsciiLetterOnlyFilter + lazy val asciiLetterOnly = new StringFilterDelegate with AsciiLetterOnly + + type AsciiNumber = com.rockymadden.stringmetric.filter.AsciiNumberFilter + lazy val asciiNumber = new StringFilterDelegate with AsciiNumber + + type AsciiNumberOnly = com.rockymadden.stringmetric.filter.AsciiNumberOnlyFilter + lazy val asciiNumberOnly = new StringFilterDelegate with AsciiNumberOnly + + type AsciiSpace = com.rockymadden.stringmetric.filter.AsciiSpaceFilter + lazy val asciiSpace = new StringFilterDelegate with AsciiSpace + + type AsciiSymbol = com.rockymadden.stringmetric.filter.AsciiSymbolFilter + lazy val asciiSymbol = new StringFilterDelegate with AsciiSymbol + + type AsciiSymbolOnly = com.rockymadden.stringmetric.filter.AsciiSymbolOnlyFilter + lazy val asciiSymbolOnly = new StringFilterDelegate with AsciiSymbolOnly + + type IgnoreAsciiLetterCase = com.rockymadden.stringmetric.filter.IgnoreAsciiLetterCaseFilter + lazy val ignoreAsciiLetterCase = new StringFilterDelegate with IgnoreAsciiLetterCase +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringFilterable.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringFilterable.scala new file mode 100755 index 0000000..d639dfb --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringFilterable.scala @@ -0,0 +1,5 @@ +package com.rockymadden.stringmetric + +trait StringFilterable extends Filterable[String] { + def filter(charArray: Array[Char]): Array[Char] +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringMetric.scala new file mode 100755 index 0000000..212f76d --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringMetric.scala @@ -0,0 +1,120 @@ +package com.rockymadden.stringmetric + +trait StringMetric[A, B] extends Metric[String, A, B] { + def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit a: A): Option[B] +} + +object StringMetric { + type DiceSorensen = com.rockymadden.stringmetric.similarity.DiceSorensenMetric + val DiceSorensen = com.rockymadden.stringmetric.similarity.DiceSorensenMetric + + type Hamming = com.rockymadden.stringmetric.similarity.HammingMetric + val Hamming = com.rockymadden.stringmetric.similarity.HammingMetric + + type Jaccard = com.rockymadden.stringmetric.similarity.JaccardMetric + val Jaccard = com.rockymadden.stringmetric.similarity.JaccardMetric + + type Jaro = com.rockymadden.stringmetric.similarity.JaroMetric + val Jaro = com.rockymadden.stringmetric.similarity.JaroMetric + + type JaroWinkler = com.rockymadden.stringmetric.similarity.JaroWinklerMetric + val JaroWinkler = com.rockymadden.stringmetric.similarity.JaroWinklerMetric + + type Levenshtein = com.rockymadden.stringmetric.similarity.LevenshteinMetric + val Levenshtein = com.rockymadden.stringmetric.similarity.LevenshteinMetric + + type Metaphone = com.rockymadden.stringmetric.phonetic.MetaphoneMetric + val Metaphone = com.rockymadden.stringmetric.phonetic.MetaphoneMetric + + type NGram = com.rockymadden.stringmetric.similarity.NGramMetric + val NGram = com.rockymadden.stringmetric.similarity.NGramMetric + + type Nysiis = com.rockymadden.stringmetric.phonetic.NysiisMetric + val Nysiis = com.rockymadden.stringmetric.phonetic.NysiisMetric + + type Overlap = com.rockymadden.stringmetric.similarity.OverlapMetric + val Overlap = com.rockymadden.stringmetric.similarity.OverlapMetric + + type RefinedNysiis = com.rockymadden.stringmetric.phonetic.RefinedNysiisMetric + val RefinedNysiis = com.rockymadden.stringmetric.phonetic.RefinedNysiisMetric + + type RefinedSoundex = com.rockymadden.stringmetric.phonetic.RefinedSoundexMetric + val RefinedSoundex = com.rockymadden.stringmetric.phonetic.RefinedSoundexMetric + + type Soundex = com.rockymadden.stringmetric.phonetic.SoundexMetric + val Soundex = com.rockymadden.stringmetric.phonetic.SoundexMetric + + type WeightedLevenshtein = com.rockymadden.stringmetric.similarity.WeightedLevenshteinMetric + val WeightedLevenshtein = com.rockymadden.stringmetric.similarity.WeightedLevenshteinMetric + + def compareWithDiceSorensen(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = + DiceSorensen.compare(charArray1, charArray2)(n) + + def compareWithDiceSorensen(string1: String, string2: String)(n: Int) = DiceSorensen.compare(string1, string2)(n) + + def compareWithHamming(charArray1: Array[Char], charArray2: Array[Char]) = Hamming.compare(charArray1, charArray2) + + def compareWithHamming(string1: String, string2: String)= Hamming.compare(string1, string2) + + def compareWithJaccard(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = + Jaccard.compare(charArray1, charArray2)(n) + + def compareWithJaccard(string1: String, string2: String)(n: Int) = Jaccard.compare(string1, string2)(n) + + def compareWithJaro(charArray1: Array[Char], charArray2: Array[Char]) = Jaro.compare(charArray1, charArray2) + + def compareWithJaro(string1: String, string2: String) = Jaro.compare(string1, string2) + + def compareWithJaroWinkler(charArray1: Array[Char], charArray2: Array[Char]) = + JaroWinkler.compare(charArray1, charArray2) + + def compareWithJaroWinkler(string1: String, string2: String) = JaroWinkler.compare(string1, string2) + + def compareWithLevenshtein(charArray1: Array[Char], charArray2: Array[Char]) = + Levenshtein.compare(charArray1, charArray2) + + def compareWithLevenshtein(string1: String, string2: String) = Levenshtein.compare(string1, string2) + + def compareWithMetaphone(charArray1: Array[Char], charArray2: Array[Char]) = + Metaphone.compare(charArray1, charArray2) + + def compareWithMetaphone(string1: String, string2: String) = Metaphone.compare(string1, string2) + + def compareWithNGram(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = + NGram.compare(charArray1, charArray2)(n) + + def compareWithNGram(string1: String, string2: String)(n: Int) = NGram.compare(string1, string2)(n) + + def compareWithNysiis(charArray1: Array[Char], charArray2: Array[Char]) = Nysiis.compare(charArray1, charArray2) + + def compareWithNysiis(string1: String, string2: String) = Nysiis.compare(string1, string2) + + def compareWithOverlap(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = + Overlap.compare(charArray1, charArray2)(n) + + def compareWithOverlap(string1: String, string2: String)(n: Int) = Overlap.compare(string1, string2)(n) + + def compareWithRefinedNysiis(charArray1: Array[Char], charArray2: Array[Char]) = + RefinedNysiis.compare(charArray1, charArray2) + + def compareWithRefinedNysiis(string1: String, string2: String) = RefinedNysiis.compare(string1, string2) + + def compareWithRefinedSoundex(charArray1: Array[Char], charArray2: Array[Char]) = + RefinedSoundex.compare(charArray1, charArray2) + + def compareWithRefinedSoundex(string1: String, string2: String) = RefinedSoundex.compare(string1, string2) + + def compareWithSoundex(charArray1: Array[Char], charArray2: Array[Char]) = Soundex.compare(charArray1, charArray2) + + def compareWithSoundex(string1: String, string2: String) = Soundex.compare(string1, string2) + + def compareWithWeightedLevenshtein(charArray1: Array[Char], charArray2: Array[Char]) + (options: (BigDecimal, BigDecimal, BigDecimal)) = + + WeightedLevenshtein.compare(charArray1, charArray2)(options) + + def compareWithWeightedLevenshtein(string1: String, string2: String) + (options: (BigDecimal, BigDecimal, BigDecimal)) = + + WeightedLevenshtein.compare(string1, string2)(options) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala new file mode 100755 index 0000000..bef56d9 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala @@ -0,0 +1,14 @@ +package com.rockymadden.stringmetric + +trait StringTokenizer[A, B] extends Tokenizer[String, A, B] { + def tokenize(charArray: Array[Char])(implicit a: A): Option[Array[Array[Char]]] +} + +object StringTokenizer { + type NGram = com.rockymadden.stringmetric.tokenization.NGramTokenizer + val NGram = com.rockymadden.stringmetric.tokenization.NGramTokenizer + + def tokenizeWithNGram(charArray: Array[Char])(n: Int) = NGram.tokenize(charArray)(n) + + def tokenizeWithNGram(string: String)(n: Int) = NGram.tokenize(string)(n) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala new file mode 100755 index 0000000..c9edae5 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala @@ -0,0 +1,5 @@ +package com.rockymadden.stringmetric + +trait Tokenizer[A, B, C] { + def tokenize(a: A)(implicit b: B): Option[C] +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiControlFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiControlFilter.scala new file mode 100755 index 0000000..bd45ecf --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiControlFilter.scala @@ -0,0 +1,11 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures ASCII controls do not matter. */ +trait AsciiControlFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = + super.filter(charArray.filter(c => !(c <= 31 || c == 127))) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiControlOnlyFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiControlOnlyFilter.scala new file mode 100755 index 0000000..c08b686 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiControlOnlyFilter.scala @@ -0,0 +1,11 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures only ASCII control characters matter. */ +trait AsciiControlOnlyFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = + super.filter(charArray.filter(c => (c <= 31 || c == 127))) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterFilter.scala new file mode 100755 index 0000000..24509cb --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterFilter.scala @@ -0,0 +1,11 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures ASCII letters do not matter. */ +trait AsciiLetterFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = + super.filter(charArray.filter(c => !((c >= 65 && c <= 90 ) || (c >= 97 && c <= 122)))) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterNumberFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterNumberFilter.scala new file mode 100755 index 0000000..e17c715 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterNumberFilter.scala @@ -0,0 +1,15 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures ASCII letters and numbers do not matter. */ +trait AsciiLetterNumberFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = + super.filter( + charArray.filter(c => + !((c >= 48 && c <= 57 ) || (c >= 65 && c <= 90 ) || (c >= 97 && c <= 122)) + ) + ) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterNumberOnlyFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterNumberOnlyFilter.scala new file mode 100755 index 0000000..7cf97ba --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterNumberOnlyFilter.scala @@ -0,0 +1,15 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures only ASCII letters and numbers matter. */ +trait AsciiLetterNumberOnlyFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = + super.filter( + charArray.filter(c => + ((c >= 48 && c <= 57 ) || (c >= 65 && c <= 90 ) || (c >= 97 && c <= 122)) + ) + ) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterOnlyFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterOnlyFilter.scala new file mode 100755 index 0000000..70032d9 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiLetterOnlyFilter.scala @@ -0,0 +1,11 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures only ASCII letters matter. */ +trait AsciiLetterOnlyFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = + super.filter(charArray.filter(c => ((c >= 65 && c <= 90 ) || (c >= 97 && c <= 122)))) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiNumberFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiNumberFilter.scala new file mode 100755 index 0000000..42fe77e --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiNumberFilter.scala @@ -0,0 +1,11 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures ASCII numbers do not matter. */ +trait AsciiNumberFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = + super.filter(charArray.filter(c => !(c >= 48 && c <= 57))) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiNumberOnlyFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiNumberOnlyFilter.scala new file mode 100755 index 0000000..3f17099 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiNumberOnlyFilter.scala @@ -0,0 +1,11 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures only ASCII numbers matter. */ +trait AsciiNumberOnlyFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = + super.filter(charArray.filter(c => (c >= 48 && c <= 57 ))) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiSpaceFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiSpaceFilter.scala new file mode 100755 index 0000000..538107d --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiSpaceFilter.scala @@ -0,0 +1,10 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures ASCII spaces do not matter. */ +trait AsciiSpaceFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = super.filter(charArray.filter(_ != ' ')) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiSymbolFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiSymbolFilter.scala new file mode 100755 index 0000000..7b0c810 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiSymbolFilter.scala @@ -0,0 +1,15 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures ASCII symbols do not matter. */ +trait AsciiSymbolFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = + super.filter( + charArray.filter(c => + !((c >= 32 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126)) + ) + ) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiSymbolOnlyFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiSymbolOnlyFilter.scala new file mode 100755 index 0000000..5cb5e94 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/AsciiSymbolOnlyFilter.scala @@ -0,0 +1,15 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures only ASCII symbols matter. */ +trait AsciiSymbolOnlyFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = + super.filter( + charArray.filter(c => + ((c >= 32 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126)) + ) + ) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/IgnoreAsciiLetterCaseFilter.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/IgnoreAsciiLetterCaseFilter.scala new file mode 100755 index 0000000..54fe66f --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/IgnoreAsciiLetterCaseFilter.scala @@ -0,0 +1,11 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +/** Ensures ASCII letter case-sensitivity does not matter. */ +trait IgnoreAsciiLetterCaseFilter extends StringFilter { + abstract override def filter(charArray: Array[Char]): Array[Char] = + super.filter(charArray.map(c => if (c >= 65 && c <= 90) (c + 32).toChar else c)) + + abstract override def filter(string: String): String = filter(string.toCharArray).mkString +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/StringFilterDelegate.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/StringFilterDelegate.scala new file mode 100755 index 0000000..8ece42d --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/filter/StringFilterDelegate.scala @@ -0,0 +1,9 @@ +package com.rockymadden.stringmetric.filter + +import com.rockymadden.stringmetric.StringFilter + +class StringFilterDelegate extends StringFilter { + override def filter(charArray: Array[Char]): Array[Char] = charArray + + override def filter(string: String): String = string +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/package.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/package.scala new file mode 100755 index 0000000..6752f4d --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/package.scala @@ -0,0 +1,7 @@ +package com.rockymadden + +package object stringmetric { + type CompareTuple[T] = (Array[T], Array[T]) + + type MatchTuple[T] = (Array[T], Array[T]) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/MetaphoneAlgorithm.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/MetaphoneAlgorithm.scala new file mode 100755 index 0000000..c580fd3 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/MetaphoneAlgorithm.scala @@ -0,0 +1,122 @@ +package com.rockymadden.stringmetric.phonetic + +import com.rockymadden.stringmetric.{StringAlgorithm, StringFilter} +import com.rockymadden.stringmetric.Alphabet.{Alpha, LowercaseVowel} +import scala.annotation.{switch, tailrec} + +/** An implementation of the Metaphone algorithm. */ +class MetaphoneAlgorithm extends StringAlgorithm[DummyImplicit, String] { this: StringFilter => + final override def compute(charArray: Array[Char])(implicit di: DummyImplicit): Option[Array[Char]] = { + val fca = filter(charArray) + + if (fca.length == 0 || !(Alpha isSuperset fca.head)) None + else { + val th = deduplicate(transcodeHead(fca.map(_.toLower))) + val t = transcode(Array.empty[Char], th.head, th.tail, Array.empty[Char]) + + if (t.length == 0) None else Some(t) // Single Y or W would have 0 length. + } + } + + final override def compute(string: String)(implicit di: DummyImplicit): Option[String] = + compute(string.toCharArray).map(_.mkString) + + private[this] def deduplicate(ca: Array[Char]) = + if (ca.length <= 1) ca + else ca.sliding(2).withFilter(a => a(0) == 'c' || a(0) != a(1)).map(a => a(0)).toArray[Char] :+ ca.last + + @tailrec + private[this] def transcode(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = { + if (c == '\0' && r.length == 0) o + else { + def shift(d: Int, ca: Array[Char]) = { + val sca = r.splitAt(d - 1) + + ( + if (sca._1.length > 0) (l :+ c) ++ sca._1 else l :+ c, + if (sca._2.length > 0) sca._2.head else '\0', + if (sca._2.length > 1) sca._2.tail else Array.empty[Char], + ca + ) + } + + val t = { + (c: @switch) match { + case 'a' | 'e' | 'i' | 'o' | 'u' => if (l.length == 0) shift(1, o:+ c) else shift(1, o) + case 'f' | 'j' | 'l' | 'm' | 'n' | 'r' => shift(1, o :+ c) + case 'b' => if (l.length >= 1 && l.last == 'm' && r.length == 0) shift(1, o) else shift(1, o :+ 'b') + case 'c' => + if (r.length >= 1 && r.head == 'h' && l.length >= 1 && l.last == 's') shift(1, o :+ 'k') + else if (r.length >= 2 && r.head == 'i' && r(1) == 'a') shift(3, o :+ 'x') + else if ((r.length >= 1 && r.head == 'h') + || (l.length >= 1 && r.length >= 1 && l.last == 's' && r.head == 'h')) shift(2, o :+ 'x') + else if (l.length >= 1 && r.length >= 1 && l.last == 's' + && (r.head == 'i' || r.head == 'e' || r.head == 'y')) shift(1, o) + else if (r.length >= 1 && (r.head == 'i' || r.head == 'e' || r.head == 'y')) shift(1, o :+ 's') + else shift(1, o :+ 'k') + case 'd' => + if (r.length >= 2 && r.head == 'g' + && (r(1) == 'e' || r(1) == 'y' || r(1) == 'i')) shift(1, o :+ 'j') + else shift(1, o :+ 't') + case 'g' => + if ((r.length > 1 && r.head == 'h') + || (r.length == 1 && r.head == 'n') + || (r.length == 3 && r.head == 'n' && r(1) == 'e' && r(2) == 'd')) shift(1, o) + else if (r.length >= 1 && (r.head == 'i' || r.head == 'e' || r.head == 'y')) shift(2, o :+ 'j') + else shift(1, o :+ 'k') + case 'h' => + if ((l.length >= 1 && (LowercaseVowel isSuperset l.last) && (r.length == 0 || !(LowercaseVowel isSuperset r.head))) + || (l.length >= 2 && l.last == 'h' + && (l(l.length - 2) == 'c' || l(l.length - 2) == 's' || l(l.length - 2) == 'p' + || l(l.length - 2) == 't' || l(l.length - 2) == 'g'))) shift(1, o) + else shift(1, o :+ 'h') + case 'k' => if (l.length >= 1 && l.last == 'c') shift(1, o) else shift(1, o :+ 'k') + case 'p' => if (r.length >= 1 && r.head == 'h') shift(2, o :+ 'f') else shift(1, o :+ 'p') + case 'q' => shift(1, o :+ 'k') + case 's' => + if (r.length >= 2 && r.head == 'i' && (r(1) == 'o' || r(1) == 'a')) shift(3, o :+ 'x') + else if (r.length >= 1 && r.head == 'h') shift(2, o :+ 'x') + else shift(1, o :+ 's') + case 't' => + if (r.length >= 2 && r.head == 'i' && (r(1) == 'a' || r(1) == 'o')) shift(3, o :+ 'x') + else if (r.length >= 1 && r.head == 'h') shift(2, o :+ '0') + else if (r.length >= 2 && r.head == 'c' && r(1) == 'h') shift(1, o) + else shift(1, o :+ 't') + case 'v' => shift(1, o :+ 'f') + case 'w' | 'y' => if (r.length == 0 || !(LowercaseVowel isSuperset r.head)) shift(1, o) else shift(1, o :+ c) + case 'x' => shift(1, (o :+ 'k') :+ 's') + case 'z' => shift(1, o :+ 's') + case _ => shift(1, o) + } + } + + transcode(t._1, t._2, t._3, t._4) + } + } + + private[this] def transcodeHead(ca: Array[Char]) = { + (ca.length: @switch) match { + case 0 => ca + case 1 => if (ca.head == 'x') Array('s') else ca + case _ => + (ca.head: @switch) match { + case 'a' if (ca(1) == 'e') => ca.tail + case 'g' | 'k' | 'p' if (ca(1) == 'n') => ca.tail + case 'w' if (ca(1) == 'r') => ca.tail + case 'w' if (ca(1) == 'h') => 'w' +: ca.drop(2) + case 'x' => 's' +: ca.tail + case _ => ca + } + } + } +} + +object MetaphoneAlgorithm { + private lazy val self = apply() + + def apply(): MetaphoneAlgorithm = new MetaphoneAlgorithm with StringFilter + + def compute(charArray: Array[Char]) = self.compute(charArray) + + def compute(string: String) = self.compute(string) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/MetaphoneMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/MetaphoneMetric.scala new file mode 100755 index 0000000..2975ad3 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/MetaphoneMetric.scala @@ -0,0 +1,32 @@ +package com.rockymadden.stringmetric.phonetic + +import com.rockymadden.stringmetric.{StringFilter, StringMetric} +import com.rockymadden.stringmetric.Alphabet.Alpha + +/** An implementation of the Metaphone metric. */ +class MetaphoneMetric extends StringMetric[DummyImplicit, Boolean] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit di: DummyImplicit): Option[Boolean] = { + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length == 0 || !(Alpha isSuperset fca1.head) || fca2.length == 0 || !(Alpha isSuperset fca2.head)) None + else MetaphoneAlgorithm.compute(fca1).filter(_.length > 0).flatMap(mp1 => + MetaphoneAlgorithm.compute(fca2).filter(_.length > 0).map(mp1.sameElements(_)) + ) + } + + final override def compare(string1: String, string2: String)(implicit di: DummyImplicit): Option[Boolean] = + compare(string1.toCharArray, string2.toCharArray) +} + +object MetaphoneMetric { + private lazy val self = apply() + + def apply(): MetaphoneMetric = new MetaphoneMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char]) = self.compare(charArray1, charArray2) + + def compare(string1: String, string2: String) = self.compare(string1, string2) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/NysiisAlgorithm.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/NysiisAlgorithm.scala new file mode 100755 index 0000000..ff0b3d6 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/NysiisAlgorithm.scala @@ -0,0 +1,131 @@ +package com.rockymadden.stringmetric.phonetic + +import com.rockymadden.stringmetric.{StringAlgorithm, StringFilter} +import com.rockymadden.stringmetric.Alphabet.{Alpha, LowercaseVowel} +import scala.annotation.{switch, tailrec} + +/** An implementation of the NYSIIS algorithm. */ +class NysiisAlgorithm extends StringAlgorithm[DummyImplicit, String] { this: StringFilter => + final override def compute(charArray: Array[Char])(implicit di: DummyImplicit): Option[Array[Char]] = { + val fca = filter(charArray) + + if (fca.length == 0 || !(Alpha isSuperset fca.head)) None + else { + val tr = transcodeRight(fca.map(_.toLower)) + val tl = transcodeLeft(tr._1) + val t = + if (tl._2.length == 0) tl._1 ++ tr._2 + else tl._1 ++ transcodeCenter( + Array.empty[Char], + tl._2.head, + if (tl._2.length > 1) tl._2.tail else Array.empty[Char], + Array.empty[Char] + ) ++ tr._2 + + if (t.length == 1) Some(t) + else Some(t.head +: deduplicate(cleanTerminal(cleanLast(t.tail)))) + } + } + + final override def compute(string: String)(implicit di: DummyImplicit): Option[String] = + compute(string.toCharArray).map(_.mkString) + + private[this] def cleanLast(ca: Array[Char]) = + if (ca.length == 0) ca + else if(ca.last == 'a' || ca.last == 's') ca.dropRight(ca.reverseIterator.takeWhile(c => c == 'a' || c == 's').length) + else ca + + private[this] def cleanTerminal(ca: Array[Char]) = + if (ca.length >= 2 && ca.last == 'y' && ca(ca.length - 2) == 'a') ca.dropRight(2) :+ 'y' + else ca + + private[this] def deduplicate(ca: Array[Char]) = + if (ca.length <= 1) ca + else ca.sliding(2).withFilter(a => a(0) != a(1)).map(a => a(0)).toArray[Char] :+ ca.last + + @tailrec + private[this] def transcodeCenter(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = { + if (c == '\0' && r.length == 0) o + else { + def shift(d: Int, ca: Array[Char]) = { + val sca = r.splitAt(d - 1) + + ( + if (sca._1.length > 0) (l :+ c) ++ sca._1 else l :+ c, + if (sca._2.length > 0) sca._2.head else '\0', + if (sca._2.length > 1) sca._2.tail else Array.empty[Char], + ca + ) + } + + val t = { + (c: @switch) match { + case 'a' | 'i' | 'o' | 'u' => shift(1, o :+ 'a') + case 'b' | 'c' | 'd' | 'f' | 'g' | 'j' | 'l' | 'n' | 'r' | 't' | 'v' | 'x' | 'y' => shift(1, o :+ c) + case 'e' => + if (r.length >= 1 && r.head == 'v') shift(2, o ++ Array('a', 'f')) + else shift(1, o :+ 'a') + case 'h' => + if (l.length >= 1 && (!(LowercaseVowel isSuperset l.last) || (r.length >= 1 && !(LowercaseVowel isSuperset r.head)))) shift(1, o) + else shift(1, o :+ c) + case 'k' => if (r.length >= 1 && r.head == 'n') shift(2, o :+ 'n') else shift(1, o :+ 'c') + case 'm' => shift(1, o :+ 'n') + case 'p' => if (r.length >= 1 && r.head == 'h') shift(2, o :+ 'f') else shift(1, o :+ c) + case 'q' => shift(1, o :+ 'g') + case 's' => + if (r.length >= 2 && r.head == 'c' && r(1) == 'h') shift(3, o :+ c) + else shift(1, o :+ c) + case 'w' => + if (l.length >= 1 && (LowercaseVowel isSuperset l.last)) shift(1, o) + else shift(1, o :+ c) + case 'z' => shift(1, o :+ 's') + case _ => shift(1, o) + } + } + + transcodeCenter(t._1, t._2, t._3, t._4) + } + } + + private[this] def transcodeLeft(ca: Array[Char]) = { + if (ca.length == 0) (Array.empty[Char], ca) + else { + lazy val tr2 = ca.takeRight(ca.length - 2) + lazy val tr3 = ca.takeRight(ca.length - 3) + + (ca.head: @switch) match { + case 'k' if (ca.length >= 2 && ca(1) == 'n') => (Array('n', 'n'), tr2) + case 'k' => (Array('c'), ca.tail) + case 'm' if (ca.length >= 3 && (ca(1) == 'a' && ca(2) == 'c')) => (Array('m', 'c'), tr3) + case 'p' if (ca.length >= 2 && (ca(1) == 'h' || ca(1) == 'f')) => (Array('f', 'f'), tr2) + case 's' if (ca.length >= 3 && (ca(1) == 'c' && ca(2) == 'h')) => (Array('s', 's'), tr3) + case _ => (Array(ca.head), ca.tail) + } + } + } + + private[this] def transcodeRight(ca: Array[Char]) = { + if (ca.length >= 2) { + val lc = ca(ca.length - 1) + val lcm1 = ca(ca.length - 2) + lazy val t2 = ca.take(ca.length - 2) + + (lc: @switch) match { + case 'd' if (lcm1 == 'n' || lcm1 == 'r') => (t2, Array('d')) + case 'e' if (lcm1 == 'e' || lcm1 == 'i') => (t2, Array('y')) + case 't' if (lcm1 == 'd' || lcm1 == 'n' || lcm1 == 'r') => (t2, Array('d')) + case _ => (ca, Array.empty[Char]) + } + } else (ca, Array.empty[Char]) + } +} + +object NysiisAlgorithm { + private lazy val self = apply() + + def apply(): NysiisAlgorithm = new NysiisAlgorithm with StringFilter + + def compute(charArray: Array[Char]) = self.compute(charArray) + + def compute(string: String) = self.compute(string) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/NysiisMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/NysiisMetric.scala new file mode 100755 index 0000000..6d1c22c --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/NysiisMetric.scala @@ -0,0 +1,40 @@ +package com.rockymadden.stringmetric.phonetic + +import com.rockymadden.stringmetric.{StringFilter, StringMetric} +import com.rockymadden.stringmetric.Alphabet.Alpha + +/** An implementation of the NYSIIS metric. */ +class NysiisMetric extends StringMetric[DummyImplicit, Boolean] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit di: DummyImplicit): Option[Boolean] = { + + val unequal = (c1: Char, c2: Char) => { + val lc1 = c1.toLower + val lc2 = c2.toLower + + (if (lc1 == 'k') 'c' else lc1) != (if (lc2 == 'k') 'c' else lc2) + } + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length == 0 || !(Alpha isSuperset fca1.head) || fca2.length == 0 || !(Alpha isSuperset fca2.head)) None + else if (unequal(fca1.head, fca2.head)) Some(false) + else NysiisAlgorithm.compute(fca1).filter(_.length > 0).flatMap(ny1 => + NysiisAlgorithm.compute(fca2).filter(_.length > 0).map(ny1.sameElements(_)) + ) + } + + final override def compare(string1: String, string2: String)(implicit di: DummyImplicit): Option[Boolean] = + compare(string1.toCharArray, string2.toCharArray) +} + +object NysiisMetric { + private lazy val self = apply() + + def apply(): NysiisMetric = new NysiisMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char]) = self.compare(charArray1, charArray2) + + def compare(string1: String, string2: String) = self.compare(string1, string2) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedNysiisAlgorithm.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedNysiisAlgorithm.scala new file mode 100755 index 0000000..334e9e3 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedNysiisAlgorithm.scala @@ -0,0 +1,135 @@ +package com.rockymadden.stringmetric.phonetic + +import com.rockymadden.stringmetric.{StringAlgorithm, StringFilter} +import com.rockymadden.stringmetric.Alphabet.{Alpha, LowercaseVowel} +import scala.annotation.{switch, tailrec} + +/** An implementation of the refined NYSIIS algorithm. */ +class RefinedNysiisAlgorithm extends StringAlgorithm[DummyImplicit, String] { this: StringFilter => + final override def compute(charArray: Array[Char])(implicit di: DummyImplicit): Option[Array[Char]] = { + val fca = filter(charArray) + + if (fca.length == 0 || !(Alpha isSuperset fca.head)) None + else { + val lfca = fca.map(_.toLower) + val tlh = transcodeLast(transcodeHead(lfca.head +: cleanLast(lfca.tail, Set('s', 'z')))) + val t = transcode(Array.empty[Char], tlh.head, tlh.tail, Array.empty[Char]) + + if (t.length == 1) Some(t) + else Some(deduplicate(t.head +: cleanTerminal(cleanLast(t.tail, Set('a'))))) + } + } + + final override def compute(string: String)(implicit di: DummyImplicit): Option[String] = + compute(string.toCharArray).map(_.mkString) + + private[this] def cleanLast(ca: Array[Char], s: Set[Char]) = + if (ca.length == 0) ca + else if(s.contains(ca.last)) ca.dropRight(ca.reverseIterator.takeWhile(c => s.contains(c)).length) + else ca + + private[this] def cleanTerminal(ca: Array[Char]) = + if (ca.length >= 2 && ca.last == 'y' && ca(ca.length - 2) == 'a') ca.dropRight(2) :+ 'y' + else ca + + private[this] def deduplicate(ca: Array[Char]) = + if (ca.length <= 1) ca + else ca.sliding(2).withFilter(a => a(0) != a(1)).map(a => a(0)).toArray[Char] :+ ca.last + + @tailrec + private[this] def transcode(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = { + if (c == '\0' && r.length == 0) o + else { + def shift(d: Int, ca: Array[Char]) = { + val sca = r.splitAt(d - 1) + + ( + if (sca._1.length > 0) (l :+ c) ++ sca._1 else l :+ c, + if (sca._2.length > 0) sca._2.head else '\0', + if (sca._2.length > 1) sca._2.tail else Array.empty[Char], + ca + ) + } + + val t = { + (c: @switch) match { + case 'a' | 'i' | 'o' | 'u' => + if (l.length == 0) shift(1, o :+ c) + else shift(1, o :+ 'a') + case 'b' | 'c' | 'f' | 'j' | 'l' | 'n' | 'r' | 't' | 'v' | 'x' => shift(1, o :+ c) + case 'd' => + if (r.length >= 1 && r.head == 'g') shift(2, o :+ 'g') else shift(1, o :+ c) + case 'e' => + if (l.length == 0) shift(1, o :+ c) + else if (r.length >= 1 && r.head == 'v') shift(2, o ++ Array('a', 'f')) + else shift(1, o :+ 'a') + case 'g' => + if (r.length >= 2 && r.head == 'h' && r(1) == 't') shift(3, o ++ Array('g', 't')) + else shift(1, o :+ c) + case 'h' => + if (l.length == 0) shift(1, o :+ c) + else if (!(LowercaseVowel isSuperset l.last) || (r.length >= 1 && !(LowercaseVowel isSuperset r.head))) shift(1, o) + else shift(1, o :+ c) + case 'k' => if (r.length >= 1 && r.head == 'n') shift(2, o :+ 'n') else shift(1, o :+ 'c') + case 'm' => if (l.length == 0) shift(1, o :+ c) else shift(1, o :+ 'n') + case 'p' => if (r.length >= 1 && r.head == 'h') shift(2, o :+ 'f') else shift(1, o :+ c) + case 'q' => if (l.length == 0) shift(1, o :+ c) else shift(1, o :+ 'g') + case 's' => + if (r.length >= 2 && r.head == 'c' && r(1) == 'h') shift(3, o :+ c) + else if (r.length >= 1 && r.head == 'h') shift(2, o :+ c) + else shift(1, o :+ c) + case 'w' => + if (l.length >= 1 && (LowercaseVowel isSuperset l.last)) shift(1, o) + else if (r.length >= 1 && r.head == 'r') shift(2, o :+ 'r') + else shift(1, o :+ c) + case 'y' => + if (l.length >= 1 && r.length >= 2 && r.head == 'w') shift(2, o :+ 'a') + else if (r.length >= 1 && r.head == 'w') shift(2, o :+ c) + else if (l.length >= 1 && r.length >= 1) shift(1, o :+ 'a') + else shift(1, o :+ c) + case 'z' => if (l.length == 0) shift(1, o :+ c) else shift(1, o :+ 's') + case _ => shift(1, o) + } + } + + transcode(t._1, t._2, t._3, t._4) + } + } + + private[this] def transcodeHead(ca: Array[Char]) = { + if (ca.length == 0) ca + else + (ca.head: @switch) match { + case 'm' if (ca.length >= 3 && ca(1) == 'a' && ca(2) == 'c') => Array('m', 'c') ++ ca.takeRight(ca.length - 3) + case 'p' if (ca.length >= 2 && ca(1) == 'f') => 'f' +: ca.takeRight(ca.length - 2) + case _ => ca + } + } + + private[this] def transcodeLast(ca: Array[Char]) = { + if (ca.length >= 2) { + val lc = ca(ca.length - 1) + val lcm1 = ca(ca.length - 2) + lazy val t2 = ca.take(ca.length - 2) + + (lc: @switch) match { + case 'd' if (lcm1 == 'n' || lcm1 == 'r') => t2 :+ 'd' + case 'e' if (lcm1 == 'e' || lcm1 == 'i' || lcm1 =='y') => t2 :+ 'y' + case 't' if (lcm1 == 'd' || lcm1 == 'n' || lcm1 == 'r') => t2 :+ 'd' + case 'x' if (lcm1 == 'e') => t2 ++ Array('e', 'c') + case 'x' if (lcm1 == 'i') => t2 ++ Array('i', 'c') + case _ => ca + } + } else ca + } +} + +object RefinedNysiisAlgorithm { + private lazy val self = apply() + + def apply(): RefinedNysiisAlgorithm = new RefinedNysiisAlgorithm with StringFilter + + def compute(charArray: Array[Char]) = self.compute(charArray) + + def compute(string: String) = self.compute(string) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedNysiisMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedNysiisMetric.scala new file mode 100755 index 0000000..c96cc52 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedNysiisMetric.scala @@ -0,0 +1,40 @@ +package com.rockymadden.stringmetric.phonetic + +import com.rockymadden.stringmetric.{StringFilter, StringMetric} +import com.rockymadden.stringmetric.Alphabet.Alpha + +/** An implementation of the refined NYSIIS metric. */ +class RefinedNysiisMetric extends StringMetric[DummyImplicit, Boolean] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit di: DummyImplicit): Option[Boolean] = { + + val unequal = (c1: Char, c2: Char) => { + val lc1 = c1.toLower + val lc2 = c2.toLower + + (if (lc1 == 'k') 'c' else lc1) != (if (lc2 == 'k') 'c' else lc2) + } + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length == 0 || !(Alpha isSuperset fca1.head) || fca2.length == 0 || !(Alpha isSuperset fca2.head)) None + else if (unequal(fca1.head, fca2.head)) Some(false) + else RefinedNysiisAlgorithm.compute(fca1).filter(_.length > 0).flatMap(rny1 => + RefinedNysiisAlgorithm.compute(fca2).filter(_.length > 0).map(rny1.sameElements(_)) + ) + } + + final override def compare(string1: String, string2: String)(implicit di: DummyImplicit): Option[Boolean] = + compare(string1.toCharArray, string2.toCharArray) +} + +object RefinedNysiisMetric { + private lazy val self = apply() + + def apply(): RefinedNysiisMetric = new RefinedNysiisMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char]) = self.compare(charArray1, charArray2) + + def compare(string1: String, string2: String) = self.compare(string1, string2) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedSoundexAlgorithm.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedSoundexAlgorithm.scala new file mode 100755 index 0000000..f22bde1 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedSoundexAlgorithm.scala @@ -0,0 +1,75 @@ +package com.rockymadden.stringmetric.phonetic + +import com.rockymadden.stringmetric.{StringAlgorithm, StringFilter} +import com.rockymadden.stringmetric.Alphabet.Alpha +import scala.annotation.{switch, tailrec} + +/** An implementation of the refined Soundex algorithm. */ +class RefinedSoundexAlgorithm extends StringAlgorithm[DummyImplicit, String] { this: StringFilter => + final override def compute(charArray: Array[Char])(implicit di: DummyImplicit): Option[Array[Char]] = { + val fca = filter(charArray) + + if (fca.length == 0 || !(Alpha isSuperset fca.head)) None + else Some(transcode(fca, Array(fca.head.toLower))) + } + + final override def compute(string: String)(implicit di: DummyImplicit): Option[String] = + compute(string.toCharArray).map(_.mkString) + + @tailrec + private[this] def transcode(i: Array[Char], o: Array[Char]): Array[Char] = { + if (i.length == 0) o + else { + val c = i.head.toLower + val m2 = (mc: Char) => (mc: @switch) match { + case 'a' | 'e' | 'h' | 'i' | 'o' | 'u' | 'w' | 'y' => '0' + case 'b' | 'p' => '1' + case 'f' | 'v' => '2' + case 'c' | 'k' | 's' => '3' + case 'g' | 'j' => '4' + case 'q' | 'x' | 'z' => '5' + case 'd' | 't' => '6' + case 'l' => '7' + case 'm' | 'n' => '8' + case 'r' => '9' + case _ => '\0' + } + val m1 = (mc: Char, pc: Char) => (mc: @switch) match { + case 'a' | 'e' | 'h' | 'i' | 'o' | 'u' | 'w' | 'y' if pc != '0' => '0' + case 'b' | 'p' if pc != '1' => '1' + case 'f' | 'v' if pc != '2' => '2' + case 'c' | 'k' | 's' if pc != '3' => '3' + case 'g' | 'j' if pc != '4' => '4' + case 'q' | 'x' | 'z' if pc != '5' => '5' + case 'd' | 't' if pc != '6' => '6' + case 'l' if pc != '7' => '7' + case 'm' | 'n' if pc != '8' => '8' + case 'r' if pc != '9' => '9' + case _ => '\0' + } + val a = + // Code twice. + if (o.length == 1) m2(c) + // Code once. + else m1( + c, + (o.last: @switch) match { + case '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => o.last + case _ => m2(o.last) + } + ) + + transcode(i.tail, if (a != '\0') o :+ a else o) + } + } +} + +object RefinedSoundexAlgorithm { + private lazy val self = apply() + + def apply(): RefinedSoundexAlgorithm = new RefinedSoundexAlgorithm with StringFilter + + def compute(charArray: Array[Char]) = self.compute(charArray) + + def compute(string: String) = self.compute(string) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedSoundexMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedSoundexMetric.scala new file mode 100755 index 0000000..eb2f01e --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/RefinedSoundexMetric.scala @@ -0,0 +1,33 @@ +package com.rockymadden.stringmetric.phonetic + +import com.rockymadden.stringmetric.{StringFilter, StringMetric} +import com.rockymadden.stringmetric.Alphabet.Alpha + +/** An implementation of the refined Soundex metric. */ +class RefinedSoundexMetric extends StringMetric[DummyImplicit, Boolean] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit di: DummyImplicit): Option[Boolean] = { + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length == 0 || !(Alpha isSuperset fca1.head) || fca2.length == 0 || !(Alpha isSuperset fca2.head)) None + else if (fca1.head.toLower != fca2.head.toLower) Some(false) + else RefinedSoundexAlgorithm.compute(fca1).filter(_.length > 0).flatMap(rse1 => + RefinedSoundexAlgorithm.compute(fca2).filter(_.length > 0).map(rse1.sameElements(_)) + ) + } + + final override def compare(string1: String, string2: String)(implicit di: DummyImplicit): Option[Boolean] = + compare(string1.toCharArray, string2.toCharArray) +} + +object RefinedSoundexMetric { + private lazy val self = apply() + + def apply(): RefinedSoundexMetric = new RefinedSoundexMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char]) = self.compare(charArray1, charArray2) + + def compare(string1: String, string2: String) = self.compare(string1, string2) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/SoundexAlgorithm.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/SoundexAlgorithm.scala new file mode 100755 index 0000000..361047d --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/SoundexAlgorithm.scala @@ -0,0 +1,73 @@ +package com.rockymadden.stringmetric.phonetic + +import com.rockymadden.stringmetric.{StringAlgorithm, StringFilter} +import com.rockymadden.stringmetric.Alphabet.Alpha +import scala.annotation.{switch, tailrec} + +/** An implementation of the Soundex algorithm. */ +class SoundexAlgorithm extends StringAlgorithm[DummyImplicit, String] { this: StringFilter => + final override def compute(charArray: Array[Char])(implicit di: DummyImplicit): Option[Array[Char]] = { + val fca = filter(charArray) + + if (fca.length == 0 || !(Alpha isSuperset fca.head)) None + else { + val fc = fca.head.toLower + + Some(transcode(fca.tail, fc, Array(fc)).padTo(4, '0')) + } + } + + final override def compute(string: String)(implicit di: DummyImplicit): Option[String] = + compute(string.toCharArray).map(_.mkString) + + @tailrec + private[this] def transcode(i: Array[Char], pc: Char, o: Array[Char]): Array[Char] = { + if (i.length == 0) o + else { + val c = i.head.toLower + val m2 = (mc: Char) => (mc: @switch) match { + case 'b' | 'f' | 'p' | 'v' => '1' + case 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => '2' + case 'd' | 't' => '3' + case 'l' => '4' + case 'm' | 'n' => '5' + case 'r' => '6' + case _ => '\0' + } + val m1 = (mc: Char, pc: Char) => (mc: @switch) match { + case 'b' | 'f' | 'p' | 'v' if pc != '1' => '1' + case 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' if pc != '2' => '2' + case 'd' | 't' if pc != '3' => '3' + case 'l' if pc != '4' => '4' + case 'm' | 'n' if pc != '5' => '5' + case 'r' if pc != '6' => '6' + case _ => '\0' + } + val a = pc match { + // Code twice. + case 'a' | 'e' | 'i' | 'o' | 'u' | 'y' => m2(c) + // Code once. + case _ => m1( + c, + (o.last: @switch) match { + case '1' | '2' | '3' | '4' | '5' | '6' => o.last + case _ => m2(o.last) + } + ) + } + + if (o.length == 3 && a != '\0') o :+ a + else transcode(i.tail, c, if (a != '\0') o :+ a else o) + } + } +} + +object SoundexAlgorithm { + private lazy val self = apply() + + def apply(): SoundexAlgorithm = new SoundexAlgorithm with StringFilter + + def compute(charArray: Array[Char]) = self.compute(charArray) + + def compute(string: String) = self.compute(string) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/SoundexMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/SoundexMetric.scala new file mode 100755 index 0000000..e4daa17 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/phonetic/SoundexMetric.scala @@ -0,0 +1,33 @@ +package com.rockymadden.stringmetric.phonetic + +import com.rockymadden.stringmetric.{StringFilter, StringMetric} +import com.rockymadden.stringmetric.Alphabet.Alpha + +/** An implementation of the Soundex metric. */ +class SoundexMetric extends StringMetric[DummyImplicit, Boolean] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit di: DummyImplicit): Option[Boolean] = { + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length == 0 || !(Alpha isSuperset fca1.head) || fca2.length == 0 || !(Alpha isSuperset fca2.head)) None + else if (fca1.head.toLower != fca2.head.toLower) Some(false) + else SoundexAlgorithm.compute(fca1).filter(_.length > 0).flatMap(se1 => + SoundexAlgorithm.compute(fca2).filter(_.length > 0).map(se1.sameElements(_)) + ) + } + + final override def compare(string1: String, string2: String)(implicit di: DummyImplicit): Option[Boolean] = + compare(string1.toCharArray, string2.toCharArray) +} + +object SoundexMetric { + private lazy val self = apply() + + def apply(): SoundexMetric = new SoundexMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char]) = self.compare(charArray1, charArray2) + + def compare(string1: String, string2: String) = self.compare(string1, string2) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala new file mode 100755 index 0000000..5e01bb1 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala @@ -0,0 +1,42 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{StringMetric, MatchTuple, StringFilter} +import com.rockymadden.stringmetric.tokenization.NGramTokenizer + +/** + * An implementation of the Dice/Sorensen metric. This implementation differs in that n-gram size is required. + * Traditionally, the algorithm uses bigrams. + */ +class DiceSorensenMetric extends StringMetric[Int, Double] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit n: Int): Option[Double] = { + if (n <= 0) throw new IllegalArgumentException("Expected valid n.") + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. + else if (fca1.sameElements(fca2)) Some(1d) + else NGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + NGramTokenizer.tokenize(fca2)(n).map { ca2bg => + val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString)) + + (2d * ms) / (ca1bg.length + ca2bg.length) + } + } + } + + final override def compare(string1: String, string2: String)(implicit n: Int): Option[Double] = + compare(string1.toCharArray, string2.toCharArray)(n: Int) + + private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length +} + +object DiceSorensenMetric { + private lazy val self = apply() + + def apply(): DiceSorensenMetric = new DiceSorensenMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = self.compare(charArray1, charArray2)(n) + + def compare(string1: String, string2: String)(n: Int) = self.compare(string1, string2)(n) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/HammingMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/HammingMetric.scala new file mode 100755 index 0000000..95ff203 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/HammingMetric.scala @@ -0,0 +1,37 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{CompareTuple, StringFilter, StringMetric} + +/** An implementation of the Hamming metric. */ +class HammingMetric extends StringMetric[DummyImplicit, Int] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit di: DummyImplicit): Option[Int] = { + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length == 0 || fca2.length == 0 || fca1.length != fca2.length) None + else if (fca1.sameElements(fca2)) Some(0) + else Some(hamming(fca1, fca2)) + } + + final override def compare(string1: String, string2: String)(implicit di: DummyImplicit): Option[Int] = + compare(string1.toCharArray, string2.toCharArray) + + private[this] def hamming(ct: CompareTuple[Char]) = { + require(ct._1.length == ct._2.length) + + if (ct._1.length == 0) 0 + else ct._1.zip(ct._2).count(t => t._1 != t._2) + } +} + +object HammingMetric { + private lazy val self = apply() + + def apply(): HammingMetric = new HammingMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char]) = self.compare(charArray1, charArray2) + + def compare(string1: String, string2: String) = self.compare(string1, string2) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala new file mode 100755 index 0000000..e32c926 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala @@ -0,0 +1,37 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{StringMetric, StringFilter} +import com.rockymadden.stringmetric.tokenization.NGramTokenizer + +/* An implementation of the Jaccard metric. */ +class JaccardMetric extends StringMetric[Int, Double] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit n: Int): Option[Double] = { + if (n <= 0) throw new IllegalArgumentException("Expected valid n.") + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. + else if (fca1.sameElements(fca2)) Some(1d) + else NGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + NGramTokenizer.tokenize(fca2)(n).map { ca2bg => + val i = (ca1bg.map(_.mkString) intersect ca2bg.map(_.mkString)).length + + i.toDouble / (ca1bg.length + ca2bg.length - i) + } + } + } + + final override def compare(string1: String, string2: String)(implicit n: Int): Option[Double] = + compare(string1.toCharArray, string2.toCharArray)(n: Int) +} + +object JaccardMetric { + private lazy val self = apply() + + def apply(): JaccardMetric = new JaccardMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = self.compare(charArray1, charArray2)(n) + + def compare(string1: String, string2: String)(n: Int) = self.compare(string1, string2)(n) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/JaroMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/JaroMetric.scala new file mode 100755 index 0000000..b7ce2c5 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/JaroMetric.scala @@ -0,0 +1,87 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{CompareTuple, MatchTuple, StringFilter, StringMetric} +import scala.collection.mutable.{ArrayBuffer, HashSet} + +/** + * An implementation of the Jaro metric. One differing detail in this implementation is that if a character is matched + * in string2, it cannot be matched upon again. This results in a more penalized distance in these scenarios. + */ +class JaroMetric extends StringMetric[DummyImplicit, Double] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit di: DummyImplicit): Option[Double] = { + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length == 0 || fca2.length == 0) None + else if (fca1.sameElements(fca2)) Some(1d) + else { + val mt = `match`(fca1, fca2) + val ms = scoreMatches(mt._1, mt._2) + + if (ms == 0) Some(0d) + else { + val ts = scoreTranspositions(mt._1, mt._2) + + Some(((ms.toDouble / fca1.length) + (ms.toDouble / fca2.length) + ((ms.toDouble - ts) / ms)) / 3) + } + } + } + + final override def compare(string1: String, string2: String)(implicit di: DummyImplicit): Option[Double] = + compare(string1.toCharArray, string2.toCharArray) + + private[this] def `match`(ct: CompareTuple[Char]): MatchTuple[Char] = { + lazy val window = math.abs((math.max(ct._1.length, ct._2.length) / 2d).floor.toInt - 1) + val one = ArrayBuffer.empty[Int] + val two = HashSet.empty[Int] + var i = 0 + var bi = false + + while (i < ct._1.length && !bi) { + val start = if (i - window <= 0) 0 else i - window + val end = if (i + window >= ct._2.length - 1) ct._2.length - 1 else i + window + + if (start > ct._2.length - 1) bi = !bi + else { + var ii = start + var bii = false + + while (ii <= end && !bii) { + if (!two.contains(ii) && ct._1(i) == ct._2(ii)) { + one += i + two += ii + bii = !bii + } else ii += 1 + } + + i += 1 + } + } + + (one.toArray.map(ct._1(_)), two.toArray.sortWith(_ < _).map(ct._2(_))) + } + + private[this] def scoreMatches(mt: MatchTuple[Char]) = { + require(mt._1.length == mt._2.length) + + mt._1.length + } + + private[this] def scoreTranspositions(mt: MatchTuple[Char]) = { + require(mt._1.length == mt._2.length) + + (mt._1.zip(mt._2).count(t => t._1 != t._2) / 2d).floor.toInt + } +} + +object JaroMetric { + private lazy val self = apply() + + def apply(): JaroMetric = new JaroMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char]) = self.compare(charArray1, charArray2) + + def compare(string1: String, string2: String) = self.compare(string1, string2) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/JaroWinklerMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/JaroWinklerMetric.scala new file mode 100755 index 0000000..4e9aebd --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/JaroWinklerMetric.scala @@ -0,0 +1,40 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{StringFilter, StringMetric} + +/** + * An implementation of the Jaro-Winkler metric. One differing detail in this implementation is that if a character is + * matched in string2, it cannot be matched upon again. This results in a more penalized distance in these scenarios + * (e.g. comparing henka and henkan distance is 0.9666 versus the typical 0.9722). + */ +class JaroWinklerMetric extends StringMetric[DummyImplicit, Double] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit di: DummyImplicit): Option[Double] = { + + val fca1 = filter(charArray1) + val fca2 = filter(charArray2) + + JaroMetric.compare(fca1, fca2).map { + case 0d => 0d + case 1d => 1d + case jaro => { + val prefix = fca1.zip(fca2).takeWhile(t => t._1 == t._2) + + jaro + ((if (prefix.length <= 4) prefix.length else 4) * 0.1d * (1 - jaro)) + } + } + } + + final override def compare(string1: String, string2: String)(implicit di: DummyImplicit): Option[Double] = + compare(string1.toCharArray, string2.toCharArray) +} + +object JaroWinklerMetric { + private lazy val self = apply() + + def apply(): JaroWinklerMetric = new JaroWinklerMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char]) = self.compare(charArray1, charArray2) + + def compare(string1: String, string2: String) = self.compare(string1, string2) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/LevenshteinMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/LevenshteinMetric.scala new file mode 100755 index 0000000..47dff23 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/LevenshteinMetric.scala @@ -0,0 +1,58 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{CompareTuple, StringFilter, StringMetric} + +/** An implementation of the Levenshtein metric. */ +class LevenshteinMetric extends StringMetric[DummyImplicit, Int] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit di: DummyImplicit): Option[Int] = { + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length == 0 || fca2.length == 0) None + else if (fca1.sameElements(fca2)) Some(0) + else Some(levenshtein(fca1, fca2)) + } + + final override def compare(string1: String, string2: String)(implicit di: DummyImplicit): Option[Int] = + compare(string1.toCharArray, string2.toCharArray) + + private[this] def levenshtein(ct: CompareTuple[Char]) = { + val m = Array.fill[Int](ct._1.length + 1, ct._2.length + 1)(-1) + + def distance(t: (Int, Int)): Int = { + t match { + case (r, 0) => r + case (0, c) => c + case (r, c) if m(r)(c) != -1 => m(r)(c) + case (r, c) => { + val min = + if (ct._1(r - 1) == ct._2(c - 1)) distance(r - 1, c - 1) + else math.min( + math.min( + distance(r - 1, c) + 1, // Delete (left). + distance(r, c - 1) + 1 // Insert (up). + ), + distance(r - 1, c - 1) + 1 // Substitute (left-up). + ) + + m(r)(c) = min + min + } + } + } + + distance(ct._1.length, ct._2.length) + } +} + +object LevenshteinMetric { + private lazy val self = apply() + + def apply(): LevenshteinMetric = new LevenshteinMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char]) = self.compare(charArray1, charArray2) + + def compare(string1: String, string2: String) = self.compare(string1, string2) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala new file mode 100755 index 0000000..e74e8eb --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala @@ -0,0 +1,40 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{StringMetric, MatchTuple, StringFilter} +import com.rockymadden.stringmetric.tokenization.NGramTokenizer +import scala.math + +/** An implementation of the N-Gram metric. */ +class NGramMetric extends StringMetric[Int, Double] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit n: Int): Option[Double] = { + if (n <= 0) throw new IllegalArgumentException("Expected valid n.") + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. + else if (fca1.sameElements(fca2)) Some(1d) + else NGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + NGramTokenizer.tokenize(fca2)(n).map { ca2bg => + val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString))) + + ms.toDouble / math.max(ca1bg.length, ca2bg.length) + } + } + } + + final override def compare(string1: String, string2: String)(implicit n: Int): Option[Double] = + compare(string1.toCharArray, string2.toCharArray)(n) + + private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length +} + +object NGramMetric { + private lazy val self = apply() + + def apply(): NGramMetric = new NGramMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = self.compare(charArray1, charArray2)(n) + + def compare(string1: String, string2: String)(n: Int) = self.compare(string1, string2)(n) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala new file mode 100755 index 0000000..a543a7e --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala @@ -0,0 +1,40 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{StringMetric, MatchTuple, StringFilter} +import com.rockymadden.stringmetric.tokenization.NGramTokenizer +import scala.math + +/* An implementation of the overlap metric. */ +class OverlapMetric extends StringMetric[Int, Double] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char])(implicit n: Int): Option[Double] = { + if (n <= 0) throw new IllegalArgumentException("Expected valid n.") + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. + else if (fca1.sameElements(fca2)) Some(1d) + else NGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + NGramTokenizer.tokenize(fca2)(n).map { ca2bg => + val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString)) + + ms.toDouble / (math.min(ca1bg.length, ca2bg.length)) + } + } + } + + final override def compare(string1: String, string2: String)(implicit n: Int): Option[Double] = + compare(string1.toCharArray, string2.toCharArray)(n: Int) + + private[this] def scoreMatches(mt: MatchTuple[String]) = mt._1.intersect(mt._2).length +} + +object OverlapMetric { + private lazy val self = apply() + + def apply(): OverlapMetric = new OverlapMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char])(n: Int) = self.compare(charArray1, charArray2)(n) + + def compare(string1: String, string2: String)(n: Int) = self.compare(string1, string2)(n) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetric.scala new file mode 100755 index 0000000..1017b1f --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/RatcliffObershelpMetric.scala @@ -0,0 +1,57 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{CompareTuple, StringFilter, StringMetric} + +/** An implementation of the Ratcliff/Obershelp metric. */ +class RatcliffObershelpMetric extends StringMetric[DummyImplicit, Double] { this: StringFilter => + final override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit di: DummyImplicit): Option[Double] = { + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length == 0 || fca2.length == 0) None + else if (fca1.sameElements(fca2)) Some(1d) + else Some(2d * commonSequences(fca1, fca2).foldLeft(0)(_ + _.length) / (fca1.length + fca2.length)) + } + + final override def compare(string1: String, string2: String)(implicit di: DummyImplicit): Option[Double] = + compare(string1.toCharArray, string2.toCharArray) + + private[this] def longestCommonSubsequence(ct: CompareTuple[Char]) = { + val m = Array.ofDim[Int](ct._1.length + 1, ct._2.length + 1) + var lrc = (0, 0, 0) // Length, row, column. + + for (r <- 0 to ct._1.length - 1; c <- 0 to ct._2.length - 1) { + if (ct._1(r) == ct._2(c)) { + val l = m(r)(c) + 1 + m(r + 1)(c + 1) = l + if (l > lrc._1) lrc = (l, r + 1, c + 1) + } + } + + lrc + } + + private[this] def commonSequences(ct: CompareTuple[Char]): Array[Array[Char]] = { + val lcs = longestCommonSubsequence(ct) + + if (lcs._1 == 0) Array.empty + else { + val sct1 = (ct._1.take(lcs._2 - lcs._1), ct._1.takeRight(ct._1.length - lcs._2)) + val sct2 = (ct._2.take(lcs._3 - lcs._1), ct._2.takeRight(ct._2.length - lcs._3)) + + Array(ct._1.slice(lcs._2 - lcs._1, lcs._2)) ++ commonSequences(sct1._1, sct2._1) ++ commonSequences(sct1._2, sct2._2) + } + } +} + +object RatcliffObershelpMetric { + private lazy val self = apply() + + def apply(): RatcliffObershelpMetric = new RatcliffObershelpMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char]) = self.compare(charArray1, charArray2) + + def compare(string1: String, string2: String) = self.compare(string1, string2) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/WeightedLevenshteinMetric.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/WeightedLevenshteinMetric.scala new file mode 100755 index 0000000..976b01a --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/similarity/WeightedLevenshteinMetric.scala @@ -0,0 +1,61 @@ +package com.rockymadden.stringmetric.similarity + +import com.rockymadden.stringmetric.{CompareTuple, StringMetric, StringFilter} +import scala.math.BigDecimal + +/** An implementation of a weighted Levenshtein metric. */ +class WeightedLevenshteinMetric + extends StringMetric[(BigDecimal, BigDecimal, BigDecimal), Double] { this: StringFilter => + + /** Options order is delete, insert, then substitute weight. */ + final override def compare(charArray1: Array[Char], charArray2: Array[Char]) + (implicit options: (BigDecimal, BigDecimal, BigDecimal)): Option[Double] = { + + if (options._1 < 0 || options._2 < 0 || options._3 < 0) + throw new IllegalArgumentException("Expected valid weight options.") + + val fca1 = filter(charArray1) + lazy val fca2 = filter(charArray2) + + if (fca1.length == 0 || fca2.length == 0) None + else if (fca1.sameElements(fca2)) Some(0d) + else Some(weightedLevenshtein((fca1, fca2), options).toDouble) + } + + /** Options order is delete, insert, then substitute weight. */ + final override def compare(string1: String, string2: String) + (implicit options: (BigDecimal, BigDecimal, BigDecimal)): Option[Double] = + + compare(string1.toCharArray, string2.toCharArray)(options) + + private[this] def weightedLevenshtein(ct: CompareTuple[Char], w: (BigDecimal, BigDecimal, BigDecimal)) = { + val m = Array.ofDim[BigDecimal](ct._1.length + 1, ct._2.length + 1) + + for (r <- 0 to ct._1.length) m(r)(0) = w._1 * r + for (c <- 0 to ct._2.length) m(0)(c) = w._2 * c + + for (r <- 1 to ct._1.length; c <- 1 to ct._2.length) { + m(r)(c) = + if (ct._1(r - 1) == ct._2(c - 1)) m(r - 1)(c - 1) + else (m(r - 1)(c) + w._1).min( // Delete (left). + (m(r)(c - 1) + w._2).min( // Insert (up). + m(r - 1)(c - 1) + w._3 // Substitute (left-up). + ) + ) + } + + m(ct._1.length)(ct._2.length) + } +} + +object WeightedLevenshteinMetric { + private lazy val self = apply() + + def apply(): WeightedLevenshteinMetric = new WeightedLevenshteinMetric with StringFilter + + def compare(charArray1: Array[Char], charArray2: Array[Char])(options: (BigDecimal, BigDecimal, BigDecimal)) = + self.compare(charArray1, charArray2)(options) + + def compare(string1: String, string2: String)(options: (BigDecimal, BigDecimal, BigDecimal)) = + self.compare(string1, string2)(options) +} diff --git a/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala new file mode 100755 index 0000000..d66fd62 --- /dev/null +++ b/stringmetric-core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala @@ -0,0 +1,37 @@ +package com.rockymadden.stringmetric.tokenization + +import com.rockymadden.stringmetric.{StringFilter, StringTokenizer} +import scala.annotation.tailrec + +/** An implementation of the N-Gram tokenizer. */ +class NGramTokenizer extends StringTokenizer[Int, Array[String]] { this: StringFilter => + final override def tokenize(charArray: Array[Char])(implicit n: Int): Option[Array[Array[Char]]] = { + if (n <= 0) throw new IllegalArgumentException("Expected valid n.") + + val fca = filter(charArray) + + if (fca.length < n) None + else Some(sequence(fca, Array.empty[Array[Char]], n)) + } + + final override def tokenize(string: String)(implicit n: Int): Option[Array[String]] = + tokenize(string.toCharArray)(n).map(_.map(_.mkString)) + + @tailrec + private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = { + require(n > 0) + + if (i.length <= n) o :+ i + else sequence(i.tail, o :+ i.take(n), n) + } +} + +object NGramTokenizer { + private lazy val self = apply() + + def apply(): NGramTokenizer = new NGramTokenizer with StringFilter + + def tokenize(charArray: Array[Char])(n: Int) = self.tokenize(charArray)(n) + + def tokenize(string: String)(n: Int) = self.tokenize(string)(n) +} |