diff options
author | Rocky Madden <git@rockymadden.com> | 2013-03-18 19:47:49 -0600 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2013-03-18 19:47:49 -0600 |
commit | 05af73ab936168ebea891ed2f0120c49e8ec5066 (patch) | |
tree | 54a724d2d674a8458e06adfc042c68d4819b7e11 /core | |
parent | b558d56b50ff1a9fe8023b927fabf1e3d46a8119 (diff) | |
download | stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.tar.gz stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.tar.bz2 stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.zip |
Broke out n-gram tokenizer algorithm into tokenization package.
Diffstat (limited to 'core')
13 files changed, 119 insertions, 103 deletions
diff --git a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmBenchmark.scala b/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala index ae34220..d5ff1f3 100755 --- a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmBenchmark.scala +++ b/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala @@ -1,11 +1,11 @@ -package com.rockymadden.stringmetric.similarity +package com.rockymadden.stringmetric.tokenization import com.google.caliper.Param import com.rockymadden.stringmetric.{ CaliperBenchmark, CaliperRunner } import scala.util.Random -final class NGramAlgorithmBenchmark extends CaliperBenchmark { - import NGramAlgorithmBenchmark.Algorithm +final class NGramTokenizerBenchmark extends CaliperBenchmark { + import NGramTokenizerBenchmark.Tokenizer @Param(Array("0", "1", "2", "4", "8", "16")) var length: Int = _ @@ -22,14 +22,14 @@ final class NGramAlgorithmBenchmark extends CaliperBenchmark { } def timeComputeWithCharArray(reps: Int) = run(reps) { - Algorithm.compute(charArray)(n) + Tokenizer.tokenize(charArray)(n) } def timeComputeWithString(reps: Int) = run(reps) { - Algorithm.compute(string)(n) + Tokenizer.tokenize(string)(n) } } -object NGramAlgorithmBenchmark extends CaliperRunner(classOf[NGramAlgorithmBenchmark]) { - private final val Algorithm = NGramAlgorithm() +object NGramTokenizerBenchmark extends CaliperRunner(classOf[NGramTokenizerBenchmark]) { + private final val Tokenizer = NGramTokenizer() } diff --git a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala index d447538..38d8714 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala @@ -4,11 +4,4 @@ trait ConfigurableStringAlgorithm[R, O] extends ConfigurableAlgorithm[String, R, def compute(charArray: Array[Char])(implicit o: O): Option[Array[_]] } -object ConfigurableStringAlgorithm { - type NGram = com.rockymadden.stringmetric.similarity.NGramAlgorithm - val NGram = com.rockymadden.stringmetric.similarity.NGramAlgorithm - - def computeWithNGram(charArray: Array[Char])(n: Int) = NGram.compute(charArray)(n) - - def computeWithNGram(string: String)(n: Int) = NGram.compute(string)(n) -} +object ConfigurableStringAlgorithm diff --git a/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala new file mode 100755 index 0000000..5494274 --- /dev/null +++ b/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala @@ -0,0 +1,14 @@ +package com.rockymadden.stringmetric + +trait StringTokenizer[O, R] extends Tokenizer[String, O, R] { + def tokenize(charArray: Array[Char])(implicit o: O): Option[Array[Array[Char]]] +} + +object StringTokenizer { + type NGram = com.rockymadden.stringmetric.tokenization.NGramTokenizer + val NGram = com.rockymadden.stringmetric.tokenization.NGramTokenizer + + def tokenizeWithNGram(charArray: Array[Char])(n: Int) = NGram.tokenize(charArray)(n) + + def tokenizeWithNGram(string: String)(n: Int) = NGram.tokenize(string)(n) +} diff --git a/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala new file mode 100755 index 0000000..6744a68 --- /dev/null +++ b/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala @@ -0,0 +1,5 @@ +package com.rockymadden.stringmetric + +trait Tokenizer[T, O, R] { + def tokenize(t: T)(implicit o: O): Option[R] +} diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala index 4dd35a7..9b3b975 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala @@ -1,6 +1,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter } +import com.rockymadden.stringmetric.tokenization.NGramTokenizer /** * An implementation of the Dice/Sorensen metric. This implementation differs in that n-gram size is required. @@ -16,10 +17,10 @@ class DiceSorensenMetric extends ConfigurableStringMetric[Double, Int] { this: S if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. else if (fca1.sameElements(fca2)) Some(1d) else { - val nGramAlgorithm = NGramAlgorithm() + val nGramTokenizer = NGramTokenizer() - nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg => - nGramAlgorithm.compute(fca2)(n).map { ca2bg => + nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + nGramTokenizer.tokenize(fca2)(n).map { ca2bg => val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString)) (2d * ms) / (ca1bg.length + ca2bg.length) diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala index 5971701..f36333f 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala @@ -1,6 +1,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter } +import com.rockymadden.stringmetric.tokenization.NGramTokenizer /* An implementation of the Jaccard metric. */ class JaccardMetric extends ConfigurableStringMetric[Double, Int] { this: StringFilter => @@ -13,10 +14,10 @@ class JaccardMetric extends ConfigurableStringMetric[Double, Int] { this: String if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. else if (fca1.sameElements(fca2)) Some(1d) else { - val nGramAlgorithm = NGramAlgorithm() + val nGramTokenizer = NGramTokenizer() - nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg => - nGramAlgorithm.compute(fca2)(n).map { ca2bg => + nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + nGramTokenizer.tokenize(fca2)(n).map { ca2bg => val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString)) ms.toDouble / (ca1bg.length + ca2bg.length) diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala deleted file mode 100755 index cc67e04..0000000 --- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala +++ /dev/null @@ -1,37 +0,0 @@ -package com.rockymadden.stringmetric.similarity - -import com.rockymadden.stringmetric.{ ConfigurableStringAlgorithm, StringFilter } -import scala.annotation.tailrec - -/** An implementation of the N-Gram algorithm. */ -class NGramAlgorithm extends ConfigurableStringAlgorithm[Array[String], Int] { this: StringFilter => - final override def compute(charArray: Array[Char])(implicit n: Int): Option[Array[Array[Char]]] = { - if (n <= 0) throw new IllegalArgumentException("Expected valid n.") - - val fca = filter(charArray) - - if (fca.length < n) None - else Some(sequence(fca, Array.empty[Array[Char]], n)) - } - - final override def compute(string: String)(implicit n: Int): Option[Array[String]] = - compute(string.toCharArray)(n).map(_.map(_.mkString)) - - @tailrec - private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = { - require(n > 0) - - if (i.length <= n) o :+ i - else sequence(i.tail, o :+ i.take(n), n) - } -} - -object NGramAlgorithm { - private lazy val self = apply() - - def apply(): NGramAlgorithm = new NGramAlgorithm with StringFilter - - def compute(charArray: Array[Char])(n: Int) = self.compute(charArray)(n) - - def compute(string: String)(n: Int) = self.compute(string)(n) -} diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala index 935e576..91cfa68 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala @@ -1,6 +1,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter } +import com.rockymadden.stringmetric.tokenization.NGramTokenizer import scala.math /** An implementation of the N-Gram metric. */ @@ -14,10 +15,10 @@ class NGramMetric extends ConfigurableStringMetric[Double, Int] { this: StringFi if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. else if (fca1.sameElements(fca2)) Some(1d) else { - val nGramAlgorithm = NGramAlgorithm() + val nGramTokenizer = NGramTokenizer() - nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg => - nGramAlgorithm.compute(fca2)(n).map { ca2bg => + nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + nGramTokenizer.tokenize(fca2)(n).map { ca2bg => val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString))) ms.toDouble / math.max(ca1bg.length, ca2bg.length) diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala index 6927138..9f81109 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala @@ -1,6 +1,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter } +import com.rockymadden.stringmetric.tokenization.NGramTokenizer import scala.math /* An implementation of the overlap metric. */ @@ -14,10 +15,10 @@ class OverlapMetric extends ConfigurableStringMetric[Double, Int] { this: String if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. else if (fca1.sameElements(fca2)) Some(1d) else { - val nGramAlgorithm = NGramAlgorithm() + val nGramTokenizer = NGramTokenizer() - nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg => - nGramAlgorithm.compute(fca2)(n).map { ca2bg => + nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + nGramTokenizer.tokenize(fca2)(n).map { ca2bg => val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString)) ms.toDouble / (math.min(ca1bg.length, ca2bg.length)) diff --git a/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala new file mode 100755 index 0000000..8e27420 --- /dev/null +++ b/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala @@ -0,0 +1,37 @@ +package com.rockymadden.stringmetric.tokenization + +import com.rockymadden.stringmetric.{ StringFilter, StringTokenizer } +import scala.annotation.tailrec + +/** An implementation of the N-Gram tokenizer. */ +class NGramTokenizer extends StringTokenizer[Int, Array[String]] { this: StringFilter => + final override def tokenize(charArray: Array[Char])(implicit n: Int): Option[Array[Array[Char]]] = { + if (n <= 0) throw new IllegalArgumentException("Expected valid n.") + + val fca = filter(charArray) + + if (fca.length < n) None + else Some(sequence(fca, Array.empty[Array[Char]], n)) + } + + final override def tokenize(string: String)(implicit n: Int): Option[Array[String]] = + tokenize(string.toCharArray)(n).map(_.map(_.mkString)) + + @tailrec + private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = { + require(n > 0) + + if (i.length <= n) o :+ i + else sequence(i.tail, o :+ i.take(n), n) + } +} + +object NGramTokenizer { + private lazy val self = apply() + + def apply(): NGramTokenizer = new NGramTokenizer with StringFilter + + def tokenize(charArray: Array[Char])(n: Int) = self.tokenize(charArray)(n) + + def tokenize(string: String)(n: Int) = self.tokenize(string)(n) +} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala deleted file mode 100755 index eb09a25..0000000 --- a/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala +++ /dev/null @@ -1,23 +0,0 @@ -package com.rockymadden.stringmetric - -import com.rockymadden.stringmetric.similarity._ -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner - -@RunWith(classOf[JUnitRunner]) -final class ConfigurableStringAlgorithmSpec extends ScalaTest { - "StringAlgorithm standalone object" should provide { - "compute method, type, and companion object pass-throughs" in { - val nGram: ConfigurableStringAlgorithm.NGram = ConfigurableStringAlgorithm.NGram() - - nGram.compute("testone")(1).get should - equal (ConfigurableStringAlgorithm.computeWithNGram("testone")(1).get) - nGram.compute("testone".toCharArray)(1).get should - equal (ConfigurableStringAlgorithm.computeWithNGram("testone".toCharArray)(1).get) - nGram.compute("testone".toCharArray)(1).get should - equal (NGramAlgorithm.compute("testone".toCharArray)(1).get) - } - } -} - - diff --git a/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala new file mode 100755 index 0000000..8837c25 --- /dev/null +++ b/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala @@ -0,0 +1,23 @@ +package com.rockymadden.stringmetric + +import com.rockymadden.stringmetric.tokenization.NGramTokenizer +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class StringTokenizerSpec extends ScalaTest { + "StringTokenizer standalone object" should provide { + "tokenize method, type, and companion object pass-throughs" in { + val nGram: StringTokenizer.NGram = StringTokenizer.NGram() + + nGram.tokenize("testone")(1).get should + equal (StringTokenizer.tokenizeWithNGram("testone")(1).get) + nGram.tokenize("testone".toCharArray)(1).get should + equal (StringTokenizer.tokenizeWithNGram("testone".toCharArray)(1).get) + nGram.tokenize("testone".toCharArray)(1).get should + equal (NGramTokenizer.tokenize("testone".toCharArray)(1).get) + } + } +} + + diff --git a/core/source/test/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala index 82a0c69..56fdc13 100755 --- a/core/source/test/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmSpec.scala +++ b/core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala @@ -1,46 +1,46 @@ -package com.rockymadden.stringmetric.similarity +package com.rockymadden.stringmetric.tokenization import com.rockymadden.stringmetric.ScalaTest import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -final class NGramAlgorithmSpec extends ScalaTest { - import NGramAlgorithmSpec.Algorithm +final class NGramTokenizerSpec extends ScalaTest { + import NGramTokenizerSpec.Tokenizer - "NGramAlgorithm" should provide { - "compute method" when passed { + "NGramTokenizer" should provide { + "tokenize method" when passed { "empty argument" should returns { "None" in { - Algorithm.compute("")(1).isDefined should be (false) + Tokenizer.tokenize("")(1).isDefined should be (false) } } "invalid n argument" should throws { "IllegalArgumentException" in { evaluating { - Algorithm.compute("")(0).isDefined should be (false) + Tokenizer.tokenize("")(0).isDefined should be (false) } should produce [IllegalArgumentException] evaluating { - Algorithm.compute("")(-1).isDefined should be (false) + Tokenizer.tokenize("")(-1).isDefined should be (false) } should produce [IllegalArgumentException] } } "valid argument" should returns { "Array[String]" in { - Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(1).get should equal ( + Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(1).get should equal ( Array( "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" ) ) - Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(2).get should equal ( + Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(2).get should equal ( Array( "ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl", "lm", "mn", "no", "op", "pq", "qr", "rs", "st", "tu", "uv", "vw", "wx", "xy", "yz" ) ) - Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(3).get should equal ( + Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(3).get should equal ( Array( "abc", "bcd", "cde", "def", "efg", "fgh", "ghi", "hij", "ijk", "jkl", "klm", "lmn", "mno", "nop", "opq", "pqr", "qrs", "rst", "stu", "tuv", "uvw", "vwx", "wxy", "xyz" @@ -50,10 +50,10 @@ final class NGramAlgorithmSpec extends ScalaTest { } } } - "NGramAlgorithm companion object" should provide { - "pass-through compute method" should returns { + "NGramTokenizer companion object" should provide { + "pass-through tokenize method" should returns { "same value as class" in { - NGramAlgorithm.compute("abcdefghijklmnopqrstuvwxyz")(1).get should equal ( + NGramTokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(1).get should equal ( Array( "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" @@ -64,6 +64,6 @@ final class NGramAlgorithmSpec extends ScalaTest { } } -object NGramAlgorithmSpec { - private final val Algorithm = NGramAlgorithm() +object NGramTokenizerSpec { + private final val Tokenizer = NGramTokenizer() } |