diff options
author | Rocky Madden <git@rockymadden.com> | 2013-03-18 19:47:49 -0600 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2013-03-18 19:47:49 -0600 |
commit | 05af73ab936168ebea891ed2f0120c49e8ec5066 (patch) | |
tree | 54a724d2d674a8458e06adfc042c68d4819b7e11 | |
parent | b558d56b50ff1a9fe8023b927fabf1e3d46a8119 (diff) | |
download | stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.tar.gz stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.tar.bz2 stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.zip |
Broke out n-gram tokenizer algorithm into tokenization package.
16 files changed, 134 insertions, 134 deletions
diff --git a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/nGramAlgorithm.scala b/cli/source/core/scala/com/rockymadden/stringmetric/cli/tokenization/nGramTokenizer.scala index 707f7fd..566ddb6 100755 --- a/cli/source/core/scala/com/rockymadden/stringmetric/cli/similarity/nGramAlgorithm.scala +++ b/cli/source/core/scala/com/rockymadden/stringmetric/cli/tokenization/nGramTokenizer.scala @@ -1,13 +1,13 @@ -package com.rockymadden.stringmetric.cli.similarity +package com.rockymadden.stringmetric.cli.tokenization import com.rockymadden.stringmetric.cli._ -import com.rockymadden.stringmetric.similarity.NGramAlgorithm +import com.rockymadden.stringmetric.tokenization.NGramTokenizer /** - * The nGramAlgorithm [[com.rockymadden.stringmetric.cli.Command]]. Returns the N-Gram representation of the passed + * The nGramTokenizer [[com.rockymadden.stringmetric.cli.Command]]. Returns the N-Gram representation of the passed * string. */ -object nGramAlgorithm extends Command { +object nGramTokenizer extends Command { override def main(args: Array[String]): Unit = { val opts: OptionMap = args @@ -31,7 +31,7 @@ object nGramAlgorithm extends Command { println( "Returns the N-Gram representation of the passed string." + ls + ls + "Syntax:" + ls + - tab + "nGramAlgorithm [Options] string..." + ls + ls + + tab + "nGramTokenizer [Options] string..." + ls + ls + "Options:" + ls + tab + "-h, --help" + ls + tab + tab + "Outputs description, syntax, and opts." + @@ -41,8 +41,8 @@ object nGramAlgorithm extends Command { } override def execute(opts: OptionMap): Unit = - NGramAlgorithm.compute(opts('dashless))(opts('n)) match { - // Implicits a pain here. + NGramTokenizer.tokenize(opts('dashless))(opts('n)) match { + // Implicits are a pain here. case Some(c) => { val sb = new StringBuilder diff --git a/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/nGramAlgorithmSpec.scala b/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenization/nGramTokenizerSpec.scala index 55ac2da..144acb4 100755 --- a/cli/source/test/scala/com/rockymadden/stringmetric/cli/similarity/nGramAlgorithmSpec.scala +++ b/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenization/nGramTokenizerSpec.scala @@ -1,19 +1,19 @@ -package com.rockymadden.stringmetric.cli.similarity +package com.rockymadden.stringmetric.cli.tokenization import com.rockymadden.stringmetric.ScalaTest import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -final class nGramAlgorithmSpec extends ScalaTest { - "nGramAlgorithm" should provide { +final class nGramTokenizerSpec extends ScalaTest { + "nGramTokenizer" should provide { "main method" when passed { "valid dashless argument and valid n argument" should executes { "print N-Gram representation" in { val out = new java.io.ByteArrayOutputStream() Console.withOut(out)( - nGramAlgorithm.main( + nGramTokenizer.main( Array( "--unitTest", "--debug", @@ -27,7 +27,7 @@ final class nGramAlgorithmSpec extends ScalaTest { out.reset() Console.withOut(out)( - nGramAlgorithm.main( + nGramTokenizer.main( Array( "--unitTest", "--debug", @@ -44,7 +44,7 @@ final class nGramAlgorithmSpec extends ScalaTest { "valid dashless argument and invalid n argument" should throws { "IllegalArgumentException" in { evaluating { - nGramAlgorithm.main( + nGramTokenizer.main( Array( "--unitTest", "abc", @@ -57,7 +57,7 @@ final class nGramAlgorithmSpec extends ScalaTest { "no dashless argument" should throws { "IllegalArgumentException" in { evaluating { - nGramAlgorithm.main(Array("--unitTest", "--debug")) + nGramTokenizer.main(Array("--unitTest", "--debug")) } should produce [IllegalArgumentException] } } diff --git a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmBenchmark.scala b/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala index ae34220..d5ff1f3 100755 --- a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmBenchmark.scala +++ b/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala @@ -1,11 +1,11 @@ -package com.rockymadden.stringmetric.similarity +package com.rockymadden.stringmetric.tokenization import com.google.caliper.Param import com.rockymadden.stringmetric.{ CaliperBenchmark, CaliperRunner } import scala.util.Random -final class NGramAlgorithmBenchmark extends CaliperBenchmark { - import NGramAlgorithmBenchmark.Algorithm +final class NGramTokenizerBenchmark extends CaliperBenchmark { + import NGramTokenizerBenchmark.Tokenizer @Param(Array("0", "1", "2", "4", "8", "16")) var length: Int = _ @@ -22,14 +22,14 @@ final class NGramAlgorithmBenchmark extends CaliperBenchmark { } def timeComputeWithCharArray(reps: Int) = run(reps) { - Algorithm.compute(charArray)(n) + Tokenizer.tokenize(charArray)(n) } def timeComputeWithString(reps: Int) = run(reps) { - Algorithm.compute(string)(n) + Tokenizer.tokenize(string)(n) } } -object NGramAlgorithmBenchmark extends CaliperRunner(classOf[NGramAlgorithmBenchmark]) { - private final val Algorithm = NGramAlgorithm() +object NGramTokenizerBenchmark extends CaliperRunner(classOf[NGramTokenizerBenchmark]) { + private final val Tokenizer = NGramTokenizer() } diff --git a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala index d447538..38d8714 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala @@ -4,11 +4,4 @@ trait ConfigurableStringAlgorithm[R, O] extends ConfigurableAlgorithm[String, R, def compute(charArray: Array[Char])(implicit o: O): Option[Array[_]] } -object ConfigurableStringAlgorithm { - type NGram = com.rockymadden.stringmetric.similarity.NGramAlgorithm - val NGram = com.rockymadden.stringmetric.similarity.NGramAlgorithm - - def computeWithNGram(charArray: Array[Char])(n: Int) = NGram.compute(charArray)(n) - - def computeWithNGram(string: String)(n: Int) = NGram.compute(string)(n) -} +object ConfigurableStringAlgorithm diff --git a/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala new file mode 100755 index 0000000..5494274 --- /dev/null +++ b/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala @@ -0,0 +1,14 @@ +package com.rockymadden.stringmetric + +trait StringTokenizer[O, R] extends Tokenizer[String, O, R] { + def tokenize(charArray: Array[Char])(implicit o: O): Option[Array[Array[Char]]] +} + +object StringTokenizer { + type NGram = com.rockymadden.stringmetric.tokenization.NGramTokenizer + val NGram = com.rockymadden.stringmetric.tokenization.NGramTokenizer + + def tokenizeWithNGram(charArray: Array[Char])(n: Int) = NGram.tokenize(charArray)(n) + + def tokenizeWithNGram(string: String)(n: Int) = NGram.tokenize(string)(n) +} diff --git a/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala new file mode 100755 index 0000000..6744a68 --- /dev/null +++ b/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala @@ -0,0 +1,5 @@ +package com.rockymadden.stringmetric + +trait Tokenizer[T, O, R] { + def tokenize(t: T)(implicit o: O): Option[R] +} diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala index 4dd35a7..9b3b975 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala @@ -1,6 +1,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter } +import com.rockymadden.stringmetric.tokenization.NGramTokenizer /** * An implementation of the Dice/Sorensen metric. This implementation differs in that n-gram size is required. @@ -16,10 +17,10 @@ class DiceSorensenMetric extends ConfigurableStringMetric[Double, Int] { this: S if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. else if (fca1.sameElements(fca2)) Some(1d) else { - val nGramAlgorithm = NGramAlgorithm() + val nGramTokenizer = NGramTokenizer() - nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg => - nGramAlgorithm.compute(fca2)(n).map { ca2bg => + nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + nGramTokenizer.tokenize(fca2)(n).map { ca2bg => val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString)) (2d * ms) / (ca1bg.length + ca2bg.length) diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala index 5971701..f36333f 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala @@ -1,6 +1,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter } +import com.rockymadden.stringmetric.tokenization.NGramTokenizer /* An implementation of the Jaccard metric. */ class JaccardMetric extends ConfigurableStringMetric[Double, Int] { this: StringFilter => @@ -13,10 +14,10 @@ class JaccardMetric extends ConfigurableStringMetric[Double, Int] { this: String if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. else if (fca1.sameElements(fca2)) Some(1d) else { - val nGramAlgorithm = NGramAlgorithm() + val nGramTokenizer = NGramTokenizer() - nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg => - nGramAlgorithm.compute(fca2)(n).map { ca2bg => + nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + nGramTokenizer.tokenize(fca2)(n).map { ca2bg => val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString)) ms.toDouble / (ca1bg.length + ca2bg.length) diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala deleted file mode 100755 index cc67e04..0000000 --- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala +++ /dev/null @@ -1,37 +0,0 @@ -package com.rockymadden.stringmetric.similarity - -import com.rockymadden.stringmetric.{ ConfigurableStringAlgorithm, StringFilter } -import scala.annotation.tailrec - -/** An implementation of the N-Gram algorithm. */ -class NGramAlgorithm extends ConfigurableStringAlgorithm[Array[String], Int] { this: StringFilter => - final override def compute(charArray: Array[Char])(implicit n: Int): Option[Array[Array[Char]]] = { - if (n <= 0) throw new IllegalArgumentException("Expected valid n.") - - val fca = filter(charArray) - - if (fca.length < n) None - else Some(sequence(fca, Array.empty[Array[Char]], n)) - } - - final override def compute(string: String)(implicit n: Int): Option[Array[String]] = - compute(string.toCharArray)(n).map(_.map(_.mkString)) - - @tailrec - private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = { - require(n > 0) - - if (i.length <= n) o :+ i - else sequence(i.tail, o :+ i.take(n), n) - } -} - -object NGramAlgorithm { - private lazy val self = apply() - - def apply(): NGramAlgorithm = new NGramAlgorithm with StringFilter - - def compute(charArray: Array[Char])(n: Int) = self.compute(charArray)(n) - - def compute(string: String)(n: Int) = self.compute(string)(n) -} diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala index 935e576..91cfa68 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala @@ -1,6 +1,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter } +import com.rockymadden.stringmetric.tokenization.NGramTokenizer import scala.math /** An implementation of the N-Gram metric. */ @@ -14,10 +15,10 @@ class NGramMetric extends ConfigurableStringMetric[Double, Int] { this: StringFi if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. else if (fca1.sameElements(fca2)) Some(1d) else { - val nGramAlgorithm = NGramAlgorithm() + val nGramTokenizer = NGramTokenizer() - nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg => - nGramAlgorithm.compute(fca2)(n).map { ca2bg => + nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + nGramTokenizer.tokenize(fca2)(n).map { ca2bg => val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString))) ms.toDouble / math.max(ca1bg.length, ca2bg.length) diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala index 6927138..9f81109 100755 --- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala +++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala @@ -1,6 +1,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter } +import com.rockymadden.stringmetric.tokenization.NGramTokenizer import scala.math /* An implementation of the overlap metric. */ @@ -14,10 +15,10 @@ class OverlapMetric extends ConfigurableStringMetric[Double, Int] { this: String if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare. else if (fca1.sameElements(fca2)) Some(1d) else { - val nGramAlgorithm = NGramAlgorithm() + val nGramTokenizer = NGramTokenizer() - nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg => - nGramAlgorithm.compute(fca2)(n).map { ca2bg => + nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg => + nGramTokenizer.tokenize(fca2)(n).map { ca2bg => val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString)) ms.toDouble / (math.min(ca1bg.length, ca2bg.length)) diff --git a/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala new file mode 100755 index 0000000..8e27420 --- /dev/null +++ b/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala @@ -0,0 +1,37 @@ +package com.rockymadden.stringmetric.tokenization + +import com.rockymadden.stringmetric.{ StringFilter, StringTokenizer } +import scala.annotation.tailrec + +/** An implementation of the N-Gram tokenizer. */ +class NGramTokenizer extends StringTokenizer[Int, Array[String]] { this: StringFilter => + final override def tokenize(charArray: Array[Char])(implicit n: Int): Option[Array[Array[Char]]] = { + if (n <= 0) throw new IllegalArgumentException("Expected valid n.") + + val fca = filter(charArray) + + if (fca.length < n) None + else Some(sequence(fca, Array.empty[Array[Char]], n)) + } + + final override def tokenize(string: String)(implicit n: Int): Option[Array[String]] = + tokenize(string.toCharArray)(n).map(_.map(_.mkString)) + + @tailrec + private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = { + require(n > 0) + + if (i.length <= n) o :+ i + else sequence(i.tail, o :+ i.take(n), n) + } +} + +object NGramTokenizer { + private lazy val self = apply() + + def apply(): NGramTokenizer = new NGramTokenizer with StringFilter + + def tokenize(charArray: Array[Char])(n: Int) = self.tokenize(charArray)(n) + + def tokenize(string: String)(n: Int) = self.tokenize(string)(n) +} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala deleted file mode 100755 index eb09a25..0000000 --- a/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala +++ /dev/null @@ -1,23 +0,0 @@ -package com.rockymadden.stringmetric - -import com.rockymadden.stringmetric.similarity._ -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner - -@RunWith(classOf[JUnitRunner]) -final class ConfigurableStringAlgorithmSpec extends ScalaTest { - "StringAlgorithm standalone object" should provide { - "compute method, type, and companion object pass-throughs" in { - val nGram: ConfigurableStringAlgorithm.NGram = ConfigurableStringAlgorithm.NGram() - - nGram.compute("testone")(1).get should - equal (ConfigurableStringAlgorithm.computeWithNGram("testone")(1).get) - nGram.compute("testone".toCharArray)(1).get should - equal (ConfigurableStringAlgorithm.computeWithNGram("testone".toCharArray)(1).get) - nGram.compute("testone".toCharArray)(1).get should - equal (NGramAlgorithm.compute("testone".toCharArray)(1).get) - } - } -} - - diff --git a/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala new file mode 100755 index 0000000..8837c25 --- /dev/null +++ b/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala @@ -0,0 +1,23 @@ +package com.rockymadden.stringmetric + +import com.rockymadden.stringmetric.tokenization.NGramTokenizer +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class StringTokenizerSpec extends ScalaTest { + "StringTokenizer standalone object" should provide { + "tokenize method, type, and companion object pass-throughs" in { + val nGram: StringTokenizer.NGram = StringTokenizer.NGram() + + nGram.tokenize("testone")(1).get should + equal (StringTokenizer.tokenizeWithNGram("testone")(1).get) + nGram.tokenize("testone".toCharArray)(1).get should + equal (StringTokenizer.tokenizeWithNGram("testone".toCharArray)(1).get) + nGram.tokenize("testone".toCharArray)(1).get should + equal (NGramTokenizer.tokenize("testone".toCharArray)(1).get) + } + } +} + + diff --git a/core/source/test/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala index 82a0c69..56fdc13 100755 --- a/core/source/test/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmSpec.scala +++ b/core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala @@ -1,46 +1,46 @@ -package com.rockymadden.stringmetric.similarity +package com.rockymadden.stringmetric.tokenization import com.rockymadden.stringmetric.ScalaTest import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -final class NGramAlgorithmSpec extends ScalaTest { - import NGramAlgorithmSpec.Algorithm +final class NGramTokenizerSpec extends ScalaTest { + import NGramTokenizerSpec.Tokenizer - "NGramAlgorithm" should provide { - "compute method" when passed { + "NGramTokenizer" should provide { + "tokenize method" when passed { "empty argument" should returns { "None" in { - Algorithm.compute("")(1).isDefined should be (false) + Tokenizer.tokenize("")(1).isDefined should be (false) } } "invalid n argument" should throws { "IllegalArgumentException" in { evaluating { - Algorithm.compute("")(0).isDefined should be (false) + Tokenizer.tokenize("")(0).isDefined should be (false) } should produce [IllegalArgumentException] evaluating { - Algorithm.compute("")(-1).isDefined should be (false) + Tokenizer.tokenize("")(-1).isDefined should be (false) } should produce [IllegalArgumentException] } } "valid argument" should returns { "Array[String]" in { - Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(1).get should equal ( + Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(1).get should equal ( Array( "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" ) ) - Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(2).get should equal ( + Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(2).get should equal ( Array( "ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl", "lm", "mn", "no", "op", "pq", "qr", "rs", "st", "tu", "uv", "vw", "wx", "xy", "yz" ) ) - Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(3).get should equal ( + Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(3).get should equal ( Array( "abc", "bcd", "cde", "def", "efg", "fgh", "ghi", "hij", "ijk", "jkl", "klm", "lmn", "mno", "nop", "opq", "pqr", "qrs", "rst", "stu", "tuv", "uvw", "vwx", "wxy", "xyz" @@ -50,10 +50,10 @@ final class NGramAlgorithmSpec extends ScalaTest { } } } - "NGramAlgorithm companion object" should provide { - "pass-through compute method" should returns { + "NGramTokenizer companion object" should provide { + "pass-through tokenize method" should returns { "same value as class" in { - NGramAlgorithm.compute("abcdefghijklmnopqrstuvwxyz")(1).get should equal ( + NGramTokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(1).get should equal ( Array( "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" @@ -64,6 +64,6 @@ final class NGramAlgorithmSpec extends ScalaTest { } } -object NGramAlgorithmSpec { - private final val Algorithm = NGramAlgorithm() +object NGramTokenizerSpec { + private final val Tokenizer = NGramTokenizer() } @@ -13,7 +13,7 @@ String metrics and phonetic algorithms for Scala. The library provides facilitie * __[Monge-Elkan](http://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf)__ (Queued similarity metric) * __[Match Rating Approach](http://en.wikipedia.org/wiki/Match_rating_approach)__ (Queued phonetic metric and algorithm) * __[Needleman-Wunch](http://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm)__ (Queued similarity metric) -* __[N-Gram](http://en.wikipedia.org/wiki/N-gram)__ (Similarity metric and algorithm) +* __[N-Gram](http://en.wikipedia.org/wiki/N-gram)__ (Similarity metric) * __[NYSIIS](http://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System)__ (Phonetic metric and algorithm) * __[Overlap](http://en.wikipedia.org/wiki/Overlap_coefficient)__ (Similarity metric) * __[Ratcliff-Obershelp](http://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html)__ (Similarity metric) @@ -165,22 +165,6 @@ Output: --- -__N-Gram Algorithm:__ _(Note you must specify the size of the n-gram you wish to use. This can be done implicitly.)_ -```scala -println(NGramAlgorithm.compute("abcdefghijklmnopqrstuvwxyz")(1)) -println(NGramAlgorithm.compute("abcdefghijklmnopqrstuvwxyz")(2)) -println(NGramAlgorithm.compute("abcdefghijklmnopqrstuvwxyz")(3)) -``` - -Output: -``` -Array("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z") -Array("ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl", "lm", "mn", "no", "op", "pq", "qr", "rs", "st", "tu", "uv", "vw", "wx", "xy", "yz") -Array("abc", "bcd", "cde", "def", "efg", "fgh", "ghi", "hij", "ijk", "jkl", "klm", "lmn", "mno", "nop", "opq", "pqr", "qrs", "rst", "stu", "tuv", "uvw", "vwx", "wxy", "xyz") -``` - ---- - __Overlap Metric:__ _(Note you must specify the size of the n-gram you wish to use. This can be done implicitly.)_ ```scala println(OverlapMetric.compare("night", "nacht")(1)) |