summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2013-03-18 19:47:49 -0600
committerRocky Madden <git@rockymadden.com>2013-03-18 19:47:49 -0600
commit05af73ab936168ebea891ed2f0120c49e8ec5066 (patch)
tree54a724d2d674a8458e06adfc042c68d4819b7e11 /core
parentb558d56b50ff1a9fe8023b927fabf1e3d46a8119 (diff)
downloadstringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.tar.gz
stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.tar.bz2
stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.zip
Broke out n-gram tokenizer algorithm into tokenization package.
Diffstat (limited to 'core')
-rwxr-xr-xcore/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala (renamed from core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmBenchmark.scala)14
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala9
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala14
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala5
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala7
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala7
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala37
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala7
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala7
-rwxr-xr-xcore/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala37
-rwxr-xr-xcore/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala23
-rwxr-xr-xcore/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala23
-rwxr-xr-xcore/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala (renamed from core/source/test/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmSpec.scala)32
13 files changed, 119 insertions, 103 deletions
diff --git a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmBenchmark.scala b/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala
index ae34220..d5ff1f3 100755
--- a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmBenchmark.scala
+++ b/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala
@@ -1,11 +1,11 @@
-package com.rockymadden.stringmetric.similarity
+package com.rockymadden.stringmetric.tokenization
import com.google.caliper.Param
import com.rockymadden.stringmetric.{ CaliperBenchmark, CaliperRunner }
import scala.util.Random
-final class NGramAlgorithmBenchmark extends CaliperBenchmark {
- import NGramAlgorithmBenchmark.Algorithm
+final class NGramTokenizerBenchmark extends CaliperBenchmark {
+ import NGramTokenizerBenchmark.Tokenizer
@Param(Array("0", "1", "2", "4", "8", "16"))
var length: Int = _
@@ -22,14 +22,14 @@ final class NGramAlgorithmBenchmark extends CaliperBenchmark {
}
def timeComputeWithCharArray(reps: Int) = run(reps) {
- Algorithm.compute(charArray)(n)
+ Tokenizer.tokenize(charArray)(n)
}
def timeComputeWithString(reps: Int) = run(reps) {
- Algorithm.compute(string)(n)
+ Tokenizer.tokenize(string)(n)
}
}
-object NGramAlgorithmBenchmark extends CaliperRunner(classOf[NGramAlgorithmBenchmark]) {
- private final val Algorithm = NGramAlgorithm()
+object NGramTokenizerBenchmark extends CaliperRunner(classOf[NGramTokenizerBenchmark]) {
+ private final val Tokenizer = NGramTokenizer()
}
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala
index d447538..38d8714 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala
@@ -4,11 +4,4 @@ trait ConfigurableStringAlgorithm[R, O] extends ConfigurableAlgorithm[String, R,
def compute(charArray: Array[Char])(implicit o: O): Option[Array[_]]
}
-object ConfigurableStringAlgorithm {
- type NGram = com.rockymadden.stringmetric.similarity.NGramAlgorithm
- val NGram = com.rockymadden.stringmetric.similarity.NGramAlgorithm
-
- def computeWithNGram(charArray: Array[Char])(n: Int) = NGram.compute(charArray)(n)
-
- def computeWithNGram(string: String)(n: Int) = NGram.compute(string)(n)
-}
+object ConfigurableStringAlgorithm
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala
new file mode 100755
index 0000000..5494274
--- /dev/null
+++ b/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala
@@ -0,0 +1,14 @@
+package com.rockymadden.stringmetric
+
+trait StringTokenizer[O, R] extends Tokenizer[String, O, R] {
+ def tokenize(charArray: Array[Char])(implicit o: O): Option[Array[Array[Char]]]
+}
+
+object StringTokenizer {
+ type NGram = com.rockymadden.stringmetric.tokenization.NGramTokenizer
+ val NGram = com.rockymadden.stringmetric.tokenization.NGramTokenizer
+
+ def tokenizeWithNGram(charArray: Array[Char])(n: Int) = NGram.tokenize(charArray)(n)
+
+ def tokenizeWithNGram(string: String)(n: Int) = NGram.tokenize(string)(n)
+}
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala
new file mode 100755
index 0000000..6744a68
--- /dev/null
+++ b/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala
@@ -0,0 +1,5 @@
+package com.rockymadden.stringmetric
+
+trait Tokenizer[T, O, R] {
+ def tokenize(t: T)(implicit o: O): Option[R]
+}
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
index 4dd35a7..9b3b975 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
@@ -1,6 +1,7 @@
package com.rockymadden.stringmetric.similarity
import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter }
+import com.rockymadden.stringmetric.tokenization.NGramTokenizer
/**
* An implementation of the Dice/Sorensen metric. This implementation differs in that n-gram size is required.
@@ -16,10 +17,10 @@ class DiceSorensenMetric extends ConfigurableStringMetric[Double, Int] { this: S
if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare.
else if (fca1.sameElements(fca2)) Some(1d)
else {
- val nGramAlgorithm = NGramAlgorithm()
+ val nGramTokenizer = NGramTokenizer()
- nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg =>
- nGramAlgorithm.compute(fca2)(n).map { ca2bg =>
+ nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg =>
+ nGramTokenizer.tokenize(fca2)(n).map { ca2bg =>
val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString))
(2d * ms) / (ca1bg.length + ca2bg.length)
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala
index 5971701..f36333f 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala
@@ -1,6 +1,7 @@
package com.rockymadden.stringmetric.similarity
import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter }
+import com.rockymadden.stringmetric.tokenization.NGramTokenizer
/* An implementation of the Jaccard metric. */
class JaccardMetric extends ConfigurableStringMetric[Double, Int] { this: StringFilter =>
@@ -13,10 +14,10 @@ class JaccardMetric extends ConfigurableStringMetric[Double, Int] { this: String
if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare.
else if (fca1.sameElements(fca2)) Some(1d)
else {
- val nGramAlgorithm = NGramAlgorithm()
+ val nGramTokenizer = NGramTokenizer()
- nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg =>
- nGramAlgorithm.compute(fca2)(n).map { ca2bg =>
+ nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg =>
+ nGramTokenizer.tokenize(fca2)(n).map { ca2bg =>
val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString))
ms.toDouble / (ca1bg.length + ca2bg.length)
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala
deleted file mode 100755
index cc67e04..0000000
--- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-package com.rockymadden.stringmetric.similarity
-
-import com.rockymadden.stringmetric.{ ConfigurableStringAlgorithm, StringFilter }
-import scala.annotation.tailrec
-
-/** An implementation of the N-Gram algorithm. */
-class NGramAlgorithm extends ConfigurableStringAlgorithm[Array[String], Int] { this: StringFilter =>
- final override def compute(charArray: Array[Char])(implicit n: Int): Option[Array[Array[Char]]] = {
- if (n <= 0) throw new IllegalArgumentException("Expected valid n.")
-
- val fca = filter(charArray)
-
- if (fca.length < n) None
- else Some(sequence(fca, Array.empty[Array[Char]], n))
- }
-
- final override def compute(string: String)(implicit n: Int): Option[Array[String]] =
- compute(string.toCharArray)(n).map(_.map(_.mkString))
-
- @tailrec
- private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = {
- require(n > 0)
-
- if (i.length <= n) o :+ i
- else sequence(i.tail, o :+ i.take(n), n)
- }
-}
-
-object NGramAlgorithm {
- private lazy val self = apply()
-
- def apply(): NGramAlgorithm = new NGramAlgorithm with StringFilter
-
- def compute(charArray: Array[Char])(n: Int) = self.compute(charArray)(n)
-
- def compute(string: String)(n: Int) = self.compute(string)(n)
-}
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala
index 935e576..91cfa68 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala
@@ -1,6 +1,7 @@
package com.rockymadden.stringmetric.similarity
import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter }
+import com.rockymadden.stringmetric.tokenization.NGramTokenizer
import scala.math
/** An implementation of the N-Gram metric. */
@@ -14,10 +15,10 @@ class NGramMetric extends ConfigurableStringMetric[Double, Int] { this: StringFi
if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare.
else if (fca1.sameElements(fca2)) Some(1d)
else {
- val nGramAlgorithm = NGramAlgorithm()
+ val nGramTokenizer = NGramTokenizer()
- nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg =>
- nGramAlgorithm.compute(fca2)(n).map { ca2bg =>
+ nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg =>
+ nGramTokenizer.tokenize(fca2)(n).map { ca2bg =>
val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString)))
ms.toDouble / math.max(ca1bg.length, ca2bg.length)
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala
index 6927138..9f81109 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala
@@ -1,6 +1,7 @@
package com.rockymadden.stringmetric.similarity
import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter }
+import com.rockymadden.stringmetric.tokenization.NGramTokenizer
import scala.math
/* An implementation of the overlap metric. */
@@ -14,10 +15,10 @@ class OverlapMetric extends ConfigurableStringMetric[Double, Int] { this: String
if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare.
else if (fca1.sameElements(fca2)) Some(1d)
else {
- val nGramAlgorithm = NGramAlgorithm()
+ val nGramTokenizer = NGramTokenizer()
- nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg =>
- nGramAlgorithm.compute(fca2)(n).map { ca2bg =>
+ nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg =>
+ nGramTokenizer.tokenize(fca2)(n).map { ca2bg =>
val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString))
ms.toDouble / (math.min(ca1bg.length, ca2bg.length))
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala
new file mode 100755
index 0000000..8e27420
--- /dev/null
+++ b/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala
@@ -0,0 +1,37 @@
+package com.rockymadden.stringmetric.tokenization
+
+import com.rockymadden.stringmetric.{ StringFilter, StringTokenizer }
+import scala.annotation.tailrec
+
+/** An implementation of the N-Gram tokenizer. */
+class NGramTokenizer extends StringTokenizer[Int, Array[String]] { this: StringFilter =>
+ final override def tokenize(charArray: Array[Char])(implicit n: Int): Option[Array[Array[Char]]] = {
+ if (n <= 0) throw new IllegalArgumentException("Expected valid n.")
+
+ val fca = filter(charArray)
+
+ if (fca.length < n) None
+ else Some(sequence(fca, Array.empty[Array[Char]], n))
+ }
+
+ final override def tokenize(string: String)(implicit n: Int): Option[Array[String]] =
+ tokenize(string.toCharArray)(n).map(_.map(_.mkString))
+
+ @tailrec
+ private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = {
+ require(n > 0)
+
+ if (i.length <= n) o :+ i
+ else sequence(i.tail, o :+ i.take(n), n)
+ }
+}
+
+object NGramTokenizer {
+ private lazy val self = apply()
+
+ def apply(): NGramTokenizer = new NGramTokenizer with StringFilter
+
+ def tokenize(charArray: Array[Char])(n: Int) = self.tokenize(charArray)(n)
+
+ def tokenize(string: String)(n: Int) = self.tokenize(string)(n)
+}
diff --git a/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala
deleted file mode 100755
index eb09a25..0000000
--- a/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-package com.rockymadden.stringmetric
-
-import com.rockymadden.stringmetric.similarity._
-import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-
-@RunWith(classOf[JUnitRunner])
-final class ConfigurableStringAlgorithmSpec extends ScalaTest {
- "StringAlgorithm standalone object" should provide {
- "compute method, type, and companion object pass-throughs" in {
- val nGram: ConfigurableStringAlgorithm.NGram = ConfigurableStringAlgorithm.NGram()
-
- nGram.compute("testone")(1).get should
- equal (ConfigurableStringAlgorithm.computeWithNGram("testone")(1).get)
- nGram.compute("testone".toCharArray)(1).get should
- equal (ConfigurableStringAlgorithm.computeWithNGram("testone".toCharArray)(1).get)
- nGram.compute("testone".toCharArray)(1).get should
- equal (NGramAlgorithm.compute("testone".toCharArray)(1).get)
- }
- }
-}
-
-
diff --git a/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala
new file mode 100755
index 0000000..8837c25
--- /dev/null
+++ b/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala
@@ -0,0 +1,23 @@
+package com.rockymadden.stringmetric
+
+import com.rockymadden.stringmetric.tokenization.NGramTokenizer
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class StringTokenizerSpec extends ScalaTest {
+ "StringTokenizer standalone object" should provide {
+ "tokenize method, type, and companion object pass-throughs" in {
+ val nGram: StringTokenizer.NGram = StringTokenizer.NGram()
+
+ nGram.tokenize("testone")(1).get should
+ equal (StringTokenizer.tokenizeWithNGram("testone")(1).get)
+ nGram.tokenize("testone".toCharArray)(1).get should
+ equal (StringTokenizer.tokenizeWithNGram("testone".toCharArray)(1).get)
+ nGram.tokenize("testone".toCharArray)(1).get should
+ equal (NGramTokenizer.tokenize("testone".toCharArray)(1).get)
+ }
+ }
+}
+
+
diff --git a/core/source/test/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala
index 82a0c69..56fdc13 100755
--- a/core/source/test/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmSpec.scala
+++ b/core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala
@@ -1,46 +1,46 @@
-package com.rockymadden.stringmetric.similarity
+package com.rockymadden.stringmetric.tokenization
import com.rockymadden.stringmetric.ScalaTest
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
@RunWith(classOf[JUnitRunner])
-final class NGramAlgorithmSpec extends ScalaTest {
- import NGramAlgorithmSpec.Algorithm
+final class NGramTokenizerSpec extends ScalaTest {
+ import NGramTokenizerSpec.Tokenizer
- "NGramAlgorithm" should provide {
- "compute method" when passed {
+ "NGramTokenizer" should provide {
+ "tokenize method" when passed {
"empty argument" should returns {
"None" in {
- Algorithm.compute("")(1).isDefined should be (false)
+ Tokenizer.tokenize("")(1).isDefined should be (false)
}
}
"invalid n argument" should throws {
"IllegalArgumentException" in {
evaluating {
- Algorithm.compute("")(0).isDefined should be (false)
+ Tokenizer.tokenize("")(0).isDefined should be (false)
} should produce [IllegalArgumentException]
evaluating {
- Algorithm.compute("")(-1).isDefined should be (false)
+ Tokenizer.tokenize("")(-1).isDefined should be (false)
} should produce [IllegalArgumentException]
}
}
"valid argument" should returns {
"Array[String]" in {
- Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(1).get should equal (
+ Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(1).get should equal (
Array(
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
"s", "t", "u", "v", "w", "x", "y", "z"
)
)
- Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(2).get should equal (
+ Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(2).get should equal (
Array(
"ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl", "lm", "mn", "no", "op",
"pq", "qr", "rs", "st", "tu", "uv", "vw", "wx", "xy", "yz"
)
)
- Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(3).get should equal (
+ Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(3).get should equal (
Array(
"abc", "bcd", "cde", "def", "efg", "fgh", "ghi", "hij", "ijk", "jkl", "klm", "lmn", "mno",
"nop", "opq", "pqr", "qrs", "rst", "stu", "tuv", "uvw", "vwx", "wxy", "xyz"
@@ -50,10 +50,10 @@ final class NGramAlgorithmSpec extends ScalaTest {
}
}
}
- "NGramAlgorithm companion object" should provide {
- "pass-through compute method" should returns {
+ "NGramTokenizer companion object" should provide {
+ "pass-through tokenize method" should returns {
"same value as class" in {
- NGramAlgorithm.compute("abcdefghijklmnopqrstuvwxyz")(1).get should equal (
+ NGramTokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(1).get should equal (
Array(
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
"s", "t", "u", "v", "w", "x", "y", "z"
@@ -64,6 +64,6 @@ final class NGramAlgorithmSpec extends ScalaTest {
}
}
-object NGramAlgorithmSpec {
- private final val Algorithm = NGramAlgorithm()
+object NGramTokenizerSpec {
+ private final val Tokenizer = NGramTokenizer()
}