Broke out n-gram tokenizer algorithm into tokenization package.

author: Rocky Madden <git@rockymadden.com> 2013-03-18 19:47:49 -0600
committer: Rocky Madden <git@rockymadden.com> 2013-03-18 19:47:49 -0600
commit: 05af73ab936168ebea891ed2f0120c49e8ec5066 (patch)
tree: 54a724d2d674a8458e06adfc042c68d4819b7e11 /core
parent: b558d56b50ff1a9fe8023b927fabf1e3d46a8119 (diff)
download: stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.tar.gz
stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.tar.bz2
stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.zip
13 files changed, 119 insertions, 103 deletions
diff --git a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmBenchmark.scala b/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala
index ae34220..d5ff1f3 100755
--- a/core/source/benchmark/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmBenchmark.scala
+++ b/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala
@@ -1,11 +1,11 @@
-package com.rockymadden.stringmetric.similarity
+package com.rockymadden.stringmetric.tokenization
 
 import com.google.caliper.Param
 import com.rockymadden.stringmetric.{ CaliperBenchmark, CaliperRunner }
 import scala.util.Random
 
-final class NGramAlgorithmBenchmark extends CaliperBenchmark {
-	import NGramAlgorithmBenchmark.Algorithm
+final class NGramTokenizerBenchmark extends CaliperBenchmark {
+	import NGramTokenizerBenchmark.Tokenizer
 
 	@Param(Array("0", "1", "2", "4", "8", "16"))
 	var length: Int = _
@@ -22,14 +22,14 @@ final class NGramAlgorithmBenchmark extends CaliperBenchmark {
 	}
 
 	def timeComputeWithCharArray(reps: Int) = run(reps) {
-		Algorithm.compute(charArray)(n)
+		Tokenizer.tokenize(charArray)(n)
 	}
 
 	def timeComputeWithString(reps: Int) = run(reps) {
-		Algorithm.compute(string)(n)
+		Tokenizer.tokenize(string)(n)
 	}
 }
 
-object NGramAlgorithmBenchmark extends CaliperRunner(classOf[NGramAlgorithmBenchmark]) {
-	private final val Algorithm = NGramAlgorithm()
+object NGramTokenizerBenchmark extends CaliperRunner(classOf[NGramTokenizerBenchmark]) {
+	private final val Tokenizer = NGramTokenizer()
 }
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala
index d447538..38d8714 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithm.scala
@@ -4,11 +4,4 @@ trait ConfigurableStringAlgorithm[R, O] extends ConfigurableAlgorithm[String, R,
 	def compute(charArray: Array[Char])(implicit o: O): Option[Array[_]]
 }
 
-object ConfigurableStringAlgorithm {
-	type NGram = com.rockymadden.stringmetric.similarity.NGramAlgorithm
-	val NGram = com.rockymadden.stringmetric.similarity.NGramAlgorithm
-
-	def computeWithNGram(charArray: Array[Char])(n: Int) = NGram.compute(charArray)(n)
-
-	def computeWithNGram(string: String)(n: Int) = NGram.compute(string)(n)
-}
+object ConfigurableStringAlgorithm
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala
new file mode 100755
index 0000000..5494274
--- /dev/null
+++ b/core/source/core/scala/com/rockymadden/stringmetric/StringTokenizer.scala
@@ -0,0 +1,14 @@
+package com.rockymadden.stringmetric
+
+trait StringTokenizer[O, R] extends Tokenizer[String, O, R] {
+	def tokenize(charArray: Array[Char])(implicit o: O): Option[Array[Array[Char]]]
+}
+
+object StringTokenizer {
+	type NGram = com.rockymadden.stringmetric.tokenization.NGramTokenizer
+	val NGram = com.rockymadden.stringmetric.tokenization.NGramTokenizer
+
+	def tokenizeWithNGram(charArray: Array[Char])(n: Int) = NGram.tokenize(charArray)(n)
+
+	def tokenizeWithNGram(string: String)(n: Int) = NGram.tokenize(string)(n)
+}
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala
new file mode 100755
index 0000000..6744a68
--- /dev/null
+++ b/core/source/core/scala/com/rockymadden/stringmetric/Tokenizer.scala
@@ -0,0 +1,5 @@
+package com.rockymadden.stringmetric
+
+trait Tokenizer[T, O, R] {
+	def tokenize(t: T)(implicit o: O): Option[R]
+}
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
index 4dd35a7..9b3b975 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
@@ -1,6 +1,7 @@
 package com.rockymadden.stringmetric.similarity
 
 import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter }
+import com.rockymadden.stringmetric.tokenization.NGramTokenizer
 
 /**
  * An implementation of the Dice/Sorensen metric. This implementation differs in that n-gram size is required.
@@ -16,10 +17,10 @@ class DiceSorensenMetric extends ConfigurableStringMetric[Double, Int] { this: S
 		if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare.
 		else if (fca1.sameElements(fca2)) Some(1d)
 		else {
-			val nGramAlgorithm = NGramAlgorithm()
+			val nGramTokenizer = NGramTokenizer()
 
-			nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg =>
-				nGramAlgorithm.compute(fca2)(n).map { ca2bg =>
+			nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg =>
+				nGramTokenizer.tokenize(fca2)(n).map { ca2bg =>
 					val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString))
 
 					(2d * ms) / (ca1bg.length + ca2bg.length)
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala
index 5971701..f36333f 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala
@@ -1,6 +1,7 @@
 package com.rockymadden.stringmetric.similarity
 
 import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter }
+import com.rockymadden.stringmetric.tokenization.NGramTokenizer
 
 /* An implementation of the Jaccard metric. */
 class JaccardMetric extends ConfigurableStringMetric[Double, Int] { this: StringFilter =>
@@ -13,10 +14,10 @@ class JaccardMetric extends ConfigurableStringMetric[Double, Int] { this: String
 		if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare.
 		else if (fca1.sameElements(fca2)) Some(1d)
 		else {
-			val nGramAlgorithm = NGramAlgorithm()
+			val nGramTokenizer = NGramTokenizer()
 
-			nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg =>
-				nGramAlgorithm.compute(fca2)(n).map { ca2bg =>
+			nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg =>
+				nGramTokenizer.tokenize(fca2)(n).map { ca2bg =>
 					val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString))
 
 					ms.toDouble / (ca1bg.length + ca2bg.length)
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala
deleted file mode 100755
index cc67e04..0000000
--- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithm.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-package com.rockymadden.stringmetric.similarity
-
-import com.rockymadden.stringmetric.{ ConfigurableStringAlgorithm, StringFilter }
-import scala.annotation.tailrec
-
-/** An implementation of the N-Gram algorithm. */
-class NGramAlgorithm extends ConfigurableStringAlgorithm[Array[String], Int] { this: StringFilter =>
-	final override def compute(charArray: Array[Char])(implicit n: Int): Option[Array[Array[Char]]] = {
-		if (n <= 0) throw new IllegalArgumentException("Expected valid n.")
-
-		val fca = filter(charArray)
-
-		if (fca.length < n) None
-		else Some(sequence(fca, Array.empty[Array[Char]], n))
-	}
-
-	final override def compute(string: String)(implicit n: Int): Option[Array[String]] =
-		compute(string.toCharArray)(n).map(_.map(_.mkString))
-
-	@tailrec
-	private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = {
-		require(n > 0)
-
-		if (i.length <= n) o :+ i
-		else sequence(i.tail, o :+ i.take(n), n)
-	}
-}
-
-object NGramAlgorithm {
-	private lazy val self = apply()
-
-	def apply(): NGramAlgorithm = new NGramAlgorithm with StringFilter
-
-	def compute(charArray: Array[Char])(n: Int) = self.compute(charArray)(n)
-
-	def compute(string: String)(n: Int) = self.compute(string)(n)
-}
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala
index 935e576..91cfa68 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala
@@ -1,6 +1,7 @@
 package com.rockymadden.stringmetric.similarity
 
 import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter }
+import com.rockymadden.stringmetric.tokenization.NGramTokenizer
 import scala.math
 
 /** An implementation of the N-Gram metric. */
@@ -14,10 +15,10 @@ class NGramMetric extends ConfigurableStringMetric[Double, Int] { this: StringFi
 		if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare.
 		else if (fca1.sameElements(fca2)) Some(1d)
 		else {
-			val nGramAlgorithm = NGramAlgorithm()
+			val nGramTokenizer = NGramTokenizer()
 
-			nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg =>
-				nGramAlgorithm.compute(fca2)(n).map { ca2bg =>
+			nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg =>
+				nGramTokenizer.tokenize(fca2)(n).map { ca2bg =>
 					val ms = scoreMatches((ca1bg.map(_.mkString), ca2bg.map(_.mkString)))
 
 					ms.toDouble / math.max(ca1bg.length, ca2bg.length)
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala
index 6927138..9f81109 100755
--- a/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala
+++ b/core/source/core/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala
@@ -1,6 +1,7 @@
 package com.rockymadden.stringmetric.similarity
 
 import com.rockymadden.stringmetric.{ ConfigurableStringMetric, MatchTuple, StringFilter }
+import com.rockymadden.stringmetric.tokenization.NGramTokenizer
 import scala.math
 
 /* An implementation of the overlap metric. */
@@ -14,10 +15,10 @@ class OverlapMetric extends ConfigurableStringMetric[Double, Int] { this: String
 		if (fca1.length < n || fca2.length < n) None // Because length is less than n, it is not possible to compare.
 		else if (fca1.sameElements(fca2)) Some(1d)
 		else {
-			val nGramAlgorithm = NGramAlgorithm()
+			val nGramTokenizer = NGramTokenizer()
 
-			nGramAlgorithm.compute(fca1)(n).flatMap { ca1bg =>
-				nGramAlgorithm.compute(fca2)(n).map { ca2bg =>
+			nGramTokenizer.tokenize(fca1)(n).flatMap { ca1bg =>
+				nGramTokenizer.tokenize(fca2)(n).map { ca2bg =>
 					val ms = scoreMatches(ca1bg.map(_.mkString), ca2bg.map(_.mkString))
 
 					ms.toDouble / (math.min(ca1bg.length, ca2bg.length))
diff --git a/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala b/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala
new file mode 100755
index 0000000..8e27420
--- /dev/null
+++ b/core/source/core/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala
@@ -0,0 +1,37 @@
+package com.rockymadden.stringmetric.tokenization
+
+import com.rockymadden.stringmetric.{ StringFilter, StringTokenizer }
+import scala.annotation.tailrec
+
+/** An implementation of the N-Gram tokenizer. */
+class NGramTokenizer extends StringTokenizer[Int, Array[String]] { this: StringFilter =>
+	final override def tokenize(charArray: Array[Char])(implicit n: Int): Option[Array[Array[Char]]] = {
+		if (n <= 0) throw new IllegalArgumentException("Expected valid n.")
+
+		val fca = filter(charArray)
+
+		if (fca.length < n) None
+		else Some(sequence(fca, Array.empty[Array[Char]], n))
+	}
+
+	final override def tokenize(string: String)(implicit n: Int): Option[Array[String]] =
+		tokenize(string.toCharArray)(n).map(_.map(_.mkString))
+
+	@tailrec
+	private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = {
+		require(n > 0)
+
+		if (i.length <= n) o :+ i
+		else sequence(i.tail, o :+ i.take(n), n)
+	}
+}
+
+object NGramTokenizer {
+	private lazy val self = apply()
+
+	def apply(): NGramTokenizer = new NGramTokenizer with StringFilter
+
+	def tokenize(charArray: Array[Char])(n: Int) = self.tokenize(charArray)(n)
+
+	def tokenize(string: String)(n: Int) = self.tokenize(string)(n)
+}
diff --git a/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala
deleted file mode 100755
index eb09a25..0000000
--- a/core/source/test/scala/com/rockymadden/stringmetric/ConfigurableStringAlgorithmSpec.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-package com.rockymadden.stringmetric
-
-import com.rockymadden.stringmetric.similarity._
-import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-
-@RunWith(classOf[JUnitRunner])
-final class ConfigurableStringAlgorithmSpec extends ScalaTest {
-	"StringAlgorithm standalone object" should provide {
-		"compute method, type, and companion object pass-throughs" in {
-			val nGram: ConfigurableStringAlgorithm.NGram = ConfigurableStringAlgorithm.NGram()
-
-			nGram.compute("testone")(1).get should
-				equal (ConfigurableStringAlgorithm.computeWithNGram("testone")(1).get)
-			nGram.compute("testone".toCharArray)(1).get should
-				equal (ConfigurableStringAlgorithm.computeWithNGram("testone".toCharArray)(1).get)
-			nGram.compute("testone".toCharArray)(1).get should
-				equal (NGramAlgorithm.compute("testone".toCharArray)(1).get)
-		}
-	}
-}
-
-
diff --git a/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala
new file mode 100755
index 0000000..8837c25
--- /dev/null
+++ b/core/source/test/scala/com/rockymadden/stringmetric/StringTokenizerSpec.scala
@@ -0,0 +1,23 @@
+package com.rockymadden.stringmetric
+
+import com.rockymadden.stringmetric.tokenization.NGramTokenizer
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+final class StringTokenizerSpec extends ScalaTest {
+	"StringTokenizer standalone object" should provide {
+		"tokenize method, type, and companion object pass-throughs" in {
+			val nGram: StringTokenizer.NGram = StringTokenizer.NGram()
+
+			nGram.tokenize("testone")(1).get should
+				equal (StringTokenizer.tokenizeWithNGram("testone")(1).get)
+			nGram.tokenize("testone".toCharArray)(1).get should
+				equal (StringTokenizer.tokenizeWithNGram("testone".toCharArray)(1).get)
+			nGram.tokenize("testone".toCharArray)(1).get should
+				equal (NGramTokenizer.tokenize("testone".toCharArray)(1).get)
+		}
+	}
+}
+
+
diff --git a/core/source/test/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala
index 82a0c69..56fdc13 100755
--- a/core/source/test/scala/com/rockymadden/stringmetric/similarity/NGramAlgorithmSpec.scala
+++ b/core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala
@@ -1,46 +1,46 @@
-package com.rockymadden.stringmetric.similarity
+package com.rockymadden.stringmetric.tokenization
 
 import com.rockymadden.stringmetric.ScalaTest
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
 
 @RunWith(classOf[JUnitRunner])
-final class NGramAlgorithmSpec extends ScalaTest {
-	import NGramAlgorithmSpec.Algorithm
+final class NGramTokenizerSpec extends ScalaTest {
+	import NGramTokenizerSpec.Tokenizer
 
-	"NGramAlgorithm" should provide {
-		"compute method" when passed {
+	"NGramTokenizer" should provide {
+		"tokenize method" when passed {
 			"empty argument" should returns {
 				"None" in {
-					Algorithm.compute("")(1).isDefined should be (false)
+					Tokenizer.tokenize("")(1).isDefined should be (false)
 				}
 			}
 			"invalid n argument" should throws {
 				"IllegalArgumentException" in {
 					evaluating {
-						Algorithm.compute("")(0).isDefined should be (false)
+						Tokenizer.tokenize("")(0).isDefined should be (false)
 					} should produce [IllegalArgumentException]
 
 					evaluating {
-						Algorithm.compute("")(-1).isDefined should be (false)
+						Tokenizer.tokenize("")(-1).isDefined should be (false)
 					} should produce [IllegalArgumentException]
 				}
 			}
 			"valid argument" should returns {
 				"Array[String]" in {
-					Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(1).get should equal (
+					Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(1).get should equal (
 						Array(
 							"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
 							"s", "t", "u", "v", "w", "x", "y", "z"
 						)
 					)
-					Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(2).get should equal (
+					Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(2).get should equal (
 						Array(
 							"ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl", "lm", "mn", "no", "op",
 							"pq", "qr", "rs", "st", "tu", "uv", "vw", "wx", "xy", "yz"
 						)
 					)
-					Algorithm.compute("abcdefghijklmnopqrstuvwxyz")(3).get should equal (
+					Tokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(3).get should equal (
 						Array(
 							"abc", "bcd", "cde", "def", "efg", "fgh", "ghi", "hij", "ijk", "jkl", "klm", "lmn", "mno",
 							"nop", "opq", "pqr", "qrs", "rst", "stu", "tuv", "uvw", "vwx", "wxy", "xyz"
@@ -50,10 +50,10 @@ final class NGramAlgorithmSpec extends ScalaTest {
 			}
 		}
 	}
-	"NGramAlgorithm companion object" should provide {
-		"pass-through compute method" should returns {
+	"NGramTokenizer companion object" should provide {
+		"pass-through tokenize method" should returns {
 			"same value as class" in {
-				NGramAlgorithm.compute("abcdefghijklmnopqrstuvwxyz")(1).get should equal (
+				NGramTokenizer.tokenize("abcdefghijklmnopqrstuvwxyz")(1).get should equal (
 					Array(
 						"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
 						"s", "t", "u", "v", "w", "x", "y", "z"
@@ -64,6 +64,6 @@ final class NGramAlgorithmSpec extends ScalaTest {
 	}
 }
 
-object NGramAlgorithmSpec {
-	private final val Algorithm = NGramAlgorithm()
+object NGramTokenizerSpec {
+	private final val Tokenizer = NGramTokenizer()
 }
author	Rocky Madden <git@rockymadden.com>	2013-03-18 19:47:49 -0600
committer	Rocky Madden <git@rockymadden.com>	2013-03-18 19:47:49 -0600
commit	05af73ab936168ebea891ed2f0120c49e8ec5066 (patch)
tree	54a724d2d674a8458e06adfc042c68d4819b7e11 /core
parent	b558d56b50ff1a9fe8023b927fabf1e3d46a8119 (diff)
download	stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.tar.gz stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.tar.bz2 stringmetric-05af73ab936168ebea891ed2f0120c49e8ec5066.zip