From 8a6853a76a61184bc2ad559e59292ef7ea1dfd4a Mon Sep 17 00:00:00 2001 From: Rocky Madden Date: Thu, 2 Jan 2014 11:22:50 -0700 Subject: Merged package contents into module. --- .../stringmetric/cli/tokenize/ngramtokenizer.scala | 31 ----------- .../cli/tokenize/ngramtokenizerSpec.scala | 64 ---------------------- .../com/rockymadden/stringmetric/Tokenize.scala | 23 +++++++- .../similarity/DiceSorensenMetric.scala | 2 +- .../stringmetric/similarity/JaccardMetric.scala | 2 +- .../stringmetric/similarity/NGramMetric.scala | 2 +- .../stringmetric/similarity/OverlapMetric.scala | 2 +- .../stringmetric/tokenize/NGramTokenizer.scala | 19 ------- .../rockymadden/stringmetric/TokenizeSpec.scala | 45 +++++++++++++++ .../stringmetric/tokenize/NGramTokenizerSpec.scala | 44 --------------- 10 files changed, 69 insertions(+), 165 deletions(-) delete mode 100755 cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala delete mode 100755 cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala delete mode 100755 core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala create mode 100755 core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala delete mode 100755 core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala diff --git a/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala b/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala deleted file mode 100755 index cbd33d7..0000000 --- a/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala +++ /dev/null @@ -1,31 +0,0 @@ -package com.rockymadden.stringmetric.cli.tokenize - -import com.rockymadden.stringmetric.cli._ -import com.rockymadden.stringmetric.tokenize.NGramTokenizer - -case object ngramtokenizer extends Command( - (opts) => - "Returns the N-Gram representation of the passed string." + Ls + Ls + - "Syntax:" + Ls + - Tab + "ngramtokenizer [Options] string..." + Ls + Ls + - "Options:" + Ls + - Tab + "-h, --help" + Ls + - Tab + Tab + "Outputs description, syntax, and opts." + - Tab + "--n" + Ls + - Tab + Tab + "The n.", - (opts) => opts.contains('dashless) && (opts('dashless): Array[String]).length == 1 && - opts.contains('n) && (opts('n): Int) >= 1, - (opts) => NGramTokenizer(opts('n)).tokenize(opts('dashless)) match { - case Some(c) => { - val sb = new StringBuilder - - Range(0, c.length).foreach { i => - sb.append(c(i)) - if (i < c.length - 1) sb.append("|") - } - - sb.result() - } - case None => "not computable" - } -) diff --git a/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala b/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala deleted file mode 100755 index 552fcf4..0000000 --- a/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala +++ /dev/null @@ -1,64 +0,0 @@ -package com.rockymadden.stringmetric.cli.tokenize - -import com.rockymadden.stringmetric.ScalaTest -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner - -@RunWith(classOf[JUnitRunner]) -final class ngramtokenizerSpec extends ScalaTest { "ngramtokenizer" should provide { - "main method" when passed { - "valid dashless argument and valid n argument" should executes { - "print N-Gram representation" in { - val out = new java.io.ByteArrayOutputStream() - - Console.withOut(out)( - ngramtokenizer.main( - Array( - "--unitTest", - "--debug", - "--n=1", - "abc" - ) - ) - ) - - out.toString should equal ("a|b|c\n") - out.reset() - - Console.withOut(out)( - ngramtokenizer.main( - Array( - "--unitTest", - "--debug", - "--n=2", - "abc" - ) - ) - ) - - out.toString should equal ("ab|bc\n") - out.reset() - } - } - "valid dashless argument and invalid n argument" should throws { - "IllegalArgumentException" in { - evaluating { - ngramtokenizer.main( - Array( - "--unitTest", - "abc", - "abc" - ) - ) - } should produce [IllegalArgumentException] - } - } - "no dashless argument" should throws { - "IllegalArgumentException" in { - evaluating { - ngramtokenizer.main(Array("--unitTest", "--debug")) - } should produce [IllegalArgumentException] - } - } - } -}} diff --git a/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala b/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala index 36b7eef..00b173d 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala @@ -1,19 +1,36 @@ package com.rockymadden.stringmetric object Tokenize { - trait Tokenizer[A] { + sealed trait Tokenizer[A] { def tokenize(a: A): Option[Array[A]] } - trait StringTokenizer extends Tokenizer[Array[Char]] { + sealed trait StringTokenizer extends Tokenizer[Array[Char]] { def tokenize(a: String): Option[Array[String]] } object StringTokenizer { - val NGram = tokenize.NGramTokenizer + val NGram = NGramTokenizer def tokenizeWithNGram(n: Int)(charArray: Array[Char]) = NGram(n).tokenize(charArray) } + + + final case class NGramTokenizer(n: Int) extends StringTokenizer { + override def tokenize(a: Array[Char]): Option[Array[Array[Char]]] = { + if (n <= 0) return None + + if (a.length < n) None + else Some(sequence(a, Array.empty[Array[Char]], n)) + } + + override def tokenize(a: String): Option[Array[String]] = tokenize(a.toCharArray).map(_.map(_.mkString)) + + @annotation.tailrec + private val sequence: ((Array[Char], Array[Array[Char]], Int) => Array[Array[Char]]) = (i, o, n) => + if (i.length <= n) o :+ i + else sequence(i.tail, o :+ i.take(n), n) + } } diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala index 1e07432..0ad3915 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala @@ -7,7 +7,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric * Traditionally, the algorithm uses bigrams. */ final case class DiceSorensenMetric(n: Int) extends StringMetric[Double] { - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer import com.rockymadden.stringmetric.MatchTuple override def compare(a: Array[Char], b: Array[Char]): Option[Double] = diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala index 629eaa0..6ec5db4 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala @@ -3,7 +3,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.Metric.StringMetric final case class JaccardMetric(n: Int) extends StringMetric[Double] { - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer override def compare(a: Array[Char], b: Array[Char]): Option[Double] = if (n <= 0 || a.length < n || b.length < n) None // Because length is less than n, it is not possible to compare. diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala index d712738..8c194ce 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala @@ -4,7 +4,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric final case class NGramMetric(n: Int) extends StringMetric[Double] { import com.rockymadden.stringmetric.MatchTuple - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer import scala.math override def compare(a: Array[Char], b: Array[Char]): Option[Double] = diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala index cc33a26..8f0418b 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala @@ -4,7 +4,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric final case class OverlapMetric(n: Int) extends StringMetric[Double] { import com.rockymadden.stringmetric.MatchTuple - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer import scala.math override def compare(a: Array[Char], b: Array[Char]): Option[Double] = diff --git a/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala b/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala deleted file mode 100755 index aa89b31..0000000 --- a/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala +++ /dev/null @@ -1,19 +0,0 @@ -package com.rockymadden.stringmetric.tokenize - -import com.rockymadden.stringmetric.Tokenize.StringTokenizer - -final case class NGramTokenizer(n: Int) extends StringTokenizer { - override def tokenize(a: Array[Char]): Option[Array[Array[Char]]] = { - if (n <= 0) return None - - if (a.length < n) None - else Some(sequence(a, Array.empty[Array[Char]], n)) - } - - override def tokenize(a: String): Option[Array[String]] = tokenize(a.toCharArray).map(_.map(_.mkString)) - - @annotation.tailrec - private val sequence: ((Array[Char], Array[Array[Char]], Int) => Array[Array[Char]]) = (i, o, n) => - if (i.length <= n) o :+ i - else sequence(i.tail, o :+ i.take(n), n) -} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala new file mode 100755 index 0000000..cfba0f7 --- /dev/null +++ b/core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala @@ -0,0 +1,45 @@ +package com.rockymadden.stringmetric + +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class TokenizeSpec extends ScalaTest { "NGramTokenizer" should provide { + import Tokenize._ + + "tokenize method" when passed { + "empty argument" should returns { + "None" in { + NGramTokenizer(1).tokenize("").isDefined should be (false) + } + } + "invalid n argument" should returns { + "None" in { + NGramTokenizer(0).tokenize("").isDefined should be (false) + NGramTokenizer(-1).tokenize("").isDefined should be (false) + } + } + "valid argument" should returns { + "Array[String]" in { + NGramTokenizer(1).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( + Array( + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", + "s", "t", "u", "v", "w", "x", "y", "z" + ) + ) + NGramTokenizer(2).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( + Array( + "ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl", "lm", "mn", "no", "op", + "pq", "qr", "rs", "st", "tu", "uv", "vw", "wx", "xy", "yz" + ) + ) + NGramTokenizer(3).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( + Array( + "abc", "bcd", "cde", "def", "efg", "fgh", "ghi", "hij", "ijk", "jkl", "klm", "lmn", "mno", + "nop", "opq", "pqr", "qrs", "rst", "stu", "tuv", "uvw", "vwx", "wxy", "xyz" + ) + ) + } + } + } +}} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala deleted file mode 100755 index 01636f0..0000000 --- a/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala +++ /dev/null @@ -1,44 +0,0 @@ -package com.rockymadden.stringmetric.tokenize - -import com.rockymadden.stringmetric.ScalaTest -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner - -@RunWith(classOf[JUnitRunner]) -final class NGramTokenizerSpec extends ScalaTest { "NGramTokenizer" should provide { - "tokenize method" when passed { - "empty argument" should returns { - "None" in { - NGramTokenizer(1).tokenize("").isDefined should be (false) - } - } - "invalid n argument" should returns { - "None" in { - NGramTokenizer(0).tokenize("").isDefined should be (false) - NGramTokenizer(-1).tokenize("").isDefined should be (false) - } - } - "valid argument" should returns { - "Array[String]" in { - NGramTokenizer(1).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( - Array( - "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", - "s", "t", "u", "v", "w", "x", "y", "z" - ) - ) - NGramTokenizer(2).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( - Array( - "ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl", "lm", "mn", "no", "op", - "pq", "qr", "rs", "st", "tu", "uv", "vw", "wx", "xy", "yz" - ) - ) - NGramTokenizer(3).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( - Array( - "abc", "bcd", "cde", "def", "efg", "fgh", "ghi", "hij", "ijk", "jkl", "klm", "lmn", "mno", - "nop", "opq", "pqr", "qrs", "rst", "stu", "tuv", "uvw", "vwx", "wxy", "xyz" - ) - ) - } - } - } -}} -- cgit v1.2.3