From ba7295409ed82fe4ae30a871004226465a758859 Mon Sep 17 00:00:00 2001 From: Rocky Madden Date: Sat, 28 Dec 2013 11:55:39 -0700 Subject: Renamed package. --- .../cli/tokenization/ngramtokenizer.scala | 58 ------------------- .../stringmetric/cli/tokenize/ngramtokenizer.scala | 58 +++++++++++++++++++ .../cli/tokenization/ngramtokenizerSpec.scala | 66 ---------------------- .../cli/tokenize/ngramtokenizerSpec.scala | 66 ++++++++++++++++++++++ .../tokenization/NGramTokenizerBenchmark.scala | 35 ------------ .../tokenize/NGramTokenizerBenchmark.scala | 35 ++++++++++++ .../similarity/DiceSorensenMetric.scala | 2 +- .../stringmetric/similarity/JaccardMetric.scala | 2 +- .../stringmetric/similarity/NGramMetric.scala | 2 +- .../stringmetric/similarity/OverlapMetric.scala | 2 +- .../stringmetric/tokenization/NGramTokenizer.scala | 22 -------- .../stringmetric/tokenize/NGramTokenizer.scala | 22 ++++++++ .../tokenization/NGramTokenizerSpec.scala | 44 --------------- .../stringmetric/tokenize/NGramTokenizerSpec.scala | 44 +++++++++++++++ 14 files changed, 229 insertions(+), 229 deletions(-) delete mode 100755 cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenization/ngramtokenizer.scala create mode 100755 cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala delete mode 100755 cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenization/ngramtokenizerSpec.scala create mode 100755 cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala delete mode 100755 core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala create mode 100755 core/source/benchmark/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerBenchmark.scala delete mode 100755 core/source/main/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala create mode 100755 core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala delete mode 100755 core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala create mode 100755 core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala diff --git a/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenization/ngramtokenizer.scala b/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenization/ngramtokenizer.scala deleted file mode 100755 index 9139e3a..0000000 --- a/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenization/ngramtokenizer.scala +++ /dev/null @@ -1,58 +0,0 @@ -package com.rockymadden.stringmetric.cli.tokenization - -import com.rockymadden.stringmetric.cli._ -import com.rockymadden.stringmetric.tokenization.NGramTokenizer - -/** - * The ngramtokenizer [[com.rockymadden.stringmetric.cli.Command]]. Returns the N-Gram representation of the passed - * string. - */ -object ngramtokenizer extends Command { - override def main(args: Array[String]): Unit = { - val opts: OptionMap = args - - try - if (opts.contains('h) || opts.contains('help)) { - help() - exit(opts) - } else if (opts.contains('dashless) && (opts('dashless): Array[String]).length == 1 - && opts.contains('n) && (opts('n): Int) >= 1) { - - execute(opts) - exit(opts) - } else throw new IllegalArgumentException("Expected valid syntax. See --help.") - catch { case e: Throwable => error(e, opts) } - } - - override def help(): Unit = { - val ls = sys.props("line.separator") - val tab = " " - - println( - "Returns the N-Gram representation of the passed string." + ls + ls + - "Syntax:" + ls + - tab + "ngramtokenizer [Options] string..." + ls + ls + - "Options:" + ls + - tab + "-h, --help" + ls + - tab + tab + "Outputs description, syntax, and opts." + - tab + "--n" + ls + - tab + tab + "The n." - ) - } - - override def execute(opts: OptionMap): Unit = - NGramTokenizer(opts('n)).tokenize(opts('dashless)) match { - // Implicits are a pain here. - case Some(c) => { - val sb = new StringBuilder - - Range(0, c.length).foreach { i => - sb.append(c(i)) - if (i < c.length - 1) sb.append("|") - } - - println(sb.result()) - } - case None => println("not computable") - } -} diff --git a/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala b/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala new file mode 100755 index 0000000..52fea44 --- /dev/null +++ b/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala @@ -0,0 +1,58 @@ +package com.rockymadden.stringmetric.cli.tokenize + +import com.rockymadden.stringmetric.cli._ +import com.rockymadden.stringmetric.tokenize.NGramTokenizer + +/** + * The ngramtokenizer [[com.rockymadden.stringmetric.cli.Command]]. Returns the N-Gram representation of the passed + * string. + */ +object ngramtokenizer extends Command { + override def main(args: Array[String]): Unit = { + val opts: OptionMap = args + + try + if (opts.contains('h) || opts.contains('help)) { + help() + exit(opts) + } else if (opts.contains('dashless) && (opts('dashless): Array[String]).length == 1 + && opts.contains('n) && (opts('n): Int) >= 1) { + + execute(opts) + exit(opts) + } else throw new IllegalArgumentException("Expected valid syntax. See --help.") + catch { case e: Throwable => error(e, opts) } + } + + override def help(): Unit = { + val ls = sys.props("line.separator") + val tab = " " + + println( + "Returns the N-Gram representation of the passed string." + ls + ls + + "Syntax:" + ls + + tab + "ngramtokenizer [Options] string..." + ls + ls + + "Options:" + ls + + tab + "-h, --help" + ls + + tab + tab + "Outputs description, syntax, and opts." + + tab + "--n" + ls + + tab + tab + "The n." + ) + } + + override def execute(opts: OptionMap): Unit = + NGramTokenizer(opts('n)).tokenize(opts('dashless)) match { + // Implicits are a pain here. + case Some(c) => { + val sb = new StringBuilder + + Range(0, c.length).foreach { i => + sb.append(c(i)) + if (i < c.length - 1) sb.append("|") + } + + println(sb.result()) + } + case None => println("not computable") + } +} diff --git a/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenization/ngramtokenizerSpec.scala b/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenization/ngramtokenizerSpec.scala deleted file mode 100755 index 8a1ea1d..0000000 --- a/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenization/ngramtokenizerSpec.scala +++ /dev/null @@ -1,66 +0,0 @@ -package com.rockymadden.stringmetric.cli.tokenization - -import com.rockymadden.stringmetric.ScalaTest -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner - -@RunWith(classOf[JUnitRunner]) -final class ngramtokenizerSpec extends ScalaTest { - "ngramtokenizer" should provide { - "main method" when passed { - "valid dashless argument and valid n argument" should executes { - "print N-Gram representation" in { - val out = new java.io.ByteArrayOutputStream() - - Console.withOut(out)( - ngramtokenizer.main( - Array( - "--unitTest", - "--debug", - "--n=1", - "abc" - ) - ) - ) - - out.toString should equal ("a|b|c\n") - out.reset() - - Console.withOut(out)( - ngramtokenizer.main( - Array( - "--unitTest", - "--debug", - "--n=2", - "abc" - ) - ) - ) - - out.toString should equal ("ab|bc\n") - out.reset() - } - } - "valid dashless argument and invalid n argument" should throws { - "IllegalArgumentException" in { - evaluating { - ngramtokenizer.main( - Array( - "--unitTest", - "abc", - "abc" - ) - ) - } should produce [IllegalArgumentException] - } - } - "no dashless argument" should throws { - "IllegalArgumentException" in { - evaluating { - ngramtokenizer.main(Array("--unitTest", "--debug")) - } should produce [IllegalArgumentException] - } - } - } - } -} diff --git a/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala b/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala new file mode 100755 index 0000000..5fea2e9 --- /dev/null +++ b/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala @@ -0,0 +1,66 @@ +package com.rockymadden.stringmetric.cli.tokenize + +import com.rockymadden.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class ngramtokenizerSpec extends ScalaTest { + "ngramtokenizer" should provide { + "main method" when passed { + "valid dashless argument and valid n argument" should executes { + "print N-Gram representation" in { + val out = new java.io.ByteArrayOutputStream() + + Console.withOut(out)( + ngramtokenizer.main( + Array( + "--unitTest", + "--debug", + "--n=1", + "abc" + ) + ) + ) + + out.toString should equal ("a|b|c\n") + out.reset() + + Console.withOut(out)( + ngramtokenizer.main( + Array( + "--unitTest", + "--debug", + "--n=2", + "abc" + ) + ) + ) + + out.toString should equal ("ab|bc\n") + out.reset() + } + } + "valid dashless argument and invalid n argument" should throws { + "IllegalArgumentException" in { + evaluating { + ngramtokenizer.main( + Array( + "--unitTest", + "abc", + "abc" + ) + ) + } should produce [IllegalArgumentException] + } + } + "no dashless argument" should throws { + "IllegalArgumentException" in { + evaluating { + ngramtokenizer.main(Array("--unitTest", "--debug")) + } should produce [IllegalArgumentException] + } + } + } + } +} diff --git a/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala b/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala deleted file mode 100755 index 7e62662..0000000 --- a/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerBenchmark.scala +++ /dev/null @@ -1,35 +0,0 @@ -package com.rockymadden.stringmetric.tokenization - -import com.google.caliper.Param -import com.rockymadden.stringmetric.{CaliperBenchmark, CaliperRunner} -import scala.util.Random - -final class NGramTokenizerBenchmark extends CaliperBenchmark { - import NGramTokenizerBenchmark.Tokenizer - - @Param(Array("0", "1", "2", "4", "8", "16")) - var length: Int = _ - - @Param(Array("2", "3")) - var n: Int = _ - - var string: String = _ - var charArray: Array[Char] = _ - - override protected def setUp() { - string = Random.alphanumeric.take(length).mkString - charArray = string.toCharArray - } - - def timeComputeWithCharArray(reps: Int) = run(reps) { - Tokenizer.tokenize(charArray)(n) - } - - def timeComputeWithString(reps: Int) = run(reps) { - Tokenizer.tokenize(string)(n) - } -} - -object NGramTokenizerBenchmark extends CaliperRunner(classOf[NGramTokenizerBenchmark]) { - private final val Tokenizer = NGramTokenizer() -} diff --git a/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerBenchmark.scala b/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerBenchmark.scala new file mode 100755 index 0000000..dbc48d4 --- /dev/null +++ b/core/source/benchmark/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerBenchmark.scala @@ -0,0 +1,35 @@ +package com.rockymadden.stringmetric.tokenize + +import com.google.caliper.Param +import com.rockymadden.stringmetric.{CaliperBenchmark, CaliperRunner} +import scala.util.Random + +final class NGramTokenizerBenchmark extends CaliperBenchmark { + import NGramTokenizerBenchmark.Tokenizer + + @Param(Array("0", "1", "2", "4", "8", "16")) + var length: Int = _ + + @Param(Array("2", "3")) + var n: Int = _ + + var string: String = _ + var charArray: Array[Char] = _ + + override protected def setUp() { + string = Random.alphanumeric.take(length).mkString + charArray = string.toCharArray + } + + def timeComputeWithCharArray(reps: Int) = run(reps) { + Tokenizer.tokenize(charArray)(n) + } + + def timeComputeWithString(reps: Int) = run(reps) { + Tokenizer.tokenize(string)(n) + } +} + +object NGramTokenizerBenchmark extends CaliperRunner(classOf[NGramTokenizerBenchmark]) { + private final val Tokenizer = NGramTokenizer() +} diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala index 8381921..21c9f16 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala @@ -7,7 +7,7 @@ import com.rockymadden.stringmetric.Metric.StringMetricLike * Traditionally, the algorithm uses bigrams. */ final case class DiceSorensenMetric(private val n: Int) extends StringMetricLike[Double] { - import com.rockymadden.stringmetric.tokenization.NGramTokenizer + import com.rockymadden.stringmetric.tokenize.NGramTokenizer import com.rockymadden.stringmetric.MatchTuple override def compare(a: Array[Char], b: Array[Char]): Option[Double] = { diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala index e1fd4ed..154ee48 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala @@ -3,7 +3,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.Metric.StringMetricLike final case class JaccardMetric(private val n: Int) extends StringMetricLike[Double] { - import com.rockymadden.stringmetric.tokenization.NGramTokenizer + import com.rockymadden.stringmetric.tokenize.NGramTokenizer override def compare(a: Array[Char], b: Array[Char]): Option[Double] = { if (n <= 0) return None diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala index 8025f38..f6397f1 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala @@ -4,7 +4,7 @@ import com.rockymadden.stringmetric.Metric.StringMetricLike final case class NGramMetric(private val n: Int) extends StringMetricLike[Double] { import com.rockymadden.stringmetric.MatchTuple - import com.rockymadden.stringmetric.tokenization.NGramTokenizer + import com.rockymadden.stringmetric.tokenize.NGramTokenizer import scala.math override def compare(a: Array[Char], b: Array[Char]): Option[Double] = { diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala index 3bfe604..f4c66b7 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala @@ -4,7 +4,7 @@ import com.rockymadden.stringmetric.Metric.StringMetricLike final case class OverlapMetric(private val n: Int) extends StringMetricLike[Double] { import com.rockymadden.stringmetric.MatchTuple - import com.rockymadden.stringmetric.tokenization.NGramTokenizer + import com.rockymadden.stringmetric.tokenize.NGramTokenizer import scala.math override def compare(a: Array[Char], b: Array[Char]): Option[Double] = { diff --git a/core/source/main/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala b/core/source/main/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala deleted file mode 100755 index 209288f..0000000 --- a/core/source/main/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizer.scala +++ /dev/null @@ -1,22 +0,0 @@ -package com.rockymadden.stringmetric.tokenization - -import com.rockymadden.stringmetric.Tokenizer.StringTokenizerLike - -final case class NGramTokenizer(private val n: Int) extends StringTokenizerLike { - override def tokenize(a: Array[Char]): Option[Array[Array[Char]]] = { - if (n <= 0) return None - - if (a.length < n) None - else Some(sequence(a, Array.empty[Array[Char]], n)) - } - - override def tokenize(a: String): Option[Array[String]] = tokenize(a.toCharArray).map(_.map(_.mkString)) - - @annotation.tailrec - private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = { - require(n > 0) - - if (i.length <= n) o :+ i - else sequence(i.tail, o :+ i.take(n), n) - } -} diff --git a/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala b/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala new file mode 100755 index 0000000..6769bde --- /dev/null +++ b/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala @@ -0,0 +1,22 @@ +package com.rockymadden.stringmetric.tokenize + +import com.rockymadden.stringmetric.Tokenizer.StringTokenizerLike + +final case class NGramTokenizer(private val n: Int) extends StringTokenizerLike { + override def tokenize(a: Array[Char]): Option[Array[Array[Char]]] = { + if (n <= 0) return None + + if (a.length < n) None + else Some(sequence(a, Array.empty[Array[Char]], n)) + } + + override def tokenize(a: String): Option[Array[String]] = tokenize(a.toCharArray).map(_.map(_.mkString)) + + @annotation.tailrec + private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = { + require(n > 0) + + if (i.length <= n) o :+ i + else sequence(i.tail, o :+ i.take(n), n) + } +} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala deleted file mode 100755 index 1598a93..0000000 --- a/core/source/test/scala/com/rockymadden/stringmetric/tokenization/NGramTokenizerSpec.scala +++ /dev/null @@ -1,44 +0,0 @@ -package com.rockymadden.stringmetric.tokenization - -import com.rockymadden.stringmetric.ScalaTest -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner - -@RunWith(classOf[JUnitRunner]) -final class NGramTokenizerSpec extends ScalaTest { "NGramTokenizer" should provide { - "tokenize method" when passed { - "empty argument" should returns { - "None" in { - NGramTokenizer(1).tokenize("").isDefined should be (false) - } - } - "invalid n argument" should returns { - "None" in { - NGramTokenizer(0).tokenize("").isDefined should be (false) - NGramTokenizer(-1).tokenize("").isDefined should be (false) - } - } - "valid argument" should returns { - "Array[String]" in { - NGramTokenizer(1).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( - Array( - "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", - "s", "t", "u", "v", "w", "x", "y", "z" - ) - ) - NGramTokenizer(2).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( - Array( - "ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl", "lm", "mn", "no", "op", - "pq", "qr", "rs", "st", "tu", "uv", "vw", "wx", "xy", "yz" - ) - ) - NGramTokenizer(3).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( - Array( - "abc", "bcd", "cde", "def", "efg", "fgh", "ghi", "hij", "ijk", "jkl", "klm", "lmn", "mno", - "nop", "opq", "pqr", "qrs", "rst", "stu", "tuv", "uvw", "vwx", "wxy", "xyz" - ) - ) - } - } - } -}} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala new file mode 100755 index 0000000..01636f0 --- /dev/null +++ b/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala @@ -0,0 +1,44 @@ +package com.rockymadden.stringmetric.tokenize + +import com.rockymadden.stringmetric.ScalaTest +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +final class NGramTokenizerSpec extends ScalaTest { "NGramTokenizer" should provide { + "tokenize method" when passed { + "empty argument" should returns { + "None" in { + NGramTokenizer(1).tokenize("").isDefined should be (false) + } + } + "invalid n argument" should returns { + "None" in { + NGramTokenizer(0).tokenize("").isDefined should be (false) + NGramTokenizer(-1).tokenize("").isDefined should be (false) + } + } + "valid argument" should returns { + "Array[String]" in { + NGramTokenizer(1).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( + Array( + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", + "s", "t", "u", "v", "w", "x", "y", "z" + ) + ) + NGramTokenizer(2).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( + Array( + "ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl", "lm", "mn", "no", "op", + "pq", "qr", "rs", "st", "tu", "uv", "vw", "wx", "xy", "yz" + ) + ) + NGramTokenizer(3).tokenize("abcdefghijklmnopqrstuvwxyz").get should equal ( + Array( + "abc", "bcd", "cde", "def", "efg", "fgh", "ghi", "hij", "ijk", "jkl", "klm", "lmn", "mno", + "nop", "opq", "pqr", "qrs", "rst", "stu", "tuv", "uvw", "vwx", "wxy", "xyz" + ) + ) + } + } + } +}} -- cgit v1.2.3