diff options
author | Rocky Madden <git@rockymadden.com> | 2014-01-02 11:22:50 -0700 |
---|---|---|
committer | Rocky Madden <git@rockymadden.com> | 2014-01-02 11:22:50 -0700 |
commit | 8a6853a76a61184bc2ad559e59292ef7ea1dfd4a (patch) | |
tree | 13e6edce58cb1de2845975bc3aaab7a2317a83c1 | |
parent | 46b69a796ef7632dafda2df0467b811008906bb0 (diff) | |
download | stringmetric-8a6853a76a61184bc2ad559e59292ef7ea1dfd4a.tar.gz stringmetric-8a6853a76a61184bc2ad559e59292ef7ea1dfd4a.tar.bz2 stringmetric-8a6853a76a61184bc2ad559e59292ef7ea1dfd4a.zip |
Merged package contents into module.
-rwxr-xr-x | cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala | 31 | ||||
-rwxr-xr-x | cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala | 64 | ||||
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala | 23 | ||||
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala | 2 | ||||
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala | 2 | ||||
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala | 2 | ||||
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala | 2 | ||||
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala | 19 | ||||
-rwxr-xr-x | core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala (renamed from core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala) | 7 |
9 files changed, 28 insertions, 124 deletions
diff --git a/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala b/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala deleted file mode 100755 index cbd33d7..0000000 --- a/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala +++ /dev/null @@ -1,31 +0,0 @@ -package com.rockymadden.stringmetric.cli.tokenize - -import com.rockymadden.stringmetric.cli._ -import com.rockymadden.stringmetric.tokenize.NGramTokenizer - -case object ngramtokenizer extends Command( - (opts) => - "Returns the N-Gram representation of the passed string." + Ls + Ls + - "Syntax:" + Ls + - Tab + "ngramtokenizer [Options] string..." + Ls + Ls + - "Options:" + Ls + - Tab + "-h, --help" + Ls + - Tab + Tab + "Outputs description, syntax, and opts." + - Tab + "--n" + Ls + - Tab + Tab + "The n.", - (opts) => opts.contains('dashless) && (opts('dashless): Array[String]).length == 1 && - opts.contains('n) && (opts('n): Int) >= 1, - (opts) => NGramTokenizer(opts('n)).tokenize(opts('dashless)) match { - case Some(c) => { - val sb = new StringBuilder - - Range(0, c.length).foreach { i => - sb.append(c(i)) - if (i < c.length - 1) sb.append("|") - } - - sb.result() - } - case None => "not computable" - } -) diff --git a/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala b/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala deleted file mode 100755 index 552fcf4..0000000 --- a/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala +++ /dev/null @@ -1,64 +0,0 @@ -package com.rockymadden.stringmetric.cli.tokenize - -import com.rockymadden.stringmetric.ScalaTest -import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner - -@RunWith(classOf[JUnitRunner]) -final class ngramtokenizerSpec extends ScalaTest { "ngramtokenizer" should provide { - "main method" when passed { - "valid dashless argument and valid n argument" should executes { - "print N-Gram representation" in { - val out = new java.io.ByteArrayOutputStream() - - Console.withOut(out)( - ngramtokenizer.main( - Array( - "--unitTest", - "--debug", - "--n=1", - "abc" - ) - ) - ) - - out.toString should equal ("a|b|c\n") - out.reset() - - Console.withOut(out)( - ngramtokenizer.main( - Array( - "--unitTest", - "--debug", - "--n=2", - "abc" - ) - ) - ) - - out.toString should equal ("ab|bc\n") - out.reset() - } - } - "valid dashless argument and invalid n argument" should throws { - "IllegalArgumentException" in { - evaluating { - ngramtokenizer.main( - Array( - "--unitTest", - "abc", - "abc" - ) - ) - } should produce [IllegalArgumentException] - } - } - "no dashless argument" should throws { - "IllegalArgumentException" in { - evaluating { - ngramtokenizer.main(Array("--unitTest", "--debug")) - } should produce [IllegalArgumentException] - } - } - } -}} diff --git a/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala b/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala index 36b7eef..00b173d 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala @@ -1,19 +1,36 @@ package com.rockymadden.stringmetric object Tokenize { - trait Tokenizer[A] { + sealed trait Tokenizer[A] { def tokenize(a: A): Option[Array[A]] } - trait StringTokenizer extends Tokenizer[Array[Char]] { + sealed trait StringTokenizer extends Tokenizer[Array[Char]] { def tokenize(a: String): Option[Array[String]] } object StringTokenizer { - val NGram = tokenize.NGramTokenizer + val NGram = NGramTokenizer def tokenizeWithNGram(n: Int)(charArray: Array[Char]) = NGram(n).tokenize(charArray) } + + + final case class NGramTokenizer(n: Int) extends StringTokenizer { + override def tokenize(a: Array[Char]): Option[Array[Array[Char]]] = { + if (n <= 0) return None + + if (a.length < n) None + else Some(sequence(a, Array.empty[Array[Char]], n)) + } + + override def tokenize(a: String): Option[Array[String]] = tokenize(a.toCharArray).map(_.map(_.mkString)) + + @annotation.tailrec + private val sequence: ((Array[Char], Array[Array[Char]], Int) => Array[Array[Char]]) = (i, o, n) => + if (i.length <= n) o :+ i + else sequence(i.tail, o :+ i.take(n), n) + } } diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala index 1e07432..0ad3915 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala @@ -7,7 +7,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric * Traditionally, the algorithm uses bigrams. */ final case class DiceSorensenMetric(n: Int) extends StringMetric[Double] { - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer import com.rockymadden.stringmetric.MatchTuple override def compare(a: Array[Char], b: Array[Char]): Option[Double] = diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala index 629eaa0..6ec5db4 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala @@ -3,7 +3,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.Metric.StringMetric final case class JaccardMetric(n: Int) extends StringMetric[Double] { - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer override def compare(a: Array[Char], b: Array[Char]): Option[Double] = if (n <= 0 || a.length < n || b.length < n) None // Because length is less than n, it is not possible to compare. diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala index d712738..8c194ce 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala @@ -4,7 +4,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric final case class NGramMetric(n: Int) extends StringMetric[Double] { import com.rockymadden.stringmetric.MatchTuple - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer import scala.math override def compare(a: Array[Char], b: Array[Char]): Option[Double] = diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala index cc33a26..8f0418b 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala @@ -4,7 +4,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric final case class OverlapMetric(n: Int) extends StringMetric[Double] { import com.rockymadden.stringmetric.MatchTuple - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer import scala.math override def compare(a: Array[Char], b: Array[Char]): Option[Double] = diff --git a/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala b/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala deleted file mode 100755 index aa89b31..0000000 --- a/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala +++ /dev/null @@ -1,19 +0,0 @@ -package com.rockymadden.stringmetric.tokenize - -import com.rockymadden.stringmetric.Tokenize.StringTokenizer - -final case class NGramTokenizer(n: Int) extends StringTokenizer { - override def tokenize(a: Array[Char]): Option[Array[Array[Char]]] = { - if (n <= 0) return None - - if (a.length < n) None - else Some(sequence(a, Array.empty[Array[Char]], n)) - } - - override def tokenize(a: String): Option[Array[String]] = tokenize(a.toCharArray).map(_.map(_.mkString)) - - @annotation.tailrec - private val sequence: ((Array[Char], Array[Array[Char]], Int) => Array[Array[Char]]) = (i, o, n) => - if (i.length <= n) o :+ i - else sequence(i.tail, o :+ i.take(n), n) -} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala index 01636f0..cfba0f7 100755 --- a/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala +++ b/core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala @@ -1,11 +1,12 @@ -package com.rockymadden.stringmetric.tokenize +package com.rockymadden.stringmetric -import com.rockymadden.stringmetric.ScalaTest import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -final class NGramTokenizerSpec extends ScalaTest { "NGramTokenizer" should provide { +final class TokenizeSpec extends ScalaTest { "NGramTokenizer" should provide { + import Tokenize._ + "tokenize method" when passed { "empty argument" should returns { "None" in { |