summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRocky Madden <git@rockymadden.com>2014-01-02 11:22:50 -0700
committerRocky Madden <git@rockymadden.com>2014-01-02 11:22:50 -0700
commit8a6853a76a61184bc2ad559e59292ef7ea1dfd4a (patch)
tree13e6edce58cb1de2845975bc3aaab7a2317a83c1
parent46b69a796ef7632dafda2df0467b811008906bb0 (diff)
downloadstringmetric-8a6853a76a61184bc2ad559e59292ef7ea1dfd4a.tar.gz
stringmetric-8a6853a76a61184bc2ad559e59292ef7ea1dfd4a.tar.bz2
stringmetric-8a6853a76a61184bc2ad559e59292ef7ea1dfd4a.zip
Merged package contents into module.
-rwxr-xr-xcli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala31
-rwxr-xr-xcli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala64
-rwxr-xr-xcore/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala23
-rwxr-xr-xcore/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala2
-rwxr-xr-xcore/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala2
-rwxr-xr-xcore/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala2
-rwxr-xr-xcore/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala2
-rwxr-xr-xcore/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala19
-rwxr-xr-xcore/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala (renamed from core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala)7
9 files changed, 28 insertions, 124 deletions
diff --git a/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala b/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala
deleted file mode 100755
index cbd33d7..0000000
--- a/cli/source/main/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizer.scala
+++ /dev/null
@@ -1,31 +0,0 @@
-package com.rockymadden.stringmetric.cli.tokenize
-
-import com.rockymadden.stringmetric.cli._
-import com.rockymadden.stringmetric.tokenize.NGramTokenizer
-
-case object ngramtokenizer extends Command(
- (opts) =>
- "Returns the N-Gram representation of the passed string." + Ls + Ls +
- "Syntax:" + Ls +
- Tab + "ngramtokenizer [Options] string..." + Ls + Ls +
- "Options:" + Ls +
- Tab + "-h, --help" + Ls +
- Tab + Tab + "Outputs description, syntax, and opts." +
- Tab + "--n" + Ls +
- Tab + Tab + "The n.",
- (opts) => opts.contains('dashless) && (opts('dashless): Array[String]).length == 1 &&
- opts.contains('n) && (opts('n): Int) >= 1,
- (opts) => NGramTokenizer(opts('n)).tokenize(opts('dashless)) match {
- case Some(c) => {
- val sb = new StringBuilder
-
- Range(0, c.length).foreach { i =>
- sb.append(c(i))
- if (i < c.length - 1) sb.append("|")
- }
-
- sb.result()
- }
- case None => "not computable"
- }
-)
diff --git a/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala b/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala
deleted file mode 100755
index 552fcf4..0000000
--- a/cli/source/test/scala/com/rockymadden/stringmetric/cli/tokenize/ngramtokenizerSpec.scala
+++ /dev/null
@@ -1,64 +0,0 @@
-package com.rockymadden.stringmetric.cli.tokenize
-
-import com.rockymadden.stringmetric.ScalaTest
-import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-
-@RunWith(classOf[JUnitRunner])
-final class ngramtokenizerSpec extends ScalaTest { "ngramtokenizer" should provide {
- "main method" when passed {
- "valid dashless argument and valid n argument" should executes {
- "print N-Gram representation" in {
- val out = new java.io.ByteArrayOutputStream()
-
- Console.withOut(out)(
- ngramtokenizer.main(
- Array(
- "--unitTest",
- "--debug",
- "--n=1",
- "abc"
- )
- )
- )
-
- out.toString should equal ("a|b|c\n")
- out.reset()
-
- Console.withOut(out)(
- ngramtokenizer.main(
- Array(
- "--unitTest",
- "--debug",
- "--n=2",
- "abc"
- )
- )
- )
-
- out.toString should equal ("ab|bc\n")
- out.reset()
- }
- }
- "valid dashless argument and invalid n argument" should throws {
- "IllegalArgumentException" in {
- evaluating {
- ngramtokenizer.main(
- Array(
- "--unitTest",
- "abc",
- "abc"
- )
- )
- } should produce [IllegalArgumentException]
- }
- }
- "no dashless argument" should throws {
- "IllegalArgumentException" in {
- evaluating {
- ngramtokenizer.main(Array("--unitTest", "--debug"))
- } should produce [IllegalArgumentException]
- }
- }
- }
-}}
diff --git a/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala b/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala
index 36b7eef..00b173d 100755
--- a/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala
+++ b/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala
@@ -1,19 +1,36 @@
package com.rockymadden.stringmetric
object Tokenize {
- trait Tokenizer[A] {
+ sealed trait Tokenizer[A] {
def tokenize(a: A): Option[Array[A]]
}
- trait StringTokenizer extends Tokenizer[Array[Char]] {
+ sealed trait StringTokenizer extends Tokenizer[Array[Char]] {
def tokenize(a: String): Option[Array[String]]
}
object StringTokenizer {
- val NGram = tokenize.NGramTokenizer
+ val NGram = NGramTokenizer
def tokenizeWithNGram(n: Int)(charArray: Array[Char]) = NGram(n).tokenize(charArray)
}
+
+
+ final case class NGramTokenizer(n: Int) extends StringTokenizer {
+ override def tokenize(a: Array[Char]): Option[Array[Array[Char]]] = {
+ if (n <= 0) return None
+
+ if (a.length < n) None
+ else Some(sequence(a, Array.empty[Array[Char]], n))
+ }
+
+ override def tokenize(a: String): Option[Array[String]] = tokenize(a.toCharArray).map(_.map(_.mkString))
+
+ @annotation.tailrec
+ private val sequence: ((Array[Char], Array[Array[Char]], Int) => Array[Array[Char]]) = (i, o, n) =>
+ if (i.length <= n) o :+ i
+ else sequence(i.tail, o :+ i.take(n), n)
+ }
}
diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
index 1e07432..0ad3915 100755
--- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
+++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala
@@ -7,7 +7,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric
* Traditionally, the algorithm uses bigrams.
*/
final case class DiceSorensenMetric(n: Int) extends StringMetric[Double] {
- import com.rockymadden.stringmetric.tokenize.NGramTokenizer
+ import com.rockymadden.stringmetric.Tokenize.NGramTokenizer
import com.rockymadden.stringmetric.MatchTuple
override def compare(a: Array[Char], b: Array[Char]): Option[Double] =
diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala
index 629eaa0..6ec5db4 100755
--- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala
+++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala
@@ -3,7 +3,7 @@ package com.rockymadden.stringmetric.similarity
import com.rockymadden.stringmetric.Metric.StringMetric
final case class JaccardMetric(n: Int) extends StringMetric[Double] {
- import com.rockymadden.stringmetric.tokenize.NGramTokenizer
+ import com.rockymadden.stringmetric.Tokenize.NGramTokenizer
override def compare(a: Array[Char], b: Array[Char]): Option[Double] =
if (n <= 0 || a.length < n || b.length < n) None // Because length is less than n, it is not possible to compare.
diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala
index d712738..8c194ce 100755
--- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala
+++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala
@@ -4,7 +4,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric
final case class NGramMetric(n: Int) extends StringMetric[Double] {
import com.rockymadden.stringmetric.MatchTuple
- import com.rockymadden.stringmetric.tokenize.NGramTokenizer
+ import com.rockymadden.stringmetric.Tokenize.NGramTokenizer
import scala.math
override def compare(a: Array[Char], b: Array[Char]): Option[Double] =
diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala
index cc33a26..8f0418b 100755
--- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala
+++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala
@@ -4,7 +4,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric
final case class OverlapMetric(n: Int) extends StringMetric[Double] {
import com.rockymadden.stringmetric.MatchTuple
- import com.rockymadden.stringmetric.tokenize.NGramTokenizer
+ import com.rockymadden.stringmetric.Tokenize.NGramTokenizer
import scala.math
override def compare(a: Array[Char], b: Array[Char]): Option[Double] =
diff --git a/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala b/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala
deleted file mode 100755
index aa89b31..0000000
--- a/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala
+++ /dev/null
@@ -1,19 +0,0 @@
-package com.rockymadden.stringmetric.tokenize
-
-import com.rockymadden.stringmetric.Tokenize.StringTokenizer
-
-final case class NGramTokenizer(n: Int) extends StringTokenizer {
- override def tokenize(a: Array[Char]): Option[Array[Array[Char]]] = {
- if (n <= 0) return None
-
- if (a.length < n) None
- else Some(sequence(a, Array.empty[Array[Char]], n))
- }
-
- override def tokenize(a: String): Option[Array[String]] = tokenize(a.toCharArray).map(_.map(_.mkString))
-
- @annotation.tailrec
- private val sequence: ((Array[Char], Array[Array[Char]], Int) => Array[Array[Char]]) = (i, o, n) =>
- if (i.length <= n) o :+ i
- else sequence(i.tail, o :+ i.take(n), n)
-}
diff --git a/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala
index 01636f0..cfba0f7 100755
--- a/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala
+++ b/core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala
@@ -1,11 +1,12 @@
-package com.rockymadden.stringmetric.tokenize
+package com.rockymadden.stringmetric
-import com.rockymadden.stringmetric.ScalaTest
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
@RunWith(classOf[JUnitRunner])
-final class NGramTokenizerSpec extends ScalaTest { "NGramTokenizer" should provide {
+final class TokenizeSpec extends ScalaTest { "NGramTokenizer" should provide {
+ import Tokenize._
+
"tokenize method" when passed {
"empty argument" should returns {
"None" in {