diff options
Diffstat (limited to 'core')
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala | 23 | ||||
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala | 2 | ||||
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala | 2 | ||||
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala | 2 | ||||
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala | 2 | ||||
-rwxr-xr-x | core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala | 19 | ||||
-rwxr-xr-x | core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala (renamed from core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala) | 7 |
7 files changed, 28 insertions, 29 deletions
diff --git a/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala b/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala index 36b7eef..00b173d 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/Tokenize.scala @@ -1,19 +1,36 @@ package com.rockymadden.stringmetric object Tokenize { - trait Tokenizer[A] { + sealed trait Tokenizer[A] { def tokenize(a: A): Option[Array[A]] } - trait StringTokenizer extends Tokenizer[Array[Char]] { + sealed trait StringTokenizer extends Tokenizer[Array[Char]] { def tokenize(a: String): Option[Array[String]] } object StringTokenizer { - val NGram = tokenize.NGramTokenizer + val NGram = NGramTokenizer def tokenizeWithNGram(n: Int)(charArray: Array[Char]) = NGram(n).tokenize(charArray) } + + + final case class NGramTokenizer(n: Int) extends StringTokenizer { + override def tokenize(a: Array[Char]): Option[Array[Array[Char]]] = { + if (n <= 0) return None + + if (a.length < n) None + else Some(sequence(a, Array.empty[Array[Char]], n)) + } + + override def tokenize(a: String): Option[Array[String]] = tokenize(a.toCharArray).map(_.map(_.mkString)) + + @annotation.tailrec + private val sequence: ((Array[Char], Array[Array[Char]], Int) => Array[Array[Char]]) = (i, o, n) => + if (i.length <= n) o :+ i + else sequence(i.tail, o :+ i.take(n), n) + } } diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala index 1e07432..0ad3915 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/DiceSorensenMetric.scala @@ -7,7 +7,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric * Traditionally, the algorithm uses bigrams. */ final case class DiceSorensenMetric(n: Int) extends StringMetric[Double] { - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer import com.rockymadden.stringmetric.MatchTuple override def compare(a: Array[Char], b: Array[Char]): Option[Double] = diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala index 629eaa0..6ec5db4 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/JaccardMetric.scala @@ -3,7 +3,7 @@ package com.rockymadden.stringmetric.similarity import com.rockymadden.stringmetric.Metric.StringMetric final case class JaccardMetric(n: Int) extends StringMetric[Double] { - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer override def compare(a: Array[Char], b: Array[Char]): Option[Double] = if (n <= 0 || a.length < n || b.length < n) None // Because length is less than n, it is not possible to compare. diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala index d712738..8c194ce 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/NGramMetric.scala @@ -4,7 +4,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric final case class NGramMetric(n: Int) extends StringMetric[Double] { import com.rockymadden.stringmetric.MatchTuple - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer import scala.math override def compare(a: Array[Char], b: Array[Char]): Option[Double] = diff --git a/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala b/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala index cc33a26..8f0418b 100755 --- a/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala +++ b/core/source/main/scala/com/rockymadden/stringmetric/similarity/OverlapMetric.scala @@ -4,7 +4,7 @@ import com.rockymadden.stringmetric.Metric.StringMetric final case class OverlapMetric(n: Int) extends StringMetric[Double] { import com.rockymadden.stringmetric.MatchTuple - import com.rockymadden.stringmetric.tokenize.NGramTokenizer + import com.rockymadden.stringmetric.Tokenize.NGramTokenizer import scala.math override def compare(a: Array[Char], b: Array[Char]): Option[Double] = diff --git a/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala b/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala deleted file mode 100755 index aa89b31..0000000 --- a/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala +++ /dev/null @@ -1,19 +0,0 @@ -package com.rockymadden.stringmetric.tokenize - -import com.rockymadden.stringmetric.Tokenize.StringTokenizer - -final case class NGramTokenizer(n: Int) extends StringTokenizer { - override def tokenize(a: Array[Char]): Option[Array[Array[Char]]] = { - if (n <= 0) return None - - if (a.length < n) None - else Some(sequence(a, Array.empty[Array[Char]], n)) - } - - override def tokenize(a: String): Option[Array[String]] = tokenize(a.toCharArray).map(_.map(_.mkString)) - - @annotation.tailrec - private val sequence: ((Array[Char], Array[Array[Char]], Int) => Array[Array[Char]]) = (i, o, n) => - if (i.length <= n) o :+ i - else sequence(i.tail, o :+ i.take(n), n) -} diff --git a/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala b/core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala index 01636f0..cfba0f7 100755 --- a/core/source/test/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizerSpec.scala +++ b/core/source/test/scala/com/rockymadden/stringmetric/TokenizeSpec.scala @@ -1,11 +1,12 @@ -package com.rockymadden.stringmetric.tokenize +package com.rockymadden.stringmetric -import com.rockymadden.stringmetric.ScalaTest import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -final class NGramTokenizerSpec extends ScalaTest { "NGramTokenizer" should provide { +final class TokenizeSpec extends ScalaTest { "NGramTokenizer" should provide { + import Tokenize._ + "tokenize method" when passed { "empty argument" should returns { "None" in { |