summaryrefslogtreecommitdiff
path: root/core/source/core/scala/org/hashtree/stringmetric/similarity/NGramAlgorithm.scala
blob: 280483f52ea1d213a2b3cf4bfa39a0a4ca96256b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
package org.hashtree.stringmetric.similarity

import org.hashtree.stringmetric.{ FilterableConfigurableStringAlgorithm, StringAlgorithm, StringFilter }
import scala.annotation.tailrec

/** An implementation of the N-Gram [[org.hashtree.stringmetric.StringAlgorithm]]. */
object NGramAlgorithm extends StringAlgorithm with FilterableConfigurableStringAlgorithm[Int] {
	type ComputeReturn = Array[String]

	override def compute(charArray: Array[Char])(n: Int)
		(implicit stringFilter: StringFilter): Option[Array[Array[Char]]] = {

		if (n <= 0) throw new IllegalArgumentException("Expected valid n.")

		val fca = stringFilter.filter(charArray)

		if (fca.length < n) None
		else Some(sequence(fca, Array.empty[Array[Char]], n))
	}

	override def compute(string: String)(n: Int)
		(implicit stringFilter: StringFilter): Option[ComputeReturn] =

		compute(stringFilter.filter(string.toCharArray))(n).map(_.map(_.mkString))

	@tailrec
	private[this] def sequence(i: Array[Char], o: Array[Array[Char]], n: Int): Array[Array[Char]] = {
		require(n > 0)

		if (i.length <= n) o :+ i
		else sequence(i.tail, o :+ i.take(n), n)
	}
}