summaryrefslogblamecommitdiff
path: root/core/source/main/scala/com/rockymadden/stringmetric/tokenize/NGramTokenizer.scala
blob: d6378d712e406147fb3466459957a358c17d8882 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
                                             
 
                                                             
 
                                                                 

                                                                             
 

                                                                   

         
                                                                                                                
 
                           
                                                                                                           

                                                        
 
package com.rockymadden.stringmetric.tokenize

import com.rockymadden.stringmetric.Tokenizer.StringTokenizer

final case class NGramTokenizer(n: Int) extends StringTokenizer {
	override def tokenize(a: Array[Char]): Option[Array[Array[Char]]] = {
		if (n <= 0) return None

		if (a.length < n) None
		else Some(sequence(a, Array.empty[Array[Char]], n))
	}

	override def tokenize(a: String): Option[Array[String]] = tokenize(a.toCharArray).map(_.map(_.mkString))

	@annotation.tailrec
	private val sequence: ((Array[Char], Array[Array[Char]], Int) => Array[Array[Char]]) = (i, o, n) =>
		if (i.length <= n) o :+ i
		else sequence(i.tail, o :+ i.take(n), n)
}