summaryrefslogtreecommitdiff
path: root/core/source/main/scala/com/rockymadden/stringmetric/phonetic/RefinedNysiisAlgorithm.scala
blob: 72bd84eb205a4298033d18bad534f341bea734da (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package com.rockymadden.stringmetric.phonetic

import com.rockymadden.stringmetric.Algorithm.StringAlgorithmLike

case object RefinedNysiisAlgorithm extends StringAlgorithmLike {
	import com.rockymadden.stringmetric.Alphabet.{Alpha, LowercaseVowel}

	override def compute(a: Array[Char]): Option[Array[Char]] =
		if (a.length == 0 || !(Alpha isSuperset a.head)) None
		else {
			val lca = a.map(_.toLower)
			val tlh = transcodeLast(transcodeHead(lca.head +: cleanLast(lca.tail, Set('s', 'z'))))
			val t = transcode(Array.empty[Char], tlh.head, tlh.tail, Array.empty[Char])

			if (t.length == 1) Some(t)
			else Some(deduplicate(t.head +: cleanTerminal(cleanLast(t.tail, Set('a')))))
		}

	override def compute(string: String): Option[String] = compute(string.toCharArray).map(_.mkString)

	private def cleanLast(ca: Array[Char], s: Set[Char]) =
		if (ca.length == 0) ca
		else if(s.contains(ca.last)) ca.dropRight(ca.reverseIterator.takeWhile(c => s.contains(c)).length)
		else ca

	private def cleanTerminal(ca: Array[Char]) =
		if (ca.length >= 2 && ca.last == 'y' && ca(ca.length - 2) == 'a') ca.dropRight(2) :+ 'y'
		else ca

	private def deduplicate(ca: Array[Char]) =
		if (ca.length <= 1) ca
		else ca.sliding(2).withFilter(a => a(0) != a(1)).map(a => a(0)).toArray[Char] :+ ca.last

	@annotation.tailrec
	private def transcode(l: Array[Char], c: Char, r: Array[Char], o: Array[Char]): Array[Char] = {
		if (c == '\0' && r.length == 0) o
		else {
			def shift(d: Int, ca: Array[Char]) = {
				val sca = r.splitAt(d - 1)

				(
					if (sca._1.length > 0) (l :+ c) ++ sca._1 else l :+ c,
					if (sca._2.length > 0) sca._2.head else '\0',
					if (sca._2.length > 1) sca._2.tail else Array.empty[Char],
					ca
				)
			}

			val t = {
				(c: @annotation.switch) match {
					case 'a' | 'i' | 'o' | 'u' =>
						if (l.length == 0) shift(1, o :+ c)
						else shift(1, o :+ 'a')
					case 'b' | 'c' | 'f' | 'j' | 'l' | 'n' | 'r' | 't' | 'v' | 'x' => shift(1, o :+ c)
					case 'd' =>
						if (r.length >= 1 && r.head == 'g') shift(2, o :+ 'g') else shift(1, o :+ c)
					case 'e' =>
						if (l.length == 0) shift(1, o :+ c)
						else if (r.length >= 1 && r.head == 'v') shift(2, o ++ Array('a', 'f'))
						else shift(1, o :+ 'a')
					case 'g' =>
						if (r.length >= 2 && r.head == 'h' && r(1) == 't') shift(3, o ++ Array('g', 't'))
						else shift(1, o :+ c)
					case 'h' =>
						if (l.length == 0) shift(1, o :+ c)
						else if (!(LowercaseVowel isSuperset l.last) || (r.length >= 1 && !(LowercaseVowel isSuperset r.head))) shift(1, o)
						else shift(1, o :+ c)
					case 'k' => if (r.length >= 1 && r.head == 'n') shift(2, o :+ 'n') else shift(1, o :+ 'c')
					case 'm' => if (l.length == 0) shift(1, o :+ c) else shift(1, o :+ 'n')
					case 'p' => if (r.length >= 1 && r.head == 'h') shift(2, o :+ 'f') else shift(1, o :+ c)
					case 'q' => if (l.length == 0) shift(1, o :+ c) else shift(1, o :+ 'g')
					case 's' =>
						if (r.length >= 2 && r.head == 'c' && r(1) == 'h') shift(3, o :+ c)
						else if (r.length >= 1 && r.head == 'h') shift(2, o :+ c)
						else shift(1, o :+ c)
					case 'w' =>
						if (l.length >= 1 && (LowercaseVowel isSuperset l.last)) shift(1, o)
						else if (r.length >= 1 && r.head == 'r') shift(2, o :+ 'r')
						else shift(1, o :+ c)
					case 'y' =>
						if (l.length >= 1 && r.length >= 2 && r.head == 'w') shift(2, o :+ 'a')
						else if (r.length >= 1 && r.head == 'w') shift(2, o :+ c)
						else if (l.length >= 1 && r.length >= 1) shift(1, o :+ 'a')
						else shift(1, o :+ c)
					case 'z' => if (l.length == 0) shift(1, o :+ c) else shift(1, o :+ 's')
					case _ => shift(1, o)
				}
			}

			transcode(t._1, t._2, t._3, t._4)
		}
	}

	private def transcodeHead(ca: Array[Char]) =
		if (ca.length == 0) ca
		else
			(ca.head: @annotation.switch) match {
				case 'm' if (ca.length >= 3 && ca(1) == 'a' && ca(2) == 'c') => Array('m', 'c') ++ ca.takeRight(ca.length - 3)
				case 'p' if (ca.length >= 2 && ca(1) == 'f') => 'f' +: ca.takeRight(ca.length - 2)
				case _ => ca
			}

	private def transcodeLast(ca: Array[Char]) =
		if (ca.length >= 2) {
			val lc = ca(ca.length - 1)
			val lcm1 = ca(ca.length - 2)
			lazy val t2 = ca.take(ca.length - 2)

			(lc: @annotation.switch) match {
				case 'd' if (lcm1 == 'n' || lcm1 == 'r') => t2 :+ 'd'
				case 'e' if (lcm1 == 'e' || lcm1 == 'i' || lcm1 =='y') => t2 :+ 'y'
				case 't' if (lcm1 == 'd' || lcm1 == 'n' || lcm1 == 'r') => t2 :+ 'd'
				case 'x' if (lcm1 == 'e') => t2 ++ Array('e', 'c')
				case 'x' if (lcm1 == 'i') => t2 ++ Array('i', 'c')
				case _ => ca
			}
		} else ca
}