path: root/library/src/main/scala/scala/scalajs/niocharset/UTF_8.scala
diff options
Diffstat (limited to 'library/src/main/scala/scala/scalajs/niocharset/UTF_8.scala')
1 files changed, 455 insertions, 0 deletions
diff --git a/library/src/main/scala/scala/scalajs/niocharset/UTF_8.scala b/library/src/main/scala/scala/scalajs/niocharset/UTF_8.scala
new file mode 100644
index 0000000..57f4ad6
--- /dev/null
+++ b/library/src/main/scala/scala/scalajs/niocharset/UTF_8.scala
@@ -0,0 +1,455 @@
+/* __ *\
+** ________ ___ / / ___ __ ____ Scala.js API **
+** / __/ __// _ | / / / _ | __ / // __/ (c) 2013, LAMP/EPFL **
+** __\ \/ /__/ __ |/ /__/ __ |/_// /_\ \ **
+** /____/\___/_/ |_/____/_/ | |__/ /____/ **
+** |/____/ **
+\* */
+package scala.scalajs.niocharset
+import scala.annotation.{switch, tailrec}
+import java.nio._
+import java.nio.charset._
+private[niocharset] object UTF_8 extends Charset("UTF-8", Array(
+ "UTF8", "unicode-1-1-utf-8")) {
+ import java.lang.Character._
+ def contains(that: Charset): Boolean = true
+ def newDecoder(): CharsetDecoder = new Decoder
+ def newEncoder(): CharsetEncoder = new Encoder
+ /* The next table contains information about UTF-8 charset and
+ * correspondence of 1st byte to the length of sequence
+ * For information please visit
+ *
+ * -------------------------------------------------------------------
+ * 0 1 2 3 Value
+ * -------------------------------------------------------------------
+ * 0xxxxxxx 00000000 00000000 0xxxxxxx
+ * 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
+ * 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
+ * 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx 000uuuzz zzzzyyyy yyxxxxxx
+ */
+ private val lengthByLeading: Array[Int] = Array(
+ // 10wwwwww
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ // 110yyyyy
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ // 1110zzzz
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ // 11110uuu
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ // > 11110111
+ -1, -1, -1, -1, -1, -1, -1, -1
+ )
+ @inline
+ private class DecodedMultiByte(val failure: CoderResult,
+ val high: Char, val low: Char)
+ private object DecodedMultiByte {
+ @inline def apply(failure: CoderResult): DecodedMultiByte =
+ new DecodedMultiByte(failure, 0, 0)
+ @inline def apply(single: Char): DecodedMultiByte =
+ new DecodedMultiByte(null, single, 0)
+ @inline def apply(high: Char, low: Char): DecodedMultiByte =
+ new DecodedMultiByte(null, high, low)
+ }
+ private class Decoder extends CharsetDecoder(UTF_8, 1.0f, 1.0f) {
+ def decodeLoop(in: ByteBuffer, out: CharBuffer): CoderResult = {
+ if (in.hasArray && out.hasArray)
+ decodeLoopArray(in, out)
+ else
+ decodeLoopNoArray(in, out)
+ }
+ private def decodeLoopArray(in: ByteBuffer, out: CharBuffer): CoderResult = {
+ val inArray = in.array
+ val inOffset = in.arrayOffset
+ val inStart = in.position + inOffset
+ val inEnd = in.limit + inOffset
+ val outArray = out.array
+ val outOffset = out.arrayOffset
+ val outStart = out.position + outOffset
+ val outEnd = out.limit + outOffset
+ @inline
+ @tailrec
+ def loop(inPos: Int, outPos: Int): CoderResult = {
+ @inline
+ def finalize(result: CoderResult): CoderResult = {
+ in.position(inPos - inOffset)
+ out.position(outPos - outOffset)
+ result
+ }
+ if (inPos == inEnd) {
+ finalize(CoderResult.UNDERFLOW)
+ } else {
+ val leading = inArray(inPos).toInt
+ if (leading >= 0) {
+ // US-ASCII repertoire
+ if (outPos == outEnd) {
+ finalize(CoderResult.OVERFLOW)
+ } else {
+ outArray(outPos) = leading.toChar
+ loop(inPos+1, outPos+1)
+ }
+ } else {
+ // Multi-byte
+ val length = lengthByLeading(leading & 0x7f)
+ if (length == -1) {
+ finalize(CoderResult.malformedForLength(1))
+ } else if (inPos + length > inEnd) {
+ finalize(CoderResult.UNDERFLOW)
+ } else {
+ val decoded = {
+ val b2 = inArray(inPos+1)
+ if (length == 2) decode2(leading, b2)
+ else if (length == 3) decode3(leading, b2, inArray(inPos+2))
+ else decode4(leading, b2, inArray(inPos+2), inArray(inPos+3))
+ }
+ if (decoded.failure != null) {
+ finalize(decoded.failure)
+ } else if (decoded.low == 0) {
+ // not a surrogate pair
+ if (outPos == outEnd)
+ finalize(CoderResult.OVERFLOW)
+ else {
+ outArray(outPos) = decoded.high
+ loop(inPos+length, outPos+1)
+ }
+ } else {
+ // a surrogate pair
+ if (outPos + 2 > outEnd)
+ finalize(CoderResult.OVERFLOW)
+ else {
+ outArray(outPos) = decoded.high
+ outArray(outPos+1) = decoded.low
+ loop(inPos+length, outPos+2)
+ }
+ }
+ }
+ }
+ }
+ }
+ loop(inStart, outStart)
+ }
+ private def decodeLoopNoArray(in: ByteBuffer, out: CharBuffer): CoderResult = {
+ @inline
+ @tailrec
+ def loop(): CoderResult = {
+ @inline
+ def finalize(read: Int, result: CoderResult): CoderResult = {
+ in.position(in.position - read)
+ result
+ }
+ if (!in.hasRemaining) {
+ CoderResult.UNDERFLOW
+ } else {
+ val leading = in.get().toInt
+ if (leading >= 0) {
+ // US-ASCII repertoire
+ if (!out.hasRemaining) {
+ finalize(1, CoderResult.OVERFLOW)
+ } else {
+ out.put(leading.toChar)
+ loop()
+ }
+ } else {
+ // Multi-byte
+ val length = lengthByLeading(leading & 0x7f)
+ if (length == -1) {
+ finalize(1, CoderResult.malformedForLength(1))
+ } else if (in.remaining < length-1) {
+ finalize(1, CoderResult.UNDERFLOW)
+ } else {
+ val decoded = {
+ if (length == 2) decode2(leading, in.get())
+ else if (length == 3) decode3(leading, in.get(), in.get())
+ else decode4(leading, in.get(), in.get(), in.get())
+ }
+ if (decoded.failure != null) {
+ finalize(length, decoded.failure)
+ } else if (decoded.low == 0) {
+ // not a surrogate pair
+ if (!out.hasRemaining)
+ finalize(length, CoderResult.OVERFLOW)
+ else {
+ out.put(decoded.high)
+ loop()
+ }
+ } else {
+ // a surrogate pair
+ if (out.remaining < 2)
+ finalize(length, CoderResult.OVERFLOW)
+ else {
+ out.put(decoded.high)
+ out.put(decoded.low)
+ loop()
+ }
+ }
+ }
+ }
+ }
+ }
+ loop()
+ }
+ @inline private def isInvalidNextByte(b: Int): Boolean =
+ (b & 0xc0) != 0x80
+ @inline private def decode2(b1: Int, b2: Int): DecodedMultiByte = {
+ if (isInvalidNextByte(b2))
+ DecodedMultiByte(CoderResult.malformedForLength(1))
+ else {
+ val codePoint = (((b1 & 0x1f) << 6) | (b2 & 0x3f))
+ // By construction, 0 <= codePoint <= 0x7ff < MIN_SURROGATE
+ if (codePoint < 0x80) {
+ // Should have been encoded with only 1 byte
+ DecodedMultiByte(CoderResult.malformedForLength(2))
+ } else {
+ DecodedMultiByte(codePoint.toChar)
+ }
+ }
+ }
+ @inline private def decode3(b1: Int, b2: Int, b3: Int): DecodedMultiByte = {
+ if (isInvalidNextByte(b2))
+ DecodedMultiByte(CoderResult.malformedForLength(1))
+ else if (isInvalidNextByte(b3))
+ DecodedMultiByte(CoderResult.malformedForLength(2))
+ else {
+ val codePoint = (((b1 & 0xf) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3f))
+ // By construction, 0 <= codePoint <= 0xffff < MIN_SUPPLEMENTARY_CODE_POINT
+ if ((codePoint < 0x800) ||
+ (codePoint >= MIN_SURROGATE && codePoint <= MAX_SURROGATE)) {
+ // Should have been encoded with only 1 or 2 bytes
+ // or it is a surrogate, which is not a valid code point
+ DecodedMultiByte(CoderResult.malformedForLength(3))
+ } else {
+ DecodedMultiByte(codePoint.toChar)
+ }
+ }
+ }
+ @inline private def decode4(b1: Int, b2: Int, b3: Int, b4: Int): DecodedMultiByte = {
+ if (isInvalidNextByte(b2))
+ DecodedMultiByte(CoderResult.malformedForLength(1))
+ else if (isInvalidNextByte(b3))
+ DecodedMultiByte(CoderResult.malformedForLength(2))
+ else if (isInvalidNextByte(b4))
+ DecodedMultiByte(CoderResult.malformedForLength(3))
+ else {
+ val codePoint = (((b1 & 0x7) << 18) | ((b2 & 0x3f) << 12) |
+ ((b3 & 0x3f) << 6) | (b4 & 0x3f))
+ // By construction, 0 <= codePoint <= 0x1fffff
+ if (codePoint < 0x10000 || codePoint > MAX_CODE_POINT) {
+ // It should have been encoded with 1, 2, or 3 bytes
+ // or it is not a valid code point
+ DecodedMultiByte(CoderResult.malformedForLength(4))
+ } else {
+ // Here, we need to encode the code point as a surrogate pair.
+ //
+ val offsetCodePoint = codePoint - 0x10000
+ DecodedMultiByte(
+ ((offsetCodePoint >> 10) | 0xd800).toChar,
+ ((offsetCodePoint & 0x3ff) | 0xdc00).toChar)
+ }
+ }
+ }
+ }
+ private class Encoder extends CharsetEncoder(UTF_8, 1.1f, 4.0f) {
+ def encodeLoop(in: CharBuffer, out: ByteBuffer): CoderResult = {
+ if (in.hasArray && out.hasArray)
+ encodeLoopArray(in, out)
+ else
+ encodeLoopNoArray(in, out)
+ }
+ private def encodeLoopArray(in: CharBuffer, out: ByteBuffer): CoderResult = {
+ val inArray = in.array
+ val inOffset = in.arrayOffset
+ val inStart = in.position + inOffset
+ val inEnd = in.limit + inOffset
+ val outArray = out.array
+ val outOffset = out.arrayOffset
+ val outStart = out.position + outOffset
+ val outEnd = out.limit + outOffset
+ @inline
+ @tailrec
+ def loop(inPos: Int, outPos: Int): CoderResult = {
+ @inline
+ def finalize(result: CoderResult): CoderResult = {
+ in.position(inPos - inOffset)
+ out.position(outPos - outOffset)
+ result
+ }
+ if (inPos == inEnd) {
+ finalize(CoderResult.UNDERFLOW)
+ } else {
+ val c1 = inArray(inPos)
+ if (c1 < 0x80) {
+ // Encoding in one byte
+ if (outPos == outEnd)
+ finalize(CoderResult.OVERFLOW)
+ else {
+ outArray(outPos) = c1.toByte
+ loop(inPos+1, outPos+1)
+ }
+ } else if (c1 < 0x800) {
+ // Encoding in 2 bytes (by construction, not a surrogate)
+ if (outPos + 2 > outEnd)
+ finalize(CoderResult.OVERFLOW)
+ else {
+ outArray(outPos) = ((c1 >> 6) | 0xc0).toByte
+ outArray(outPos+1) = ((c1 & 0x3f) | 0x80).toByte
+ loop(inPos+1, outPos+2)
+ }
+ } else if (!isSurrogate(c1)) {
+ // Not a surrogate, encoding in 3 bytes
+ if (outPos + 3 > outEnd)
+ finalize(CoderResult.OVERFLOW)
+ else {
+ outArray(outPos) = ((c1 >> 12) | 0xe0).toByte
+ outArray(outPos+1) = (((c1 >> 6) & 0x3f) | 0x80).toByte
+ outArray(outPos+2) = ((c1 & 0x3f) | 0x80).toByte
+ loop(inPos+1, outPos+3)
+ }
+ } else if (isHighSurrogate(c1)) {
+ // Should have a low surrogate that follows
+ if (inPos + 1 == inEnd)
+ finalize(CoderResult.UNDERFLOW)
+ else {
+ val c2 = inArray(inPos+1)
+ if (!isLowSurrogate(c2)) {
+ finalize(CoderResult.malformedForLength(1))
+ } else {
+ // Surrogate pair, encoding in 4 bytes
+ if (outPos + 4 > outEnd)
+ finalize(CoderResult.OVERFLOW)
+ else {
+ val cp = toCodePoint(c1, c2)
+ outArray(outPos) = ((cp >> 18) | 0xf0).toByte
+ outArray(outPos+1) = (((cp >> 12) & 0x3f) | 0x80).toByte
+ outArray(outPos+2) = (((cp >> 6) & 0x3f) | 0x80).toByte
+ outArray(outPos+3) = ((cp & 0x3f) | 0x80).toByte
+ loop(inPos+2, outPos+4)
+ }
+ }
+ }
+ } else {
+ finalize(CoderResult.malformedForLength(1))
+ }
+ }
+ }
+ loop(inStart, outStart)
+ }
+ private def encodeLoopNoArray(in: CharBuffer, out: ByteBuffer): CoderResult = {
+ @inline
+ @tailrec
+ def loop(): CoderResult = {
+ @inline
+ def finalize(read: Int, result: CoderResult): CoderResult = {
+ in.position(in.position - read)
+ result
+ }
+ if (!in.hasRemaining) {
+ CoderResult.UNDERFLOW
+ } else {
+ val c1 = in.get()
+ if (c1 < 0x80) {
+ // Encoding in one byte
+ if (!out.hasRemaining)
+ finalize(1, CoderResult.OVERFLOW)
+ else {
+ out.put(c1.toByte)
+ loop()
+ }
+ } else if (c1 < 0x800) {
+ // Encoding in 2 bytes (by construction, not a surrogate)
+ if (out.remaining < 2)
+ finalize(1, CoderResult.OVERFLOW)
+ else {
+ out.put(((c1 >> 6) | 0xc0).toByte)
+ out.put(((c1 & 0x3f) | 0x80).toByte)
+ loop()
+ }
+ } else if (!isSurrogate(c1)) {
+ // Not a surrogate, encoding in 3 bytes
+ if (out.remaining < 3)
+ finalize(1, CoderResult.OVERFLOW)
+ else {
+ out.put(((c1 >> 12) | 0xe0).toByte)
+ out.put((((c1 >> 6) & 0x3f) | 0x80).toByte)
+ out.put(((c1 & 0x3f) | 0x80).toByte)
+ loop()
+ }
+ } else if (isHighSurrogate(c1)) {
+ // Should have a low surrogate that follows
+ if (!in.hasRemaining)
+ finalize(1, CoderResult.UNDERFLOW)
+ else {
+ val c2 = in.get()
+ if (!isLowSurrogate(c2)) {
+ finalize(2, CoderResult.malformedForLength(1))
+ } else {
+ // Surrogate pair, encoding in 4 bytes
+ if (out.remaining < 4)
+ finalize(2, CoderResult.OVERFLOW)
+ else {
+ val cp = toCodePoint(c1, c2)
+ out.put(((cp >> 18) | 0xf0).toByte)
+ out.put((((cp >> 12) & 0x3f) | 0x80).toByte)
+ out.put((((cp >> 6) & 0x3f) | 0x80).toByte)
+ out.put(((cp & 0x3f) | 0x80).toByte)
+ loop()
+ }
+ }
+ }
+ } else {
+ finalize(1, CoderResult.malformedForLength(1))
+ }
+ }
+ }
+ loop()
+ }
+ }
+ private final val SurrogateMask = 0xf800 // 11111 0 00 00000000
+ private final val SurrogateID = 0xd800 // 11011 0 00 00000000
+ @inline private def isSurrogate(c: Char): Boolean =
+ (c & SurrogateMask) == SurrogateID