diff options
Diffstat (limited to 'examples/scala-js/library/src/main/scala/scala/scalajs/niocharset/UTF_8.scala')
-rw-r--r-- | examples/scala-js/library/src/main/scala/scala/scalajs/niocharset/UTF_8.scala | 455 |
1 files changed, 455 insertions, 0 deletions
diff --git a/examples/scala-js/library/src/main/scala/scala/scalajs/niocharset/UTF_8.scala b/examples/scala-js/library/src/main/scala/scala/scalajs/niocharset/UTF_8.scala new file mode 100644 index 0000000..57f4ad6 --- /dev/null +++ b/examples/scala-js/library/src/main/scala/scala/scalajs/niocharset/UTF_8.scala @@ -0,0 +1,455 @@ +/* __ *\ +** ________ ___ / / ___ __ ____ Scala.js API ** +** / __/ __// _ | / / / _ | __ / // __/ (c) 2013, LAMP/EPFL ** +** __\ \/ /__/ __ |/ /__/ __ |/_// /_\ \ http://scala-lang.org/ ** +** /____/\___/_/ |_/____/_/ | |__/ /____/ ** +** |/____/ ** +\* */ + + +package scala.scalajs.niocharset + +import scala.annotation.{switch, tailrec} + +import java.nio._ +import java.nio.charset._ + +private[niocharset] object UTF_8 extends Charset("UTF-8", Array( + "UTF8", "unicode-1-1-utf-8")) { + + import java.lang.Character._ + + def contains(that: Charset): Boolean = true + + def newDecoder(): CharsetDecoder = new Decoder + def newEncoder(): CharsetEncoder = new Encoder + + /* The next table contains information about UTF-8 charset and + * correspondence of 1st byte to the length of sequence + * For information please visit http://www.ietf.org/rfc/rfc3629.txt + * + * ------------------------------------------------------------------- + * 0 1 2 3 Value + * ------------------------------------------------------------------- + * 0xxxxxxx 00000000 00000000 0xxxxxxx + * 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx + * 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx + * 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx 000uuuzz zzzzyyyy yyxxxxxx + */ + + private val lengthByLeading: Array[Int] = Array( + // 10wwwwww + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + // 110yyyyy + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + // 1110zzzz + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + // 11110uuu + 4, 4, 4, 4, 4, 4, 4, 4, + // > 11110111 + -1, -1, -1, -1, -1, -1, -1, -1 + ) + + @inline + private class DecodedMultiByte(val failure: CoderResult, + val high: Char, val low: Char) + + private object DecodedMultiByte { + @inline def apply(failure: CoderResult): DecodedMultiByte = + new DecodedMultiByte(failure, 0, 0) + + @inline def apply(single: Char): DecodedMultiByte = + new DecodedMultiByte(null, single, 0) + + @inline def apply(high: Char, low: Char): DecodedMultiByte = + new DecodedMultiByte(null, high, low) + } + + private class Decoder extends CharsetDecoder(UTF_8, 1.0f, 1.0f) { + def decodeLoop(in: ByteBuffer, out: CharBuffer): CoderResult = { + if (in.hasArray && out.hasArray) + decodeLoopArray(in, out) + else + decodeLoopNoArray(in, out) + } + + private def decodeLoopArray(in: ByteBuffer, out: CharBuffer): CoderResult = { + val inArray = in.array + val inOffset = in.arrayOffset + val inStart = in.position + inOffset + val inEnd = in.limit + inOffset + + val outArray = out.array + val outOffset = out.arrayOffset + val outStart = out.position + outOffset + val outEnd = out.limit + outOffset + + @inline + @tailrec + def loop(inPos: Int, outPos: Int): CoderResult = { + @inline + def finalize(result: CoderResult): CoderResult = { + in.position(inPos - inOffset) + out.position(outPos - outOffset) + result + } + + if (inPos == inEnd) { + finalize(CoderResult.UNDERFLOW) + } else { + val leading = inArray(inPos).toInt + if (leading >= 0) { + // US-ASCII repertoire + if (outPos == outEnd) { + finalize(CoderResult.OVERFLOW) + } else { + outArray(outPos) = leading.toChar + loop(inPos+1, outPos+1) + } + } else { + // Multi-byte + val length = lengthByLeading(leading & 0x7f) + if (length == -1) { + finalize(CoderResult.malformedForLength(1)) + } else if (inPos + length > inEnd) { + finalize(CoderResult.UNDERFLOW) + } else { + val decoded = { + val b2 = inArray(inPos+1) + if (length == 2) decode2(leading, b2) + else if (length == 3) decode3(leading, b2, inArray(inPos+2)) + else decode4(leading, b2, inArray(inPos+2), inArray(inPos+3)) + } + + if (decoded.failure != null) { + finalize(decoded.failure) + } else if (decoded.low == 0) { + // not a surrogate pair + if (outPos == outEnd) + finalize(CoderResult.OVERFLOW) + else { + outArray(outPos) = decoded.high + loop(inPos+length, outPos+1) + } + } else { + // a surrogate pair + if (outPos + 2 > outEnd) + finalize(CoderResult.OVERFLOW) + else { + outArray(outPos) = decoded.high + outArray(outPos+1) = decoded.low + loop(inPos+length, outPos+2) + } + } + } + } + } + } + + loop(inStart, outStart) + } + + private def decodeLoopNoArray(in: ByteBuffer, out: CharBuffer): CoderResult = { + @inline + @tailrec + def loop(): CoderResult = { + @inline + def finalize(read: Int, result: CoderResult): CoderResult = { + in.position(in.position - read) + result + } + + if (!in.hasRemaining) { + CoderResult.UNDERFLOW + } else { + val leading = in.get().toInt + if (leading >= 0) { + // US-ASCII repertoire + if (!out.hasRemaining) { + finalize(1, CoderResult.OVERFLOW) + } else { + out.put(leading.toChar) + loop() + } + } else { + // Multi-byte + val length = lengthByLeading(leading & 0x7f) + if (length == -1) { + finalize(1, CoderResult.malformedForLength(1)) + } else if (in.remaining < length-1) { + finalize(1, CoderResult.UNDERFLOW) + } else { + val decoded = { + if (length == 2) decode2(leading, in.get()) + else if (length == 3) decode3(leading, in.get(), in.get()) + else decode4(leading, in.get(), in.get(), in.get()) + } + + if (decoded.failure != null) { + finalize(length, decoded.failure) + } else if (decoded.low == 0) { + // not a surrogate pair + if (!out.hasRemaining) + finalize(length, CoderResult.OVERFLOW) + else { + out.put(decoded.high) + loop() + } + } else { + // a surrogate pair + if (out.remaining < 2) + finalize(length, CoderResult.OVERFLOW) + else { + out.put(decoded.high) + out.put(decoded.low) + loop() + } + } + } + } + } + } + + loop() + } + + @inline private def isInvalidNextByte(b: Int): Boolean = + (b & 0xc0) != 0x80 + + @inline private def decode2(b1: Int, b2: Int): DecodedMultiByte = { + if (isInvalidNextByte(b2)) + DecodedMultiByte(CoderResult.malformedForLength(1)) + else { + val codePoint = (((b1 & 0x1f) << 6) | (b2 & 0x3f)) + // By construction, 0 <= codePoint <= 0x7ff < MIN_SURROGATE + if (codePoint < 0x80) { + // Should have been encoded with only 1 byte + DecodedMultiByte(CoderResult.malformedForLength(2)) + } else { + DecodedMultiByte(codePoint.toChar) + } + } + } + + @inline private def decode3(b1: Int, b2: Int, b3: Int): DecodedMultiByte = { + if (isInvalidNextByte(b2)) + DecodedMultiByte(CoderResult.malformedForLength(1)) + else if (isInvalidNextByte(b3)) + DecodedMultiByte(CoderResult.malformedForLength(2)) + else { + val codePoint = (((b1 & 0xf) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3f)) + // By construction, 0 <= codePoint <= 0xffff < MIN_SUPPLEMENTARY_CODE_POINT + if ((codePoint < 0x800) || + (codePoint >= MIN_SURROGATE && codePoint <= MAX_SURROGATE)) { + // Should have been encoded with only 1 or 2 bytes + // or it is a surrogate, which is not a valid code point + DecodedMultiByte(CoderResult.malformedForLength(3)) + } else { + DecodedMultiByte(codePoint.toChar) + } + } + } + + @inline private def decode4(b1: Int, b2: Int, b3: Int, b4: Int): DecodedMultiByte = { + if (isInvalidNextByte(b2)) + DecodedMultiByte(CoderResult.malformedForLength(1)) + else if (isInvalidNextByte(b3)) + DecodedMultiByte(CoderResult.malformedForLength(2)) + else if (isInvalidNextByte(b4)) + DecodedMultiByte(CoderResult.malformedForLength(3)) + else { + val codePoint = (((b1 & 0x7) << 18) | ((b2 & 0x3f) << 12) | + ((b3 & 0x3f) << 6) | (b4 & 0x3f)) + // By construction, 0 <= codePoint <= 0x1fffff + if (codePoint < 0x10000 || codePoint > MAX_CODE_POINT) { + // It should have been encoded with 1, 2, or 3 bytes + // or it is not a valid code point + DecodedMultiByte(CoderResult.malformedForLength(4)) + } else { + // Here, we need to encode the code point as a surrogate pair. + // http://en.wikipedia.org/wiki/UTF-16 + val offsetCodePoint = codePoint - 0x10000 + DecodedMultiByte( + ((offsetCodePoint >> 10) | 0xd800).toChar, + ((offsetCodePoint & 0x3ff) | 0xdc00).toChar) + } + } + } + } + + private class Encoder extends CharsetEncoder(UTF_8, 1.1f, 4.0f) { + def encodeLoop(in: CharBuffer, out: ByteBuffer): CoderResult = { + if (in.hasArray && out.hasArray) + encodeLoopArray(in, out) + else + encodeLoopNoArray(in, out) + } + + private def encodeLoopArray(in: CharBuffer, out: ByteBuffer): CoderResult = { + val inArray = in.array + val inOffset = in.arrayOffset + val inStart = in.position + inOffset + val inEnd = in.limit + inOffset + + val outArray = out.array + val outOffset = out.arrayOffset + val outStart = out.position + outOffset + val outEnd = out.limit + outOffset + + @inline + @tailrec + def loop(inPos: Int, outPos: Int): CoderResult = { + @inline + def finalize(result: CoderResult): CoderResult = { + in.position(inPos - inOffset) + out.position(outPos - outOffset) + result + } + + if (inPos == inEnd) { + finalize(CoderResult.UNDERFLOW) + } else { + val c1 = inArray(inPos) + + if (c1 < 0x80) { + // Encoding in one byte + if (outPos == outEnd) + finalize(CoderResult.OVERFLOW) + else { + outArray(outPos) = c1.toByte + loop(inPos+1, outPos+1) + } + } else if (c1 < 0x800) { + // Encoding in 2 bytes (by construction, not a surrogate) + if (outPos + 2 > outEnd) + finalize(CoderResult.OVERFLOW) + else { + outArray(outPos) = ((c1 >> 6) | 0xc0).toByte + outArray(outPos+1) = ((c1 & 0x3f) | 0x80).toByte + loop(inPos+1, outPos+2) + } + } else if (!isSurrogate(c1)) { + // Not a surrogate, encoding in 3 bytes + if (outPos + 3 > outEnd) + finalize(CoderResult.OVERFLOW) + else { + outArray(outPos) = ((c1 >> 12) | 0xe0).toByte + outArray(outPos+1) = (((c1 >> 6) & 0x3f) | 0x80).toByte + outArray(outPos+2) = ((c1 & 0x3f) | 0x80).toByte + loop(inPos+1, outPos+3) + } + } else if (isHighSurrogate(c1)) { + // Should have a low surrogate that follows + if (inPos + 1 == inEnd) + finalize(CoderResult.UNDERFLOW) + else { + val c2 = inArray(inPos+1) + if (!isLowSurrogate(c2)) { + finalize(CoderResult.malformedForLength(1)) + } else { + // Surrogate pair, encoding in 4 bytes + if (outPos + 4 > outEnd) + finalize(CoderResult.OVERFLOW) + else { + val cp = toCodePoint(c1, c2) + outArray(outPos) = ((cp >> 18) | 0xf0).toByte + outArray(outPos+1) = (((cp >> 12) & 0x3f) | 0x80).toByte + outArray(outPos+2) = (((cp >> 6) & 0x3f) | 0x80).toByte + outArray(outPos+3) = ((cp & 0x3f) | 0x80).toByte + loop(inPos+2, outPos+4) + } + } + } + } else { + finalize(CoderResult.malformedForLength(1)) + } + } + } + + loop(inStart, outStart) + } + + private def encodeLoopNoArray(in: CharBuffer, out: ByteBuffer): CoderResult = { + @inline + @tailrec + def loop(): CoderResult = { + @inline + def finalize(read: Int, result: CoderResult): CoderResult = { + in.position(in.position - read) + result + } + + if (!in.hasRemaining) { + CoderResult.UNDERFLOW + } else { + val c1 = in.get() + + if (c1 < 0x80) { + // Encoding in one byte + if (!out.hasRemaining) + finalize(1, CoderResult.OVERFLOW) + else { + out.put(c1.toByte) + loop() + } + } else if (c1 < 0x800) { + // Encoding in 2 bytes (by construction, not a surrogate) + if (out.remaining < 2) + finalize(1, CoderResult.OVERFLOW) + else { + out.put(((c1 >> 6) | 0xc0).toByte) + out.put(((c1 & 0x3f) | 0x80).toByte) + loop() + } + } else if (!isSurrogate(c1)) { + // Not a surrogate, encoding in 3 bytes + if (out.remaining < 3) + finalize(1, CoderResult.OVERFLOW) + else { + out.put(((c1 >> 12) | 0xe0).toByte) + out.put((((c1 >> 6) & 0x3f) | 0x80).toByte) + out.put(((c1 & 0x3f) | 0x80).toByte) + loop() + } + } else if (isHighSurrogate(c1)) { + // Should have a low surrogate that follows + if (!in.hasRemaining) + finalize(1, CoderResult.UNDERFLOW) + else { + val c2 = in.get() + if (!isLowSurrogate(c2)) { + finalize(2, CoderResult.malformedForLength(1)) + } else { + // Surrogate pair, encoding in 4 bytes + if (out.remaining < 4) + finalize(2, CoderResult.OVERFLOW) + else { + val cp = toCodePoint(c1, c2) + out.put(((cp >> 18) | 0xf0).toByte) + out.put((((cp >> 12) & 0x3f) | 0x80).toByte) + out.put((((cp >> 6) & 0x3f) | 0x80).toByte) + out.put(((cp & 0x3f) | 0x80).toByte) + loop() + } + } + } + } else { + finalize(1, CoderResult.malformedForLength(1)) + } + } + } + + loop() + } + } + + private final val SurrogateMask = 0xf800 // 11111 0 00 00000000 + private final val SurrogateID = 0xd800 // 11011 0 00 00000000 + + @inline private def isSurrogate(c: Char): Boolean = + (c & SurrogateMask) == SurrogateID +} |