From 0c9b2feed5d5d253181a49beb1e806ba9df3bd1b Mon Sep 17 00:00:00 2001 From: Mark Hatton Date: Thu, 11 Jun 2015 19:14:11 +0100 Subject: = Fix decoding of 4-byte UTF-8 characters into UTF-16 surrogate pairs --- src/main/scala/spray/json/JsonParser.scala | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'src/main/scala') diff --git a/src/main/scala/spray/json/JsonParser.scala b/src/main/scala/spray/json/JsonParser.scala index 855ab21..b1e59d5 100644 --- a/src/main/scala/spray/json/JsonParser.scala +++ b/src/main/scala/spray/json/JsonParser.scala @@ -271,7 +271,7 @@ object ParserInput { */ class ByteArrayBasedParserInput(bytes: Array[Byte]) extends DefaultParserInput { private val byteBuffer = ByteBuffer.allocate(4) - private val charBuffer = CharBuffer.allocate(1) // we currently don't support surrogate pairs! + private val charBuffer = CharBuffer.allocate(2) private val decoder = UTF8.newDecoder() def nextChar() = { _cursor += 1 @@ -289,20 +289,26 @@ object ParserInput { charBuffer.flip() val result = if (coderResult.isUnderflow & charBuffer.hasRemaining) charBuffer.get() else ErrorChar byteBuffer.clear() - charBuffer.clear() + if (!charBuffer.hasRemaining) charBuffer.clear() result } } - _cursor += 1 - if (_cursor < bytes.length) { - val byte = bytes(_cursor) - if (byte >= 0) byte.toChar // 7-Bit ASCII - else if ((byte & 0xE0) == 0xC0) decode(byte, 1) // 2-byte UTF-8 sequence - else if ((byte & 0xF0) == 0xE0) decode(byte, 2) // 3-byte UTF-8 sequence - else if ((byte & 0xF8) == 0xF0) decode(byte, 3) // 4-byte UTF-8 sequence, will probably produce an (unsupported) surrogate pair - else ErrorChar - } else EOI + if (charBuffer.position() > 0) { + val result = charBuffer.get() + charBuffer.clear() + result + } else { + _cursor += 1 + if (_cursor < bytes.length) { + val byte = bytes(_cursor) + if (byte >= 0) byte.toChar // 7-Bit ASCII + else if ((byte & 0xE0) == 0xC0) decode(byte, 1) // 2-byte UTF-8 sequence + else if ((byte & 0xF0) == 0xE0) decode(byte, 2) // 3-byte UTF-8 sequence + else if ((byte & 0xF8) == 0xF0) decode(byte, 3) // 4-byte UTF-8 sequence + else ErrorChar + } else EOI + } } def length = bytes.length def sliceString(start: Int, end: Int) = new String(bytes, start, end - start, UTF8) -- cgit v1.2.3