From 43a2319910f1c58c044aadbbb726bdc8a9f9229c Mon Sep 17 00:00:00 2001 From: Mathias Date: Thu, 18 Sep 2014 13:33:50 +0200 Subject: Switch to fast, hand-written parser, remove parboiled dependency Closes #86, #108 --- src/main/scala/spray/json/JsValue.scala | 2 +- src/main/scala/spray/json/JsonParser.scala | 335 ++++++++++++++++++++----- src/test/scala/spray/json/JsonParserSpec.scala | 44 ++-- src/test/scala/spray/json/RoundTripSpecs.scala | 4 +- 4 files changed, 295 insertions(+), 90 deletions(-) (limited to 'src') diff --git a/src/main/scala/spray/json/JsValue.scala b/src/main/scala/spray/json/JsValue.scala index 8b25c98..7dfb06f 100644 --- a/src/main/scala/spray/json/JsValue.scala +++ b/src/main/scala/spray/json/JsValue.scala @@ -20,7 +20,6 @@ package spray.json import collection.immutable.ListMap - /** * The general type of a JSON AST node. */ @@ -89,6 +88,7 @@ object JsNumber { } def apply(n: BigInt) = new JsNumber(BigDecimal(n)) def apply(n: String) = new JsNumber(BigDecimal(n)) + def apply(n: Array[Char]) = new JsNumber(BigDecimal(n)) } /** diff --git a/src/main/scala/spray/json/JsonParser.scala b/src/main/scala/spray/json/JsonParser.scala index 0b408e8..36aa338 100644 --- a/src/main/scala/spray/json/JsonParser.scala +++ b/src/main/scala/spray/json/JsonParser.scala @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009-2011 Mathias Doenitz + * Copyright (C) 2014 Mathias Doenitz * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,106 +16,307 @@ package spray.json -import org.parboiled.scala._ -import org.parboiled.errors.{ErrorUtils, ParsingException} -import org.parboiled.Context -import java.lang.StringBuilder +import java.lang.{StringBuilder => JStringBuilder} +import java.nio.{CharBuffer, ByteBuffer} +import java.nio.charset.Charset +import scala.annotation.{switch, tailrec} +import scala.collection.immutable.ListMap /** - * This JSON parser is the almost direct implementation of the JSON grammar - * presented at http://www.json.org as a parboiled PEG parser. + * Fast, no-dependency parser for JSON as defined by http://tools.ietf.org/html/rfc4627. */ -object JsonParser extends Parser { +object JsonParser { + def apply(input: ParserInput): JsValue = new JsonParser(input).parseJsValue() - // the root rule - lazy val Json = rule { WhiteSpace ~ Value ~ EOI } + class ParsingException(val summary: String, val detail: String = "") + extends RuntimeException(if (summary.isEmpty) detail else if (detail.isEmpty) summary else summary + ": " + detail) +} - def JsonObject: Rule1[JsObject] = rule { - "{ " ~ zeroOrMore(Pair, separator = ", ") ~ "} " ~~> (JsObject(_ :_*)) - } +class JsonParser(input: ParserInput) { + import JsonParser.ParsingException - def Pair = rule { JsonStringUnwrapped ~ ": " ~ Value ~~> ((_, _)) } + private[this] val sb = new JStringBuilder + private[this] var cursorChar: Char = input.nextChar() + private[this] var jsValue: JsValue = _ - def Value: Rule1[JsValue] = rule { - JsonString | JsonNumber | JsonObject | JsonArray | JsonTrue | JsonFalse | JsonNull + def parseJsValue(): JsValue = { + ws() + `value`() + jsValue } - def JsonString = rule { JsonStringUnwrapped ~~> (JsString(_)) } - - def JsonStringUnwrapped = rule { "\"" ~ Characters ~ "\" " ~~> (_.toString) } + ////////////////////// GRAMMAR //////////////////////// - def JsonNumber = rule { group(Integer ~ optional(Frac) ~ optional(Exp)) ~> (JsNumber(_)) ~ WhiteSpace } + private final val EOI = '\uFFFF' // compile-time constant - def JsonArray = rule { "[ " ~ zeroOrMore(Value, separator = ", ") ~ "] " ~~> (JsArray(_)) } + // http://tools.ietf.org/html/rfc4627#section-2.1 + private def `value`(): Unit = { + val mark = input.cursor + def simpleValue(matched: Boolean, value: JsValue) = if (matched) jsValue = value else fail("JSON Value", mark) + (cursorChar: @switch) match { + case 'f' => simpleValue(`false`(), JsFalse) + case 'n' => simpleValue(`null`(), JsNull) + case 't' => simpleValue(`true`(), JsTrue) + case '{' => advance(); `object`() + case '[' => advance(); `array`() + case '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' | '-' => `number`() + case '"' => `string`(); jsValue = JsString(sb.toString) + case _ => fail("JSON Value") + } + } - def Characters = rule { push(new StringBuilder) ~ zeroOrMore("\\" ~ EscapedChar | NormalChar) } - - def EscapedChar = rule ( - anyOf("\"\\/") ~:% withContext(appendToSb(_)(_)) - | "b" ~ appendToSb('\b') - | "f" ~ appendToSb('\f') - | "n" ~ appendToSb('\n') - | "r" ~ appendToSb('\r') - | "t" ~ appendToSb('\t') - | Unicode ~~% withContext((code, ctx) => appendToSb(code.asInstanceOf[Char])(ctx)) - ) + private def `false`() = advance() && ch('a') && ch('l') && ch('s') && ws('e') + private def `null`() = advance() && ch('u') && ch('l') && ws('l') + private def `true`() = advance() && ch('r') && ch('u') && ws('e') - def NormalChar = rule { !anyOf("\"\\") ~ ANY ~:% (withContext(appendToSb(_)(_))) } + // http://tools.ietf.org/html/rfc4627#section-2.2 + private def `object`(): Unit = { + ws() + var map = ListMap.empty[String, JsValue] + @tailrec def members(): Unit = { + `string`() + require(':') + ws() + val key = sb.toString + `value`() + map = map.updated(key, jsValue) + if (ws(',')) members() + } + if (cursorChar != '}') members() + require('}') + ws() + jsValue = JsObject(map) + } - def Unicode = rule { "u" ~ group(HexDigit ~ HexDigit ~ HexDigit ~ HexDigit) ~> (java.lang.Integer.parseInt(_, 16)) } + // http://tools.ietf.org/html/rfc4627#section-2.3 + private def `array`(): Unit = { + ws() + var list = List.newBuilder[JsValue] + @tailrec def values(): Unit = { + `value`() + list += jsValue + if (ws(',')) values() + } + if (cursorChar != ']') values() + require(']') + ws() + jsValue = JsArray(list.result()) + } - def Integer = rule { optional("-") ~ (("1" - "9") ~ Digits | Digit) } + // http://tools.ietf.org/html/rfc4627#section-2.4 + private def `number`() = { + val start = input.cursor + ch('-') + `int`() + `frac`() + `exp`() + jsValue = JsNumber(input.sliceCharArray(start, input.cursor)) + ws() + } - def Digits = rule { oneOrMore(Digit) } + private def `int`(): Unit = if (!ch('0')) oneOrMoreDigits() + private def `frac`(): Unit = if (ch('.')) oneOrMoreDigits() + private def `exp`(): Unit = if ((ch('e') || ch('E')) && (ch('-') || ch('+') || true)) oneOrMoreDigits() - def Digit = rule { "0" - "9" } + private def oneOrMoreDigits(): Unit = if (DIGIT()) zeroOrMoreDigits() else fail("DIGIT") + @tailrec private def zeroOrMoreDigits(): Unit = if (DIGIT()) zeroOrMoreDigits() - def HexDigit = rule { "0" - "9" | "a" - "f" | "A" - "F" } + private def DIGIT(): Boolean = cursorChar >= '0' && cursorChar <= '9' && advance() - def Frac = rule { "." ~ Digits } + // http://tools.ietf.org/html/rfc4627#section-2.5 + private def `string`(): Unit = { + require('"') + sb.setLength(0) + while (`char`()) cursorChar = input.nextUtf8Char() + require('"') + ws() + } - def Exp = rule { ignoreCase("e") ~ optional(anyOf("+-")) ~ Digits } + private def `char`() = + cursorChar match { + case '"' => false + case '\\' => advance(); `escaped`() + case c if cursorChar >= ' ' => appendSB(cursorChar) + case _ => false + } + + private def `escaped`() = { + def hexValue(c: Char): Int = + if ('0' <= c && c <= '9') c - '0' + else if ('a' <= c && c <= 'f') c - 87 + else if ('A' <= c && c <= 'F') c - 55 + else fail("hex digit") + def unicode() = { + var value = hexValue(cursorChar) + advance() + value = (value << 4) + hexValue(cursorChar) + advance() + value = (value << 4) + hexValue(cursorChar) + advance() + value = (value << 4) + hexValue(cursorChar) + appendSB(value.toChar) + } + (cursorChar: @switch) match { + case '"' | '/' | '\\' => appendSB(cursorChar) + case 'b' => appendSB('\b') + case 'f' => appendSB('\f') + case 'n' => appendSB('\n') + case 'r' => appendSB('\r') + case 't' => appendSB('\t') + case 'u' => advance(); unicode() + case _ => fail("JSON escape sequence") + } + } - def JsonTrue = rule { "true " ~ push(JsTrue) } + @tailrec private def ws(): Unit = + // fast test whether cursorChar is one of " \n\r\t" + if (((1L << cursorChar) & ((cursorChar - 64) >> 31) & 0x100002600L) != 0L) { advance(); ws() } - def JsonFalse = rule { "false " ~ push(JsFalse) } + ////////////////////////// HELPERS ////////////////////////// - def JsonNull = rule { "null " ~ push(JsNull) } + private def ch(c: Char): Boolean = if (cursorChar == c) { advance(); true } else false + private def ws(c: Char): Boolean = if (ch(c)) { ws(); true } else false + private def advance(): Boolean = { cursorChar = input.nextChar(); true } + private def appendSB(c: Char): Boolean = { sb.append(c); true } + private def require(c: Char): Unit = if (!ch(c)) fail(s"'$c'") - def WhiteSpace: Rule0 = rule { zeroOrMore(anyOf(" \n\r\t\f")) } - - // helper method for fast string building - // for maximum performance we use a somewhat unorthodox parsing technique that is a bit more verbose (and somewhat - // less readable) but reduces object allocations during the parsing run to a minimum: - // the Characters rules pushes a StringBuilder object onto the stack which is then directly fed with matched - // and unescaped characters in the sub rules (i.e. no string allocations and value stack operation required) - def appendToSb(c: Char): Context[Any] => Unit = { ctx => - ctx.getValueStack.peek.asInstanceOf[StringBuilder].append(c) - () + private def fail(target: String, cursor: Int = input.cursor, errorChar: Char = cursorChar): Nothing = { + val ParserInput.Line(lineNr, col, text) = input.getLine(cursor) + val summary = { + val unexpected = + if (errorChar != EOI) { + val c = if (Character.isISOControl(errorChar)) "\\u%04x" format errorChar.toInt else errorChar.toString + s"character '$c'" + } else "end-of-input" + s"Unexpected $unexpected at input index $cursor (line $lineNr, position $col), expected $target" + } + val detail = { + val sanitizedText = text.map(c ⇒ if (Character.isISOControl(c)) '?' else c) + s"\n$sanitizedText\n${" " * col}^\n" + } + throw new ParsingException(summary, detail) } +} +trait ParserInput { /** - * We redefine the default string-to-rule conversion to also match trailing whitespace if the string ends with - * a blank, this keeps the rules free from most whitespace matching clutter + * Advance the cursor and get the next char. + * Since the char is required to be a 7-Bit ASCII char no decoding is required. */ - override implicit def toRule(string: String) = { - if (string.endsWith(" ")) str(string.trim) ~ WhiteSpace - else str(string) - } + def nextChar(): Char /** - * The main parsing method. Uses a ReportingParseRunner (which only reports the first error) for simplicity. + * Advance the cursor and get the next char, which could potentially be outside + * of the 7-Bit ASCII range. Therefore decoding might be required. */ - def apply(json: String): JsValue = apply(json.toCharArray) - + def nextUtf8Char(): Char + + def cursor: Int + def length: Int + def sliceString(start: Int, end: Int): String + def sliceCharArray(start: Int, end: Int): Array[Char] + def getLine(index: Int): ParserInput.Line +} + +object ParserInput { + private final val EOI = '\uFFFF' // compile-time constant + private final val ErrorChar = '\uFFFD' // compile-time constant, universal UTF-8 replacement character '�' + + implicit def apply(string: String): StringBasedParserInput = new StringBasedParserInput(string) + implicit def apply(chars: Array[Char]): CharArrayBasedParserInput = new CharArrayBasedParserInput(chars) + implicit def apply(bytes: Array[Byte]): ByteArrayBasedParserInput = new ByteArrayBasedParserInput(bytes) + + case class Line(lineNr: Int, column: Int, text: String) + + abstract class DefaultParserInput extends ParserInput { + protected var _cursor: Int = -1 + def cursor = _cursor + def getLine(index: Int): Line = { + val sb = new java.lang.StringBuilder + @tailrec def rec(ix: Int, lineStartIx: Int, lineNr: Int): Line = + nextUtf8Char() match { + case '\n' if index > ix => sb.setLength(0); rec(ix + 1, ix + 1, lineNr + 1) + case '\n' | EOI => Line(lineNr, index - lineStartIx, sb.toString) + case c => sb.append(c); rec(ix + 1, lineStartIx, lineNr) + } + val savedCursor = _cursor + _cursor = 0 + val line = rec(ix = 0, lineStartIx = 0, lineNr = 1) + _cursor = savedCursor + line + } + } + + private val UTF8 = Charset.forName("UTF-8") + /** - * The main parsing method. Uses a ReportingParseRunner (which only reports the first error) for simplicity. + * ParserInput reading directly off a byte array which is assumed to contain the UTF-8 encoded represenation + * of the JSON input, without requiring a separate decoding step. */ - def apply(json: Array[Char]): JsValue = { - val parsingResult = ReportingParseRunner(Json).run(json) - parsingResult.result.getOrElse { - throw new ParsingException("Invalid JSON source:\n" + ErrorUtils.printParseErrors(parsingResult)) + class ByteArrayBasedParserInput(bytes: Array[Byte]) extends DefaultParserInput { + private val byteBuffer = ByteBuffer.allocate(4) + private val charBuffer = CharBuffer.allocate(1) // we currently don't support surrogate pairs! + private val decoder = UTF8.newDecoder() + def nextChar() = { + _cursor += 1 + if (_cursor < bytes.length) (bytes(_cursor) & 0xFF).toChar else EOI + } + def nextUtf8Char() = { + @tailrec def decode(byte: Byte, remainingBytes: Int): Char = { + byteBuffer.put(byte) + if (remainingBytes > 0) { + _cursor += 1 + if (_cursor < bytes.length) decode(bytes(_cursor), remainingBytes - 1) else ErrorChar + } else { + byteBuffer.flip() + val coderResult = decoder.decode(byteBuffer, charBuffer, false) + charBuffer.flip() + val result = if (coderResult.isUnderflow & charBuffer.hasRemaining) charBuffer.get() else ErrorChar + byteBuffer.clear() + charBuffer.clear() + result + } + } + + _cursor += 1 + if (_cursor < bytes.length) { + val byte = bytes(_cursor) + if (byte >= 0) byte.toChar // 7-Bit ASCII + else if ((byte & 0xE0) == 0xC0) decode(byte, 1) // 2-byte UTF-8 sequence + else if ((byte & 0xF0) == 0xE0) decode(byte, 2) // 3-byte UTF-8 sequence + else if ((byte & 0xF8) == 0xF0) decode(byte, 3) // 4-byte UTF-8 sequence, will probably produce an (unsupported) surrogate pair + else ErrorChar + } else EOI + } + def length = bytes.length + def sliceString(start: Int, end: Int) = new String(bytes, start, end - start, UTF8) + def sliceCharArray(start: Int, end: Int) = + UTF8.decode(ByteBuffer.wrap(java.util.Arrays.copyOfRange(bytes, start, end))).array() + } + + class StringBasedParserInput(string: String) extends DefaultParserInput { + def nextChar(): Char = { + _cursor += 1 + if (_cursor < string.length) string.charAt(_cursor) else EOI + } + def nextUtf8Char() = nextChar() + def length = string.length + def sliceString(start: Int, end: Int) = string.substring(start, end) + def sliceCharArray(start: Int, end: Int) = { + val chars = new Array[Char](end - start) + string.getChars(start, end, chars, 0) + chars } } + class CharArrayBasedParserInput(chars: Array[Char]) extends DefaultParserInput { + def nextChar(): Char = { + _cursor += 1 + if (_cursor < chars.length) chars(_cursor) else EOI + } + def nextUtf8Char() = nextChar() + def length = chars.length + def sliceString(start: Int, end: Int) = new String(chars, start, end - start) + def sliceCharArray(start: Int, end: Int) = java.util.Arrays.copyOfRange(chars, start, end) + } } \ No newline at end of file diff --git a/src/test/scala/spray/json/JsonParserSpec.scala b/src/test/scala/spray/json/JsonParserSpec.scala index c6c1589..608898f 100644 --- a/src/test/scala/spray/json/JsonParserSpec.scala +++ b/src/test/scala/spray/json/JsonParserSpec.scala @@ -17,55 +17,61 @@ package spray.json import org.specs2.mutable._ -import org.parboiled.common.FileUtils class JsonParserSpec extends Specification { "The JsonParser" should { "parse 'null' to JsNull" in { - JsonParser("null") mustEqual JsNull + JsonParser("null") === JsNull } "parse 'true' to JsTrue" in { - JsonParser("true") mustEqual JsTrue + JsonParser("true") === JsTrue } "parse 'false' to JsFalse" in { - JsonParser("false") mustEqual JsFalse + JsonParser("false") === JsFalse } "parse '0' to JsNumber" in { - JsonParser("0") mustEqual JsNumber(0) + JsonParser("0") === JsNumber(0) } "parse '1.23' to JsNumber" in { - JsonParser("1.23") mustEqual JsNumber(1.23) + JsonParser("1.23") === JsNumber(1.23) } "parse '-1E10' to JsNumber" in { - JsonParser("-1E10") mustEqual JsNumber("-1E+10") + JsonParser("-1E10") === JsNumber("-1E+10") } "parse '12.34e-10' to JsNumber" in { - JsonParser("12.34e-10") mustEqual JsNumber("1.234E-9") + JsonParser("12.34e-10") === JsNumber("1.234E-9") } "parse \"xyz\" to JsString" in { - JsonParser("\"xyz\"") mustEqual JsString("xyz") + JsonParser("\"xyz\"") === JsString("xyz") } "parse escapes in a JsString" in { - JsonParser(""""\"\\/\b\f\n\r\t"""") mustEqual JsString("\"\\/\b\f\n\r\t") - JsonParser("\"L\\" + "u00e4nder\"") mustEqual JsString("Länder") + JsonParser(""""\"\\/\b\f\n\r\t"""") === JsString("\"\\/\b\f\n\r\t") + JsonParser("\"L\\" + "u00e4nder\"") === JsString("Länder") } "parse all representations of the slash (SOLIDUS) character in a JsString" in { - JsonParser( "\"" + "/\\/\\u002f" + "\"") mustEqual JsString("///") + JsonParser( "\"" + "/\\/\\u002f" + "\"") === JsString("///") } - "properly parse a simple JsObject" in ( - JsonParser(""" { "key" :42, "key2": "value" }""") mustEqual + "parse a simple JsObject" in ( + JsonParser(""" { "key" :42, "key2": "value" }""") === JsObject("key" -> JsNumber(42), "key2" -> JsString("value")) ) - "properly parse a simple JsArray" in ( - JsonParser("""[null, 1.23 ,{"key":true } ] """) mustEqual - JsArray(JsNull, JsNumber(1.23), JsObject("key" -> JsBoolean(true))) + "parse a simple JsArray" in ( + JsonParser("""[null, 1.23 ,{"key":true } ] """) === + JsArray(JsNull, JsNumber(1.23), JsObject("key" -> JsTrue)) ) + "parse directly from UTF-8 encoded bytes" in { + val json = JsObject( + "7-bit" -> JsString("This is regular 7-bit ASCII text."), + "2-bytes" -> JsString("2-byte UTF-8 chars like £, æ or Ö"), + "3-bytes" -> JsString("3-byte UTF-8 chars like ヨ, ᄅ or ᐁ.")) + JsonParser(json.prettyPrint.getBytes("UTF-8")) === json + } "be reentrant" in { - val largeJsonSource = FileUtils.readAllCharsFromResource("test.json") + val largeJsonSource = scala.io.Source.fromInputStream(getClass.getResourceAsStream("/test.json")).mkString List.fill(20)(largeJsonSource).par.map(JsonParser(_)).toList.map { _.asInstanceOf[JsObject].fields("questions").asInstanceOf[JsArray].elements.size - } mustEqual List.fill(20)(100) + } === List.fill(20)(100) } } diff --git a/src/test/scala/spray/json/RoundTripSpecs.scala b/src/test/scala/spray/json/RoundTripSpecs.scala index 51df48d..d9e16c1 100644 --- a/src/test/scala/spray/json/RoundTripSpecs.scala +++ b/src/test/scala/spray/json/RoundTripSpecs.scala @@ -8,9 +8,7 @@ object JsValueGenerators { import Gen._ import Arbitrary.arbitrary - // some characters have special meaning in parboiled - // see org.parboiled.support.Chars, we have to exclude those - val parseableString: Gen[String] = arbitrary[String].map(_.filterNot(_ > 0xfd00)) + val parseableString: Gen[String] = arbitrary[String] val genString: Gen[JsString] = parseableString.map(JsString(_)) val genBoolean: Gen[JsBoolean] = oneOf(JsFalse, JsTrue) val genLongNumber: Gen[JsNumber] = arbitrary[Long].map(JsNumber(_)) -- cgit v1.2.3