From 135d4f06b174aa585af64b5253aba647982ac4a2 Mon Sep 17 00:00:00 2001 From: Paul Phillips Date: Mon, 18 Jan 2010 21:18:36 +0000 Subject: More work consolidating the XML code needlessly... More work consolidating the XML code needlessly duplicated between the compiler and the library. Having to fix #2354 in two completely different places was I found very motivating. --- src/library/scala/xml/parsing/MarkupParser.scala | 178 +++----------------- .../scala/xml/parsing/MarkupParserCommon.scala | 180 ++++++++++++++++++++- 2 files changed, 192 insertions(+), 166 deletions(-) (limited to 'src/library') diff --git a/src/library/scala/xml/parsing/MarkupParser.scala b/src/library/scala/xml/parsing/MarkupParser.scala index a15cd0f7e4..2f7f48c765 100644 --- a/src/library/scala/xml/parsing/MarkupParser.scala +++ b/src/library/scala/xml/parsing/MarkupParser.scala @@ -32,7 +32,13 @@ trait MarkupParser extends MarkupParserCommon with TokenTests self: MarkupParser with MarkupHandler => type PositionType = Int - type InputType = Source + type InputType = Source + type ElementType = NodeSeq + type AttributesType = (MetaData, NamespaceBinding) + type NamespaceType = NamespaceBinding + + def truncatedError(msg: String): Nothing = throw FatalError(msg) + def errorNoEnd(tag: String) = throw FatalError("expected closing tag of " + tag) def xHandleError(that: Char, msg: String) = reportSyntaxError(msg) @@ -106,8 +112,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests * // this is a bit more lenient than necessary... */ def prolog(): Tuple3[Option[String], Option[String], Option[Boolean]] = { - - //Console.println("(DEBUG) prolog") var n = 0 var info_ver: Option[String] = None var info_enc: Option[String] = None @@ -176,7 +180,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests if (m.length - n != 0) { reportSyntaxError("VersionInfo EncodingDecl? or '?>' expected!"); } - //Console.println("[MarkupParser::textDecl] finished parsing textdecl"); Tuple2(info_ver, info_enc); } @@ -190,8 +193,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests */ def document(): Document = { - - //Console.println("(DEBUG) document") doc = new Document() this.dtd = null @@ -204,7 +205,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests nextch // is prolog ? var children: NodeSeq = null if ('?' == ch) { - //Console.println("[MarkupParser::document] starts with xml declaration"); nextch; info_prolog = prolog() doc.version = info_prolog._1 @@ -212,10 +212,8 @@ trait MarkupParser extends MarkupParserCommon with TokenTests doc.standAlone = info_prolog._3 children = content(TopScope) // DTD handled as side effect - } else { - //Console.println("[MarkupParser::document] does not start with xml declaration"); - // - + } + else { val ts = new NodeBuffer(); content1(TopScope, ts); // DTD handled as side effect ts &+ content(TopScope); @@ -257,6 +255,14 @@ trait MarkupParser extends MarkupParserCommon with TokenTests this } + def ch_returning_nextch = { val res = ch ; nextch ; res } + def mkProcInstr(position: Int, name: String, text: String): NodeSeq = + handle.procInstr(position, name, text) + + def mkAttributes(name: String, pscope: NamespaceBinding) = + if (isNameStart (ch)) xAttributes(pscope) + else (Null, pscope) + /** this method assign the next character to ch and advances in input */ def nextch = { if (curInput.hasNext) { @@ -315,27 +321,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests (aMap,scope) } - /** attribute value, terminated by either ' or ". value may not contain <. - * AttValue ::= `'` { _ } `'` - * | `"` { _ } `"` - */ - def xAttributeValue(): String = { - val endch = ch - nextch - while (ch != endch) { - if ('<' == ch) - reportSyntaxError( "'<' not allowed in attrib value" ); - putChar(ch) - nextch - } - nextch - val str = cbuf.toString() - cbuf.length = 0 - - // well-formedness constraint - normalizeAttributeValue(str) - } - /** entity value, terminated by either ' or ". value may not contain <. * AttValue ::= `'` { _ } `'` * | `"` { _ } `"` @@ -353,35 +338,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests str } - - /** parse a start or empty tag. - * [40] STag ::= '<' Name { S Attribute } [S] - * [44] EmptyElemTag ::= '<' Name { S Attribute } [S] - */ - protected def xTag(pscope:NamespaceBinding): (String, MetaData, NamespaceBinding) = { - val qname = xName - - xSpaceOpt - val (aMap: MetaData, scope: NamespaceBinding) = { - if (isNameStart(ch)) - xAttributes(pscope) - else - (Null, pscope) - } - (qname, aMap, scope) - } - - /** [42] '<' xmlEndTag ::= '<' '/' Name S? '>' - */ - def xEndTag(n: String) = { - xToken('/') - val m = xName - if (n != m) - reportSyntaxError("expected closing tag of " + n/* +", not "+m*/); - xSpaceOpt - xToken('>') - } - /** '<! CharData ::= [CDATA[ ( {char} - {char}"]]>"{char} ) ']]>' * * see [15] @@ -392,14 +348,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests xTakeUntil(mkResult, () => pos, "]]>") } - /** CharRef ::= "&#" '0'..'9' {'0'..'9'} ";" - * | "&#x" '0'..'9'|'A'..'F'|'a'..'f' { hexdigit } ";" - * - * see [66] - */ - def xCharRef(ch: () => Char, nextch: () => Unit): String = - Utility.parseCharRef(ch, nextch, reportSyntaxError _) - /** Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' * * see [15] @@ -576,7 +524,7 @@ trait MarkupParser extends MarkupParserCommon with TokenTests */ def element1(pscope: NamespaceBinding): NodeSeq = { val pos = this.pos - val (qname, aMap, scope) = xTag(pscope) + val (qname, (aMap, scope)) = xTag(pscope) val (pre, local) = Utility.prefix(qname) match { case Some(p) => (p, qname drop p.length+1) case _ => (null, qname) @@ -600,50 +548,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests res } - //def xEmbeddedExpr: MarkupType; - - /** Name ::= (Letter | '_' | ':') (NameChar)* - * - * see [5] of XML 1.0 specification - */ - def xName: String = { - if (isNameStart(ch)) { - while (isNameChar(ch)) { - putChar(ch) - nextch - } - val n = cbuf.toString().intern() - cbuf.length = 0 - n - } else { - reportSyntaxError("name expected") - "" - } - } - - /** '<?' ProcInstr ::= Name [S ({Char} - ({Char}'>?' {Char})]'?>' - * - * see [15] - */ - def xProcInstr: NodeSeq = { - val sb:StringBuilder = new StringBuilder() - val n = xName - if (isSpace(ch)) { - xSpace - while (true) { - if (ch == '?' && { sb.append( ch ); nextch; ch == '>' }) { - sb.length = sb.length - 1; - nextch; - return handle.procInstr(tmppos, n, sb.toString); - } else - sb.append(ch); - nextch - } - }; - xToken("?>") - handle.procInstr(tmppos, n, sb.toString) - } - /** parse character data. * precondition: xEmbeddedBlock == false (we are not in a scala block) */ @@ -996,50 +900,4 @@ trait MarkupParser extends MarkupParserCommon with TokenTests pos = curInput.pos eof = false // must be false, because of places where entity refs occur } - - /** for the moment, replace only character references - * see spec 3.3.3 - * precond: cbuf empty - */ - def normalizeAttributeValue(attval: String): String = { - val s: Seq[Char] = attval - val it = s.iterator - while (it.hasNext) { - it.next match { - case ' '|'\t'|'\n'|'\r' => - cbuf.append(' '); - case '&' => it.next match { - case '#' => - var c = it.next - val s = xCharRef ({ () => c }, { () => c = it.next }) - cbuf.append(s) - case nchar => - val nbuf = new StringBuilder() - var d = nchar - do { - nbuf.append(d) - d = it.next - } while(d != ';'); - nbuf.toString() match { - case "lt" => cbuf.append('<') - case "gt" => cbuf.append('>') - case "amp" => cbuf.append('&') - case "apos" => cbuf.append('\'') - case "quot" => cbuf.append('"') - case "quote" => cbuf.append('"') - case name => - cbuf.append('&') - cbuf.append(name) - cbuf.append(';') - } - } - case c => - cbuf.append(c) - } - } - val name = cbuf.toString() - cbuf.length = 0 - name - } - } diff --git a/src/library/scala/xml/parsing/MarkupParserCommon.scala b/src/library/scala/xml/parsing/MarkupParserCommon.scala index 57c46c4685..ba1402d55f 100644 --- a/src/library/scala/xml/parsing/MarkupParserCommon.scala +++ b/src/library/scala/xml/parsing/MarkupParserCommon.scala @@ -11,30 +11,191 @@ package parsing import scala.io.Source import scala.xml.dtd._ +import scala.annotation.switch import Utility.Escapes.{ pairs => unescape } +object MarkupParserCommon { + final val SU = '\u001A' +} +import MarkupParserCommon._ + /** This is not a public trait - it contains common code shared * between the library level XML parser and the compiler's. * All members should be accessed through those. */ private[scala] trait MarkupParserCommon extends TokenTests { - private final val SU: Char = 0x1A protected def unreachable = Predef.error("Cannot be reached.") - // type HandleType // MarkupHandler, SymbolicXMLBuilder - + // type HandleType // MarkupHandler, SymbolicXMLBuilder type InputType // Source, CharArrayReader type PositionType // Int, Position + type ElementType // NodeSeq, Tree + type NamespaceType // NamespaceBinding, Any + type AttributesType // (MetaData, NamespaceBinding), mutable.Map[String, Tree] + + def mkAttributes(name: String, pscope: NamespaceType): AttributesType + def mkProcInstr(position: PositionType, name: String, text: String): ElementType + + /** parse a start or empty tag. + * [40] STag ::= '<' Name { S Attribute } [S] + * [44] EmptyElemTag ::= '<' Name { S Attribute } [S] + */ + protected def xTag(pscope: NamespaceType): (String, AttributesType) = { + val name = xName + xSpaceOpt + + (name, mkAttributes(name, pscope)) + } + + /** '?' {Char})]'?>' + * + * see [15] + */ + def xProcInstr: ElementType = { + val n = xName + xSpaceOpt + xTakeUntil(mkProcInstr(_, n, _), () => tmppos, "?>") + } + + /** attribute value, terminated by either ' or ". value may not contain <. + * @param endch either ' or " + */ + def xAttributeValue(endCh: Char): String = { + val buf = new StringBuilder + while (ch != endCh) { + // well-formedness constraint + if (ch == '<') return errorAndResult("'<' not allowed in attrib value", "") + else if (ch == SU) truncatedError("") + else buf append ch_returning_nextch + } + ch_returning_nextch + // @todo: normalize attribute value + buf.toString + } + + def xAttributeValue(): String = { + val str = xAttributeValue(ch_returning_nextch) + // well-formedness constraint + normalizeAttributeValue(str) + } + + private def takeUntilChar(it: Iterator[Char], end: Char): String = { + val buf = new StringBuilder + while (it.hasNext) it.next match { + case `end` => return buf.toString + case ch => buf append ch + } + error("Expected '%s'".format(end)) + } + + /** [42] '<' xmlEndTag ::= '<' '/' Name S? '>' + */ + def xEndTag(startName: String) { + xToken('/') + if (xName != startName) + errorNoEnd(startName) + + xSpaceOpt + xToken('>') + } + + /** actually, Name ::= (Letter | '_' | ':') (NameChar)* but starting with ':' cannot happen + * Name ::= (Letter | '_') (NameChar)* + * + * see [5] of XML 1.0 specification + * + * pre-condition: ch != ':' // assured by definition of XMLSTART token + * post-condition: name does neither start, nor end in ':' + */ + def xName: String = { + if (ch == SU) + truncatedError("") + else if (!isNameStart(ch)) + return errorAndResult("name expected, but char '%s' cannot start a name" format ch, "") + + val buf = new StringBuilder + + do buf append ch_returning_nextch + while (isNameChar(ch)) + + if (buf.last == ':') { + reportSyntaxError( "name cannot end in ':'" ) + buf.toString dropRight 1 + } + else buf.toString + } + + private def attr_unescape(s: String) = s match { + case "lt" => "<" + case "gt" => ">" + case "amp" => "&" + case "apos" => "'" + case "quot" => "\"" + case "quote" => "\"" + case _ => "&" + s + ";" + } + + /** Replaces only character references right now. + * see spec 3.3.3 + */ + private def normalizeAttributeValue(attval: String): String = { + val buf = new StringBuilder + val it = attval.iterator.buffered + + while (it.hasNext) buf append (it.next match { + case ' ' | '\t' | '\n' | '\r' => " " + case '&' if it.head == '#' => it.next ; xCharRef(it) + case '&' => attr_unescape(takeUntilChar(it, ';')) + case c => c + }) + + buf.toString + } + + /** CharRef ::= "&#" '0'..'9' {'0'..'9'} ";" + * | "&#x" '0'..'9'|'A'..'F'|'a'..'f' { hexdigit } ";" + * + * see [66] + */ + def xCharRef(ch: () => Char, nextch: () => Unit): String = + Utility.parseCharRef(ch, nextch, reportSyntaxError _) + + def xCharRef(it: Iterator[Char]): String = { + var c = it.next + Utility.parseCharRef(() => c, () => { c = it.next }, reportSyntaxError _) + } + + def xCharRef: String = xCharRef(() => ch, () => nextch) /** Create a lookahead reader which does not influence the input */ def lookahead(): BufferedIterator[Char] + /** The library and compiler parsers had the interesting distinction of + * different behavior for nextch (a function for which there are a total + * of two plausible behaviors, so we know the design space was fully + * explored.) One of them returned the value of nextch before the increment + * and one of them the new value. So to unify code we have to at least + * temporarily abstract over the nextchs. + */ def ch: Char def nextch: Char + def ch_returning_nextch: Char + def eof: Boolean + + // def handle: HandleType + var tmppos: PositionType + def xHandleError(that: Char, msg: String): Unit def reportSyntaxError(str: String): Unit def reportSyntaxError(pos: Int, str: String): Unit - def eof: Boolean + + def truncatedError(msg: String): Nothing + def errorNoEnd(tag: String): Nothing + + protected def errorAndResult[T](msg: String, x: T): T = { + reportSyntaxError(msg) + x + } def xToken(that: Char) { if (ch == that) nextch @@ -53,9 +214,16 @@ private[scala] trait MarkupParserCommon extends TokenTests { if (isSpace(ch)) { nextch; xSpaceOpt } else xHandleError(ch, "whitespace expected") - // + /** Apply a function and return the passed value */ def returning[T](x: T)(f: T => Unit): T = { f(x) ; x } + /** Execute body with a variable saved and restored after execution */ + def saving[A,B](getter: A, setter: (A) => Unit)(body: => B): B = { + val saved = getter + try body + finally setter(saved) + } + /** Take characters from input stream until given String "until" * is seen. Once seen, the accumulated characters are passed * along with the current Position to the supplied handler function. @@ -73,7 +241,7 @@ private[scala] trait MarkupParserCommon extends TokenTests { if (ch == head && peek(rest)) return handler(positioner(), sb.toString) else if (ch == SU) - xHandleError(ch, "") // throws TruncatedXML in compiler + truncatedError("") // throws TruncatedXML in compiler sb append ch nextch -- cgit v1.2.3