More work consolidating the XML code needlessly...

More work consolidating the XML code needlessly duplicated between the compiler and the library. Having to fix #2354 in two completely different places was I found very motivating.
author: Paul Phillips <paulp@improving.org> 2010-01-18 21:18:36 +0000
committer: Paul Phillips <paulp@improving.org> 2010-01-18 21:18:36 +0000
commit: 135d4f06b174aa585af64b5253aba647982ac4a2 (patch)
tree: 73ca84ae254f4903feef03f7573172f992ca7e99 /src/library
parent: e83ad1e005d40738f87da8bb2d60cf9035cfb6ca (diff)
download: scala-135d4f06b174aa585af64b5253aba647982ac4a2.tar.gz
scala-135d4f06b174aa585af64b5253aba647982ac4a2.tar.bz2
scala-135d4f06b174aa585af64b5253aba647982ac4a2.zip
2 files changed, 192 insertions, 166 deletions
diff --git a/src/library/scala/xml/parsing/MarkupParser.scala b/src/library/scala/xml/parsing/MarkupParser.scala
index a15cd0f7e4..2f7f48c765 100644
--- a/src/library/scala/xml/parsing/MarkupParser.scala
+++ b/src/library/scala/xml/parsing/MarkupParser.scala
@@ -32,7 +32,13 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
   self: MarkupParser with MarkupHandler =>
 
   type PositionType = Int
-  type InputType = Source
+  type InputType    = Source
+  type ElementType  = NodeSeq
+  type AttributesType = (MetaData, NamespaceBinding)
+  type NamespaceType = NamespaceBinding
+
+  def truncatedError(msg: String): Nothing = throw FatalError(msg)
+  def errorNoEnd(tag: String) = throw FatalError("expected closing tag of " + tag)
 
   def xHandleError(that: Char, msg: String) = reportSyntaxError(msg)
 
@@ -106,8 +112,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
    *  // this is a bit more lenient than necessary...
    */
   def prolog(): Tuple3[Option[String], Option[String], Option[Boolean]] = {
-
-    //Console.println("(DEBUG) prolog")
     var n = 0
     var info_ver: Option[String] = None
     var info_enc: Option[String] = None
@@ -176,7 +180,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
     if (m.length - n != 0) {
       reportSyntaxError("VersionInfo EncodingDecl? or '?>' expected!");
     }
-    //Console.println("[MarkupParser::textDecl] finished parsing textdecl");
     Tuple2(info_ver, info_enc);
   }
 
@@ -190,8 +193,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
    */
 
   def document(): Document = {
-
-    //Console.println("(DEBUG) document")
     doc = new Document()
 
     this.dtd = null
@@ -204,7 +205,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
     nextch // is prolog ?
     var children: NodeSeq = null
     if ('?' == ch) {
-      //Console.println("[MarkupParser::document] starts with xml declaration");
       nextch;
       info_prolog = prolog()
       doc.version    = info_prolog._1
@@ -212,10 +212,8 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
       doc.standAlone = info_prolog._3
 
       children = content(TopScope) // DTD handled as side effect
-    } else {
-      //Console.println("[MarkupParser::document] does not start with xml declaration");
- //
-
+    }
+    else {
       val ts = new NodeBuffer();
       content1(TopScope, ts); // DTD handled as side effect
       ts &+ content(TopScope);
@@ -257,6 +255,14 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
     this
   }
 
+  def ch_returning_nextch = { val res = ch ; nextch ; res }
+  def mkProcInstr(position: Int, name: String, text: String): NodeSeq =
+    handle.procInstr(position, name, text)
+
+  def mkAttributes(name: String, pscope: NamespaceBinding) =
+    if (isNameStart (ch)) xAttributes(pscope)
+    else (Null, pscope)
+
   /** this method assign the next character to ch and advances in input */
   def nextch = {
     if (curInput.hasNext) {
@@ -315,27 +321,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
     (aMap,scope)
   }
 
-  /** attribute value, terminated by either ' or ". value may not contain &lt;.
-   *       AttValue     ::= `'` { _  } `'`
-   *                      | `"` { _ } `"`
-   */
-  def xAttributeValue(): String = {
-    val endch = ch
-    nextch
-    while (ch != endch) {
-      if ('<' == ch)
-        reportSyntaxError( "'<' not allowed in attrib value" );
-      putChar(ch)
-      nextch
-    }
-    nextch
-    val str = cbuf.toString()
-    cbuf.length = 0
-
-    // well-formedness constraint
-    normalizeAttributeValue(str)
-  }
-
   /** entity value, terminated by either ' or ". value may not contain &lt;.
    *       AttValue     ::= `'` { _  } `'`
    *                      | `"` { _ } `"`
@@ -353,35 +338,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
     str
   }
 
-
-  /** parse a start or empty tag.
-   *  [40] STag         ::= '&lt;' Name { S Attribute } [S]
-   *  [44] EmptyElemTag ::= '&lt;' Name { S Attribute } [S]
-   */
-  protected def xTag(pscope:NamespaceBinding): (String, MetaData, NamespaceBinding) = {
-    val qname = xName
-
-    xSpaceOpt
-    val (aMap: MetaData, scope: NamespaceBinding) = {
-      if (isNameStart(ch))
-        xAttributes(pscope)
-      else
-        (Null, pscope)
-    }
-    (qname, aMap, scope)
-  }
-
-  /** [42]  '&lt;' xmlEndTag ::=  '&lt;' '/' Name S? '&gt;'
-   */
-  def xEndTag(n: String) = {
-    xToken('/')
-    val m = xName
-    if (n != m)
-      reportSyntaxError("expected closing tag of " + n/* +", not "+m*/);
-    xSpaceOpt
-    xToken('>')
-  }
-
   /** '&lt;! CharData ::= [CDATA[ ( {char} - {char}"]]&gt;"{char} ) ']]&gt;'
    *
    * see [15]
@@ -392,14 +348,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
     xTakeUntil(mkResult, () => pos, "]]>")
   }
 
-  /** CharRef ::= "&amp;#" '0'..'9' {'0'..'9'} ";"
-   *            | "&amp;#x" '0'..'9'|'A'..'F'|'a'..'f' { hexdigit } ";"
-   *
-   * see [66]
-   */
-  def xCharRef(ch: () => Char, nextch: () => Unit): String =
-    Utility.parseCharRef(ch, nextch, reportSyntaxError _)
-
   /** Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* '--&gt;'
    *
    * see [15]
@@ -576,7 +524,7 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
    */
   def element1(pscope: NamespaceBinding): NodeSeq = {
     val pos = this.pos
-    val (qname, aMap, scope) = xTag(pscope)
+    val (qname, (aMap, scope)) = xTag(pscope)
     val (pre, local) = Utility.prefix(qname) match {
       case Some(p) => (p, qname drop p.length+1)
       case _       => (null, qname)
@@ -600,50 +548,6 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
     res
   }
 
-  //def xEmbeddedExpr: MarkupType;
-
-  /** Name ::= (Letter | '_' | ':') (NameChar)*
-   *
-   *  see  [5] of XML 1.0 specification
-   */
-  def xName: String = {
-    if (isNameStart(ch)) {
-      while (isNameChar(ch)) {
-        putChar(ch)
-        nextch
-      }
-      val n = cbuf.toString().intern()
-      cbuf.length = 0
-      n
-    } else {
-      reportSyntaxError("name expected")
-      ""
-    }
-  }
-
-  /** '&lt;?' ProcInstr ::= Name [S ({Char} - ({Char}'&gt;?' {Char})]'?&gt;'
-   *
-   * see [15]
-   */
-  def xProcInstr: NodeSeq = {
-    val sb:StringBuilder = new StringBuilder()
-    val n = xName
-    if (isSpace(ch)) {
-      xSpace
-      while (true) {
-        if (ch == '?' && { sb.append( ch ); nextch; ch == '>' }) {
-          sb.length = sb.length - 1;
-          nextch;
-          return handle.procInstr(tmppos, n, sb.toString);
-        } else
-          sb.append(ch);
-        nextch
-      }
-    };
-    xToken("?>")
-    handle.procInstr(tmppos, n, sb.toString)
-  }
-
   /** parse character data.
    *   precondition: xEmbeddedBlock == false (we are not in a scala block)
    */
@@ -996,50 +900,4 @@ trait MarkupParser extends MarkupParserCommon with TokenTests
     pos = curInput.pos
     eof = false // must be false, because of places where entity refs occur
   }
-
-  /** for the moment, replace only character references
-   *  see spec 3.3.3
-   *  precond: cbuf empty
-   */
-  def normalizeAttributeValue(attval: String): String = {
-    val s: Seq[Char] = attval
-    val it = s.iterator
-    while (it.hasNext) {
-      it.next match {
-        case ' '|'\t'|'\n'|'\r' =>
-          cbuf.append(' ');
-        case '&' => it.next match {
-          case '#' =>
-            var c = it.next
-            val s = xCharRef ({ () => c }, { () => c = it.next })
-            cbuf.append(s)
-          case nchar =>
-            val nbuf = new StringBuilder()
-            var d = nchar
-            do {
-              nbuf.append(d)
-              d = it.next
-            } while(d != ';');
-            nbuf.toString() match {
-              case "lt"    => cbuf.append('<')
-              case "gt"    => cbuf.append('>')
-              case "amp"   => cbuf.append('&')
-              case "apos"  => cbuf.append('\'')
-              case "quot"  => cbuf.append('"')
-              case "quote" => cbuf.append('"')
-              case name =>
-                cbuf.append('&')
-                cbuf.append(name)
-                cbuf.append(';')
-            }
-        }
-        case c =>
-          cbuf.append(c)
-      }
-    }
-    val name = cbuf.toString()
-    cbuf.length = 0
-    name
-  }
-
 }
diff --git a/src/library/scala/xml/parsing/MarkupParserCommon.scala b/src/library/scala/xml/parsing/MarkupParserCommon.scala
index 57c46c4685..ba1402d55f 100644
--- a/src/library/scala/xml/parsing/MarkupParserCommon.scala
+++ b/src/library/scala/xml/parsing/MarkupParserCommon.scala
@@ -11,30 +11,191 @@ package parsing
 
 import scala.io.Source
 import scala.xml.dtd._
+import scala.annotation.switch
 import Utility.Escapes.{ pairs => unescape }
 
+object MarkupParserCommon {
+  final val SU = '\u001A'
+}
+import MarkupParserCommon._
+
 /** This is not a public trait - it contains common code shared
  *  between the library level XML parser and the compiler's.
  *  All members should be accessed through those.
  */
 private[scala] trait MarkupParserCommon extends TokenTests {
-  private final val SU: Char = 0x1A
   protected def unreachable = Predef.error("Cannot be reached.")
 
-  // type HandleType   // MarkupHandler, SymbolicXMLBuilder
-
+  // type HandleType       // MarkupHandler, SymbolicXMLBuilder
   type InputType        // Source, CharArrayReader
   type PositionType     // Int, Position
+  type ElementType      // NodeSeq, Tree
+  type NamespaceType    // NamespaceBinding, Any
+  type AttributesType   // (MetaData, NamespaceBinding), mutable.Map[String, Tree]
+
+  def mkAttributes(name: String, pscope: NamespaceType): AttributesType
+  def mkProcInstr(position: PositionType, name: String, text: String): ElementType
+
+  /** parse a start or empty tag.
+   *  [40] STag         ::= '<' Name { S Attribute } [S]
+   *  [44] EmptyElemTag ::= '<' Name { S Attribute } [S]
+   */
+  protected def xTag(pscope: NamespaceType): (String, AttributesType) = {
+    val name = xName
+    xSpaceOpt
+
+    (name, mkAttributes(name, pscope))
+  }
+
+  /** '<?' ProcInstr ::= Name [S ({Char} - ({Char}'>?' {Char})]'?>'
+   *
+   * see [15]
+   */
+  def xProcInstr: ElementType = {
+    val n = xName
+    xSpaceOpt
+    xTakeUntil(mkProcInstr(_, n, _), () => tmppos, "?>")
+  }
+
+  /** attribute value, terminated by either ' or ". value may not contain <.
+   *  @param endch either ' or "
+   */
+  def xAttributeValue(endCh: Char): String = {
+    val buf = new StringBuilder
+    while (ch != endCh) {
+      // well-formedness constraint
+      if (ch == '<') return errorAndResult("'<' not allowed in attrib value", "")
+      else if (ch == SU) truncatedError("")
+      else buf append ch_returning_nextch
+    }
+    ch_returning_nextch
+    // @todo: normalize attribute value
+    buf.toString
+  }
+
+  def xAttributeValue(): String = {
+    val str = xAttributeValue(ch_returning_nextch)
+    // well-formedness constraint
+    normalizeAttributeValue(str)
+  }
+
+  private def takeUntilChar(it: Iterator[Char], end: Char): String = {
+    val buf = new StringBuilder
+    while (it.hasNext) it.next match {
+      case `end`  => return buf.toString
+      case ch     => buf append ch
+    }
+    error("Expected '%s'".format(end))
+  }
+
+  /** [42]  '<' xmlEndTag ::=  '<' '/' Name S? '>'
+   */
+  def xEndTag(startName: String) {
+    xToken('/')
+    if (xName != startName)
+      errorNoEnd(startName)
+
+    xSpaceOpt
+    xToken('>')
+  }
+
+  /** actually, Name ::= (Letter | '_' | ':') (NameChar)*  but starting with ':' cannot happen
+   *  Name ::= (Letter | '_') (NameChar)*
+   *
+   *  see  [5] of XML 1.0 specification
+   *
+   *  pre-condition:  ch != ':' // assured by definition of XMLSTART token
+   *  post-condition: name does neither start, nor end in ':'
+   */
+  def xName: String = {
+    if (ch == SU)
+      truncatedError("")
+    else if (!isNameStart(ch))
+      return errorAndResult("name expected, but char '%s' cannot start a name" format ch, "")
+
+    val buf = new StringBuilder
+
+    do buf append ch_returning_nextch
+    while (isNameChar(ch))
+
+    if (buf.last == ':') {
+      reportSyntaxError( "name cannot end in ':'" )
+      buf.toString dropRight 1
+    }
+    else buf.toString
+  }
+
+  private def attr_unescape(s: String) = s match {
+    case "lt"     => "<"
+    case "gt"     => ">"
+    case "amp"    => "&"
+    case "apos"   => "'"
+    case "quot"   => "\""
+    case "quote"  => "\""
+    case _        => "&" + s + ";"
+  }
+
+  /** Replaces only character references right now.
+   *  see spec 3.3.3
+   */
+  private def normalizeAttributeValue(attval: String): String = {
+    val buf = new StringBuilder
+    val it = attval.iterator.buffered
+
+    while (it.hasNext) buf append (it.next match {
+      case ' ' | '\t' | '\n' | '\r' => " "
+      case '&' if it.head == '#'    => it.next ; xCharRef(it)
+      case '&'                      => attr_unescape(takeUntilChar(it, ';'))
+      case c                        => c
+    })
+
+    buf.toString
+  }
+
+  /** CharRef ::= "&#" '0'..'9' {'0'..'9'} ";"
+   *            | "&#x" '0'..'9'|'A'..'F'|'a'..'f' { hexdigit } ";"
+   *
+   * see [66]
+   */
+  def xCharRef(ch: () => Char, nextch: () => Unit): String =
+    Utility.parseCharRef(ch, nextch, reportSyntaxError _)
+
+  def xCharRef(it: Iterator[Char]): String = {
+    var c = it.next
+    Utility.parseCharRef(() => c, () => { c = it.next }, reportSyntaxError _)
+  }
+
+  def xCharRef: String = xCharRef(() => ch, () => nextch)
 
   /** Create a lookahead reader which does not influence the input */
   def lookahead(): BufferedIterator[Char]
 
+  /** The library and compiler parsers had the interesting distinction of
+   *  different behavior for nextch (a function for which there are a total
+   *  of two plausible behaviors, so we know the design space was fully
+   *  explored.) One of them returned the value of nextch before the increment
+   *  and one of them the new value.  So to unify code we have to at least
+   *  temporarily abstract over the nextchs.
+   */
   def ch: Char
   def nextch: Char
+  def ch_returning_nextch: Char
+  def eof: Boolean
+
+  // def handle: HandleType
+  var tmppos: PositionType
+
   def xHandleError(that: Char, msg: String): Unit
   def reportSyntaxError(str: String): Unit
   def reportSyntaxError(pos: Int, str: String): Unit
-  def eof: Boolean
+
+  def truncatedError(msg: String): Nothing
+  def errorNoEnd(tag: String): Nothing
+
+  protected def errorAndResult[T](msg: String, x: T): T = {
+    reportSyntaxError(msg)
+    x
+  }
 
   def xToken(that: Char) {
     if (ch == that) nextch
@@ -53,9 +214,16 @@ private[scala] trait MarkupParserCommon extends TokenTests {
     if (isSpace(ch)) { nextch; xSpaceOpt }
     else xHandleError(ch, "whitespace expected")
 
-  //
+  /** Apply a function and return the passed value */
   def returning[T](x: T)(f: T => Unit): T = { f(x) ; x }
 
+  /** Execute body with a variable saved and restored after execution */
+  def saving[A,B](getter: A, setter: (A) => Unit)(body: => B): B = {
+    val saved = getter
+    try body
+    finally setter(saved)
+  }
+
   /** Take characters from input stream until given String "until"
    *  is seen.  Once seen, the accumulated characters are passed
    *  along with the current Position to the supplied handler function.
@@ -73,7 +241,7 @@ private[scala] trait MarkupParserCommon extends TokenTests {
       if (ch == head && peek(rest))
         return handler(positioner(), sb.toString)
       else if (ch == SU)
-        xHandleError(ch, "")  // throws TruncatedXML in compiler
+        truncatedError("")  // throws TruncatedXML in compiler
 
       sb append ch
       nextch
author	Paul Phillips <paulp@improving.org>	2010-01-18 21:18:36 +0000
committer	Paul Phillips <paulp@improving.org>	2010-01-18 21:18:36 +0000
commit	135d4f06b174aa585af64b5253aba647982ac4a2 (patch)
tree	73ca84ae254f4903feef03f7573172f992ca7e99 /src/library
parent	e83ad1e005d40738f87da8bb2d60cf9035cfb6ca (diff)
download	scala-135d4f06b174aa585af64b5253aba647982ac4a2.tar.gz scala-135d4f06b174aa585af64b5253aba647982ac4a2.tar.bz2 scala-135d4f06b174aa585af64b5253aba647982ac4a2.zip