SI-3368 CDATA gets a Node

XML Parser uses `scala.xml.PCData`. A compiler flag `-Yxml:coalescing`, analogous to `DocumentBuilderFactory.setCoalescing`, turns `PCData` nodes into `Text` nodes and coalesces sibling text nodes. This change also fixes parse errors such as rejecting a sequence of CDATA sections. A sequence of "top level" nodes are not coalesced. ``` scala> <a><b/>start<![CDATA[hi & bye]]><c/>world<d/>stuff<![CDATA[red & black]]></a> res0: scala.xml.Elem = <a><b/>start<![CDATA[hi & bye]]><c/>world<d/>stuff<![CDATA[red & black]]></a> scala> :replay -Yxml:coalescing Replaying: <a><b/>start<![CDATA[hi & bye]]><c/>world<d/>stuff<![CDATA[red & black]]></a> res0: scala.xml.Elem = <a><b/>starthi & bye<c/>world<d/>stuffred & black</a> ```
author: Som Snytt <som.snytt@gmail.com> 2014-12-20 02:28:44 -0800
committer: Som Snytt <som.snytt@gmail.com> 2015-04-08 09:20:24 -0700
commit: 4df81aab315e587d9c7e319c7a2ece0f0f6fbaf3 (patch)
tree: d83454b666d54549d88f52bd800cddb66bd46041 /src/compiler/scala/tools/nsc/ast/parser/MarkupParsers.scala
parent: 476eef82dfed90278de45739ca72819b1b4be5a4 (diff)
download: scala-4df81aab315e587d9c7e319c7a2ece0f0f6fbaf3.tar.gz
scala-4df81aab315e587d9c7e319c7a2ece0f0f6fbaf3.tar.bz2
scala-4df81aab315e587d9c7e319c7a2ece0f0f6fbaf3.zip
1 files changed, 78 insertions, 51 deletions
diff --git a/src/compiler/scala/tools/nsc/ast/parser/MarkupParsers.scala b/src/compiler/scala/tools/nsc/ast/parser/MarkupParsers.scala
index 96939e616c..edee1e296d 100755
--- a/src/compiler/scala/tools/nsc/ast/parser/MarkupParsers.scala
+++ b/src/compiler/scala/tools/nsc/ast/parser/MarkupParsers.scala
@@ -6,6 +6,7 @@
 package scala.tools.nsc
 package ast.parser
 
+import scala.annotation.tailrec
 import scala.collection.mutable
 import mutable.{ Buffer, ArrayBuffer, ListBuffer }
 import scala.util.control.ControlThrowable
@@ -172,20 +173,19 @@ trait MarkupParsers {
     }
 
     def appendText(pos: Position, ts: Buffer[Tree], txt: String): Unit = {
-      def append(t: String) = ts append handle.text(pos, t)
-
-      if (preserveWS) append(txt)
-      else {
+      def append(text: String): Unit = {
+        val tree = handle.text(pos, text)
+        ts append tree
+      }
+      val clean = if (preserveWS) txt else {
         val sb = new StringBuilder()
-
         txt foreach { c =>
           if (!isSpace(c)) sb append c
           else if (sb.isEmpty || !isSpace(sb.last)) sb append ' '
         }
-
-        val trimmed = sb.toString.trim
-        if (!trimmed.isEmpty) append(trimmed)
+        sb.toString.trim
       }
+      if (!clean.isEmpty) append(clean)
     }
 
     /** adds entity/character to ts as side-effect
@@ -216,44 +216,75 @@ trait MarkupParsers {
       if (xCheckEmbeddedBlock) ts append xEmbeddedExpr
       else appendText(p, ts, xText)
 
-    /** Returns true if it encounters an end tag (without consuming it),
-     *  appends trees to ts as side-effect.
+    /** At an open angle-bracket, detects an end tag
+     *  or consumes CDATA, comment, PI or element.
+     *  Trees are appended to `ts` as a side-effect.
+     *  @return true if an end tag (without consuming it)
      */
-    private def content_LT(ts: ArrayBuffer[Tree]): Boolean = {
-      if (ch == '/')
-        return true   // end tag
-
-      val toAppend = ch match {
-        case '!'    => nextch() ; if (ch =='[') xCharData else xComment // CDATA or Comment
-        case '?'    => nextch() ; xProcInstr                            // PI
-        case _      => element                                        // child node
+    private def content_LT(ts: ArrayBuffer[Tree]): Boolean =
+      (ch == '/') || {
+        val toAppend = ch match {
+          case '!' => nextch() ; if (ch =='[') xCharData else xComment // CDATA or Comment
+          case '?' => nextch() ; xProcInstr                            // PI
+          case _   => element                                          // child node
+        }
+        ts append toAppend
+        false
       }
 
-      ts append toAppend
-      false
-    }
-
     def content: Buffer[Tree] = {
       val ts = new ArrayBuffer[Tree]
-      while (true) {
-        if (xEmbeddedBlock)
+      val coalescing = settings.YxmlSettings.isCoalescing
+      @tailrec def loopContent(): Unit =
+        if (xEmbeddedBlock) {
           ts append xEmbeddedExpr
-        else {
+          loopContent()
+        } else {
           tmppos = o2p(curOffset)
           ch match {
-            // end tag, cdata, comment, pi or child node
-            case '<'  => nextch() ; if (content_LT(ts)) return ts
-            // either the character '{' or an embedded scala block }
-            case '{'  => content_BRACE(tmppos, ts)  // }
-            // EntityRef or CharRef
-            case '&'  => content_AMP(ts)
-            case SU   => return ts
-            // text content - here xEmbeddedBlock might be true
-            case _    => appendText(tmppos, ts, xText)
+            case '<' =>           // end tag, cdata, comment, pi or child node
+              nextch()
+              if (!content_LT(ts)) loopContent()
+            case '{'  =>          // } literal brace or embedded Scala block
+              content_BRACE(tmppos, ts)
+              loopContent()
+            case '&' =>           // EntityRef or CharRef
+              content_AMP(ts)
+              loopContent()
+            case SU  => ()
+            case _   =>           // text content - here xEmbeddedBlock might be true
+              appendText(tmppos, ts, xText)
+              loopContent()
           }
         }
+      // merge text sections and strip attachments
+      def coalesce(): ArrayBuffer[Tree] = {
+        def copy() = {
+          val buf = new ArrayBuffer[Tree]
+          var acc = new StringBuilder
+          var pos: Position = NoPosition
+          def emit() = if (acc.nonEmpty) {
+            appendText(pos, buf, acc.toString)
+            acc.clear()
+          }
+          for (t <- ts)
+            t.attachments.get[handle.TextAttache] match {
+              case Some(ta) =>
+                if (acc.isEmpty) pos = ta.pos
+                acc append ta.text
+              case _        =>
+                emit()
+                buf += t
+            }
+          emit()
+          buf
+        }
+        val res = if (ts.count(_.hasAttachment[handle.TextAttache]) > 1) copy() else ts
+        for (t <- res) t.removeAttachment[handle.TextAttache]
+        res
       }
-      unreachable
+      loopContent()
+      if (coalescing) coalesce() else ts
     }
 
     /** '<' element ::= xmlTag1 '>'  { xmlExpr | '{' simpleExpr '}' } ETag
@@ -289,20 +320,16 @@ trait MarkupParsers {
     private def xText: String = {
       assert(!xEmbeddedBlock, "internal error: encountered embedded block")
       val buf = new StringBuilder
-      def done = buf.toString
-
-      while (ch != SU) {
-        if (ch == '}') {
-          if (charComingAfter(nextch()) == '}') nextch()
-          else errorBraces()
-        }
-
-        buf append ch
-        nextch()
-        if (xCheckEmbeddedBlock || ch == '<' ||  ch == '&')
-          return done
-      }
-      done
+      if (ch != SU)
+        do {
+          if (ch == '}') {
+            if (charComingAfter(nextch()) == '}') nextch()
+            else errorBraces()
+          }
+          buf append ch
+          nextch()
+        } while (!(ch == SU || xCheckEmbeddedBlock || ch == '<' ||  ch == '&'))
+      buf.toString
     }
 
     /** Some try/catch/finally logic used by xLiteral and xLiteralPattern.  */
@@ -344,12 +371,12 @@ trait MarkupParsers {
         tmppos = o2p(curOffset)    // Iuli: added this line, as it seems content_LT uses tmppos when creating trees
         content_LT(ts)
 
-        // parse more XML ?
+        // parse more XML?
         if (charComingAfter(xSpaceOpt()) == '<') {
           do {
             xSpaceOpt()
             nextch()
-            ts append element
+            content_LT(ts)
           } while (charComingAfter(xSpaceOpt()) == '<')
           handle.makeXMLseq(r2p(start, start, curOffset), ts)
         }
author	Som Snytt <som.snytt@gmail.com>	2014-12-20 02:28:44 -0800
committer	Som Snytt <som.snytt@gmail.com>	2015-04-08 09:20:24 -0700
commit	4df81aab315e587d9c7e319c7a2ece0f0f6fbaf3 (patch)
tree	d83454b666d54549d88f52bd800cddb66bd46041 /src/compiler/scala/tools/nsc/ast/parser/MarkupParsers.scala
parent	476eef82dfed90278de45739ca72819b1b4be5a4 (diff)
download	scala-4df81aab315e587d9c7e319c7a2ece0f0f6fbaf3.tar.gz scala-4df81aab315e587d9c7e319c7a2ece0f0f6fbaf3.tar.bz2 scala-4df81aab315e587d9c7e319c7a2ece0f0f6fbaf3.zip