diff options
author | Burak Emir <emir@epfl.ch> | 2008-02-13 08:41:32 +0000 |
---|---|---|
committer | Burak Emir <emir@epfl.ch> | 2008-02-13 08:41:32 +0000 |
commit | bdf8585f76aa0fe7eab9b3121942e788894117c6 (patch) | |
tree | 6ae49ff89693d15d0f28efbc51a96e7ef3de960f /src | |
parent | 9481a6f1811e430651988ac595e302e2bd23eab0 (diff) | |
download | scala-bdf8585f76aa0fe7eab9b3121942e788894117c6.tar.gz scala-bdf8585f76aa0fe7eab9b3121942e788894117c6.tar.bz2 scala-bdf8585f76aa0fe7eab9b3121942e788894117c6.zip |
added DPP's code for xhtml parsing (cdata/entit...
added DPP's code for xhtml parsing (cdata/entity handling)
Diffstat (limited to 'src')
-rw-r--r-- | src/library/scala/xml/CData.scala | 29 | ||||
-rw-r--r-- | src/library/scala/xml/Xhtml.scala | 85 | ||||
-rw-r--r-- | src/library/scala/xml/parsing/XhtmlEntities.scala | 41 | ||||
-rw-r--r-- | src/library/scala/xml/parsing/XhtmlParser.scala | 54 |
4 files changed, 209 insertions, 0 deletions
diff --git a/src/library/scala/xml/CData.scala b/src/library/scala/xml/CData.scala new file mode 100644 index 0000000000..0dbd3bf70a --- /dev/null +++ b/src/library/scala/xml/CData.scala @@ -0,0 +1,29 @@ +package scala.xml + +/** This class (which is not used by all XML parsers, but always used by the XHTML one) + * represents parseable character data, which appeared as CDATA sections in the input + * and is to be preserved as CDATA section in the output. + */ +case class PCData(_data: String) extends Atom[String](_data) { + /* The following code is a derivative work of scala.xml.Text */ + if (null == data) + throw new java.lang.NullPointerException("tried to construct PCData with null") + + final override def equals(x: Any) = x match { + case s:String => s.equals(data.toString()) + case s:Atom[_] => data == s.data + case _ => false + } + + /** Returns text, with some characters escaped according to the XML + * specification. + * + * @param sb ... + * @return ... + */ + override def toString(sb: StringBuilder) = { + sb.append("<![CDATA[") + sb.append(data) + sb.append("]]>") + } +} diff --git a/src/library/scala/xml/Xhtml.scala b/src/library/scala/xml/Xhtml.scala new file mode 100644 index 0000000000..980a7939f0 --- /dev/null +++ b/src/library/scala/xml/Xhtml.scala @@ -0,0 +1,85 @@ +package scala.xml; + +/* (c) David Pollak 2007 WorldWide Conferencing, LLC */ + +object Xhtml { + + def toXhtml(n: Node, stripComment: Boolean, convertAmp: Boolean): String = { + val sb = new StringBuilder() + toXhtml(n, TopScope, sb, stripComment, convertAmp) + sb.toString() + } + + /** + * Appends a tree to the given stringbuffer within given namespace scope. + * + * @param n the node + * @param pscope the parent scope + * @param sb stringbuffer to append to + * @param stripComment if true, strip comments + */ + def toXhtml(x: Node, pscope: NamespaceBinding, sb: StringBuilder, stripComment: Boolean, convertAmp: Boolean): Unit = { + x match { + + case c: Comment if !stripComment => + c.toString(sb) + + case er: EntityRef if convertAmp => + XhtmlEntities.entMap.get(er.entityName) match { + case Some(chr) if chr.toInt >= 128 => sb.append(chr) + case _ => er.toString(sb) + } + + case x: SpecialNode => + x.toString(sb) + + case g: Group => + for (c <- g.nodes) toXhtml(c, x.scope, sb, stripComment, convertAmp) + + case _ => + if (((x.child eq null) || (x.child.length == 0)) && x.label != "div" && x.label != "script" && x.label != "textarea") { + sb.append('<') + x.nameToString(sb) + if (x.attributes ne null) x.attributes.toString(sb) + x.scope.toString(sb, pscope) + sb.append(" />") + } else { + // print tag with namespace declarations + sb.append('<') + x.nameToString(sb) + if (x.attributes ne null) x.attributes.toString(sb) + x.scope.toString(sb, pscope) + sb.append('>') + sequenceToXML(x.child, x.scope, sb, stripComment, convertAmp) + sb.append("</") + x.nameToString(sb) + sb.append('>') + } + } + } + + /** + * @param children ... + * @param pscope ... + * @param sb ... + * @param stripComment ... + */ + def sequenceToXML(children: Seq[Node], pscope: NamespaceBinding, + sb: StringBuilder, stripComment: Boolean, convertAmp: Boolean): Unit = { + if (children.isEmpty) + return + else if (children forall { y => y.isInstanceOf[Atom[_]] && !y.isInstanceOf[Text] }) { // add space + val it = children.elements + val f = it.next + toXhtml(f, pscope, sb, stripComment, convertAmp) + while (it.hasNext) { + val x = it.next + sb.append(' ') + toXhtml(x, pscope, sb, stripComment, convertAmp) + } + } else { + for (c <- children) toXhtml(c, pscope, sb, stripComment, convertAmp) + } + } +} + diff --git a/src/library/scala/xml/parsing/XhtmlEntities.scala b/src/library/scala/xml/parsing/XhtmlEntities.scala new file mode 100644 index 0000000000..a59e6ab451 --- /dev/null +++ b/src/library/scala/xml/parsing/XhtmlEntities.scala @@ -0,0 +1,41 @@ +package scala.xml + +import scala.xml.dtd.{IntDef, ParsedEntityDecl} + +/* (c) David Pollak 2007 WorldWide Conferencing, LLC */ +object XhtmlEntities { + val entList = List(("quot",34), ("amp",38), ("lt",60), ("gt",62), ("nbsp",160), ("iexcl",161), ("cent",162), ("pound",163), ("curren",164), ("yen",165), + ("euro",8364), ("brvbar",166), ("sect",167), ("uml",168), ("copy",169), ("ordf",170), ("laquo",171), ("shy",173), ("reg",174), ("trade",8482), + ("macr",175), ("deg",176), ("plusmn",177), ("sup2",178), ("sup3",179), ("acute",180), ("micro",181), ("para",182), ("middot",183), ("cedil",184), + ("sup1",185), ("ordm",186), ("raquo",187), ("frac14",188), ("frac12",189), ("frac34",190), ("iquest",191), ("times",215), ("divide",247), + ("Agrave",192), ("Aacute",193), ("Acirc",194), ("Atilde",195), ("Auml",196), ("Aring",197), ("AElig",198), ("Ccedil",199), ("Egrave",200), + ("Eacute",201), ("Ecirc",202), ("Euml",203), ("Igrave",204), ("Iacute",205), ("Icirc",206), ("Iuml",207), ("ETH",208), ("Ntilde",209), + ("Ograve",210), ("Oacute",211), ("Ocirc",212), ("Otilde",213), ("Ouml",214), ("Oslash",216), ("Ugrave",217), ("Uacute",218), ("Ucirc",219), + ("Uuml",220), ("Yacute",221), ("THORN",222), ("szlig",223), ("agrave",224), ("aacute",225), ("acirc",226), ("atilde",227), ("auml",228), + ("aring",229), ("aelig",230), ("ccedil",231), ("egrave",232), ("eacute",233), ("ecirc",234), ("euml",235), ("igrave",236), ("iacute",237), + ("icirc",238), ("iuml",239), ("eth",240), ("ntilde",241), ("ograve",242), ("oacute",243), ("ocirc",244), ("otilde",245), ("ouml",246), + ("oslash",248), ("ugrave",249), ("uacute",250), ("ucirc",251), ("uuml",252), ("yacute",253), ("thorn",254), ("yuml",255), ("OElig",338), + ("oelig",339), ("Scaron",352), ("scaron",353), ("Yuml",376), ("circ",710), ("ensp",8194), ("emsp",8195), ("zwnj",204), ("zwj",8205), ("lrm",8206), + ("rlm",8207), ("ndash",8211), ("mdash",8212), ("lsquo",8216), ("rsquo",8217), ("sbquo",8218), ("ldquo",8220), ("rdquo",8221), ("bdquo",8222), + ("dagger",8224), ("Dagger",8225), ("permil",8240), ("lsaquo",8249), ("rsaquo",8250), ("fnof",402), ("bull",8226), ("hellip",8230), ("prime",8242), + ("Prime",8243), ("oline",8254), ("frasl",8260), ("weierp",8472), ("image",8465), ("real",8476), ("alefsym",8501), ("larr",8592), ("uarr",8593), + ("rarr",8594), ("darr",8495), ("harr",8596), ("crarr",8629), ("lArr",8656), ("uArr",8657), ("rArr",8658), ("dArr",8659), ("hArr",8660), + ("forall",8704), ("part",8706), ("exist",8707), ("empty",8709), ("nabla",8711), ("isin",8712), ("notin",8713), ("ni",8715), ("prod",8719), + ("sum",8721), ("minus",8722), ("lowast",8727), ("radic",8730), ("prop",8733), ("infin",8734), ("ang",8736), ("and",8743), ("or",8744), + ("cap",8745), ("cup",8746), ("int",8747), ("there4",8756), ("sim",8764), ("cong",8773), ("asymp",8776), ("ne",8800), ("equiv",8801), ("le",8804), + ("ge",8805), ("sub",8834), ("sup",8835), ("nsub",8836), ("sube",8838), ("supe",8839), ("oplus",8853), ("otimes",8855), ("perp",8869), ("sdot",8901), + ("lceil",8968), ("rceil",8969), ("lfloor",8970), ("rfloor",8971), ("lang",9001), ("rang",9002), ("loz",9674), ("spades",9824), ("clubs",9827), + ("hearts",9829), ("diams",9830), ("Alpha",913), ("Beta",914), ("Gamma",915), ("Delta",916), ("Epsilon",917), ("Zeta",918), ("Eta",919), + ("Theta",920), ("Iota",921), ("Kappa",922), ("Lambda",923), ("Mu",924), ("Nu",925), ("Xi",926), ("Omicron",927), ("Pi",928), ("Rho",929), + ("Sigma",931), ("Tau",932), ("Upsilon",933), ("Phi",934), ("Chi",935), ("Psi",936), ("Omega",937), ("alpha",945), ("beta",946), ("gamma",947), + ("delta",948), ("epsilon",949), ("zeta",950), ("eta",951), ("theta",952), ("iota",953), ("kappa",954), ("lambda",955), ("mu",956), ("nu",957), + ("xi",958), ("omicron",959), ("pi",960), ("rho",961), ("sigmaf",962), ("sigma",963), ("tau",964), ("upsilon",965), ("phi",966), ("chi",967), + ("psi",968), ("omega",969), ("thetasym",977), ("upsih",978), ("piv",982)) + + val entMap: Map[String, Char] = Map.empty ++ entList.map { case (name, value) => (name, value.toChar)} + + val entities = entList. + map { case (name, value) => (name, new ParsedEntityDecl(name, new IntDef(value.toChar.toString)))} + + def apply() = entities +} diff --git a/src/library/scala/xml/parsing/XhtmlParser.scala b/src/library/scala/xml/parsing/XhtmlParser.scala new file mode 100644 index 0000000000..ef285f4bc8 --- /dev/null +++ b/src/library/scala/xml/parsing/XhtmlParser.scala @@ -0,0 +1,54 @@ +package scala.xml + +import scala.xml.parsing.{MarkupParser, MarkupHandler, FatalError, ConstructingHandler, ExternalSources} +import scala.io.{Source} + +/** + * Extends the Markup Parser to do the right thing (tm) with PCData blocks. + * (c) David Pollak, 2007 WorldWide Conferencing, LLC + */ +trait PCDataMarkupParser[PCM <: MarkupParser with MarkupHandler] extends MarkupParser { self: PCM => + + /** '<! CharData ::= [CDATA[ ( {char} - {char}"]]>"{char} ) ']]>' + * + * see [15] + */ + override def xCharData: NodeSeq = { + xToken("[CDATA[") + val pos1 = pos + val sb: StringBuilder = new StringBuilder() + while (true) { + if (ch==']' && + { sb.append(ch); nextch; ch == ']' } && + { sb.append(ch); nextch; ch == '>' } ) { + sb.setLength(sb.length - 2); + nextch; + return PCData(sb.toString) + } else sb.append( ch ); + nextch; + } + // bq: (todo) increase grace when meeting CDATA section + throw FatalError("this cannot happen"); + } +} + +/** + * An XML Parser that preserves CDATA blocks and knows about HtmlEntities. + * (c) David Pollak, 2007 WorldWide Conferencing, LLC + */ +class XhtmlParser(val input: Source) extends ConstructingHandler with PCDataMarkupParser[XhtmlParser] with ExternalSources { + val preserveWS = true + ent ++= XhtmlEntities() +} + +/** + * Convenience method that instantiates, initializes and runs an XhtmlParser + * (c) Burak Emir + */ +object XhtmlParser { + def apply(source: Source): NodeSeq = { + val p = new XhtmlParser(source) + p.nextch + p.document + } +} |