From 6a415fa5cef84c98f3dbfb1a0e9d06a1751724c6 Mon Sep 17 00:00:00 2001 From: buraq Date: Tue, 26 Apr 2005 14:21:52 +0000 Subject: DTD parsing, representation --- config/list/library.lst | 2 +- sources/scala/xml/dtd/DTD.scala | 18 ++ sources/scala/xml/dtd/Decl.scala | 78 ++++-- sources/scala/xml/parsing/MarkupParser.scala | 394 ++++++++++++++++++++++++++- sources/scala/xml/parsing/TokenTests.scala | 28 ++ 5 files changed, 484 insertions(+), 36 deletions(-) create mode 100644 sources/scala/xml/dtd/DTD.scala diff --git a/config/list/library.lst b/config/list/library.lst index 2c7f810d67..54738aae04 100644 --- a/config/list/library.lst +++ b/config/list/library.lst @@ -239,7 +239,7 @@ xml/XML.scala xml/dtd/ContentModel.scala xml/dtd/DocType.scala -#xml/dtd/DTD.scala +xml/dtd/DTD.scala xml/dtd/Decl.scala xml/dtd/ExternalID.scala xml/dtd/Parser.scala diff --git a/sources/scala/xml/dtd/DTD.scala b/sources/scala/xml/dtd/DTD.scala new file mode 100644 index 0000000000..92d1f435be --- /dev/null +++ b/sources/scala/xml/dtd/DTD.scala @@ -0,0 +1,18 @@ +package scala.xml.dtd; + +/** a document type declaration */ +abstract class DTD { + + var externalID: ExternalID = null; + + def notations: Seq[NotationDecl] = Nil; + + def unparsedEntities: Seq[EntityDecl] = Nil; + + var decls: List[MarkupDecl] = Nil; + + //def getElemDecl(elem:String): ElemDecl; + + //def getAttribDecl(elem: String, attr: String): AttrDecl; + +} diff --git a/sources/scala/xml/dtd/Decl.scala b/sources/scala/xml/dtd/Decl.scala index 2b4c4a56e0..538e5101f4 100644 --- a/sources/scala/xml/dtd/Decl.scala +++ b/sources/scala/xml/dtd/Decl.scala @@ -16,27 +16,23 @@ abstract class Decl ; abstract class MarkupDecl extends Decl ; -case class ElemDecl( name:String , - contentModel:String , - attribs:Map[String,AttrDecl] ) - extends MarkupDecl { - - final val parsedContentModel:ContentModel.RegExp = { - try { - ContentModel.parse( contentModel ); - } catch { - case _:Error => - Console.println( "error parsing declaration of " + name ); - Console.println( "content model was:\n" + contentModel ); - null - } - } - - def containsText = contentModel.indexOf("#PCDATA") != -1 ; -}; - -/** an attribute declaration */ -case class AttrDecl( name:String, tpe:String, default:DefaultDecl ) extends MarkupDecl { +/** an element declaration + */ +case class ElemDecl(name: String, contentModel: ContentModel.RegExp, attList: AttListDecl) extends MarkupDecl { + + //def mixed = ; // to do + + def setAttList(nAttList:AttListDecl) = + ElemDecl(name, contentModel, nAttList); +} // ElemDecl + +case class AttListDecl(name: String, attrs:List[AttrDecl]) extends MarkupDecl; + +/** an attribute declaration. at this point, the tpe is a string. Future + * versions might provide a way to access the attribute types more + * directly. + */ +case class AttrDecl( name:String, tpe:String, default:DefaultDecl ) { final override def toString() = { val sb = new StringBuffer("AttrDecl("); sb.append('"'); @@ -53,14 +49,48 @@ case class AttrDecl( name:String, tpe:String, default:DefaultDecl ) extends Mark } } +class EntityDecl extends MarkupDecl; +/** an entity declaration */ + +case class ParsedEntityDecl( name:String, entdef:EntityDef ) + extends EntityDecl; + +case class ParameterEntityDecl(name: String, entdef: EntityDef) + extends EntityDecl; + +class EntityDef; + +case class IntDef(value:String) extends EntityDef { + private def validateValue(): Unit = { + var tmp = value; + var ix = tmp.indexOf('%'); + while( ix != -1) { + val iz = tmp.indexOf(';', ix); + if(iz == -1 && iz == ix + 1) + error("no % allowed in entity value, except for parameter-entity-references"); + else { + val n = tmp.substring(ix, iz); + + if( !Utility.isName( n )) + throw new IllegalArgumentException("ent must be an XML Name"); + + tmp = tmp.substring(iz+1, tmp.length()); + ix = tmp.indexOf('%'); + } + } + } + validateValue(); +} +case class ExtDef(extID:ExternalID) extends EntityDef; + /** an entity declaration */ -case class EntityDecl( name:String, tpe:String ) extends MarkupDecl; +case class UnparsedEntityDecl( name:String, extID:ExternalID, notation:String ) extends EntityDecl; /** a notation declaration */ -case class NotationDecl( name:String, tpe:String ) extends MarkupDecl; +case class NotationDecl( name:String, extID:ExternalID ) extends MarkupDecl; /** a parsed entity reference */ -case class PEReference(ent:String) extends Decl { +case class PEReference(ent:String) extends MarkupDecl { if( !Utility.isName( ent )) throw new IllegalArgumentException("ent must be an XML Name"); diff --git a/sources/scala/xml/parsing/MarkupParser.scala b/sources/scala/xml/parsing/MarkupParser.scala index 633c610a53..f41d0c3ccd 100644 --- a/sources/scala/xml/parsing/MarkupParser.scala +++ b/sources/scala/xml/parsing/MarkupParser.scala @@ -9,12 +9,17 @@ package scala.xml.parsing; -/** an xml parser. parses XML, invokes callback methods of a MarkupHandler +import scala.xml.dtd._ ; +/** an xml parser. parses XML 1.0, invokes callback methods of a MarkupHandler * and returns whatever the markup handler returns. Use ConstructingParser * if you just want to parse XML to construct instances of scala.xml.Node. */ abstract class MarkupParser with TokenTests { + // + // variables, values + // + /** the handler of the markup */ val handle: MarkupHandler; @@ -33,6 +38,118 @@ abstract class MarkupParser with TokenTests { /** character buffer, for names */ protected val cbuf = new StringBuffer(); + var dtd: DTD = null; + + var decls: List[scala.xml.dtd.Decl] = Nil; + + // + // methods + // + + /** <? prolog ::= xml S + */ + def prolog(): Tuple3[Option[String], Option[String], Option[Boolean]] = { + + var info_ver: Option[String] = None; + var info_enc: Option[String] = None; + var info_stdl: Option[Boolean] = None; + + xToken('x'); + xToken('m'); + xToken('l'); + xSpace; + val Pair(md,scp) = xAttributes(TopScope); + xToken('?'); + xToken('>'); + xSpace; + if(TopScope == scp) { + var m = md; + + if(!m.isPrefixed && m.key == "version") { + if(m.value == "1.0") { + info_ver = Some("1.0"); + m = m.next; + } else { + reportSyntaxError("cannot deal with versions != 1.0"); + } + } else + reportSyntaxError("VersionInfo expected!"); + + if(!m.isPrefixed && m.key == "encoding") { + val enc = m.value; + if(!isValidIANAEncoding(enc)) + reportSyntaxError("\""+enc+"\" is not a valid encoding"); + info_enc = Some(enc); + m = m.next + } + + if(!m.isPrefixed && m.key == "standalone") { + m.value.match { + case "yes" => + info_stdl = Some(true); + case "no" => + info_stdl = Some(false); + case _ => + reportSyntaxError("either 'yes' or 'no' expected"); + } + m = m.next + } + + if(m != Null) + reportSyntaxError("VersionInfo EncodingDecl? SDDecl? or '?>' expected!"); + } else + reportSyntaxError("no xmlns definitions here, please"); + + Tuple3(info_ver,info_enc,info_stdl) + } + + /** + *[22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? + *[23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' + *[24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') + *[25] Eq ::= S? '=' S? + *[26] VersionNum ::= '1.0' + *[27] Misc ::= Comment | PI | S + */ + + def document(): Document = { + this.dtd = null; + var info_prolog: Tuple3[Option[String], Option[String], Option[Boolean]] = + Tuple3(None,None,None); + if('<' != ch) { + reportSyntaxError("< expected"); + return null; + } + + nextch; // is prolog ? + if('?' == ch) + info_prolog = prolog(); + + val children = content(TopScope); // DTD handled as side effect + var elemCount = 0; + var theNode: Node = _; + for(val c <- children) c.match { + case _:ProcInstr => ; + case _:Comment => ; + case _:EntityRef => // todo: fix entities, shouldn't be "special" + reportSyntaxError("no entity references alllowed here"); + case m:Node => + elemCount = elemCount + 1; + theNode = m; + } + if(1 != elemCount) + reportSyntaxError("document should contain exactly one element"); + + val doc = new Document(); + doc.children = children; + doc.docElem = theNode; + doc.version = info_prolog._1; + doc.encoding = info_prolog._2; + doc.standAlone = info_prolog._3; + doc.dtd = this.dtd; + return doc + } + /** append Unicode character to name buffer*/ protected def putChar(c: Char) = cbuf.append(c); @@ -55,6 +172,12 @@ abstract class MarkupParser with TokenTests { reportSyntaxError("'" + that + "' expected instead of '" + ch + "'"); } + def xToken(that: Seq[Char]): Unit = { + val it = that.elements; + while(it.hasNext) + xToken(it.next); + } + /** checks whether next character starts a Scala block, if yes, skip it. * @return true if next character starts a scala block def xCheckEmbeddedBlock:Boolean = { @@ -93,7 +216,7 @@ abstract class MarkupParser with TokenTests { aMap = new UnprefixedAttribute(qname, value, aMap); } - if ((ch != '/') && (ch != '>')) + if ((ch != '/') && (ch != '>') && ('?' != ch)) xSpace; } @@ -158,13 +281,7 @@ abstract class MarkupParser with TokenTests { * see [15] */ def xCharData: NodeSeq = { - xToken('['); - xToken('C'); - xToken('D'); - xToken('A'); - xToken('T'); - xToken('A'); - xToken('['); + xToken("[CDATA["); val pos1 = pos; val sb:StringBuffer = new StringBuffer(); while (true) { @@ -257,7 +374,9 @@ abstract class MarkupParser with TokenTests { nextch; if ('[' == ch) // CDATA ts + xCharData; - else // comment + else if ('D' == ch) // doctypedecl, parse DTD + parseDTD(); + else // comment ts + xComment; case '?' => // PI nextch; @@ -298,7 +417,55 @@ abstract class MarkupParser with TokenTests { new NodeSeq { val theSeq = ts.toList; } - } /* end content */ + } // content(NamespaceBinding) + + /** externalID ::= SYSTEM S syslit + * PUBLIC S pubid S syslit + */ + + def externalID(): ExternalID = ch.match { + case 'S' => + nextch; + xToken("YSTEM"); + val sysID = systemLiteral(); + new SystemID(sysID); + case 'P' => + nextch; xToken("UBLIC"); + val pubID = pubidLiteral(); + xSpace; + val sysID = systemLiteral(); + new PublicID(pubID, sysID); + } + /** parses document type declaration and assigns it to instance variable + * dtd. + * + * + nextch; + decls = PEReference(xName) :: decls; + xToken(';') + //peReference + case '<' => + nextch; + + if('?' == ch) + xProcInstr; // simply ignore processing instructions! + else { + xToken('!'); + ch.match { + case '-' => + xComment ; // ignore comments + + case 'E' => + nextch; + if('L' == ch) { + nextch; + elementDecl() + } else + entityDecl(); + + case 'A' => + nextch; + attrDecl(); + + case 'N' => + nextch; + notationDecl(); + } + } + case _ => + reportSyntaxError("unexpected character"); + } + } + + /** ' != ch) { + putChar(ch); + nextch; + } + nextch; + val cmstr = cbuf.toString(); + cbuf.setLength( 0 ); + val cm = ContentModel.parse(cmstr); + decls = ElemDecl(n, cm, null)::decls; + } + + /** ' != ch) { + val aname = xName; + var defdecl: DefaultDecl = null; + xSpace; + while('"' != ch && '\'' != ch && '#' != ch && '<' != ch) { + if(!isSpace(ch)) + cbuf.append(ch); + nextch; + } + ch match { + case '\'' | '"' => + val defValue = xAttributeValue(); // default value + defdecl = DEFAULT(false, defValue); + + case '#' => xName.match { + case "FIXED" => + xSpace; + val defValue = xAttributeValue(); // default value + defdecl = DEFAULT(true, defValue); + case "IMPLIED" => + defdecl = IMPLIED + case "REQUIRED" => + defdecl = REQUIRED + } + case _ => + } + xSpaceOpt; + + attList = AttrDecl(xName, cbuf.toString(), defdecl) :: attList; + cbuf.setLength(0); + } + nextch; + decls = AttListDecl(n, attList.reverse) :: decls + } + + /** //sy + val extID = externalID(); + if(isParameterEntity) { + + + ParameterEntityDecl(n, ExtDef(extID)) + + } else { // notation? + + xSpace; + if('>' != ch) { + xToken("NDATA"); + xSpace; + val notat = xName; + xSpace; + UnparsedEntityDecl(n, extID, notat); + } else + + ParsedEntityDecl(n, ExtDef(extID)); + + } + + case '"' | '\'' => + val av = xAttributeValue(); + if(isParameterEntity) + ParameterEntityDecl(n, IntDef(av)) + else + ParsedEntityDecl(n, IntDef(av)); + } + decls = res :: decls; + } // entityDecl + + /** 'N' notationDecl ::= "OTATION" + */ + def notationDecl() = { + xToken("OTATION"); + xSpace; + val notat = xName; + xSpace; + val extID = externalID(); + xSpace; + xToken('>'); + decls = NotationDecl(notat, extID) :: decls; + } + } diff --git a/sources/scala/xml/parsing/TokenTests.scala b/sources/scala/xml/parsing/TokenTests.scala index 8d95424d05..c148d484f3 100644 --- a/sources/scala/xml/parsing/TokenTests.scala +++ b/sources/scala/xml/parsing/TokenTests.scala @@ -85,6 +85,34 @@ trait TokenTests { case _ => false; } + /** + * Returns true if the encoding name is a valid IANA encoding. + * This method does not verify that there is a decoder available + * for this encoding, only that the characters are valid for an + * IANA encoding name. + * + * @param ianaEncoding The IANA encoding name. + */ + def isValidIANAEncoding(ianaEncoding: Seq[Char]): Boolean = { + val it = ianaEncoding.elements; + if(!it.hasNext) + return false; + + var c = it.next; + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { + while(it.hasNext) { + c = it.next; + if ((c < 'A' || c > 'Z') && (c < 'a' || c > 'z') && + (c < '0' || c > '9') && c != '.' && c != '_' && + c != '-') { + return false; + } + } + return true; + } else + return false; + } // isValidIANAEncoding(String): Boolean + def checkSysID( s:String ):boolean = { s.indexOf('"') == -1 || s.indexOf('\'') == -1 } -- cgit v1.2.3