summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorburaq <buraq@epfl.ch>2005-04-26 14:21:52 +0000
committerburaq <buraq@epfl.ch>2005-04-26 14:21:52 +0000
commit6a415fa5cef84c98f3dbfb1a0e9d06a1751724c6 (patch)
treea5ebf0d04fa103136b2552838a6da1087f7e89b5
parentbdf2e9f7023a3916d1b41ff7deb0493082fae877 (diff)
downloadscala-6a415fa5cef84c98f3dbfb1a0e9d06a1751724c6.tar.gz
scala-6a415fa5cef84c98f3dbfb1a0e9d06a1751724c6.tar.bz2
scala-6a415fa5cef84c98f3dbfb1a0e9d06a1751724c6.zip
DTD parsing, representation
-rw-r--r--config/list/library.lst2
-rw-r--r--sources/scala/xml/dtd/DTD.scala18
-rw-r--r--sources/scala/xml/dtd/Decl.scala78
-rw-r--r--sources/scala/xml/parsing/MarkupParser.scala394
-rw-r--r--sources/scala/xml/parsing/TokenTests.scala28
5 files changed, 484 insertions, 36 deletions
diff --git a/config/list/library.lst b/config/list/library.lst
index 2c7f810d67..54738aae04 100644
--- a/config/list/library.lst
+++ b/config/list/library.lst
@@ -239,7 +239,7 @@ xml/XML.scala
xml/dtd/ContentModel.scala
xml/dtd/DocType.scala
-#xml/dtd/DTD.scala
+xml/dtd/DTD.scala
xml/dtd/Decl.scala
xml/dtd/ExternalID.scala
xml/dtd/Parser.scala
diff --git a/sources/scala/xml/dtd/DTD.scala b/sources/scala/xml/dtd/DTD.scala
new file mode 100644
index 0000000000..92d1f435be
--- /dev/null
+++ b/sources/scala/xml/dtd/DTD.scala
@@ -0,0 +1,18 @@
+package scala.xml.dtd;
+
+/** a document type declaration */
+abstract class DTD {
+
+ var externalID: ExternalID = null;
+
+ def notations: Seq[NotationDecl] = Nil;
+
+ def unparsedEntities: Seq[EntityDecl] = Nil;
+
+ var decls: List[MarkupDecl] = Nil;
+
+ //def getElemDecl(elem:String): ElemDecl;
+
+ //def getAttribDecl(elem: String, attr: String): AttrDecl;
+
+}
diff --git a/sources/scala/xml/dtd/Decl.scala b/sources/scala/xml/dtd/Decl.scala
index 2b4c4a56e0..538e5101f4 100644
--- a/sources/scala/xml/dtd/Decl.scala
+++ b/sources/scala/xml/dtd/Decl.scala
@@ -16,27 +16,23 @@ abstract class Decl ;
abstract class MarkupDecl extends Decl ;
-case class ElemDecl( name:String ,
- contentModel:String ,
- attribs:Map[String,AttrDecl] )
- extends MarkupDecl {
-
- final val parsedContentModel:ContentModel.RegExp = {
- try {
- ContentModel.parse( contentModel );
- } catch {
- case _:Error =>
- Console.println( "error parsing declaration of " + name );
- Console.println( "content model was:\n" + contentModel );
- null
- }
- }
-
- def containsText = contentModel.indexOf("#PCDATA") != -1 ;
-};
-
-/** an attribute declaration */
-case class AttrDecl( name:String, tpe:String, default:DefaultDecl ) extends MarkupDecl {
+/** an element declaration
+ */
+case class ElemDecl(name: String, contentModel: ContentModel.RegExp, attList: AttListDecl) extends MarkupDecl {
+
+ //def mixed = ; // to do
+
+ def setAttList(nAttList:AttListDecl) =
+ ElemDecl(name, contentModel, nAttList);
+} // ElemDecl
+
+case class AttListDecl(name: String, attrs:List[AttrDecl]) extends MarkupDecl;
+
+/** an attribute declaration. at this point, the tpe is a string. Future
+ * versions might provide a way to access the attribute types more
+ * directly.
+ */
+case class AttrDecl( name:String, tpe:String, default:DefaultDecl ) {
final override def toString() = {
val sb = new StringBuffer("AttrDecl(");
sb.append('"');
@@ -53,14 +49,48 @@ case class AttrDecl( name:String, tpe:String, default:DefaultDecl ) extends Mark
}
}
+class EntityDecl extends MarkupDecl;
+/** an entity declaration */
+
+case class ParsedEntityDecl( name:String, entdef:EntityDef )
+ extends EntityDecl;
+
+case class ParameterEntityDecl(name: String, entdef: EntityDef)
+ extends EntityDecl;
+
+class EntityDef;
+
+case class IntDef(value:String) extends EntityDef {
+ private def validateValue(): Unit = {
+ var tmp = value;
+ var ix = tmp.indexOf('%');
+ while( ix != -1) {
+ val iz = tmp.indexOf(';', ix);
+ if(iz == -1 && iz == ix + 1)
+ error("no % allowed in entity value, except for parameter-entity-references");
+ else {
+ val n = tmp.substring(ix, iz);
+
+ if( !Utility.isName( n ))
+ throw new IllegalArgumentException("ent must be an XML Name");
+
+ tmp = tmp.substring(iz+1, tmp.length());
+ ix = tmp.indexOf('%');
+ }
+ }
+ }
+ validateValue();
+}
+case class ExtDef(extID:ExternalID) extends EntityDef;
+
/** an entity declaration */
-case class EntityDecl( name:String, tpe:String ) extends MarkupDecl;
+case class UnparsedEntityDecl( name:String, extID:ExternalID, notation:String ) extends EntityDecl;
/** a notation declaration */
-case class NotationDecl( name:String, tpe:String ) extends MarkupDecl;
+case class NotationDecl( name:String, extID:ExternalID ) extends MarkupDecl;
/** a parsed entity reference */
-case class PEReference(ent:String) extends Decl {
+case class PEReference(ent:String) extends MarkupDecl {
if( !Utility.isName( ent ))
throw new IllegalArgumentException("ent must be an XML Name");
diff --git a/sources/scala/xml/parsing/MarkupParser.scala b/sources/scala/xml/parsing/MarkupParser.scala
index 633c610a53..f41d0c3ccd 100644
--- a/sources/scala/xml/parsing/MarkupParser.scala
+++ b/sources/scala/xml/parsing/MarkupParser.scala
@@ -9,12 +9,17 @@
package scala.xml.parsing;
-/** an xml parser. parses XML, invokes callback methods of a MarkupHandler
+import scala.xml.dtd._ ;
+/** an xml parser. parses XML 1.0, invokes callback methods of a MarkupHandler
* and returns whatever the markup handler returns. Use ConstructingParser
* if you just want to parse XML to construct instances of scala.xml.Node.
*/
abstract class MarkupParser with TokenTests {
+ //
+ // variables, values
+ //
+
/** the handler of the markup */
val handle: MarkupHandler;
@@ -33,6 +38,118 @@ abstract class MarkupParser with TokenTests {
/** character buffer, for names */
protected val cbuf = new StringBuffer();
+ var dtd: DTD = null;
+
+ var decls: List[scala.xml.dtd.Decl] = Nil;
+
+ //
+ // methods
+ //
+
+ /** &lt;? prolog ::= xml S
+ */
+ def prolog(): Tuple3[Option[String], Option[String], Option[Boolean]] = {
+
+ var info_ver: Option[String] = None;
+ var info_enc: Option[String] = None;
+ var info_stdl: Option[Boolean] = None;
+
+ xToken('x');
+ xToken('m');
+ xToken('l');
+ xSpace;
+ val Pair(md,scp) = xAttributes(TopScope);
+ xToken('?');
+ xToken('>');
+ xSpace;
+ if(TopScope == scp) {
+ var m = md;
+
+ if(!m.isPrefixed && m.key == "version") {
+ if(m.value == "1.0") {
+ info_ver = Some("1.0");
+ m = m.next;
+ } else {
+ reportSyntaxError("cannot deal with versions != 1.0");
+ }
+ } else
+ reportSyntaxError("VersionInfo expected!");
+
+ if(!m.isPrefixed && m.key == "encoding") {
+ val enc = m.value;
+ if(!isValidIANAEncoding(enc))
+ reportSyntaxError("\""+enc+"\" is not a valid encoding");
+ info_enc = Some(enc);
+ m = m.next
+ }
+
+ if(!m.isPrefixed && m.key == "standalone") {
+ m.value.match {
+ case "yes" =>
+ info_stdl = Some(true);
+ case "no" =>
+ info_stdl = Some(false);
+ case _ =>
+ reportSyntaxError("either 'yes' or 'no' expected");
+ }
+ m = m.next
+ }
+
+ if(m != Null)
+ reportSyntaxError("VersionInfo EncodingDecl? SDDecl? or '?>' expected!");
+ } else
+ reportSyntaxError("no xmlns definitions here, please");
+
+ Tuple3(info_ver,info_enc,info_stdl)
+ }
+
+ /**
+ *[22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
+ *[23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
+ *[24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
+ *[25] Eq ::= S? '=' S?
+ *[26] VersionNum ::= '1.0'
+ *[27] Misc ::= Comment | PI | S
+ */
+
+ def document(): Document = {
+ this.dtd = null;
+ var info_prolog: Tuple3[Option[String], Option[String], Option[Boolean]] =
+ Tuple3(None,None,None);
+ if('<' != ch) {
+ reportSyntaxError("< expected");
+ return null;
+ }
+
+ nextch; // is prolog ?
+ if('?' == ch)
+ info_prolog = prolog();
+
+ val children = content(TopScope); // DTD handled as side effect
+ var elemCount = 0;
+ var theNode: Node = _;
+ for(val c <- children) c.match {
+ case _:ProcInstr => ;
+ case _:Comment => ;
+ case _:EntityRef => // todo: fix entities, shouldn't be "special"
+ reportSyntaxError("no entity references alllowed here");
+ case m:Node =>
+ elemCount = elemCount + 1;
+ theNode = m;
+ }
+ if(1 != elemCount)
+ reportSyntaxError("document should contain exactly one element");
+
+ val doc = new Document();
+ doc.children = children;
+ doc.docElem = theNode;
+ doc.version = info_prolog._1;
+ doc.encoding = info_prolog._2;
+ doc.standAlone = info_prolog._3;
+ doc.dtd = this.dtd;
+ return doc
+ }
+
/** append Unicode character to name buffer*/
protected def putChar(c: Char) = cbuf.append(c);
@@ -55,6 +172,12 @@ abstract class MarkupParser with TokenTests {
reportSyntaxError("'" + that + "' expected instead of '" + ch + "'");
}
+ def xToken(that: Seq[Char]): Unit = {
+ val it = that.elements;
+ while(it.hasNext)
+ xToken(it.next);
+ }
+
/** checks whether next character starts a Scala block, if yes, skip it.
* @return true if next character starts a scala block
def xCheckEmbeddedBlock:Boolean = {
@@ -93,7 +216,7 @@ abstract class MarkupParser with TokenTests {
aMap = new UnprefixedAttribute(qname, value, aMap);
}
- if ((ch != '/') && (ch != '>'))
+ if ((ch != '/') && (ch != '>') && ('?' != ch))
xSpace;
}
@@ -158,13 +281,7 @@ abstract class MarkupParser with TokenTests {
* see [15]
*/
def xCharData: NodeSeq = {
- xToken('[');
- xToken('C');
- xToken('D');
- xToken('A');
- xToken('T');
- xToken('A');
- xToken('[');
+ xToken("[CDATA[");
val pos1 = pos;
val sb:StringBuffer = new StringBuffer();
while (true) {
@@ -257,7 +374,9 @@ abstract class MarkupParser with TokenTests {
nextch;
if ('[' == ch) // CDATA
ts + xCharData;
- else // comment
+ else if ('D' == ch) // doctypedecl, parse DTD
+ parseDTD();
+ else // comment
ts + xComment;
case '?' => // PI
nextch;
@@ -298,7 +417,55 @@ abstract class MarkupParser with TokenTests {
new NodeSeq {
val theSeq = ts.toList;
}
- } /* end content */
+ } // content(NamespaceBinding)
+
+ /** externalID ::= SYSTEM S syslit
+ * PUBLIC S pubid S syslit
+ */
+
+ def externalID(): ExternalID = ch.match {
+ case 'S' =>
+ nextch;
+ xToken("YSTEM");
+ val sysID = systemLiteral();
+ new SystemID(sysID);
+ case 'P' =>
+ nextch; xToken("UBLIC");
+ val pubID = pubidLiteral();
+ xSpace;
+ val sysID = systemLiteral();
+ new PublicID(pubID, sysID);
+ }
+ /** parses document type declaration and assigns it to instance variable
+ * dtd.
+ *
+ * <! parseDTD ::= DOCTYPE name ...
+ */
+ def parseDTD(): Unit = { // dirty but fast
+ var extID: ExternalID = null;
+ if(this.dtd != null)
+ reportSyntaxError("unexpected character");
+ xToken("DOCTYPE");
+ xSpace;
+ val n = xName;
+ xSpace;
+ //external ID
+ if('S' == ch || 'P' == ch) {
+ extID = externalID();
+ xSpace;
+ }
+ if('[' == ch) { // internal subset
+ nextch;
+ /* TODO */
+ while(']' != ch)
+ nextch;
+ // TODO: do the DTD parsing?? ?!?!?!?!!
+ xToken(']');
+ }
+ this.dtd = new DTD {
+ override var externalID = extID;
+ }
+ }
def element(pscope: NamespaceBinding): NodeSeq = {
xToken('<');
@@ -415,4 +582,209 @@ abstract class MarkupParser with TokenTests {
/*}*/
}
+ /** attribute value, terminated by either ' or ". value may not contain &lt;.
+ * AttValue ::= `'` { _ } `'`
+ * | `"` { _ } `"`
+ */
+ def systemLiteral(): String = {
+ val endch = ch;
+ if(ch!='\'' && ch != '"')
+ reportSyntaxError("quote ' or \" expected");
+ nextch;
+ while (ch != endch) {
+ putChar(ch);
+ nextch;
+ }
+ nextch;
+ val str = cbuf.toString();
+ cbuf.setLength( 0 );
+ str
+ }
+
+
+ /* [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
+ def pubidLiteral(): String = {
+ val endch = ch;
+ if(ch!='\'' && ch != '"')
+ reportSyntaxError("quote ' or \" expected");
+ nextch;
+ while (ch != endch) {
+ putChar(ch);
+ if(!isPubIDChar(ch))
+ reportSyntaxError("char '"+ch+"' is not allowed in public id");
+ nextch;
+ }
+ nextch;
+ val str = cbuf.toString();
+ cbuf.setLength( 0 );
+ str
+ }
+
+ //
+ // dtd parsing
+ //
+
+ def intSubset(): Unit = {
+ xSpace;
+ while(']' != ch)
+ ch match {
+ case '%' =>
+ nextch;
+ decls = PEReference(xName) :: decls;
+ xToken(';')
+ //peReference
+ case '<' =>
+ nextch;
+
+ if('?' == ch)
+ xProcInstr; // simply ignore processing instructions!
+ else {
+ xToken('!');
+ ch.match {
+ case '-' =>
+ xComment ; // ignore comments
+
+ case 'E' =>
+ nextch;
+ if('L' == ch) {
+ nextch;
+ elementDecl()
+ } else
+ entityDecl();
+
+ case 'A' =>
+ nextch;
+ attrDecl();
+
+ case 'N' =>
+ nextch;
+ notationDecl();
+ }
+ }
+ case _ =>
+ reportSyntaxError("unexpected character");
+ }
+ }
+
+ /** <! element := ELEMENT
+ */
+ def elementDecl(): Unit = {
+ xToken("EMENT");
+ xSpace;
+ val n = xName;
+ xSpace;
+ while('>' != ch) {
+ putChar(ch);
+ nextch;
+ }
+ nextch;
+ val cmstr = cbuf.toString();
+ cbuf.setLength( 0 );
+ val cm = ContentModel.parse(cmstr);
+ decls = ElemDecl(n, cm, null)::decls;
+ }
+
+ /** <! element := ELEMENT
+ */
+ def attrDecl() = {
+ xToken("TTLIST");
+ xSpace;
+ val n = xName;
+ var attList: List[AttrDecl] = Nil;
+ // later: find the elemDecl for n
+ while('>' != ch) {
+ val aname = xName;
+ var defdecl: DefaultDecl = null;
+ xSpace;
+ while('"' != ch && '\'' != ch && '#' != ch && '<' != ch) {
+ if(!isSpace(ch))
+ cbuf.append(ch);
+ nextch;
+ }
+ ch match {
+ case '\'' | '"' =>
+ val defValue = xAttributeValue(); // default value
+ defdecl = DEFAULT(false, defValue);
+
+ case '#' => xName.match {
+ case "FIXED" =>
+ xSpace;
+ val defValue = xAttributeValue(); // default value
+ defdecl = DEFAULT(true, defValue);
+ case "IMPLIED" =>
+ defdecl = IMPLIED
+ case "REQUIRED" =>
+ defdecl = REQUIRED
+ }
+ case _ =>
+ }
+ xSpaceOpt;
+
+ attList = AttrDecl(xName, cbuf.toString(), defdecl) :: attList;
+ cbuf.setLength(0);
+ }
+ nextch;
+ decls = AttListDecl(n, attList.reverse) :: decls
+ }
+
+ /** <! element := ELEMENT
+ */
+ def entityDecl() = {
+ var isParameterEntity = false;
+ var entdef: EntityDef = null;
+ xToken("NTITY");
+ xSpace;
+ if('%' == ch) {
+ isParameterEntity = true;
+ xSpace;
+ }
+ val n = xName;
+ xSpace;
+
+ val res = ch match {
+ case 'S' | 'P' => //sy
+ val extID = externalID();
+ if(isParameterEntity) {
+
+
+ ParameterEntityDecl(n, ExtDef(extID))
+
+ } else { // notation?
+
+ xSpace;
+ if('>' != ch) {
+ xToken("NDATA");
+ xSpace;
+ val notat = xName;
+ xSpace;
+ UnparsedEntityDecl(n, extID, notat);
+ } else
+
+ ParsedEntityDecl(n, ExtDef(extID));
+
+ }
+
+ case '"' | '\'' =>
+ val av = xAttributeValue();
+ if(isParameterEntity)
+ ParameterEntityDecl(n, IntDef(av))
+ else
+ ParsedEntityDecl(n, IntDef(av));
+ }
+ decls = res :: decls;
+ } // entityDecl
+
+ /** 'N' notationDecl ::= "OTATION"
+ */
+ def notationDecl() = {
+ xToken("OTATION");
+ xSpace;
+ val notat = xName;
+ xSpace;
+ val extID = externalID();
+ xSpace;
+ xToken('>');
+ decls = NotationDecl(notat, extID) :: decls;
+ }
+
}
diff --git a/sources/scala/xml/parsing/TokenTests.scala b/sources/scala/xml/parsing/TokenTests.scala
index 8d95424d05..c148d484f3 100644
--- a/sources/scala/xml/parsing/TokenTests.scala
+++ b/sources/scala/xml/parsing/TokenTests.scala
@@ -85,6 +85,34 @@ trait TokenTests {
case _ => false;
}
+ /**
+ * Returns true if the encoding name is a valid IANA encoding.
+ * This method does not verify that there is a decoder available
+ * for this encoding, only that the characters are valid for an
+ * IANA encoding name.
+ *
+ * @param ianaEncoding The IANA encoding name.
+ */
+ def isValidIANAEncoding(ianaEncoding: Seq[Char]): Boolean = {
+ val it = ianaEncoding.elements;
+ if(!it.hasNext)
+ return false;
+
+ var c = it.next;
+ if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
+ while(it.hasNext) {
+ c = it.next;
+ if ((c < 'A' || c > 'Z') && (c < 'a' || c > 'z') &&
+ (c < '0' || c > '9') && c != '.' && c != '_' &&
+ c != '-') {
+ return false;
+ }
+ }
+ return true;
+ } else
+ return false;
+ } // isValidIANAEncoding(String): Boolean
+
def checkSysID( s:String ):boolean = {
s.indexOf('"') == -1 || s.indexOf('\'') == -1
}