diff options
author | Martin Odersky <odersky@gmail.com> | 2013-04-17 09:48:22 +0200 |
---|---|---|
committer | Martin Odersky <odersky@gmail.com> | 2013-04-17 10:16:22 +0200 |
commit | ca8dc7ada663e44aafe470944dd17256dbde151c (patch) | |
tree | d15939e204042e358e0c83064250f1f18c1c4f25 /src/dotty/tools/dotc/parsing | |
parent | e32fedb6844eab11a27e365a570b2033a0f6f78d (diff) | |
download | dotty-ca8dc7ada663e44aafe470944dd17256dbde151c.tar.gz dotty-ca8dc7ada663e44aafe470944dd17256dbde151c.tar.bz2 dotty-ca8dc7ada663e44aafe470944dd17256dbde151c.zip |
Scanners added.
Moving Positions, Chars to new packages.
Added Source positions.
Added untyped trees module.
Factored out behavior between typed and untyped trees.
Diffstat (limited to 'src/dotty/tools/dotc/parsing')
-rw-r--r-- | src/dotty/tools/dotc/parsing/CharArrayReader.scala | 131 | ||||
-rw-r--r-- | src/dotty/tools/dotc/parsing/Scanners.scala | 958 | ||||
-rw-r--r-- | src/dotty/tools/dotc/parsing/Tokens.scala | 171 |
3 files changed, 1260 insertions, 0 deletions
diff --git a/src/dotty/tools/dotc/parsing/CharArrayReader.scala b/src/dotty/tools/dotc/parsing/CharArrayReader.scala new file mode 100644 index 000000000..29346b78a --- /dev/null +++ b/src/dotty/tools/dotc/parsing/CharArrayReader.scala @@ -0,0 +1,131 @@ +package dotty.tools +package dotc +package parsing + +import scala.reflect.internal.Chars._ + +abstract class CharArrayReader { self => + + val buf: Array[Char] + + /** Switch whether unicode should be decoded */ + protected def decodeUni: Boolean = true + + /** An error routine to call on bad unicode escapes \\uxxxx. */ + protected def error(msg: String, offset: Int): Unit + + /** the last read character */ + var ch: Char = _ + + /** The offset one past the last read character */ + var charOffset: Int = 0 + + /** The offset before the last read character */ + var lastCharOffset: Int = 0 + + /** The start offset of the current line */ + var lineStartOffset: Int = 0 + + /** The start offset of the line before the current one */ + var lastLineStartOffset: Int = 0 + + private var lastUnicodeOffset = -1 + + /** Is last character a unicode escape \\uxxxx? */ + def isUnicodeEscape = charOffset == lastUnicodeOffset + + /** Advance one character; reducing CR;LF pairs to just LF */ + final def nextChar(): Unit = { + val idx = charOffset + lastCharOffset = idx + if (idx >= buf.length) { + ch = SU + } else { + val c = buf(idx) + ch = c + charOffset = idx + 1 + if (c == '\\') potentialUnicode() + else if (c < ' ') { skipCR(); potentialLineEnd() } + } + } + + def getc() = { nextChar() ; ch } + + /** Advance one character, leaving CR;LF pairs intact. + * This is for use in multi-line strings, so there are no + * "potential line ends" here. + */ + final def nextRawChar(): Unit = { + val idx = charOffset + lastCharOffset = idx + if (idx >= buf.length) { + ch = SU + } else { + val c = buf(charOffset) + ch = c + charOffset = idx + 1 + if (c == '\\') potentialUnicode() + } + } + + /** Interpret \\uxxxx escapes */ + private def potentialUnicode() { + def evenSlashPrefix: Boolean = { + var p = charOffset - 2 + while (p >= 0 && buf(p) == '\\') p -= 1 + (charOffset - p) % 2 == 0 + } + def udigit: Int = { + if (charOffset >= buf.length) { + // Since the positioning code is very insistent about throwing exceptions, + // we have to decrement the position so our error message can be seen, since + // we are one past EOF. This happens with e.g. val x = \ u 1 <EOF> + error("incomplete unicode escape", charOffset - 1) + SU + } + else { + val d = digit2int(buf(charOffset), 16) + if (d >= 0) charOffset += 1 + else error("error in unicode escape", charOffset) + d + } + } + if (charOffset < buf.length && buf(charOffset) == 'u' && decodeUni && evenSlashPrefix) { + do charOffset += 1 + while (charOffset < buf.length && buf(charOffset) == 'u') + val code = udigit << 12 | udigit << 8 | udigit << 4 | udigit + lastUnicodeOffset = charOffset + ch = code.toChar + } + } + + /** replace CR;LF by LF */ + private def skipCR() { + if (ch == CR) + if (charOffset < buf.length && buf(charOffset) == LF) { + charOffset += 1 + ch = LF + } + } + + /** Handle line ends */ + private def potentialLineEnd() { + if (ch == LF || ch == FF) { + lastLineStartOffset = lineStartOffset + lineStartOffset = charOffset + } + } + + def isAtEnd = charOffset >= buf.length + + /** A new reader that takes off at the current character position */ + def lookaheadReader = new CharArrayLookaheadReader + + class CharArrayLookaheadReader extends CharArrayReader { + val buf = self.buf + charOffset = self.charOffset + ch = self.ch + override def decodeUni = self.decodeUni + def error(msg: String, offset: Int) = self.error(msg, offset) + } +} diff --git a/src/dotty/tools/dotc/parsing/Scanners.scala b/src/dotty/tools/dotc/parsing/Scanners.scala new file mode 100644 index 000000000..2b3ec9bc2 --- /dev/null +++ b/src/dotty/tools/dotc/parsing/Scanners.scala @@ -0,0 +1,958 @@ +package dotty.tools +package dotc +package parsing + +import Tokens._ +import core.Names._, core.Contexts._, core.Decorators._, util.Positions._ +import core.StdNames._ +import util.SourceFile +import java.lang.Character.isDigit +import scala.reflect.internal.Chars._ +import Tokens._ +import scala.annotation.{ switch, tailrec } +import scala.collection.{ mutable, immutable } +import mutable.{ ListBuffer, ArrayBuffer } +import scala.xml.Utility.isNameStart + +object Scanners { + + /** Offset into source character array */ + type Offset = Int + + /** An undefined offset */ + val NoOffset: Offset = -1 + + case class Comment(pos: Position, chrs: String) { + def isDocComment = chrs.startsWith("/**") + } + + type Token = Int + + trait TokenData { + + /** the next token */ + var token: Token = EMPTY + + /** the offset of the first character of the current token */ + var offset: Offset = 0 + + /** the offset of the character following the token preceding this one */ + var lastOffset: Offset = 0 + + /** the name of an identifier */ + var name: TermName = null + + /** the string value of a literal */ + var strVal: String = null + + /** the base of a number */ + var base: Int = 0 + + def copyFrom(td: TokenData) = { + this.token = td.token + this.offset = td.offset + this.lastOffset = td.lastOffset + this.name = td.name + this.strVal = td.strVal + this.base = td.base + } + } + + class Scanner(source: SourceFile)(implicit ctx: Context) extends CharArrayReader with TokenData { + + val buf = source.content + + var keepComments = false + + /** All comments in the reverse order of their position in the source. + * set only when `keepComments` is true. + */ + var revComments: List[Comment] = Nil + + /** the last error offset + */ + var errOffset: Offset = NoOffset + + /** A buffer for comments */ + val commentBuf = new StringBuilder + + /** A character buffer for literals + */ + val litBuf = new StringBuilder + + /** append Unicode character to "litBuf" buffer + */ + protected def putChar(c: Char): Unit = litBuf.append(c) + + /** Clear buffer and set string */ + private def setStrVal() = flushBuf(litBuf) + + private class TokenData0 extends TokenData + + /** we need one token lookahead and one token history + */ + private val next : TokenData = new TokenData0 + private val prev : TokenData = new TokenData0 + + /** a stack of tokens which indicates whether line-ends can be statement separators + * also used for keeping track of nesting levels. + * We keep track of the closing symbol of a region. This can be + * RPAREN if region starts with '(' + * RBRACKET if region starts with '[' + * RBRACE if region starts with '{' + * ARROW if region starts with `case' + * STRINGLIT if region is a string interpolation expression starting with '${' + * (the STRINGLIT appears twice in succession on the stack iff the + * expression is a multiline string literal). + */ + var sepRegions: List[Token] = List() + +// Get next token ------------------------------------------------------------ + + /** Are we directly in a string interpolation expression? + */ + private def inStringInterpolation = + sepRegions.nonEmpty && sepRegions.head == STRINGLIT + + /** Are we directly in a multiline string interpolation expression? + * @pre inStringInterpolation + */ + private def inMultiLineInterpolation = + inStringInterpolation && sepRegions.tail.nonEmpty && sepRegions.tail.head == STRINGPART + + /** read next token and return last offset + */ + def skipToken(): Offset = { + val off = offset + nextToken() + off + } + + def adjustSepRegions(lastToken: Token): Unit = (lastToken: @switch) match { + case LPAREN => + sepRegions = RPAREN :: sepRegions + case LBRACKET => + sepRegions = RBRACKET :: sepRegions + case LBRACE => + sepRegions = RBRACE :: sepRegions + case CASE => + sepRegions = ARROW :: sepRegions + case RBRACE => + while (!sepRegions.isEmpty && sepRegions.head != RBRACE) + sepRegions = sepRegions.tail + if (!sepRegions.isEmpty) sepRegions = sepRegions.tail + case RBRACKET | RPAREN => + if (!sepRegions.isEmpty && sepRegions.head == lastToken) + sepRegions = sepRegions.tail + case ARROW => + if (!sepRegions.isEmpty && sepRegions.head == lastToken) + sepRegions = sepRegions.tail + case STRINGLIT => + if (inMultiLineInterpolation) + sepRegions = sepRegions.tail.tail + else if (inStringInterpolation) + sepRegions = sepRegions.tail + case _ => + } + + /** Produce next token, filling TokenData fields of Scanner. + */ + def nextToken() { + val lastToken = token + adjustSepRegions(lastToken) + + // Read a token or copy it from `next` tokenData + if (next.token == EMPTY) { + lastOffset = lastCharOffset + if (inStringInterpolation) fetchStringPart() + else fetchToken() + if (token == ERROR) adjustSepRegions(STRINGLIT) + } else { + this copyFrom next + next.token = EMPTY + } + + /** Insert NEWLINE or NEWLINES if + * - we are after a newline + * - we are within a { ... } or on toplevel (wrt sepRegions) + * - the current token can start a statement and the one before can end it + * insert NEWLINES if we are past a blank line, NEWLINE otherwise + */ + if (isAfterLineEnd() && + (canEndStatTokens contains lastToken) && + (canStartStatTokens contains token) && + (sepRegions.isEmpty || sepRegions.head == RBRACE)) { + next copyFrom this + offset = lineStartOffset min lastLineStartOffset + token = if (pastBlankLine()) NEWLINES else NEWLINE + } + + postProcessToken() +// print("["+this+"]") + } + + def postProcessToken() = { + // Join CASE + CLASS => CASECLASS, CASE + OBJECT => CASEOBJECT, SEMI + ELSE => ELSE + if (token == CASE) { + prev copyFrom this + val nextLastOffset = lastCharOffset + fetchToken() + def resetOffset() { + offset = prev.offset + lastOffset = prev.lastOffset + } + if (token == CLASS) { + token = CASECLASS + resetOffset() + } else if (token == OBJECT) { + token = CASEOBJECT + resetOffset() + } else { + lastOffset = nextLastOffset + next copyFrom this + this copyFrom prev + } + } else if (token == SEMI) { + prev copyFrom this + fetchToken() + if (token != ELSE) { + next copyFrom this + this copyFrom prev + } + } + } + + /** Is current token first one after a newline? */ + def isAfterLineEnd(): Boolean = + lastOffset < lineStartOffset && + (lineStartOffset <= offset || + lastOffset < lastLineStartOffset && lastLineStartOffset <= offset) + + /** Is there a blank line between the current token and the last one? + * @pre afterLineEnd(). + */ + private def pastBlankLine(): Boolean = { + val end = offset + def recur(idx: Offset, isBlank: Boolean): Boolean = + idx < end && { + val ch = buf(idx) + if (ch == LF || ch == FF) isBlank || recur(idx + 1, true) + else recur(idx + 1, isBlank && ch <= ' ') + } + recur(lastOffset, false) + } + + /** read next token, filling TokenData fields of Scanner. + */ + protected final def fetchToken() { + offset = charOffset - 1 + (ch: @switch) match { + case ' ' | '\t' | CR | LF | FF => + nextChar() + fetchToken() + case 'A' | 'B' | 'C' | 'D' | 'E' | + 'F' | 'G' | 'H' | 'I' | 'J' | + 'K' | 'L' | 'M' | 'N' | 'O' | + 'P' | 'Q' | 'R' | 'S' | 'T' | + 'U' | 'V' | 'W' | 'X' | 'Y' | + 'Z' | '$' | '_' | + 'a' | 'b' | 'c' | 'd' | 'e' | + 'f' | 'g' | 'h' | 'i' | 'j' | + 'k' | 'l' | 'm' | 'n' | 'o' | + 'p' | 'q' | 'r' | 's' | 't' | + 'u' | 'v' | 'w' | 'x' | 'y' | + 'z' => + putChar(ch) + nextChar() + getIdentRest() + if (ch == '"' && token == IDENTIFIER) + token = INTERPOLATIONID + case '<' => // is XMLSTART? + def fetchLT() = { + val last = if (charOffset >= 2) buf(charOffset - 2) else ' ' + nextChar() + last match { + case ' ' | '\t' | '\n' | '{' | '(' | '>' if isNameStart(ch) || ch == '!' || ch == '?' => + token = XMLSTART + case _ => + // Console.println("found '<', but last is '"+in.last+"'"); // DEBUG + putChar('<') + getOperatorRest() + } + } + fetchLT + case '~' | '!' | '@' | '#' | '%' | + '^' | '*' | '+' | '-' | /*'<' | */ + '>' | '?' | ':' | '=' | '&' | + '|' | '\\' => + putChar(ch) + nextChar() + getOperatorRest() + case '/' => + if (skipComment()) { + fetchToken() + } else { + putChar('/') + getOperatorRest() + } + case '0' => + def fetchZero() = { + putChar(ch) + nextChar() + if (ch == 'x' || ch == 'X') { + nextChar() + base = 16 + } else { + /** + * What should leading 0 be in the future? It is potentially dangerous + * to let it be base-10 because of history. Should it be an error? Is + * there a realistic situation where one would need it? + */ + if (isDigit(ch)) + error("Non-zero numbers may not have a leading zero.") + } + getNumber() + } + fetchZero + case '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => + base = 10 + getNumber() + case '`' => + getBackquotedIdent() + case '\"' => + def fetchDoubleQuote() = { + if (token == INTERPOLATIONID) { + nextRawChar() + if (ch == '\"') { + nextRawChar() + if (ch == '\"') { + nextRawChar() + getStringPart(multiLine = true) + sepRegions = STRINGPART :: sepRegions // indicate string part + sepRegions = STRINGLIT :: sepRegions // once more to indicate multi line string part + } else { + token = STRINGLIT + strVal = "" + } + } else { + getStringPart(multiLine = false) + sepRegions = STRINGLIT :: sepRegions // indicate single line string part + } + } else { + nextChar() + if (ch == '\"') { + nextChar() + if (ch == '\"') { + nextRawChar() + getRawStringLit() + } else { + token = STRINGLIT + strVal = "" + } + } else { + getStringLit() + } + } + } + fetchDoubleQuote + case '\'' => + def fetchSingleQuote() = { + nextChar() + if (isIdentifierStart(ch)) + charLitOr(getIdentRest) + else if (isOperatorPart(ch) && (ch != '\\')) + charLitOr(getOperatorRest) + else { + getLitChar() + if (ch == '\'') { + nextChar() + token = CHARLIT + setStrVal() + } else { + error("unclosed character literal") + } + } + } + fetchSingleQuote + case '.' => + nextChar() + if ('0' <= ch && ch <= '9') { + putChar('.'); getFraction(); setStrVal() + } else { + token = DOT + } + case ';' => + nextChar(); token = SEMI + case ',' => + nextChar(); token = COMMA + case '(' => + nextChar(); token = LPAREN + case '{' => + nextChar(); token = LBRACE + case ')' => + nextChar(); token = RPAREN + case '}' => + nextChar(); token = RBRACE + case '[' => + nextChar(); token = LBRACKET + case ']' => + nextChar(); token = RBRACKET + case SU => + if (isAtEnd) token = EOF + else { + error("illegal character") + nextChar() + } + case _ => + def fetchOther() = { + if (ch == '\u21D2') { + nextChar(); token = ARROW + } else if (ch == '\u2190') { + nextChar(); token = LARROW + } else if (Character.isUnicodeIdentifierStart(ch)) { + putChar(ch) + nextChar() + getIdentRest() + } else if (isSpecial(ch)) { + putChar(ch) + nextChar() + getOperatorRest() + } else { + error(f"illegal character '\\u${ch: Int}%04x'") + nextChar() + } + } + fetchOther + } + } + + private def skipComment(): Boolean = { + def appendToComment(ch: Char) = + if (keepComments) commentBuf.append(ch) + def nextChar() = { + appendToComment(ch) + Scanner.this.nextChar() + } + def skipLine(): Unit = { + nextChar() + if ((ch != CR) && (ch != LF) && (ch != SU)) skipLine() + } + @tailrec + def skipBlock(openComments: Int): Unit = { + val last = ch + nextChar() + if (ch == '/') + if (last == '*') { + if (openComments > 0) skipBlock(openComments - 1) + } else { + nextChar() + if (ch == '*') { nextChar(); skipBlock(openComments + 1) } + else skipBlock(openComments) + } + else if (ch == SU) incompleteInputError("unclosed comment") + else skipBlock(openComments) + } + val start = lastCharOffset + def finishComment(): Boolean = { + if (keepComments) { + val pos = Position(start, charOffset) + nextChar() + revComments = Comment(pos, flushBuf(commentBuf)) :: revComments + } + true + } + nextChar() + if (ch == '/') { skipLine(); finishComment() } + else if (ch == '*') { nextChar(); skipBlock(0); finishComment() } + else false + } + +// Identifiers --------------------------------------------------------------- + + private def getBackquotedIdent() { + nextChar() + getLitChars('`') + if (ch == '`') { + nextChar() + finishNamed(BACKQUOTED_IDENT) + if (name.length == 0) + error("empty quoted identifier") + else if (name == nme.WILDCARD) + error("wildcard invalid as backquoted identifier") + } + else error("unclosed quoted identifier") + } + + private def getIdentRest(): Unit = (ch: @switch) match { + case 'A' | 'B' | 'C' | 'D' | 'E' | + 'F' | 'G' | 'H' | 'I' | 'J' | + 'K' | 'L' | 'M' | 'N' | 'O' | + 'P' | 'Q' | 'R' | 'S' | 'T' | + 'U' | 'V' | 'W' | 'X' | 'Y' | + 'Z' | '$' | + 'a' | 'b' | 'c' | 'd' | 'e' | + 'f' | 'g' | 'h' | 'i' | 'j' | + 'k' | 'l' | 'm' | 'n' | 'o' | + 'p' | 'q' | 'r' | 's' | 't' | + 'u' | 'v' | 'w' | 'x' | 'y' | + 'z' | + '0' | '1' | '2' | '3' | '4' | + '5' | '6' | '7' | '8' | '9' => + putChar(ch) + nextChar() + getIdentRest() + case '_' => + putChar(ch) + nextChar() + getIdentOrOperatorRest() + case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true! + finishNamed() + case _ => + if (Character.isUnicodeIdentifierPart(ch)) { + putChar(ch) + nextChar() + getIdentRest() + } else { + finishNamed() + } + } + + private def getOperatorRest(): Unit = (ch: @switch) match { + case '~' | '!' | '@' | '#' | '%' | + '^' | '*' | '+' | '-' | '<' | + '>' | '?' | ':' | '=' | '&' | + '|' | '\\' => + putChar(ch); nextChar(); getOperatorRest() + case '/' => + if (skipComment()) finishNamed() + else { putChar('/'); getOperatorRest() } + case _ => + if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() } + else finishNamed() + } + + private def getIdentOrOperatorRest() { + if (isIdentifierPart(ch)) + getIdentRest() + else ch match { + case '~' | '!' | '@' | '#' | '%' | + '^' | '*' | '+' | '-' | '<' | + '>' | '?' | ':' | '=' | '&' | + '|' | '\\' | '/' => + getOperatorRest() + case _ => + if (isSpecial(ch)) getOperatorRest() + else finishNamed() + } + } + + +// Literals ----------------------------------------------------------------- + + private def getStringLit() = { + getLitChars('"') + if (ch == '"') { + setStrVal() + nextChar() + token = STRINGLIT + } else error("unclosed string literal") + } + + private def getRawStringLit(): Unit = { + if (ch == '\"') { + nextRawChar() + if (isTripleQuote()) { + setStrVal() + token = STRINGLIT + } else + getRawStringLit() + } else if (ch == SU) { + incompleteInputError("unclosed multi-line string literal") + } else { + putChar(ch) + nextRawChar() + getRawStringLit() + } + } + + @annotation.tailrec private def getStringPart(multiLine: Boolean): Unit = { + def finishStringPart() = { + setStrVal() + token = STRINGPART + next.lastOffset = charOffset - 1 + next.offset = charOffset - 1 + } + if (ch == '"') { + if (multiLine) { + nextRawChar() + if (isTripleQuote()) { + setStrVal() + token = STRINGLIT + } else + getStringPart(multiLine) + } else { + nextChar() + setStrVal() + token = STRINGLIT + } + } else if (ch == '$') { + nextRawChar() + if (ch == '$') { + putChar(ch) + nextRawChar() + getStringPart(multiLine) + } else if (ch == '{') { + finishStringPart() + nextRawChar() + next.token = LBRACE + } else if (Character.isUnicodeIdentifierStart(ch)) { + finishStringPart() + do { + putChar(ch) + nextRawChar() + } while (ch != SU && Character.isUnicodeIdentifierPart(ch)) + finishNamed(target = next) + } else { + error("invalid string interpolation: `$$', `$'ident or `$'BlockExpr expected") + } + } else { + val isUnclosedLiteral = !isUnicodeEscape && (ch == SU || (!multiLine && (ch == CR || ch == LF))) + if (isUnclosedLiteral) { + if (multiLine) + incompleteInputError("unclosed multi-line string literal") + else + error("unclosed string literal") + } + else { + putChar(ch) + nextRawChar() + getStringPart(multiLine) + } + } + } + + private def fetchStringPart() = { + offset = charOffset - 1 + getStringPart(multiLine = inMultiLineInterpolation) + } + + private def isTripleQuote(): Boolean = + if (ch == '"') { + nextRawChar() + if (ch == '"') { + nextChar() + while (ch == '"') { + putChar('"') + nextChar() + } + true + } else { + putChar('"') + putChar('"') + false + } + } else { + putChar('"') + false + } + + /** copy current character into litBuf, interpreting any escape sequences, + * and advance to next character. + */ + protected def getLitChar(): Unit = + if (ch == '\\') { + nextChar() + if ('0' <= ch && ch <= '7') { + val leadch: Char = ch + var oct: Int = digit2int(ch, 8) + nextChar() + if ('0' <= ch && ch <= '7') { + oct = oct * 8 + digit2int(ch, 8) + nextChar() + if (leadch <= '3' && '0' <= ch && ch <= '7') { + oct = oct * 8 + digit2int(ch, 8) + nextChar() + } + } + putChar(oct.toChar) + } else { + ch match { + case 'b' => putChar('\b') + case 't' => putChar('\t') + case 'n' => putChar('\n') + case 'f' => putChar('\f') + case 'r' => putChar('\r') + case '\"' => putChar('\"') + case '\'' => putChar('\'') + case '\\' => putChar('\\') + case _ => invalidEscape() + } + nextChar() + } + } else { + putChar(ch) + nextChar() + } + + protected def invalidEscape(): Unit = { + error("invalid escape character", charOffset - 1) + putChar(ch) + } + + private def getLitChars(delimiter: Char) = { + while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape)) + getLitChar() + } + + /** read fractional part and exponent of floating point number + * if one is present. + */ + protected def getFraction() { + token = DOUBLELIT + while ('0' <= ch && ch <= '9') { + putChar(ch) + nextChar() + } + if (ch == 'e' || ch == 'E') { + val lookahead = lookaheadReader + lookahead.nextChar() + if (lookahead.ch == '+' || lookahead.ch == '-') { + lookahead.nextChar() + } + if ('0' <= lookahead.ch && lookahead.ch <= '9') { + putChar(ch) + nextChar() + if (ch == '+' || ch == '-') { + putChar(ch) + nextChar() + } + while ('0' <= ch && ch <= '9') { + putChar(ch) + nextChar() + } + } + token = DOUBLELIT + } + if (ch == 'd' || ch == 'D') { + putChar(ch) + nextChar() + token = DOUBLELIT + } else if (ch == 'f' || ch == 'F') { + putChar(ch) + nextChar() + token = FLOATLIT + } + checkNoLetter() + } + def checkNoLetter() { + if (isIdentifierPart(ch) && ch >= ' ') + error("Invalid literal number") + } + + /** Read a number into strVal and set base + */ + protected def getNumber() { + while (digit2int(ch, base) >= 0) { + putChar(ch) + nextChar() + } + token = INTLIT + if (base == 10 && ch == '.') { + val isDefinitelyNumber = { + val lookahead = lookaheadReader + val c = lookahead.getc() + (c: @switch) match { + /** Another digit is a giveaway. */ + case '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => + true + + /** Backquoted idents like 22.`foo`. */ + case '`' => + false + + /** These letters may be part of a literal, or a method invocation on an Int. + */ + case 'd' | 'D' | 'f' | 'F' => + !isIdentifierPart(lookahead.getc()) + + /** A little more special handling for e.g. 5e7 */ + case 'e' | 'E' => + val ch = lookahead.getc() + !isIdentifierPart(ch) || (isDigit(ch) || ch == '+' || ch == '-') + + case x => + !isIdentifierStart(x) + } + } + if (isDefinitelyNumber) { + putChar(ch) + nextChar() + getFraction() + } + } else (ch: @switch) match { + case 'e' | 'E' | 'f' | 'F' | 'd' | 'D' if base == 10 => + getFraction() + case 'l' | 'L' => + nextChar() + token = LONGLIT + case _ => + } + setStrVal() + } + + /** Parse character literal if current character is followed by \', + * or follow with given op and return a symbol literal token + */ + def charLitOr(op: () => Unit) { + putChar(ch) + nextChar() + if (ch == '\'') { + nextChar() + token = CHARLIT + setStrVal() + } else { + op() + token = SYMBOLLIT + strVal = name.toString + } + } + +// Setting token data ---------------------------------------------------- + + /** Clear buffer and set name and token */ + def finishNamed(idtoken: Token = IDENTIFIER, target: TokenData = this): Unit = { + target.name = flushBuf(litBuf).toTermName + target.token = idtoken + if (idtoken == IDENTIFIER) { + val idx = target.name.start + if (idx >= 0 && idx < kwArray.length) target.token = kwArray(idx) + } + } + + /** Return buffer contents and clear */ + def flushBuf(buf: StringBuilder): String = { + val str = buf.toString + buf.clear() + str + } + + /** Convert current strVal to char value + */ + def charVal: Char = if (strVal.length > 0) strVal.charAt(0) else 0 + + /** Convert current strVal, base to long value + * This is tricky because of max negative value. + */ + def intVal(negated: Boolean): Long = { + if (token == CHARLIT && !negated) { + charVal + } else { + var value: Long = 0 + val divider = if (base == 10) 1 else 2 + val limit: Long = + if (token == LONGLIT) Long.MaxValue else Int.MaxValue + var i = 0 + val len = strVal.length + while (i < len) { + val d = digit2int(strVal charAt i, base) + if (d < 0) { + error("malformed integer number") + return 0 + } + if (value < 0 || + limit / (base / divider) < value || + limit - (d / divider) < value * (base / divider) && + !(negated && limit == value * base - 1 + d)) { + error("integer number too large") + return 0 + } + value = value * base + d + i += 1 + } + if (negated) -value else value + } + } + + def intVal: Long = intVal(false) + + /** Convert current strVal, base to double value + */ + def floatVal(negated: Boolean): Double = { + val limit: Double = + if (token == DOUBLELIT) Double.MaxValue else Float.MaxValue + try { + val value: Double = java.lang.Double.valueOf(strVal).doubleValue() + if (value > limit) + error("floating point number too large") + if (negated) -value else value + } catch { + case _: NumberFormatException => + error("malformed floating point number") + 0.0 + } + } + + def floatVal: Double = floatVal(false) + + override def toString = showTokenDetailed(token) + + def show: String = token match { + case IDENTIFIER | BACKQUOTED_IDENT => s"id($name)" + case CHARLIT => s"char($intVal)" + case INTLIT => s"int($intVal)" + case LONGLIT => s"long($intVal)" + case FLOATLIT => s"float($floatVal)" + case DOUBLELIT => s"double($floatVal)" + case STRINGLIT => s"string($strVal)" + case STRINGPART => s"stringpart($strVal)" + case INTERPOLATIONID => s"interpolationid($name)" + case SEMI => ";" + case NEWLINE => ";" + case NEWLINES => ";;" + case COMMA => "," + case _ => showToken(token) + } + +// (does not seem to be needed) def flush = { charOffset = offset; nextChar(); this } + + /* Resume normal scanning after XML */ + def resume(lastToken: Token) = { + token = lastToken + if (next.token != EMPTY && !ctx.reporter.hasErrors) + error("unexpected end of input: possible missing '}' in XML block") + + nextToken() + } + +// Errors ----------------------------------------------------------------- + + /** Generate an error at the given offset */ + def error(msg: String, off: Offset = offset) = { + ctx.error(msg, source atPos Position(off)) + token = ERROR + errOffset = off + } + + /** signal an error where the input ended in the middle of a token */ + def incompleteInputError(msg: String) { + ctx.reporter.incompleteInputError(msg, source atPos Position(offset)) + token = EOF + errOffset = offset + } + + /* Initialization: read first char, then first token */ + nextChar() + nextToken() + } // end Scanner + + // ------------- keyword configuration ----------------------------------- + + private val kwArray: Array[Token] = { + def start(tok: Token) = tok.toString.toTermName.start + val sourceKeywords = keywords.filterNot(_.toString contains " ") + val lastIdx = sourceKeywords.map(start).max + val arr = Array.fill(lastIdx + 1)(IDENTIFIER) + for (kw <- sourceKeywords) arr(start(kw)) = kw + arr + } +} diff --git a/src/dotty/tools/dotc/parsing/Tokens.scala b/src/dotty/tools/dotc/parsing/Tokens.scala new file mode 100644 index 000000000..f573df49d --- /dev/null +++ b/src/dotty/tools/dotc/parsing/Tokens.scala @@ -0,0 +1,171 @@ +package dotty.tools +package dotc +package parsing + +import collection.mutable +import collection.immutable.BitSet +import scala.annotation.switch + +object Tokens { + + final val minToken = EMPTY + final val maxToken = XMLSTART + + type TokenSet = BitSet + + def tokenRange(lo: Int, hi: Int): TokenSet = BitSet(lo to hi: _*) + + def showTokenDetailed(token: Int) = debugString(token) + + def showToken(token: Int) = { + val str = tokenString(token) + if (keywords contains token) s"'$str'" else str + } + + val tokenString, debugString = new Array[String](maxToken + 1) + + def enter(token: Int, str: String, debugStr: String = ""): Unit = { + tokenString(token) = str + debugString(token) = if (debugStr.isEmpty) str else debugStr + } + + /** special tokens */ + final val EMPTY = 0; enter(EMPTY, "<empty>") // a missing token, used in lookahead + final val ERROR = 1; enter(ERROR, "erroneous token") // an erroneous token + final val EOF = 2; enter(EOF, "eof") + + /** literals */ + final val CHARLIT = 3; enter(CHARLIT, "character literal") + final val INTLIT = 4; enter(INTLIT, "integer literal") + final val LONGLIT = 5; enter(LONGLIT, "long literal") + final val FLOATLIT = 6; enter(FLOATLIT, "float literal") + final val DOUBLELIT = 7; enter(DOUBLELIT, "double literal") + final val STRINGLIT = 8; enter(STRINGLIT, "string literal") + final val STRINGPART = 9; enter(STRINGPART, "string literal", "string literal part") + final val INTERPOLATIONID = 10; enter(INTERPOLATIONID, "string interpolator") + final val SYMBOLLIT = 11; enter(SYMBOLLIT, "symbol literal") // TODO: deprecate + + final val literalTokens = tokenRange(CHARLIT, SYMBOLLIT) + + /** identifiers */ + final val IDENTIFIER = 12; enter(IDENTIFIER, "identifier") + final val BACKQUOTED_IDENT = 13; enter(BACKQUOTED_IDENT, "identifier", "backquoted ident") + + final val identifierTokens = BitSet(IDENTIFIER, BACKQUOTED_IDENT) + + def isIdentifier(token : Int) = + token >= IDENTIFIER && token <= BACKQUOTED_IDENT + + /** alphabetic keywords */ + final val IF = 20; enter(IF, "if") + final val FOR = 21; enter(FOR, "for") + final val ELSE = 22; enter(ELSE, "else") + final val THIS = 23; enter(THIS, "this") + final val NULL = 24; enter(NULL, "null") + final val NEW = 25; enter(NEW, "new") + final val WITH = 26; enter(WITH, "with") + final val SUPER = 27; enter(SUPER, "super") + final val CASE = 28; enter(CASE, "case") + final val CASECLASS = 29; enter(CASECLASS, "case class") + final val CASEOBJECT = 30; enter(CASEOBJECT, "case object") + final val VAL = 31; enter(VAL, "val") + final val ABSTRACT = 32; enter(ABSTRACT, "abstract") + final val FINAL = 33; enter(FINAL, "final") + final val PRIVATE = 34; enter(PRIVATE, "private") + final val PROTECTED = 35; enter(PROTECTED, "protected") + final val OVERRIDE = 36; enter(OVERRIDE, "override") + final val IMPLICIT = 37; enter(IMPLICIT, "implicit") + final val VAR = 38; enter(VAR, "var") + final val DEF = 39; enter(DEF, "def") + final val TYPE = 40; enter(TYPE, "type") + final val EXTENDS = 41; enter(EXTENDS, "extends") + final val TRUE = 42; enter(TRUE, "true") + final val FALSE = 43; enter(FALSE, "false") + final val OBJECT = 44; enter(OBJECT, "object") + final val CLASS = 45; enter(CLASS, "class") + final val IMPORT = 46; enter(IMPORT, "import") + final val PACKAGE = 47; enter(PACKAGE, "package") + final val YIELD = 48; enter(YIELD, "yield") + final val DO = 49; enter(DO, "do") + final val TRAIT = 50; enter(TRAIT, "trait") + final val SEALED = 51; enter(SEALED, "sealed") + final val THROW = 52; enter(THROW, "throw") + final val TRY = 53; enter(TRY, "try") + final val CATCH = 54; enter(CATCH, "catch") + final val FINALLY = 55; enter(FINALLY, "finally") + final val WHILE = 56; enter(WHILE, "while") + final val RETURN = 57; enter(RETURN, "return") + final val MATCH = 58; enter(MATCH, "match") + final val FORSOME = 59; enter(FORSOME, "forSome") // TODO: deprecate + final val LAZY = 61; enter(LAZY, "lazy") + final val THEN = 62; enter(THEN, "then") + + final val alphaKeywords = tokenRange(IF, LAZY) + + /** special symbols */ + final val COMMA = 70; enter(COMMA, "','") + final val SEMI = 71; enter(DOT, "'.'") + final val DOT = 72; enter(SEMI, "';'") + final val NEWLINE = 78; enter(NEWLINE, "';'", "new line") + final val NEWLINES = 79; enter(NEWLINES, "';'", "new lines") + + /** special keywords */ + final val USCORE = 73; enter(USCORE, "_") + final val COLON = 74; enter(COLON, ":") + final val EQUALS = 75; enter(EQUALS, "==") + final val LARROW = 76; enter(LARROW, "<-") + final val ARROW = 77; enter(ARROW, "=>") + final val SUBTYPE = 80; enter(SUBTYPE, "<:") + final val SUPERTYPE = 81; enter(SUPERTYPE, ">:") + final val HASH = 82; enter(HASH, "#") + final val AT = 83; enter(AT, "@") + final val VIEWBOUND = 84; enter(VIEWBOUND, "<%") // TODO: deprecate + + final val symbolicKeywords = tokenRange(USCORE, VIEWBOUND) + final val symbolicTokens = tokenRange(COMMA, VIEWBOUND) + final val keywords = alphaKeywords | symbolicKeywords + + /** parentheses */ + final val LPAREN = 90; enter(LPAREN, "'('") + final val RPAREN = 91; enter(RPAREN, "')'") + final val LBRACKET = 92; enter(LBRACKET, "'['") + final val RBRACKET = 93; enter(RBRACKET, "']'") + final val LBRACE = 94; enter(LBRACE, "'{'") + final val RBRACE = 95; enter(RBRACE, "'}'") + + /** XML mode */ + final val XMLSTART = 96; enter(XMLSTART, "$XMLSTART$<") // TODO: deprecate + + final val allTokens = tokenRange(minToken, maxToken) + + final val atomicExprTokens = literalTokens | identifierTokens | BitSet( + USCORE, NULL, THIS, SUPER, TRUE, FALSE, RETURN, XMLSTART) + + final val canStartExpressionTokens = atomicExprTokens | BitSet( + LBRACE, LPAREN, IF, DO, WHILE, FOR, NEW, TRY, THROW) + + final val canStartTypeTokens = literalTokens | identifierTokens | BitSet( + THIS, SUPER, USCORE, LPAREN, AT) + + final val templateIntroTokens = BitSet(CLASS, TRAIT, OBJECT, CASECLASS, CASEOBJECT) + + final val dclIntroTokens = BitSet(DEF, VAL, VAR, TYPE) + + final val defIntroTokens = templateIntroTokens | dclIntroTokens + + final val localModifierTokens = BitSet( + ABSTRACT, FINAL, SEALED, IMPLICIT, LAZY) + + final val modifierTokens = localModifierTokens | BitSet( + PRIVATE, PROTECTED, OVERRIDE) + + /** Is token only legal as start of statement (eof also included)? */ + final val mustStartStatTokens = defIntroTokens | modifierTokens | BitSet( + CASE, IMPORT, PACKAGE) + + final val canStartStatTokens = canStartExpressionTokens | mustStartStatTokens | BitSet( + AT) + + final val canEndStatTokens = atomicExprTokens | BitSet( + TYPE, RPAREN, RBRACE, RBRACKET) +} |