From 5b1da4217f7f36eab1ba14b5b95667de5bda09ed Mon Sep 17 00:00:00 2001 From: Tiark Rompf Date: Wed, 6 May 2009 14:51:51 +0000 Subject: packrat parsing --- .../util/parsing/combinator/PackratParsers.scala | 328 +++++++++++++++++++++ .../scala/util/parsing/combinator/Parsers.scala | 19 +- .../syntactical/StandardTokenParsers.scala | 6 + .../combinator/syntactical/StdTokenParsers.scala | 7 +- 4 files changed, 356 insertions(+), 4 deletions(-) create mode 100644 src/library/scala/util/parsing/combinator/PackratParsers.scala (limited to 'src') diff --git a/src/library/scala/util/parsing/combinator/PackratParsers.scala b/src/library/scala/util/parsing/combinator/PackratParsers.scala new file mode 100644 index 0000000000..4b8477eada --- /dev/null +++ b/src/library/scala/util/parsing/combinator/PackratParsers.scala @@ -0,0 +1,328 @@ +/* __ *\ +** ________ ___ / / ___ Scala API ** +** / __/ __// _ | / / / _ | (c) 2006-2009, LAMP/EPFL ** +** __\ \/ /__/ __ |/ /__/ __ | http://scala-lang.org/ ** +** /____/\___/_/ |_/____/_/ | | ** +** |/ ** +\* */ + +// $Id:$ + +package scala.util.parsing.combinator + +import scala.collection.mutable._ +import scala.util.parsing.combinator._ +import scala.util.parsing.input.Position +import scala.util.parsing.input.Reader + +/** + *

+ * PackratParsers is a component that extends the parser combinators + * provided by Parsers with a memoization facility + * (``Packrat Parsing''). + *

+ *

+ * Packrat Parsing is a technique for implementing backtracking, recursive-descent parsers, with the + * advantage that it guarantees unlimited lookahead and a linear parse time. Using this technique, + * left recursive grammars can also be accepted. + *

+ *

+ * Using PackratParsers is very similar to using Parsers: + *

+ *

+ *

+ * Cached parse results are attached to the input, not the grammar. + * Therefore, PackratsParsers require a PackratReader as input, which + * adds memoization to an underlying Reader. Programmers can create PackratReader + * objects either manually, as in production(new PackratReader(new lexical.Scanner("input"))), + * but the common way should be to rely on the combinator phrase to wrap a given + * input with a PackratReader if the input is not one itself. + *

+ * + * @see Bryan Ford: "Packrat Parsing: Simple, Powerful, Lazy, Linear Time." ICFP'02 + * @see Alessandro Warth, James R. Douglass, Todd Millstein: "Packrat Parsers Can Support Left Recursion." PEPM'08 + * + * @since 2.8 + * @author Manohar Jonnalagedda, Tiark Rompf + */ + +trait PackratParsers extends Parsers { + + //type Input = PackratReader[Elem] + + /** + * A specialized Reader class that wraps an underlying Reader + * and provides memoization of parse results. + */ + class PackratReader[+T](underlying: Reader[T]) extends Reader[T] { outer => + + /* + * caching of intermediate parse results and information about recursion + */ + + private[PackratParsers] val cache: HashMap[(Parser[_], Position), MemoEntry[_]] = HashMap.empty + + + private[PackratParsers] def getFromCache[T](p: Parser[T]): Option[MemoEntry[T]] = { + cache.get((p, pos)).asInstanceOf[Option[MemoEntry[T]]] + } + + private[PackratParsers] def updateCacheAndGet[T](p: Parser[T], w: MemoEntry[T]): MemoEntry[T] = { + cache.put((p, pos),w) + w + } + + /* a cache for storing parser heads: allows to know which parser is involved + in a recursion*/ + private[PackratParsers] val recursionHeads: HashMap[Position, Head] = HashMap.empty + + //a stack that keeps a list of all involved rules + private[PackratParsers] var lrStack: List[LR] = Nil + + + + + override def source: java.lang.CharSequence = underlying.source + override def offset: Int = underlying.offset + + def first: T = underlying.first + def rest: Reader[T] = new PackratReader(underlying.rest) { + override private[PackratParsers] val cache = outer.cache + override private[PackratParsers] val recursionHeads = outer.recursionHeads + lrStack = outer.lrStack + } + + def pos: Position = underlying.pos + def atEnd: Boolean = underlying.atEnd + } + + + /** + *

+ * A parser generator delimiting whole phrases (i.e. programs). + *

+ *

+ * Overridden to make sure any input passed to the argument parser + * is wrapped in a PackratReader. + *

+ */ + override def phrase[T](p: Parser[T]) = { + val q = super.phrase(p) + new PackratParser[T] { + def apply(in: Input) = in match { + case in: PackratReader[_] => q(in) + case in => p(new PackratReader(in)) + } + } + } + + + private def getPosFromResult(r: ParseResult[_]): Position = r.next.pos + + // auxiliary data structures + + private case class MemoEntry[+T](var r: Either[LR,ParseResult[_]]){ + def getResult: ParseResult[T] = r match { + case Left(LR(res,_,_)) => res + case Right(res) => res + } + } + + private case class LR(var seed: ParseResult[_], var rule: Parser[_], var head: Option[Head]){ + def getPos: Position = getPosFromResult(seed) + } + + private case class Head(var headParser: Parser[_], var involvedSet: List[Parser[_]], var evalSet: List[Parser[_]]){ + def getHead = headParser + } + + /** + * The root class of packrat parsers. + */ + abstract class PackratParser[+T] extends super.Parser[T] + + /** + * Implicitly convert a parser to a packrat parser. + * The conversion is triggered by giving the appropriate target type: + * val myParser: PackratParser[MyResult] = aParser + */ + implicit def parser2packrat[T](p: => super.Parser[T]): PackratParser[T] = { + lazy val q = p + memo(super.Parser {in => q(in)}) + } + + + /* + * An unspecified function that is called when a packrat reader is applied. + * It verifies whether we are in the process of growing a parse or not. + * In the former case, it makes sure that rules involved in the recursion are evaluated. + * It also prevents non-involved rules from getting evaluated further + */ + + private def recall(p: super.Parser[_], in: PackratReader[Elem]): Option[MemoEntry[_]] = { + val cached = in.getFromCache(p) + val head = in.recursionHeads.get(in.pos) + + head match { + case None => /*no heads*/ cached + case Some(h@Head(hp, involved, evalSet)) => { + //heads found + if(cached == None && !(hp::involved contains p)) { + //Nothing in the cache, and p is not involved + return Some(MemoEntry(Right(Failure("dummy ",in)))) + } + if(evalSet contains p){ + //something in cache, and p is in the evalSet + //remove the rule from the evalSet of the Head + h.evalSet = h.evalSet.remove(_==p) + val tempRes = p(in) + //we know that cached has an entry here + val tempEntry: MemoEntry[_] = cached.get // match {case Some(x: MemoEntry[_]) => x} + //cache is modified + tempEntry.r = Right(tempRes) + } + cached + } + } + } + + /* + * setting up the left-recursion. We have the LR for the rule head + * we modify the involvedSets of all LRs in the stack, till we see + * the current parser again + */ + private def setupLR(p: Parser[_], in: PackratReader[_], recDetect: LR): Unit = { + if(recDetect.head == None) recDetect.head = Some(Head(p, Nil, Nil)) + + in.lrStack.takeWhile(_.rule != p).foreach {x => + x.head = recDetect.head + recDetect.head.map(h => h.involvedSet = x.rule::h.involvedSet) + } + } + + /* + * growing, if needed the recursion + * check whether the parser we are growing is the head of the rule. + * Not => no grow + */ + + /* + * Once the result of the recall function is known, if it is nil, then we need to store a dummy +failure into the cache (much like in the previous listings) and compute the future parse. If it +is not, however, this means we have detected a recursion, and we use the setupLR function +to update each parser involved in the recursion. + */ + + private def lrAnswer[T](p: Parser[T], in: PackratReader[Elem], growable: LR): ParseResult[T] = growable match { + //growable will always be having a head, we can't enter lrAnswer otherwise + case LR(seed ,rule, Some(head)) => + if(head.getHead != p) /*not head rule, so not growing*/ seed + else { + in.updateCacheAndGet(p, MemoEntry(Right[LR, ParseResult[T]](seed))) + seed match { + case f@Failure(_,_) => f + case e@Error(_,_) => e + case s@Success(_,_) => /*growing*/ grow(p, in, head) + } + } + case _=> throw new Exception("lrAnswer with no head !!") + } + + //p here should be strict (cannot be non-strict) !! + //failing left-recursive grammars: This is done by simply storing a failure if nothing is found + + /** + * Explicitly convert a given parser to a memoizing packrat parser. + * In most cases, client code should avoid calling memo directly + * and rely on implicit conversion instead. + */ + def memo[T](p: super.Parser[T]): PackratParser[T] = { + new PackratParser[T] { + def apply(in: Input) = { + /* + * transformed reader + */ + val inMem = in.asInstanceOf[PackratReader[Elem]] + + //look in the global cache if in a recursion + val m = recall(p, inMem) + m match { + //nothing has been done due to recall + case None => + val base = LR(Failure("Base Failure",in), p, None) + inMem.lrStack = base::inMem.lrStack + //cache base result + inMem.updateCacheAndGet(p,MemoEntry(Left(base))) + //parse the input + val tempRes = p(in) + //the base variable has passed equality tests with the cache + inMem.lrStack = inMem.lrStack.tail + //check whether base has changed, if yes, we will have a head + base.head match { + case None => + /*simple result*/ + inMem.updateCacheAndGet(p,MemoEntry(Right(tempRes))) + tempRes + case s@Some(_) => + /*non simple result*/ + base.seed = tempRes + //the base variable has passed equality tests with the cache + val res = lrAnswer(p, inMem, base) + res + } + + case Some(mEntry) => { + //entry found in cache + mEntry match { + case MemoEntry(Left(recDetect)) => { + setupLR(p, inMem, recDetect) + //all setupLR does is change the heads of the recursions, so the seed will stay the same + recDetect match {case LR(seed, _, _) => seed} + } + case MemoEntry(Right(res: ParseResult[T])) => res + } + } + } + } + } + } + + private def grow[T](p: super.Parser[T], rest: PackratReader[Elem], head: Head): ParseResult[T] = { + //store the head into the recursionHeads + rest.recursionHeads.put(rest.pos, head /*match {case Head(hp,involved,_) => Head(hp,involved,involved)}*/) + val oldRes: ParseResult[T] = rest.getFromCache(p).get match { + case MemoEntry(Right(x)) => x + case _ => throw new Exception("impossible match") + } + + //resetting the evalSet of the head of the recursion at each beginning of growth + head.evalSet = head.involvedSet + val tempRes = p(rest); tempRes match { + case s@Success(_,_) => + if(getPosFromResult(oldRes) < getPosFromResult(tempRes)) { + rest.updateCacheAndGet(p, MemoEntry(Right(s))) + grow(p, rest, head) + } else { + //we're done with growing, we can remove data from recursion head + rest.recursionHeads -= rest.pos + rest.getFromCache(p).get match { + case MemoEntry(Right(x: ParseResult[T])) => x + case _ => throw new Exception("impossible match") + } + } + case f => + rest.recursionHeads -= rest.pos + /*rest.updateCacheAndGet(p, MemoEntry(Right(f)));*/oldRes + } + } +} diff --git a/src/library/scala/util/parsing/combinator/Parsers.scala b/src/library/scala/util/parsing/combinator/Parsers.scala index 772202b548..29fbbebc28 100644 --- a/src/library/scala/util/parsing/combinator/Parsers.scala +++ b/src/library/scala/util/parsing/combinator/Parsers.scala @@ -14,7 +14,6 @@ import scala.util.parsing.input._ import scala.collection.mutable.{Map=>MutableMap} // TODO: better error handling (labelling like parsec's ) -// TODO: memoisation (like packrat parsers?) /**

* Parsers is a component that provides generic @@ -638,8 +637,9 @@ trait Parsers { * (and that only succeeds if `p' matches at least once). * The results of `p' are collected in a list. The results of `q' are discarded. */ - def rep1sep[T](p: => Parser[T], q: => Parser[Any]): Parser[List[T]] = - p ~ (q ~ rep1sep(p, q) ^^ { case x ~ y => y } | success(List())) ^^ { case x ~ y => x :: y } + def rep1sep[T](p : => Parser[T], q : => Parser[Any]): Parser[List[T]] = + p ~ rep(q ~> p) ^^ {case x~y => x::y} + /** A parser generator that, roughly, generalises the rep1sep generator so that `q', which parses the separator, * produces a left-associative function that combines the elements it separates. @@ -705,6 +705,19 @@ trait Parsers { } } + /** A parser generator for guard expressions. The resulting parser will fail or succeed + * just like the one given as parameter but it will not consume any input. + * + * @param p a `Parser' that is to be applied to the input + * @return A parser that returns success if and only if 'p' succeeds but never consumes any input + */ + def guard[T](p: => Parser[T]): Parser[T] = Parser { in => + p(in) match{ + case s@ Success(s1,_) => Success(s1, in) + case e => e + } + } + /** `positioned' decorates a parser's result with the start position of the input it consumed. * diff --git a/src/library/scala/util/parsing/combinator/syntactical/StandardTokenParsers.scala b/src/library/scala/util/parsing/combinator/syntactical/StandardTokenParsers.scala index 18fef4fa08..bed664d71e 100644 --- a/src/library/scala/util/parsing/combinator/syntactical/StandardTokenParsers.scala +++ b/src/library/scala/util/parsing/combinator/syntactical/StandardTokenParsers.scala @@ -21,4 +21,10 @@ import scala.util.parsing.combinator.lexical.StdLexical class StandardTokenParsers extends StdTokenParsers { type Tokens = StdTokens val lexical = new StdLexical + + //an implicit keyword function that gives a warning when a given word is not in the reserved/delimiters list + override implicit def keyword(chars : String): Parser[String] = + if(lexical.reserved.contains(chars) || lexical.delimiters.contains(chars)) super.keyword(chars) + else failure("You are trying to parse \""+chars+"\", but it is neither contained in the delimiters list, nor in the reserved keyword list of your lexical object") + } diff --git a/src/library/scala/util/parsing/combinator/syntactical/StdTokenParsers.scala b/src/library/scala/util/parsing/combinator/syntactical/StdTokenParsers.scala index edd4f286ca..42e910134b 100644 --- a/src/library/scala/util/parsing/combinator/syntactical/StdTokenParsers.scala +++ b/src/library/scala/util/parsing/combinator/syntactical/StdTokenParsers.scala @@ -12,6 +12,7 @@ package scala.util.parsing.combinator.syntactical import scala.util.parsing.syntax._ +import scala.collection.mutable.HashMap /** This component provides primitive parsers for the standard tokens defined in `StdTokens'. * @@ -21,12 +22,16 @@ trait StdTokenParsers extends TokenParsers { type Tokens <: StdTokens import lexical.{Keyword, NumericLit, StringLit, Identifier} + protected val keywordCache : HashMap[String, Parser[String]] = HashMap.empty + /** A parser which matches a single keyword token. * * @param chars The character string making up the matched keyword. * @return a `Parser' that matches the given string */ - implicit def keyword(chars: String): Parser[String] = accept(Keyword(chars)) ^^ (_.chars) +// implicit def keyword(chars: String): Parser[String] = accept(Keyword(chars)) ^^ (_.chars) + implicit def keyword(chars: String): Parser[String] = + keywordCache.getOrElseUpdate(chars, accept(Keyword(chars)) ^^ (_.chars)) /** A parser which matches a numeric literal */ def numericLit: Parser[String] = -- cgit v1.2.3