diff options
Diffstat (limited to 'src/library/scala/util/matching')
-rw-r--r-- | src/library/scala/util/matching/Regex.scala | 197 |
1 files changed, 134 insertions, 63 deletions
diff --git a/src/library/scala/util/matching/Regex.scala b/src/library/scala/util/matching/Regex.scala index 6d3d015b1a..4822fe02b4 100644 --- a/src/library/scala/util/matching/Regex.scala +++ b/src/library/scala/util/matching/Regex.scala @@ -11,21 +11,14 @@ * with the main goal of pulling out information from those matches, or replacing * them with something else. * - * There are four classes and three objects, with most of them being members of - * Regex companion object. [[scala.util.matching.Regex]] is the class users instantiate - * to do regular expression matching. + * [[scala.util.matching.Regex]] is the class users instantiate to do regular expression matching. * - * The remaining classes and objects in the package are used in the following way: - * - * * The companion object to [[scala.util.matching.Regex]] just contains the other members. + * The companion object to [[scala.util.matching.Regex]] contains supporting members: * * [[scala.util.matching.Regex.Match]] makes more information about a match available. - * * [[scala.util.matching.Regex.MatchIterator]] is used to iterate over multiple matches. + * * [[scala.util.matching.Regex.MatchIterator]] is used to iterate over matched strings. * * [[scala.util.matching.Regex.MatchData]] is just a base trait for the above classes. * * [[scala.util.matching.Regex.Groups]] extracts group from a [[scala.util.matching.Regex.Match]] * without recomputing the match. - * * [[scala.util.matching.Regex.Match]] converts a [[scala.util.matching.Regex.Match]] - * into a [[java.lang.String]]. - * */ package scala.util.matching @@ -35,6 +28,7 @@ import java.util.regex.{ Pattern, Matcher } /** A regular expression is used to determine whether a string matches a pattern * and, if it does, to extract or transform the parts that match. * + * === Usage === * This class delegates to the [[java.util.regex]] package of the Java Platform. * See the documentation for [[java.util.regex.Pattern]] for details about * the regular expression syntax for pattern strings. @@ -47,12 +41,15 @@ import java.util.regex.{ Pattern, Matcher } * implicitly for strings: * * {{{ - * val date = """(\d\d\d\d)-(\d\d)-(\d\d)""".r + * val date = raw"(\d{4})-(\d{2})-(\d{2})".r * }}} * * Since escapes are not processed in multi-line string literals, using triple quotes * avoids having to escape the backslash character, so that `"\\d"` can be written `"""\d"""`. + * The same result is achieved with certain interpolators, such as `raw"\d".r` or + * a custom interpolator `r"\d"` that also compiles the `Regex`. * + * === Extraction === * To extract the capturing groups when a `Regex` is matched, use it as * an extractor in a pattern match: * @@ -92,48 +89,80 @@ import java.util.regex.{ Pattern, Matcher } * } * }}} * + * === Find Matches === * To find or replace matches of the pattern, use the various find and replace methods. - * There is a flavor of each method that produces matched strings and - * another that produces `Match` objects. + * For each method, there is a version for working with matched strings and + * another for working with `Match` objects. * * For example, pattern matching with an unanchored `Regex`, as in the previous example, - * is the same as using `findFirstMatchIn`, except that the findFirst methods return an `Option`, - * or `None` for no match: + * can also be accomplished using `findFirstMatchIn`. The `findFirst` methods return an `Option` + * which is non-empty if a match is found, or `None` for no match: * * {{{ * val dates = "Important dates in history: 2004-01-20, 1958-09-05, 2010-10-06, 2011-07-15" - * val firstDate = date findFirstIn dates getOrElse "No date found." - * val firstYear = for (m <- date findFirstMatchIn dates) yield m group 1 + * val firstDate = date.findFirstIn(dates).getOrElse("No date found.") + * val firstYear = for (m <- date.findFirstMatchIn(dates)) yield m.group(1) * }}} * * To find all matches: * * {{{ - * val allYears = for (m <- date findAllMatchIn dates) yield m group 1 + * val allYears = for (m <- date.findAllMatchIn(dates)) yield m.group(1) + * }}} + * + * To iterate over the matched strings, use `findAllIn`, which returns a special iterator + * that can be queried for the `MatchData` of the last match: + * + * {{{ + * val mi = date.findAllIn(dates) + * while (mi.hasNext) { + * val d = mi.next + * if (mi.group(1).toInt < 1960) println(s"$d: An oldie but goodie.") + * } * }}} * - * But `findAllIn` returns a special iterator of strings that can be queried for the `MatchData` - * of the last match: + * Although the `MatchIterator` returned by `findAllIn` is used like any `Iterator`, + * with alternating calls to `hasNext` and `next`, `hasNext` has the additional + * side effect of advancing the underlying matcher to the next unconsumed match. + * This effect is visible in the `MatchData` representing the "current match". * * {{{ - * val mi = date findAllIn dates - * val oldies = mi filter (_ => (mi group 1).toInt < 1960) map (s => s"$s: An oldie but goodie.") + * val r = "(ab+c)".r + * val s = "xxxabcyyyabbczzz" + * r.findAllIn(s).start // 3 + * val mi = r.findAllIn(s) + * mi.hasNext // true + * mi.start // 3 + * mi.next() // "abc" + * mi.start // 3 + * mi.hasNext // true + * mi.start // 9 + * mi.next() // "abbc" * }}} * + * The example shows that methods on `MatchData` such as `start` will advance to + * the first match, if necessary. It also shows that `hasNext` will advance to + * the next unconsumed match, if `next` has already returned the current match. + * + * The current `MatchData` can be captured using the `matchData` method. + * Alternatively, `findAllMatchIn` returns an `Iterator[Match]`, where there + * is no interaction between the iterator and `Match` objects it has already produced. + * * Note that `findAllIn` finds matches that don't overlap. (See [[findAllIn]] for more examples.) * * {{{ - * val num = """(\d+)""".r - * val all = (num findAllIn "123").toList // List("123"), not List("123", "23", "3") + * val num = raw"(\d+)".r + * val all = num.findAllIn("123").toList // List("123"), not List("123", "23", "3") * }}} * + * === Replace Text === * Text replacement can be performed unconditionally or as a function of the current match: * * {{{ - * val redacted = date replaceAllIn (dates, "XXXX-XX-XX") - * val yearsOnly = date replaceAllIn (dates, m => m group 1) - * val months = (0 to 11) map { i => val c = Calendar.getInstance; c.set(2014, i, 1); f"$c%tb" } - * val reformatted = date replaceAllIn (dates, _ match { case date(y,m,d) => f"${months(m.toInt - 1)} $d, $y" }) + * val redacted = date.replaceAllIn(dates, "XXXX-XX-XX") + * val yearsOnly = date.replaceAllIn(dates, m => m.group(1)) + * val months = (0 to 11).map { i => val c = Calendar.getInstance; c.set(2014, i, 1); f"$c%tb" } + * val reformatted = date.replaceAllIn(dates, _ match { case date(y,m,d) => f"${months(m.toInt - 1)} $d, $y" }) * }}} * * Pattern matching the `Match` against the `Regex` that created it does not reapply the `Regex`. @@ -142,7 +171,7 @@ import java.util.regex.{ Pattern, Matcher } * * {{{ * val docSpree = """2011(?:-\d{2}){2}""".r - * val docView = date replaceAllIn (dates, _ match { + * val docView = date.replaceAllIn(dates, _ match { * case docSpree() => "Historic doc spree!" * case _ => "Something else happened" * }) @@ -182,6 +211,9 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends * val namedYears = for (m <- namedDate findAllMatchIn dates) yield m group "year" * }}} * + * Group names supplied to the constructor are preferred to inline group names + * when retrieving matched groups by name. Not all platforms support inline names. + * * This constructor does not support options as flags, which must be * supplied as inline flags in the pattern string: `(?idmsux-idmsux)`. * @@ -305,7 +337,7 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends * @param target The string to match * @return The matches */ - @deprecated("Extracting a match result from anything but a CharSequence or Match is deprecated", "2.11.0") + @deprecated("extracting a match result from anything but a CharSequence or Match is deprecated", "2.11.0") def unapplySeq(target: Any): Option[List[String]] = target match { case s: CharSequence => val m = pattern matcher s @@ -318,16 +350,16 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends // @see UnanchoredRegex protected def runMatcher(m: Matcher) = m.matches() - /** Return all non-overlapping matches of this `Regex` in the given character + /** Return all non-overlapping matches of this `Regex` in the given character * sequence as a [[scala.util.matching.Regex.MatchIterator]], * which is a special [[scala.collection.Iterator]] that returns the * matched strings but can also be queried for more data about the last match, * such as capturing groups and start position. - * + * * A `MatchIterator` can also be converted into an iterator * that returns objects of type [[scala.util.matching.Regex.Match]], * such as is normally returned by `findAllMatchIn`. - * + * * Where potential matches overlap, the first possible match is returned, * followed by the next match that follows the input consumed by the * first match: @@ -335,8 +367,8 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends * {{{ * val hat = "hat[^a]+".r * val hathaway = "hathatthattthatttt" - * val hats = (hat findAllIn hathaway).toList // List(hath, hattth) - * val pos = (hat findAllMatchIn hathaway map (_.start)).toList // List(0, 7) + * val hats = hat.findAllIn(hathaway).toList // List(hath, hattth) + * val pos = hat.findAllMatchIn(hathaway).map(_.start).toList // List(0, 7) * }}} * * To return overlapping matches, it is possible to formulate a regular expression @@ -344,13 +376,13 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends * * {{{ * val madhatter = "(h)(?=(at[^a]+))".r - * val madhats = (madhatter findAllMatchIn hathaway map { + * val madhats = madhatter.findAllMatchIn(hathaway).map { * case madhatter(x,y) => s"$x$y" - * }).toList // List(hath, hatth, hattth, hatttt) + * }.toList // List(hath, hatth, hattth, hatttt) * }}} * - * Attempting to retrieve match information before performing the first match - * or after exhausting the iterator results in [[java.lang.IllegalStateException]]. + * Attempting to retrieve match information after exhausting the iterator + * results in [[java.lang.IllegalStateException]]. * See [[scala.util.matching.Regex.MatchIterator]] for details. * * @param source The text to match against. @@ -578,6 +610,9 @@ object Regex { */ trait MatchData { + /** Basically, wraps a platform Matcher. */ + protected def matcher: Matcher + /** The source from which the match originated */ val source: CharSequence @@ -650,16 +685,25 @@ object Regex { private lazy val nameToIndex: Map[String, Int] = Map[String, Int]() ++ ("" :: groupNames.toList).zipWithIndex - /** Returns the group with given name. + /** Returns the group with the given name. + * + * Uses explicit group names when supplied; otherwise, + * queries the underlying implementation for inline named groups. + * Not all platforms support inline group names. * * @param id The group name * @return The requested group - * @throws NoSuchElementException if the requested group name is not defined + * @throws IllegalArgumentException if the requested group name is not defined */ - def group(id: String): String = nameToIndex.get(id) match { - case None => throw new NoSuchElementException("group name "+id+" not defined") - case Some(index) => group(index) - } + def group(id: String): String = ( + if (groupNames.isEmpty) + matcher group id + else + nameToIndex.get(id) match { + case Some(index) => group(index) + case None => matcher group id + } + ) /** The matched string; equivalent to `matched.toString`. */ override def toString = matched @@ -667,7 +711,7 @@ object Regex { /** Provides information about a successful match. */ class Match(val source: CharSequence, - private[matching] val matcher: Matcher, + protected[matching] val matcher: Matcher, val groupNames: Seq[String]) extends MatchData { /** The index of the first matched character. */ @@ -728,11 +772,13 @@ object Regex { /** A class to step through a sequence of regex matches. * - * All methods inherited from [[scala.util.matching.Regex.MatchData]] will throw - * a [[java.lang.IllegalStateException]] until the matcher is initialized. The - * matcher can be initialized by calling `hasNext` or `next()` or causing these - * methods to be called, such as by invoking `toString` or iterating through - * the iterator's elements. + * This is an iterator that returns the matched strings. + * + * Queries about match data pertain to the current state of the underlying + * matcher, which is advanced by calling `hasNext` or `next`. + * + * When matches are exhausted, queries about match data will throw + * [[java.lang.IllegalStateException]]. * * @see [[java.util.regex.Matcher]] */ @@ -740,37 +786,62 @@ object Regex { extends AbstractIterator[String] with Iterator[String] with MatchData { self => protected[Regex] val matcher = regex.pattern.matcher(source) - private var nextSeen = false - /** Is there another match? */ + // 0 = not yet matched, 1 = matched, 2 = advanced to match, 3 = no more matches + private[this] var nextSeen = 0 + + /** Return true if `next` will find a match. + * As a side effect, advance the underlying matcher if necessary; + * queries about the current match data pertain to the underlying matcher. + */ def hasNext: Boolean = { - if (!nextSeen) nextSeen = matcher.find() - nextSeen + nextSeen match { + case 0 => nextSeen = if (matcher.find()) 1 else 3 + case 1 => () + case 2 => nextSeen = 0 ; hasNext + case 3 => () + } + nextSeen == 1 // otherwise, 3 } - /** The next matched substring of `source`. */ + /** The next matched substring of `source`. + * As a side effect, advance the underlying matcher if necessary. + */ def next(): String = { - if (!hasNext) throw new NoSuchElementException - nextSeen = false + nextSeen match { + case 0 => if (!hasNext) throw new NoSuchElementException ; next() + case 1 => nextSeen = 2 + case 2 => nextSeen = 0 ; next() + case 3 => throw new NoSuchElementException + } matcher.group } + /** Report emptiness. */ override def toString = super[AbstractIterator].toString + // ensure we're at a match + private[this] def ensure(): Unit = nextSeen match { + case 0 => if (!hasNext) throw new IllegalStateException + case 1 => () + case 2 => () + case 3 => throw new IllegalStateException + } + /** The index of the first matched character. */ - def start: Int = matcher.start + def start: Int = { ensure() ; matcher.start } /** The index of the first matched character in group `i`. */ - def start(i: Int): Int = matcher.start(i) + def start(i: Int): Int = { ensure() ; matcher.start(i) } /** The index of the last matched character. */ - def end: Int = matcher.end + def end: Int = { ensure() ; matcher.end } /** The index following the last matched character in group `i`. */ - def end(i: Int): Int = matcher.end(i) + def end(i: Int): Int = { ensure() ; matcher.end(i) } /** The number of subgroups. */ - def groupCount = matcher.groupCount + def groupCount = { ensure() ; matcher.groupCount } /** Convert to an iterator that yields MatchData elements instead of Strings. */ def matchData: Iterator[Match] = new AbstractIterator[Match] { |