From 0e26910372d349c6ff7bbaa17fc8fe0bf568c5fe Mon Sep 17 00:00:00 2001 From: Som Snytt Date: Fri, 8 Aug 2014 22:19:59 -0700 Subject: SI-8787 Update doc for Regex Simplify the exposition and update the examples. Emphasize pattern matching, especially using `r.unanchored` instead of `for (r(x,y,z) <- r findFirstIn text)`. Certain details are moved to method docs. It would be nice to fix matching package doc, but the doc must attach to the package object, it seems. Introducing a package object is not binary-compatible. Includes a doc line edit on 2.12, anticipating the merge. --- src/library/scala/util/matching/Regex.scala | 309 ++++++++++++++++------------ 1 file changed, 175 insertions(+), 134 deletions(-) diff --git a/src/library/scala/util/matching/Regex.scala b/src/library/scala/util/matching/Regex.scala index 3d77105a1e..3fdd256b86 100644 --- a/src/library/scala/util/matching/Regex.scala +++ b/src/library/scala/util/matching/Regex.scala @@ -6,7 +6,6 @@ ** |/ ** \* */ - /** * This package is concerned with regular expression (regex) matching against strings, * with the main goal of pulling out information from those matches, or replacing @@ -28,117 +27,109 @@ * into a [[java.lang.String]]. * */ -package scala -package util.matching +package scala.util.matching import scala.collection.AbstractIterator import java.util.regex.{ Pattern, Matcher } -/** This class provides methods for creating and using regular expressions. - * It is based on the regular expressions of the JDK since 1.4. +/** A regular expression is used to determine whether a string matches a pattern + * and, if it does, to extract or transform the parts that match. + * + * This class delegates to the [[java.util.regex]] package of the Java Platform. + * See the documentation for [[java.util.regex.Pattern]] for details about + * the regular expression syntax for pattern strings. * - * Its main goal is to extract strings that match a pattern, or the subgroups - * that make it up. For that reason, it is usually used with for comprehensions - * and matching (see methods for examples). + * An instance of `Regex` represents a compiled regular expression pattern. + * Since compilation is expensive, frequently used `Regex`es should be constructed + * once, outside of loops and perhaps in a companion object. * - * A Regex is created from a [[java.lang.String]] representation of the - * regular expression pattern^1^. That pattern is compiled - * during construction, so frequently used patterns should be declared outside - * loops if performance is of concern. Possibly, they might be declared on a - * companion object, so that they need only to be initialized once. + * The canonical way to create a `Regex` is by using the method `r`, provided + * implicitly for strings: * - * The canonical way of creating regex patterns is by using the method `r`, provided - * on [[java.lang.String]] through an implicit conversion into - * [[scala.collection.immutable.WrappedString]]. Using triple quotes to write these - * strings avoids having to quote the backslash character (`\`). + * {{{ + * val date = """(\d\d\d\d)-(\d\d)-(\d\d)""".r + * }}} * - * Using the constructor directly, on the other hand, makes - * it possible to declare names for subgroups in the pattern. + * Since escapes are not processed in multi-line string literals, using triple quotes + * avoids having to escape the backslash character, so that `"\\d"` can be written `"""\d"""`. * - * For example, both declarations below generate the same regex, but the second - * one associate names with the subgroups. + * To extract the capturing groups when a `Regex` is matched, use it as + * an extractor in a pattern match: * * {{{ - * val dateP1 = """(\d\d\d\d)-(\d\d)-(\d\d)""".r - * val dateP2 = new scala.util.matching.Regex("""(\d\d\d\d)-(\d\d)-(\d\d)""", "year", "month", "day") + * "2004-01-20" match { + * case date(year, month, day) => s"$year was a good year for PLs." + * } * }}} * - * There are two ways of using a `Regex` to find a pattern: calling methods on - * Regex, such as `findFirstIn` or `findAllIn`, or using it as an extractor in a - * pattern match. - * - * Note that, when calling `findAllIn`, the resulting [[scala.util.matching.Regex.MatchIterator]] - * needs to be initialized (by calling `hasNext` or `next()`, or causing these to be - * called) before information about a match can be retrieved: + * To check only whether the `Regex` matches, ignoring any groups: * * {{{ - * val msg = "I love Scala" + * "2004-01-20" match { + * case date(_*) => "It's a date!" + * } + * }}} * - * // val start = " ".r.findAllIn(msg).start // throws an IllegalStateException + * In a pattern match, `Regex` normally matches the entire input. + * However, an unanchored `Regex` finds the pattern anywhere + * in the input. * - * val matches = " ".r.findAllIn(msg) - * matches.hasNext // initializes the matcher - * val start = matches.start + * {{{ + * val embeddedDate = date.unanchored + * "Date: 2004-01-20 17:25:18 GMT (10 years, 28 weeks, 5 days, 17 hours and 51 minutes ago)" match { + * case embeddedDate("2004", "01", "20") => "A Scala is born." + * } * }}} * - * When Regex is used as an extractor in a pattern match, note that it - * only succeeds if the whole text can be matched. For this reason, one usually - * calls a method to find the matching substrings, and then use it as an extractor - * to break match into subgroups. + * To find or replace matches of the pattern, use the various find and replace methods. + * There is a flavor of each method that produces matched strings and + * another that produces `Match` objects. * - * As an example, the above patterns can be used like this: + * For example, pattern matching with an unanchored `Regex`, as in the previous example, + * is the same as using `findFirstMatchIn`, except that the findFirst methods return an `Option`, + * or `None` for no match: * * {{{ - * val dateP1(year, month, day) = "2011-07-15" + * val dates = "Important dates in history: 2004-01-20, 1958-09-05, 2010-10-06, 2011-07-15" + * val firstDate = date findFirstIn dates getOrElse "No date found." + * val firstYear = for (m <- date findFirstMatchIn dates) yield m group 1 + * }}} * - * // val dateP1(year, month, day) = "Date 2011-07-15" // throws an exception at runtime + * To find all matches: * - * val copyright: String = dateP1 findFirstIn "Date of this document: 2011-07-15" match { - * case Some(dateP1(year, month, day)) => "Copyright "+year - * case None => "No copyright" - * } + * {{{ + * val allYears = for (m <- date findAllMatchIn dates) yield m group 1 + * }}} * - * val copyright: Option[String] = for { - * dateP1(year, month, day) <- dateP1 findFirstIn "Last modified 2011-07-15" - * } yield year + * But `findAllIn` returns a special iterator of strings that can be queried for the `MatchData` + * of the last match: * - * def getYears(text: String): Iterator[String] = for (dateP1(year, _, _) <- dateP1 findAllIn text) yield year - * def getFirstDay(text: String): Option[String] = for (m <- dateP2 findFirstMatchIn text) yield m group "day" + * {{{ + * val mi = date findAllIn dates + * val oldies = mi filter (_ => (mi group 1).toInt < 1960) map (s => s"$s: An oldie but goodie.") * }}} * - * Regex does not provide a method that returns a [[scala.Boolean]]. One can - * use [[java.lang.String]] `matches` method, or, if `Regex` is preferred, - * either ignore the return value or test the `Option` for emptyness. For example: + * Text replacement can be performed unconditionally or as a function of the current match: * * {{{ - * def hasDate(text: String): Boolean = (dateP1 findFirstIn text).nonEmpty - * def printLinesWithDates(lines: Traversable[String]) { - * lines foreach { line => - * dateP1 findFirstIn line foreach { _ => println(line) } - * } - * } + * val redacted = date replaceAllIn (dates, "XXXX-XX-XX") + * val yearsOnly = date replaceAllIn (dates, m => m group 1) + * val months = (0 to 11) map { i => val c = Calendar.getInstance; c.set(2014, i, 1); f"$c%tb" } + * val reformatted = date replaceAllIn (dates, _ match { case date(y,m,d) => f"${months(m.toInt - 1)} $d, $y" }) * }}} * - * There are also methods that can be used to replace the patterns - * on a text. The substitutions can be simple replacements, or more - * complex functions. For example: + * Pattern matching the `Match` against the `Regex` that created it does not reapply the `Regex`. + * In the expression for `reformatted`, each `date` match is computed once. But it is possible to apply a + * `Regex` to a `Match` resulting from a different pattern: * * {{{ - * val months = Map( 1 -> "Jan", 2 -> "Feb", 3 -> "Mar", - * 4 -> "Apr", 5 -> "May", 6 -> "Jun", - * 7 -> "Jul", 8 -> "Aug", 9 -> "Sep", - * 10 -> "Oct", 11 -> "Nov", 12 -> "Dec") - * - * import scala.util.matching.Regex.Match - * def reformatDate(text: String) = dateP2 replaceAllIn ( text, (m: Match) => - * "%s %s, %s" format (months(m group "month" toInt), m group "day", m group "year") - * ) + * val docSpree = """2011(?:-\d{2}){2}""".r + * val docView = date replaceAllIn (dates, _ match { + * case docSpree() => "Historic doc spree!" + * case _ => "Something else happened" + * }) * }}} * - * You can use special pattern syntax constructs like `(?idmsux-idmsux)`¹ to switch - * various regex compilation options like `CASE_INSENSITIVE` or `UNICODE_CASE`. - * - * @note ¹ A detailed description is available in [[java.util.regex.Pattern]]. * @see [[java.util.regex.Pattern]] * * @author Thibaud Hottelier @@ -155,8 +146,7 @@ import java.util.regex.{ Pattern, Matcher } * 1 through 9 corresponding to the first nine groups, and 0 standing for the * whole match. Any other character is an error. The backslash (`\`) character * will be interpreted as an escape character and can be used to escape the - * dollar sign. One can use [[scala.util.matching.Regex]]'s `quoteReplacement` - * to automatically escape these characters. + * dollar sign. Use `Regex.quoteReplacement` to escape these characters. */ @SerialVersionUID(-2094783597747625537L) class Regex private[matching](val pattern: Pattern, groupNames: String*) extends Serializable { @@ -164,20 +154,33 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends import Regex._ - /** - * @param regex A string representing a regular expression - * @param groupNames A mapping from names to indices in capture groups - */ + /** Compile a regular expression, supplied as a string, into a pattern that + * can be matched against inputs. + * + * If group names are supplied, they can be used this way: + * + * {{{ + * val namedDate = new Regex("""(\d\d\d\d)-(\d\d)-(\d\d)""", "year", "month", "day") + * val namedYears = for (m <- namedDate findAllMatchIn dates) yield m group "year" + * }}} + * + * This constructor does not support options as flags, which must be + * supplied as inline flags in the pattern string: `(?idmsux-idmsux)`. + * + * @param regex The regular expression to compile. + * @param groupNames Names of capturing groups. + */ def this(regex: String, groupNames: String*) = this(Pattern.compile(regex), groupNames: _*) /** Tries to match a [[java.lang.CharSequence]]. + * * If the match succeeds, the result is a list of the matching * groups (or a `null` element if a group did not match any input). * If the pattern specifies no groups, then the result will be an empty list * on a successful match. * * This method attempts to match the entire input by default; to find the next - * matching subsequence, use an unanchored Regex. + * matching subsequence, use an unanchored `Regex`. * * For example: * @@ -188,6 +191,10 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends * case _ => false * } * val p2 = "a(b*)c".r + * val p2Matches = "abbbc" match { + * case p2(_*) => true + * case _ => false + * } * val numberOfB = "abbbc" match { * case p2(b) => Some(b.length) * case _ => None @@ -211,6 +218,7 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends } /** Tries to match the String representation of a [[scala.Char]]. + * * If the match succeeds, the result is the first matching * group if any groups are defined, or an empty Sequence otherwise. * @@ -249,8 +257,11 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends } /** Tries to match on a [[scala.util.matching.Regex.Match]]. + * * A previously failed match results in None. + * * If a successful match was made against the current pattern, then that result is used. + * * Otherwise, this Regex is applied to the previously matched input, * and the result of that match is used. */ @@ -276,25 +287,43 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends // @see UnanchoredRegex protected def runMatcher(m: Matcher) = m.matches() - /** Return all non-overlapping matches of this regexp in given character + /** Return all non-overlapping matches of this `Regex` in the given character * sequence as a [[scala.util.matching.Regex.MatchIterator]], * which is a special [[scala.collection.Iterator]] that returns the - * matched strings, but can also be converted into a normal iterator - * that returns objects of type [[scala.util.matching.Regex.Match]] - * that can be queried for data such as the text that precedes the - * match, subgroups, etc. + * matched strings but can also be queried for more data about the last match, + * such as capturing groups and start position. + * + * A `MatchIterator` can also be converted into an iterator + * that returns objects of type [[scala.util.matching.Regex.Match]], + * such as is normally returned by `findAllMatchIn`. * * Where potential matches overlap, the first possible match is returned, - * followed by the next match that is completely after the first. For - * instance, `"hat[^a]+".r` will match `hath` and `hattth` in the string - * `"hathatthattthatttt"`. + * followed by the next match that follows the input consumed by the + * first match: + * + * {{{ + * val hat = "hat[^a]+".r + * val hathaway = "hathatthattthatttt" + * val hats = (hat findAllIn hathaway).toList // List(hath, hattth) + * val pos = (hat findAllMatchIn hathaway map (_.start)).toList // List(0, 7) + * }}} * - * Attempting to retrieve information about a match before initializing - * the iterator can result in [[java.lang.IllegalStateException]]s. See - * [[scala.util.matching.Regex.MatchIterator]] for details. + * To return overlapping matches, it is possible to formulate a regular expression + * that does not consume the overlapping region. + * + * {{{ + * val madhatter = "(h)(?=(at[^a]+))".r + * val madhats = (madhatter findAllMatchIn hathaway map { + * case madhatter(x,y) => s"$x$y" + * }).toList // List(hath, hatth, hattth, hatttt) + * }}} + * + * Attempting to retrieve match information before performing the first match + * or after exhausting the iterator results in [[java.lang.IllegalStateException]]. + * See [[scala.util.matching.Regex.MatchIterator]] for details. * * @param source The text to match against. - * @return A [[scala.util.matching.Regex.MatchIterator]] of all matches. + * @return A [[scala.util.matching.Regex.MatchIterator]] of matched substrings. * @example {{{for (words <- """\w+""".r findAllIn "A simple example.") yield words}}} */ def findAllIn(source: CharSequence) = new Regex.MatchIterator(source, this, groupNames) @@ -317,8 +346,8 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends } } - /** Return optionally first matching string of this regexp in given character sequence, - * or None if it does not exist. + /** Return an optional first matching string of this `Regex` in the given character sequence, + * or None if there is no match. * * @param source The text to match against. * @return An [[scala.Option]] of the first matching string in the text. @@ -329,13 +358,11 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends if (m.find) Some(m.group) else None } - /** Return optionally first match of this regexp in given character sequence, + /** Return an optional first match of this `Regex` in the given character sequence, * or None if it does not exist. * - * The main difference between this method and `findFirstIn` is that the (optional) return - * type for this is [[scala.util.matching.Regex.Match]], through which more - * data can be obtained about the match, such as the strings that precede and follow it, - * or subgroups. + * If the match is successful, the [[scala.util.matching.Regex.Match]] can be queried for + * more data. * * @param source The text to match against. * @return A [[scala.Option]] of [[scala.util.matching.Regex.Match]] of the first matching string in the text. @@ -346,30 +373,28 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends if (m.find) Some(new Match(source, m, groupNames)) else None } - /** Return optionally match of this regexp at the beginning of the - * given character sequence, or None if regexp matches no prefix + /** Return an optional match of this `Regex` at the beginning of the + * given character sequence, or None if it matches no prefix * of the character sequence. * - * The main difference from this method to `findFirstIn` is that this - * method will not return any matches that do not begin at the start - * of the text being matched against. + * Unlike `findFirstIn`, this method will only return a match at + * the beginning of the input. * * @param source The text to match against. * @return A [[scala.Option]] of the matched prefix. - * @example {{{"""[a-z]""".r findPrefixOf "A simple example." // returns None, since the text does not begin with a lowercase letter}}} + * @example {{{"""\p{Lower}""".r findPrefixOf "A simple example." // returns None, since the text does not begin with a lowercase letter}}} */ def findPrefixOf(source: CharSequence): Option[String] = { val m = pattern.matcher(source) if (m.lookingAt) Some(m.group) else None } - /** Return optionally match of this regexp at the beginning of the - * given character sequence, or None if regexp matches no prefix + /** Return an optional match of this `Regex` at the beginning of the + * given character sequence, or None if it matches no prefix * of the character sequence. * - * The main difference from this method to `findFirstMatchIn` is that - * this method will not return any matches that do not begin at the - * start of the text being matched against. + * Unlike `findFirstMatchIn`, this method will only return a match at + * the beginning of the input. * * @param source The text to match against. * @return A [[scala.Option]] of the [[scala.util.matching.Regex.Match]] of the matched string. @@ -403,7 +428,7 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends * import scala.util.matching.Regex * val datePattern = new Regex("""(\d\d\d\d)-(\d\d)-(\d\d)""", "year", "month", "day") * val text = "From 2011-07-15 to 2011-07-17" - * val repl = datePattern replaceAllIn (text, m => m.group("month")+"/"+m.group("day")) + * val repl = datePattern replaceAllIn (text, m => s"${m group "month"}/${m group "day"}") * }}} * * $replacementString @@ -426,10 +451,10 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends * {{{ * import scala.util.matching.Regex._ * - * val map = Map("x" -> "a var", "y" -> """some $ and \ signs""") + * val vars = Map("x" -> "a var", "y" -> """some $ and \ signs""") * val text = "A text with variables %x, %y and %z." * val varPattern = """%(\w+)""".r - * val mapper = (m: Match) => map get (m group 1) map (quoteReplacement(_)) + * val mapper = (m: Match) => vars get (m group 1) map (quoteReplacement(_)) * val repl = varPattern replaceSomeIn (text, mapper) * }}} * @@ -470,17 +495,25 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends pattern.split(toSplit) /** Create a new Regex with the same pattern, but no requirement that - * the entire String matches in extractor patterns. For instance, the strings - * shown below lead to successful matches, where they would not otherwise. + * the entire String matches in extractor patterns. + * + * Normally, matching on `date` behaves as though the pattern were + * enclosed in anchors, `"^pattern$"`. + * + * The unanchored `Regex` behaves as though those anchors were removed. + * + * Note that this method does not actually strip any matchers from the pattern. + * + * Calling `anchored` returns the original `Regex`. * * {{{ - * val dateP1 = """(\d\d\d\d)-(\d\d)-(\d\d)""".r.unanchored + * val date = """(\d\d\d\d)-(\d\d)-(\d\d)""".r.unanchored * - * val dateP1(year, month, day) = "Date 2011-07-15" + * val date(year, month, day) = "Date 2011-07-15" // OK * * val copyright: String = "Date of this document: 2011-07-15" match { - * case dateP1(year, month, day) => "Copyright "+year - * case _ => "No copyright" + * case date(year, month, day) => s"Copyright $year" // OK + * case _ => "No copyright" * } * }}} * @@ -495,6 +528,10 @@ class Regex private[matching](val pattern: Pattern, groupNames: String*) extends override def toString = regex } +/** A [[Regex]] that finds the first match when used in a pattern match. + * + * @see [[Regex#unanchored]] + */ trait UnanchoredRegex extends Regex { override protected def runMatcher(m: Matcher) = m.find() override def unanchored = this @@ -510,30 +547,34 @@ object Regex { */ trait MatchData { - /** The source from where the match originated */ + /** The source from which the match originated */ val source: CharSequence - /** The names of the groups, or some empty sequence if one defined */ + /** The names of the groups, or an empty sequence if none defined */ val groupNames: Seq[String] - /** The number of subgroups in the pattern (not all of these need to match!) */ + /** The number of capturing groups in the pattern. + * (For a given successful match, some of those groups may not have matched any input.) + */ def groupCount: Int /** The index of the first matched character, or -1 if nothing was matched */ def start: Int /** The index of the first matched character in group `i`, - * or -1 if nothing was matched for that group */ + * or -1 if nothing was matched for that group. + */ def start(i: Int): Int - /** The index of the last matched character, or -1 if nothing was matched */ + /** The index following the last matched character, or -1 if nothing was matched. */ def end: Int /** The index following the last matched character in group `i`, - * or -1 if nothing was matched for that group */ + * or -1 if nothing was matched for that group. + */ def end(i: Int): Int - /** The matched string, or `null` if nothing was matched */ + /** The matched string, or `null` if nothing was matched. */ def matched: String = if (start >= 0) source.subSequence(start, end).toString else null @@ -544,7 +585,7 @@ object Regex { if (start(i) >= 0) source.subSequence(start(i), end(i)).toString else null - /** All matched subgroups, i.e. not including group(0) */ + /** All capturing groups, i.e., not including group(0). */ def subgroups: List[String] = (1 to groupCount).toList map group /** The char sequence before first character of match, @@ -636,15 +677,15 @@ object Regex { def unapply(m: Match): Some[String] = Some(m.matched) } - /** An extractor object that yields the groups in the match. Using an extractor - * rather than the original regex avoids recomputing the match. + /** An extractor object that yields the groups in the match. Using this extractor + * rather than the original `Regex` ensures that the match is not recomputed. * * {{{ * import scala.util.matching.Regex.Groups * - * val datePattern = """(\d\d\d\d)-(\d\d)-(\d\d)""".r + * val date = """(\d\d\d\d)-(\d\d)-(\d\d)""".r * val text = "The doc spree happened on 2011-07-15." - * val day = datePattern replaceAllIn(text, _ match { case Groups(year, month, day) => month+"/"+day }) + * val day = date replaceAllIn(text, _ match { case Groups(_, month, day) => s"$month/$day" }) * }}} */ object Groups { -- cgit v1.2.3