diff options
Diffstat (limited to 'csharp/ProtocolBuffers/TextTokenizer.cs')
-rw-r--r-- | csharp/ProtocolBuffers/TextTokenizer.cs | 341 |
1 files changed, 341 insertions, 0 deletions
diff --git a/csharp/ProtocolBuffers/TextTokenizer.cs b/csharp/ProtocolBuffers/TextTokenizer.cs new file mode 100644 index 00000000..a3585ab0 --- /dev/null +++ b/csharp/ProtocolBuffers/TextTokenizer.cs @@ -0,0 +1,341 @@ +using System; +using System.Globalization; +using System.Text.RegularExpressions; + +namespace Google.ProtocolBuffers { + /// <summary> + /// Represents a stream of tokens parsed from a string. + /// </summary> + internal sealed class TextTokenizer { + private readonly string text; + private string currentToken; + + /// <summary> + /// The character index within the text to perform the next regex match at. + /// </summary> + private int matchPos = 0; + + /// <summary> + /// The character index within the text at which the current token begins. + /// </summary> + private int pos = 0; + + /// <summary> + /// The line number of the current token. + /// </summary> + private int line = 0; + /// <summary> + /// The column number of the current token. + /// </summary> + private int column = 0; + + /// <summary> + /// The line number of the previous token. + /// </summary> + private int previousLine = 0; + /// <summary> + /// The column number of the previous token. + /// </summary> + private int previousColumn = 0; + + private static Regex WhitespaceAndCommentPattern = new Regex("\\G(\\s|(#[^\\\n]*\\n))+", RegexOptions.Compiled); + private static Regex TokenPattern = new Regex( + "\\G[a-zA-Z_][0-9a-zA-Z_+-]*|" + // an identifier + "\\G[0-9+-][0-9a-zA-Z_.+-]*|" + // a number + "\\G\"([^\"\\\n\\\\]|\\\\[^\\\n])*(\"|\\\\?$)|" + // a double-quoted string + "\\G\'([^\"\\\n\\\\]|\\\\[^\\\n])*(\'|\\\\?$)", // a single-quoted string + RegexOptions.Compiled); + + /** Construct a tokenizer that parses tokens from the given text. */ + public TextTokenizer(string text) { + this.text = text; + SkipWhitespace(); + NextToken(); + } + + /// <summary> + /// Are we at the end of the input? + /// </summary> + public bool AtEnd { + get { return currentToken.Length == 0; } + } + + /// <summary> + /// Advances to the next token. + /// </summary> + public void NextToken() { + previousLine = line; + previousColumn = column; + + // Advance the line counter to the current position. + while (pos < matchPos) { + if (text[pos] == '\n') { + ++line; + column = 0; + } else { + ++column; + } + ++pos; + } + + // Match the next token. + if (matchPos == text.Length) { + // EOF + currentToken = ""; + } else { + Match match = TokenPattern.Match(text, matchPos); + if (match.Success) { + currentToken = match.Value; + matchPos += match.Length; + } else { + // Take one character. + currentToken = text[matchPos].ToString(); + matchPos++; + } + + SkipWhitespace(); + } + } + + /// <summary> + /// Skip over any whitespace so that matchPos starts at the next token. + /// </summary> + private void SkipWhitespace() { + Match match = WhitespaceAndCommentPattern.Match(text, matchPos); + if (match.Success) { + matchPos += match.Length; + } + } + + /// <summary> + /// If the next token exactly matches the given token, consume it and return + /// true. Otherwise, return false without doing anything. + /// </summary> + public bool TryConsume(string token) { + if (currentToken == token) { + NextToken(); + return true; + } + return false; + } + + /* + * If the next token exactly matches {@code token}, consume it. Otherwise, + * throw a {@link ParseException}. + */ + /// <summary> + /// If the next token exactly matches the specified one, consume it. + /// Otherwise, throw a FormatException. + /// </summary> + /// <param name="token"></param> + public void Consume(string token) { + if (!TryConsume(token)) { + throw CreateFormatException("Expected \"" + token + "\"."); + } + } + + /// <summary> + /// Returns true if the next token is an integer, but does not consume it. + /// </summary> + public bool LookingAtInteger() { + if (currentToken.Length == 0) { + return false; + } + + char c = currentToken[0]; + return ('0' <= c && c <= '9') || c == '-' || c == '+'; + } + + /// <summary> + /// If the next token is an identifier, consume it and return its value. + /// Otherwise, throw a FormatException. + /// </summary> + public string ConsumeIdentifier() { + foreach (char c in currentToken) { + if (('a' <= c && c <= 'z') || + ('A' <= c && c <= 'Z') || + ('0' <= c && c <= '9') || + (c == '_') || (c == '.')) { + // OK + } else { + throw CreateFormatException("Expected identifier."); + } + } + + string result = currentToken; + NextToken(); + return result; + } + + /// <summary> + /// If the next token is a 32-bit signed integer, consume it and return its + /// value. Otherwise, throw a FormatException. + /// </summary> + public int ConsumeInt32() { + try { + int result = TextFormat.ParseInt32(currentToken); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateIntegerParseException(e); + } + } + + /// <summary> + /// If the next token is a 32-bit unsigned integer, consume it and return its + /// value. Otherwise, throw a FormatException. + /// </summary> + public uint ConsumeUInt32() { + try { + uint result = TextFormat.ParseUInt32(currentToken); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateIntegerParseException(e); + } + } + + /// <summary> + /// If the next token is a 64-bit signed integer, consume it and return its + /// value. Otherwise, throw a FormatException. + /// </summary> + public long ConsumeInt64() { + try { + long result = TextFormat.ParseInt64(currentToken); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateIntegerParseException(e); + } + } + + /// <summary> + /// If the next token is a 64-bit unsigned integer, consume it and return its + /// value. Otherwise, throw a FormatException. + /// </summary> + public ulong ConsumeUInt64() { + try { + ulong result = TextFormat.ParseUInt64(currentToken); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateIntegerParseException(e); + } + } + + /// <summary> + /// If the next token is a double, consume it and return its value. + /// Otherwise, throw a FormatException. + /// </summary> + public double ConsumeDouble() { + try { + double result = double.Parse(currentToken, CultureInfo.InvariantCulture); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateFloatParseException(e); + } catch (OverflowException e) { + throw CreateFloatParseException(e); + } + } + + /// <summary> + /// If the next token is a float, consume it and return its value. + /// Otherwise, throw a FormatException. + /// </summary> + public float consumeFloat() { + try { + float result = float.Parse(currentToken, CultureInfo.InvariantCulture); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateFloatParseException(e); + } catch (OverflowException e) { + throw CreateFloatParseException(e); + } + } + + /// <summary> + /// If the next token is a Boolean, consume it and return its value. + /// Otherwise, throw a FormatException. + /// </summary> + public bool ConsumeBoolean() { + if (currentToken == "true") { + NextToken(); + return true; + } + if (currentToken == "false") { + NextToken(); + return false; + } + throw CreateFormatException("Expected \"true\" or \"false\"."); + } + + /// <summary> + /// If the next token is a string, consume it and return its (unescaped) value. + /// Otherwise, throw a FormatException. + /// </summary> + public string ConsumeString() { + return ConsumeByteString().ToStringUtf8(); + } + + /// <summary> + /// If the next token is a string, consume it, unescape it as a + /// ByteString and return it. Otherwise, throw a FormatException. + /// </summary> + public ByteString ConsumeByteString() { + char quote = currentToken.Length > 0 ? currentToken[0] : '\0'; + if (quote != '\"' && quote != '\'') { + throw CreateFormatException("Expected string."); + } + + if (currentToken.Length < 2 || + currentToken[currentToken.Length-1] != quote) { + throw CreateFormatException("String missing ending quote."); + } + + try { + string escaped = currentToken.Substring(1, currentToken.Length - 2); + ByteString result = TextFormat.UnescapeBytes(escaped); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateFormatException(e.Message); + } + } + + /// <summary> + /// Returns a format exception with the current line and column numbers + /// in the description, suitable for throwing. + /// </summary> + public FormatException CreateFormatException(string description) { + // Note: People generally prefer one-based line and column numbers. + return new FormatException((line + 1) + ":" + (column + 1) + ": " + description); + } + + /// <summary> + /// Returns a format exception with the line and column numbers of the + /// previous token in the description, suitable for throwing. + /// </summary> + public FormatException CreateFormatExceptionPreviousToken(string description) { + // Note: People generally prefer one-based line and column numbers. + return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description); + } + + /// <summary> + /// Constructs an appropriate FormatException for the given existing exception + /// when trying to parse an integer. + /// </summary> + private FormatException CreateIntegerParseException(FormatException e) { + return CreateFormatException("Couldn't parse integer: " + e.Message); + } + + /// <summary> + /// Constructs an appropriate FormatException for the given existing exception + /// when trying to parse a float or double. + /// </summary> + private FormatException CreateFloatParseException(Exception e) { + return CreateFormatException("Couldn't parse number: " + e.Message); + } + } +} |