aboutsummaryrefslogblamecommitdiff
path: root/src/ProtocolBuffers/TextTokenizer.cs
blob: 5bb27fd0eba5f456a59302a7c2719744d830cfa1 (plain) (tree)
1
2
                                     
 






























                                                                          
 





                                      

                                 
                  
                                                            
                   




































                                                                                                       
                                                                                                     







                                                                                                                 
                                                                                        

                                                                                    
                                                                                                                    


                                                                                           
                                                                                                                   


                                                                                          
                                                                                                              








                                                                                     
          






                                                     

          














































                                                                  
 










                                                                                
 












                                                                                     
 
           


                                                                                
 











                                                                             
 











                                                                                   
          
 
























                                                                                 
 
















                                                                                     
 
















                                                                                      
 
















                                                                                    
 
















                                                                                      
 













                                                                                     
                                                                                






                                   
                                                                                                   











                                                    
 


























                                                                                   
                                                                                                 











                                                    
 

















                                                                             
 







                                                                                        
 





























                                                                                     
 








                                                                                              
 








                                                                                                              
 







                                                                                       
 







                                                                                       
      
 
#region Copyright notice and license

// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc.  All rights reserved.
// http://github.com/jskeet/dotnet-protobufs/
// Original C++/Java/Python code:
// http://code.google.com/p/protobuf/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#endregion

using System;
using System.Globalization;
using System.Text.RegularExpressions;

namespace Google.ProtocolBuffers
{
    /// <summary>
    /// Represents a stream of tokens parsed from a string.
    /// </summary>
    internal sealed class TextTokenizer
    {
        private readonly string text;
        private string currentToken;

        /// <summary>
        /// The character index within the text to perform the next regex match at.
        /// </summary>
        private int matchPos = 0;

        /// <summary>
        /// The character index within the text at which the current token begins.
        /// </summary>
        private int pos = 0;

        /// <summary>
        /// The line number of the current token.
        /// </summary>
        private int line = 0;

        /// <summary>
        /// The column number of the current token.
        /// </summary>
        private int column = 0;

        /// <summary>
        /// The line number of the previous token.
        /// </summary>
        private int previousLine = 0;

        /// <summary>
        /// The column number of the previous token.
        /// </summary>
        private int previousColumn = 0;

        // Note: atomic groups used to mimic possessive quantifiers in Java in both of these regexes
        internal static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(?>(\\s|(#.*$))+)",
                                                                               FrameworkPortability.
                                                                                   CompiledRegexWhereAvailable |
                                                                               RegexOptions.Multiline);

        private static readonly Regex TokenPattern = new Regex(
            "\\G[a-zA-Z_](?>[0-9a-zA-Z_+-]*)|" + // an identifier
            "\\G[0-9+-](?>[0-9a-zA-Z_.+-]*)|" + // a number
            "\\G\"(?>([^\"\\\n\\\\]|\\\\.)*)(\"|\\\\?$)|" + // a double-quoted string
            "\\G\'(?>([^\"\\\n\\\\]|\\\\.)*)(\'|\\\\?$)", // a single-quoted string
            FrameworkPortability.CompiledRegexWhereAvailable | RegexOptions.Multiline);

        private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$",
                                                                 FrameworkPortability.CompiledRegexWhereAvailable |
                                                                 RegexOptions.IgnoreCase);

        private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$",
                                                                FrameworkPortability.CompiledRegexWhereAvailable |
                                                                RegexOptions.IgnoreCase);

        private static readonly Regex FloatNan = new Regex("^nanf?$",
                                                           FrameworkPortability.CompiledRegexWhereAvailable |
                                                           RegexOptions.IgnoreCase);

        /** Construct a tokenizer that parses tokens from the given text. */

        public TextTokenizer(string text)
        {
            this.text = text;
            SkipWhitespace();
            NextToken();
        }

        /// <summary>
        /// Are we at the end of the input?
        /// </summary>
        public bool AtEnd
        {
            get { return currentToken.Length == 0; }
        }

        /// <summary>
        /// Advances to the next token.
        /// </summary>
        public void NextToken()
        {
            previousLine = line;
            previousColumn = column;

            // Advance the line counter to the current position.
            while (pos < matchPos)
            {
                if (text[pos] == '\n')
                {
                    ++line;
                    column = 0;
                }
                else
                {
                    ++column;
                }
                ++pos;
            }

            // Match the next token.
            if (matchPos == text.Length)
            {
                // EOF
                currentToken = "";
            }
            else
            {
                Match match = TokenPattern.Match(text, matchPos);
                if (match.Success)
                {
                    currentToken = match.Value;
                    matchPos += match.Length;
                }
                else
                {
                    // Take one character.
                    currentToken = text[matchPos].ToString();
                    matchPos++;
                }

                SkipWhitespace();
            }
        }

        /// <summary>
        /// Skip over any whitespace so that matchPos starts at the next token.
        /// </summary>
        private void SkipWhitespace()
        {
            Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
            if (match.Success)
            {
                matchPos += match.Length;
            }
        }

        /// <summary>
        /// If the next token exactly matches the given token, consume it and return
        /// true. Otherwise, return false without doing anything.
        /// </summary>
        public bool TryConsume(string token)
        {
            if (currentToken == token)
            {
                NextToken();
                return true;
            }
            return false;
        }

        /*
     * If the next token exactly matches {@code token}, consume it.  Otherwise,
     * throw a {@link ParseException}.
     */

        /// <summary>
        /// If the next token exactly matches the specified one, consume it.
        /// Otherwise, throw a FormatException.
        /// </summary>
        /// <param name="token"></param>
        public void Consume(string token)
        {
            if (!TryConsume(token))
            {
                throw CreateFormatException("Expected \"" + token + "\".");
            }
        }

        /// <summary>
        /// Returns true if the next token is an integer, but does not consume it.
        /// </summary>
        public bool LookingAtInteger()
        {
            if (currentToken.Length == 0)
            {
                return false;
            }

            char c = currentToken[0];
            return ('0' <= c && c <= '9') || c == '-' || c == '+';
        }

        /// <summary>
        /// If the next token is an identifier, consume it and return its value.
        /// Otherwise, throw a FormatException.
        /// </summary>
        public string ConsumeIdentifier()
        {
            foreach (char c in currentToken)
            {
                if (('a' <= c && c <= 'z') ||
                    ('A' <= c && c <= 'Z') ||
                    ('0' <= c && c <= '9') ||
                    (c == '_') || (c == '.'))
                {
                    // OK
                }
                else
                {
                    throw CreateFormatException("Expected identifier.");
                }
            }

            string result = currentToken;
            NextToken();
            return result;
        }

        /// <summary>
        /// If the next token is a 32-bit signed integer, consume it and return its 
        /// value. Otherwise, throw a FormatException.
        /// </summary>
        public int ConsumeInt32()
        {
            try
            {
                int result = TextFormat.ParseInt32(currentToken);
                NextToken();
                return result;
            }
            catch (FormatException e)
            {
                throw CreateIntegerParseException(e);
            }
        }

        /// <summary>
        /// If the next token is a 32-bit unsigned integer, consume it and return its
        /// value. Otherwise, throw a FormatException.
        /// </summary>
        public uint ConsumeUInt32()
        {
            try
            {
                uint result = TextFormat.ParseUInt32(currentToken);
                NextToken();
                return result;
            }
            catch (FormatException e)
            {
                throw CreateIntegerParseException(e);
            }
        }

        /// <summary>
        /// If the next token is a 64-bit signed integer, consume it and return its
        /// value. Otherwise, throw a FormatException.
        /// </summary>
        public long ConsumeInt64()
        {
            try
            {
                long result = TextFormat.ParseInt64(currentToken);
                NextToken();
                return result;
            }
            catch (FormatException e)
            {
                throw CreateIntegerParseException(e);
            }
        }

        /// <summary>
        /// If the next token is a 64-bit unsigned integer, consume it and return its
        /// value. Otherwise, throw a FormatException.
        /// </summary>
        public ulong ConsumeUInt64()
        {
            try
            {
                ulong result = TextFormat.ParseUInt64(currentToken);
                NextToken();
                return result;
            }
            catch (FormatException e)
            {
                throw CreateIntegerParseException(e);
            }
        }

        /// <summary>
        /// If the next token is a double, consume it and return its value.
        /// Otherwise, throw a FormatException.
        /// </summary>
        public double ConsumeDouble()
        {
            // We need to parse infinity and nan separately because
            // double.Parse() does not accept "inf", "infinity", or "nan".
            if (DoubleInfinity.IsMatch(currentToken))
            {
                bool negative = currentToken.StartsWith("-");
                NextToken();
                return negative ? double.NegativeInfinity : double.PositiveInfinity;
            }
            if (currentToken.Equals("nan", StringComparison.OrdinalIgnoreCase))
            {
                NextToken();
                return Double.NaN;
            }

            try
            {
                double result = double.Parse(currentToken, FrameworkPortability.InvariantCulture);
                NextToken();
                return result;
            }
            catch (FormatException e)
            {
                throw CreateFloatParseException(e);
            }
            catch (OverflowException e)
            {
                throw CreateFloatParseException(e);
            }
        }

        /// <summary>
        /// If the next token is a float, consume it and return its value.
        /// Otherwise, throw a FormatException.
        /// </summary>
        public float ConsumeFloat()
        {
            // We need to parse infinity and nan separately because
            // Float.parseFloat() does not accept "inf", "infinity", or "nan".
            if (FloatInfinity.IsMatch(currentToken))
            {
                bool negative = currentToken.StartsWith("-");
                NextToken();
                return negative ? float.NegativeInfinity : float.PositiveInfinity;
            }
            if (FloatNan.IsMatch(currentToken))
            {
                NextToken();
                return float.NaN;
            }

            if (currentToken.EndsWith("f"))
            {
                currentToken = currentToken.TrimEnd('f');
            }

            try
            {
                float result = float.Parse(currentToken, FrameworkPortability.InvariantCulture);
                NextToken();
                return result;
            }
            catch (FormatException e)
            {
                throw CreateFloatParseException(e);
            }
            catch (OverflowException e)
            {
                throw CreateFloatParseException(e);
            }
        }

        /// <summary>
        /// If the next token is a Boolean, consume it and return its value.
        /// Otherwise, throw a FormatException.    
        /// </summary>
        public bool ConsumeBoolean()
        {
            if (currentToken == "true")
            {
                NextToken();
                return true;
            }
            if (currentToken == "false")
            {
                NextToken();
                return false;
            }
            throw CreateFormatException("Expected \"true\" or \"false\".");
        }

        /// <summary>
        /// If the next token is a string, consume it and return its (unescaped) value.
        /// Otherwise, throw a FormatException.
        /// </summary>
        public string ConsumeString()
        {
            return ConsumeByteString().ToStringUtf8();
        }

        /// <summary>
        /// If the next token is a string, consume it, unescape it as a
        /// ByteString and return it. Otherwise, throw a FormatException.
        /// </summary>
        public ByteString ConsumeByteString()
        {
            char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
            if (quote != '\"' && quote != '\'')
            {
                throw CreateFormatException("Expected string.");
            }

            if (currentToken.Length < 2 ||
                currentToken[currentToken.Length - 1] != quote)
            {
                throw CreateFormatException("String missing ending quote.");
            }

            try
            {
                string escaped = currentToken.Substring(1, currentToken.Length - 2);
                ByteString result = TextFormat.UnescapeBytes(escaped);
                NextToken();
                return result;
            }
            catch (FormatException e)
            {
                throw CreateFormatException(e.Message);
            }
        }

        /// <summary>
        /// Returns a format exception with the current line and column numbers
        /// in the description, suitable for throwing.
        /// </summary>
        public FormatException CreateFormatException(string description)
        {
            // Note:  People generally prefer one-based line and column numbers.
            return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
        }

        /// <summary>
        /// Returns a format exception with the line and column numbers of the
        /// previous token in the description, suitable for throwing.
        /// </summary>
        public FormatException CreateFormatExceptionPreviousToken(string description)
        {
            // Note:  People generally prefer one-based line and column numbers.
            return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
        }

        /// <summary>
        /// Constructs an appropriate FormatException for the given existing exception
        /// when trying to parse an integer.
        /// </summary>
        private FormatException CreateIntegerParseException(FormatException e)
        {
            return CreateFormatException("Couldn't parse integer: " + e.Message);
        }

        /// <summary>
        /// Constructs an appropriate FormatException for the given existing exception
        /// when trying to parse a float or double.
        /// </summary>
        private FormatException CreateFloatParseException(Exception e)
        {
            return CreateFormatException("Couldn't parse number: " + e.Message);
        }
    }
}