From 1e42fdde2ebf34dddf1297bbecc56439ecef323f Mon Sep 17 00:00:00 2001 From: Jon Skeet Date: Thu, 14 Aug 2008 20:35:30 +0100 Subject: Implemented text parsing. --- csharp/ProtocolBuffers.Test/TextFormatTest.cs | 26 +- csharp/ProtocolBuffers/ProtocolBuffers.csproj | 1 + csharp/ProtocolBuffers/TextFormat.cs | 202 ++++++++++++++- csharp/ProtocolBuffers/TextTokenizer.cs | 341 ++++++++++++++++++++++++++ 4 files changed, 536 insertions(+), 34 deletions(-) create mode 100644 csharp/ProtocolBuffers/TextTokenizer.cs (limited to 'csharp') diff --git a/csharp/ProtocolBuffers.Test/TextFormatTest.cs b/csharp/ProtocolBuffers.Test/TextFormatTest.cs index f3ac0a0e..fa12e45d 100644 --- a/csharp/ProtocolBuffers.Test/TextFormatTest.cs +++ b/csharp/ProtocolBuffers.Test/TextFormatTest.cs @@ -8,18 +8,6 @@ namespace Google.ProtocolBuffers { [TestFixture] public class TextFormatTest { - /// - /// A basic string with different escapable characters for testing. - /// - private const string EscapeTestString = "\"A string with ' characters \n and \r newlines and \t tabs and \001 " - + "slashes \\"; - - /// - /// A representation of the above string with all the characters escaped. - /// - private const string EscapeTestStringEscaped = "\"\\\"A string with \\' characters \\n and \\r newlines " - + "and \\t tabs and \\001 slashes \\\\\""; - private static readonly string AllFieldsSetText = TestUtil.ReadTextFromFile("text_format_unittest_data.txt"); private static readonly string AllExtensionsSetText = TestUtil.ReadTextFromFile("text_format_unittest_extensions_data.txt"); @@ -193,7 +181,6 @@ namespace Google.ProtocolBuffers { // ================================================================= [Test] - [Ignore("Parsing not implemented")] public void Parse() { TestAllTypes.Builder builder = TestAllTypes.CreateBuilder(); TextFormat.Merge(AllFieldsSetText, builder); @@ -201,7 +188,6 @@ namespace Google.ProtocolBuffers { } [Test] - [Ignore("Parsing not implemented")] public void ParseReader() { TestAllTypes.Builder builder = TestAllTypes.CreateBuilder(); TextFormat.Merge(new StringReader(AllFieldsSetText), builder); @@ -209,7 +195,6 @@ namespace Google.ProtocolBuffers { } [Test] - [Ignore("Parsing not implemented")] public void ParseExtensions() { TestAllExtensions.Builder builder = TestAllExtensions.CreateBuilder(); TextFormat.Merge(AllExtensionsSetText, @@ -219,7 +204,6 @@ namespace Google.ProtocolBuffers { } [Test] - [Ignore("Parsing not implemented")] public void ParseExotic() { TestAllTypes.Builder builder = TestAllTypes.CreateBuilder(); TextFormat.Merge(ExoticText, builder); @@ -230,7 +214,6 @@ namespace Google.ProtocolBuffers { } [Test] - [Ignore("Parsing not implemented")] public void ParseMessageSet() { ExtensionRegistry extensionRegistry = ExtensionRegistry.CreateInstance(); extensionRegistry.Add(TestMessageSetExtension1.MessageSetExtension); @@ -247,7 +230,6 @@ namespace Google.ProtocolBuffers { } [Test] - [Ignore("Parsing not implemented")] public void ParseNumericEnum() { TestAllTypes.Builder builder = TestAllTypes.CreateBuilder(); TextFormat.Merge("optional_nested_enum: 2", builder); @@ -255,7 +237,6 @@ namespace Google.ProtocolBuffers { } [Test] - [Ignore("Parsing not implemented")] public void ParseAngleBrackets() { TestAllTypes.Builder builder = TestAllTypes.CreateBuilder(); TextFormat.Merge("OptionalGroup: < a: 1 >", builder); @@ -274,7 +255,6 @@ namespace Google.ProtocolBuffers { } [Test] - [Ignore("Parsing not implemented")] public void ParseErrors() { AssertParseError( "1:16: Expected \":\".", @@ -296,17 +276,17 @@ namespace Google.ProtocolBuffers { "1:18: Expected string.", "optional_string: 123"); AssertParseError( - "1:18: string missing ending quote.", + "1:18: String missing ending quote.", "optional_string: \"ueoauaoe"); AssertParseError( - "1:18: string missing ending quote.", + "1:18: String missing ending quote.", "optional_string: \"ueoauaoe\n" + "optional_int32: 123"); AssertParseError( "1:18: Invalid escape sequence: '\\z'", "optional_string: \"\\z\""); AssertParseError( - "1:18: string missing ending quote.", + "1:18: String missing ending quote.", "optional_string: \"ueoauaoe\n" + "optional_int32: 123"); AssertParseError( diff --git a/csharp/ProtocolBuffers/ProtocolBuffers.csproj b/csharp/ProtocolBuffers/ProtocolBuffers.csproj index 6c97e8d1..cf490a73 100644 --- a/csharp/ProtocolBuffers/ProtocolBuffers.csproj +++ b/csharp/ProtocolBuffers/ProtocolBuffers.csproj @@ -97,6 +97,7 @@ + diff --git a/csharp/ProtocolBuffers/TextFormat.cs b/csharp/ProtocolBuffers/TextFormat.cs index e088d080..3b717df4 100644 --- a/csharp/ProtocolBuffers/TextFormat.cs +++ b/csharp/ProtocolBuffers/TextFormat.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Globalization; using System.IO; using System.Text; using Google.ProtocolBuffers.Descriptors; @@ -116,9 +117,9 @@ namespace Google.ProtocolBuffers { case FieldType.UInt64: case FieldType.Fixed32: case FieldType.Fixed64: - // Good old ToString() does what we want for these types. (Including the - // unsigned ones, unlike with Java.) - generator.Print(value.ToString()); + // The simple Object.ToString converts using the current culture. + // We want to always use the invariant culture so it's predictable. + generator.Print(((IConvertible) value).ToString(CultureInfo.InvariantCulture)); break; case FieldType.Bool: // Explicitly use the Java true/false @@ -237,13 +238,15 @@ namespace Google.ProtocolBuffers { result = radix == 10 ? ulong.Parse(text) : Convert.ToUInt64(text, radix); } catch (OverflowException) { // Convert OverflowException to FormatException so there's a single exception type this method can throw. - throw new FormatException("Number of out range: " + original); + string numberDescription = string.Format("{0}-bit {1}signed integer", isLong ? 64 : 32, isSigned ? "" : "un"); + throw new FormatException("Number out of range for " + numberDescription + ": " + original); } if (negative) { ulong max = isLong ? 0x8000000000000000UL : 0x80000000L; if (result > max) { - throw new FormatException("Number of out range: " + original); + string numberDescription = string.Format("{0}-bit signed integer", isLong ? 64 : 32); + throw new FormatException("Number out of range for " + numberDescription + ": " + original); } return -((long) result); } else { @@ -251,7 +254,8 @@ namespace Google.ProtocolBuffers { ? (isLong ? (ulong) long.MaxValue : int.MaxValue) : (isLong ? ulong.MaxValue : uint.MaxValue); if (result > max) { - throw new FormatException("Number of out range: " + original); + string numberDescription = string.Format("{0}-bit {1}signed integer", isLong ? 64 : 32, isSigned ? "" : "un"); + throw new FormatException("Number out of range for " + numberDescription + ": " + original); } return (long) result; } @@ -418,19 +422,195 @@ namespace Google.ProtocolBuffers { } public static void Merge(string text, IBuilder builder) { - throw new NotImplementedException(); + Merge(text, ExtensionRegistry.Empty, builder); } public static void Merge(TextReader reader, IBuilder builder) { - throw new NotImplementedException(); + Merge(reader, ExtensionRegistry.Empty, builder); + } + + public static void Merge(TextReader reader, ExtensionRegistry registry, IBuilder builder) { + Merge(reader.ReadToEnd(), registry, builder); } public static void Merge(string text, ExtensionRegistry registry, IBuilder builder) { - throw new NotImplementedException(); + TextTokenizer tokenizer = new TextTokenizer(text); + + while (!tokenizer.AtEnd) { + MergeField(tokenizer, registry, builder); + } } - public static void Merge(TextReader reader, ExtensionRegistry registry, IBuilder builder) { - throw new NotImplementedException(); + /// + /// Parses a single field from the specified tokenizer and merges it into + /// the builder. + /// + private static void MergeField(TextTokenizer tokenizer, ExtensionRegistry extensionRegistry, + IBuilder builder) { + + FieldDescriptor field; + MessageDescriptor type = builder.DescriptorForType; + ExtensionInfo extension = null; + + if (tokenizer.TryConsume("[")) { + // An extension. + StringBuilder name = new StringBuilder(tokenizer.ConsumeIdentifier()); + while (tokenizer.TryConsume(".")) { + name.Append("."); + name.Append(tokenizer.ConsumeIdentifier()); + } + + extension = extensionRegistry[name.ToString()]; + + if (extension == null) { + throw tokenizer.CreateFormatExceptionPreviousToken("Extension \"" + name + "\" not found in the ExtensionRegistry."); + } else if (extension.Descriptor.ContainingType != type) { + throw tokenizer.CreateFormatExceptionPreviousToken("Extension \"" + name + "\" does not extend message type \"" + + type.FullName + "\"."); + } + + tokenizer.Consume("]"); + + field = extension.Descriptor; + } else { + String name = tokenizer.ConsumeIdentifier(); + field = type.FindDescriptor(name); + + // Group names are expected to be capitalized as they appear in the + // .proto file, which actually matches their type names, not their field + // names. + if (field == null) { + // Explicitly specify the invariant culture so that this code does not break when + // executing in Turkey. + String lowerName = name.ToLowerInvariant(); + field = type.FindDescriptor(lowerName); + // If the case-insensitive match worked but the field is NOT a group, + // TODO(jonskeet): What? Java comment ends here! + if (field != null && field.FieldType != FieldType.Group) { + field = null; + } + } + // Again, special-case group names as described above. + if (field != null && field.FieldType == FieldType.Group && field.MessageType.Name != name) { + field = null; + } + + if (field == null) { + throw tokenizer.CreateFormatExceptionPreviousToken( + "Message type \"" + type.FullName + "\" has no field named \"" + name + "\"."); + } + } + + object value = null; + + if (field.MappedType == MappedType.Message) { + tokenizer.TryConsume(":"); // optional + + String endToken; + if (tokenizer.TryConsume("<")) { + endToken = ">"; + } else { + tokenizer.Consume("{"); + endToken = "}"; + } + + IBuilder subBuilder; + if (extension == null) { + subBuilder = builder.CreateBuilderForField(field); + } else { + subBuilder = extension.DefaultInstance.WeakCreateBuilderForType(); + } + + while (!tokenizer.TryConsume(endToken)) { + if (tokenizer.AtEnd) { + throw tokenizer.CreateFormatException("Expected \"" + endToken + "\"."); + } + MergeField(tokenizer, extensionRegistry, subBuilder); + } + + value = subBuilder.WeakBuild(); + + } else { + tokenizer.Consume(":"); + + switch (field.FieldType) { + case FieldType.Int32: + case FieldType.SInt32: + case FieldType.SFixed32: + value = tokenizer.ConsumeInt32(); + break; + + case FieldType.Int64: + case FieldType.SInt64: + case FieldType.SFixed64: + value = tokenizer.ConsumeInt64(); + break; + + case FieldType.UInt32: + case FieldType.Fixed32: + value = tokenizer.ConsumeUInt32(); + break; + + case FieldType.UInt64: + case FieldType.Fixed64: + value = tokenizer.ConsumeUInt64(); + break; + + case FieldType.Float: + value = tokenizer.consumeFloat(); + break; + + case FieldType.Double: + value = tokenizer.ConsumeDouble(); + break; + + case FieldType.Bool: + value = tokenizer.ConsumeBoolean(); + break; + + case FieldType.String: + value = tokenizer.ConsumeString(); + break; + + case FieldType.Bytes: + value = tokenizer.ConsumeByteString(); + break; + + case FieldType.Enum: { + EnumDescriptor enumType = field.EnumType; + + if (tokenizer.LookingAtInteger()) { + int number = tokenizer.ConsumeInt32(); + value = enumType.FindValueByNumber(number); + if (value == null) { + throw tokenizer.CreateFormatExceptionPreviousToken( + "Enum type \"" + enumType.FullName + + "\" has no value with number " + number + "."); + } + } else { + String id = tokenizer.ConsumeIdentifier(); + value = enumType.FindValueByName(id); + if (value == null) { + throw tokenizer.CreateFormatExceptionPreviousToken( + "Enum type \"" + enumType.FullName + + "\" has no value named \"" + id + "\"."); + } + } + + break; + } + + case FieldType.Message: + case FieldType.Group: + throw new InvalidOperationException("Can't get here."); + } + } + + if (field.IsRepeated) { + builder.WeakAddRepeatedField(field, value); + } else { + builder.SetField(field, value); + } } } } diff --git a/csharp/ProtocolBuffers/TextTokenizer.cs b/csharp/ProtocolBuffers/TextTokenizer.cs new file mode 100644 index 00000000..a3585ab0 --- /dev/null +++ b/csharp/ProtocolBuffers/TextTokenizer.cs @@ -0,0 +1,341 @@ +using System; +using System.Globalization; +using System.Text.RegularExpressions; + +namespace Google.ProtocolBuffers { + /// + /// Represents a stream of tokens parsed from a string. + /// + internal sealed class TextTokenizer { + private readonly string text; + private string currentToken; + + /// + /// The character index within the text to perform the next regex match at. + /// + private int matchPos = 0; + + /// + /// The character index within the text at which the current token begins. + /// + private int pos = 0; + + /// + /// The line number of the current token. + /// + private int line = 0; + /// + /// The column number of the current token. + /// + private int column = 0; + + /// + /// The line number of the previous token. + /// + private int previousLine = 0; + /// + /// The column number of the previous token. + /// + private int previousColumn = 0; + + private static Regex WhitespaceAndCommentPattern = new Regex("\\G(\\s|(#[^\\\n]*\\n))+", RegexOptions.Compiled); + private static Regex TokenPattern = new Regex( + "\\G[a-zA-Z_][0-9a-zA-Z_+-]*|" + // an identifier + "\\G[0-9+-][0-9a-zA-Z_.+-]*|" + // a number + "\\G\"([^\"\\\n\\\\]|\\\\[^\\\n])*(\"|\\\\?$)|" + // a double-quoted string + "\\G\'([^\"\\\n\\\\]|\\\\[^\\\n])*(\'|\\\\?$)", // a single-quoted string + RegexOptions.Compiled); + + /** Construct a tokenizer that parses tokens from the given text. */ + public TextTokenizer(string text) { + this.text = text; + SkipWhitespace(); + NextToken(); + } + + /// + /// Are we at the end of the input? + /// + public bool AtEnd { + get { return currentToken.Length == 0; } + } + + /// + /// Advances to the next token. + /// + public void NextToken() { + previousLine = line; + previousColumn = column; + + // Advance the line counter to the current position. + while (pos < matchPos) { + if (text[pos] == '\n') { + ++line; + column = 0; + } else { + ++column; + } + ++pos; + } + + // Match the next token. + if (matchPos == text.Length) { + // EOF + currentToken = ""; + } else { + Match match = TokenPattern.Match(text, matchPos); + if (match.Success) { + currentToken = match.Value; + matchPos += match.Length; + } else { + // Take one character. + currentToken = text[matchPos].ToString(); + matchPos++; + } + + SkipWhitespace(); + } + } + + /// + /// Skip over any whitespace so that matchPos starts at the next token. + /// + private void SkipWhitespace() { + Match match = WhitespaceAndCommentPattern.Match(text, matchPos); + if (match.Success) { + matchPos += match.Length; + } + } + + /// + /// If the next token exactly matches the given token, consume it and return + /// true. Otherwise, return false without doing anything. + /// + public bool TryConsume(string token) { + if (currentToken == token) { + NextToken(); + return true; + } + return false; + } + + /* + * If the next token exactly matches {@code token}, consume it. Otherwise, + * throw a {@link ParseException}. + */ + /// + /// If the next token exactly matches the specified one, consume it. + /// Otherwise, throw a FormatException. + /// + /// + public void Consume(string token) { + if (!TryConsume(token)) { + throw CreateFormatException("Expected \"" + token + "\"."); + } + } + + /// + /// Returns true if the next token is an integer, but does not consume it. + /// + public bool LookingAtInteger() { + if (currentToken.Length == 0) { + return false; + } + + char c = currentToken[0]; + return ('0' <= c && c <= '9') || c == '-' || c == '+'; + } + + /// + /// If the next token is an identifier, consume it and return its value. + /// Otherwise, throw a FormatException. + /// + public string ConsumeIdentifier() { + foreach (char c in currentToken) { + if (('a' <= c && c <= 'z') || + ('A' <= c && c <= 'Z') || + ('0' <= c && c <= '9') || + (c == '_') || (c == '.')) { + // OK + } else { + throw CreateFormatException("Expected identifier."); + } + } + + string result = currentToken; + NextToken(); + return result; + } + + /// + /// If the next token is a 32-bit signed integer, consume it and return its + /// value. Otherwise, throw a FormatException. + /// + public int ConsumeInt32() { + try { + int result = TextFormat.ParseInt32(currentToken); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateIntegerParseException(e); + } + } + + /// + /// If the next token is a 32-bit unsigned integer, consume it and return its + /// value. Otherwise, throw a FormatException. + /// + public uint ConsumeUInt32() { + try { + uint result = TextFormat.ParseUInt32(currentToken); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateIntegerParseException(e); + } + } + + /// + /// If the next token is a 64-bit signed integer, consume it and return its + /// value. Otherwise, throw a FormatException. + /// + public long ConsumeInt64() { + try { + long result = TextFormat.ParseInt64(currentToken); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateIntegerParseException(e); + } + } + + /// + /// If the next token is a 64-bit unsigned integer, consume it and return its + /// value. Otherwise, throw a FormatException. + /// + public ulong ConsumeUInt64() { + try { + ulong result = TextFormat.ParseUInt64(currentToken); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateIntegerParseException(e); + } + } + + /// + /// If the next token is a double, consume it and return its value. + /// Otherwise, throw a FormatException. + /// + public double ConsumeDouble() { + try { + double result = double.Parse(currentToken, CultureInfo.InvariantCulture); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateFloatParseException(e); + } catch (OverflowException e) { + throw CreateFloatParseException(e); + } + } + + /// + /// If the next token is a float, consume it and return its value. + /// Otherwise, throw a FormatException. + /// + public float consumeFloat() { + try { + float result = float.Parse(currentToken, CultureInfo.InvariantCulture); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateFloatParseException(e); + } catch (OverflowException e) { + throw CreateFloatParseException(e); + } + } + + /// + /// If the next token is a Boolean, consume it and return its value. + /// Otherwise, throw a FormatException. + /// + public bool ConsumeBoolean() { + if (currentToken == "true") { + NextToken(); + return true; + } + if (currentToken == "false") { + NextToken(); + return false; + } + throw CreateFormatException("Expected \"true\" or \"false\"."); + } + + /// + /// If the next token is a string, consume it and return its (unescaped) value. + /// Otherwise, throw a FormatException. + /// + public string ConsumeString() { + return ConsumeByteString().ToStringUtf8(); + } + + /// + /// If the next token is a string, consume it, unescape it as a + /// ByteString and return it. Otherwise, throw a FormatException. + /// + public ByteString ConsumeByteString() { + char quote = currentToken.Length > 0 ? currentToken[0] : '\0'; + if (quote != '\"' && quote != '\'') { + throw CreateFormatException("Expected string."); + } + + if (currentToken.Length < 2 || + currentToken[currentToken.Length-1] != quote) { + throw CreateFormatException("String missing ending quote."); + } + + try { + string escaped = currentToken.Substring(1, currentToken.Length - 2); + ByteString result = TextFormat.UnescapeBytes(escaped); + NextToken(); + return result; + } catch (FormatException e) { + throw CreateFormatException(e.Message); + } + } + + /// + /// Returns a format exception with the current line and column numbers + /// in the description, suitable for throwing. + /// + public FormatException CreateFormatException(string description) { + // Note: People generally prefer one-based line and column numbers. + return new FormatException((line + 1) + ":" + (column + 1) + ": " + description); + } + + /// + /// Returns a format exception with the line and column numbers of the + /// previous token in the description, suitable for throwing. + /// + public FormatException CreateFormatExceptionPreviousToken(string description) { + // Note: People generally prefer one-based line and column numbers. + return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description); + } + + /// + /// Constructs an appropriate FormatException for the given existing exception + /// when trying to parse an integer. + /// + private FormatException CreateIntegerParseException(FormatException e) { + return CreateFormatException("Couldn't parse integer: " + e.Message); + } + + /// + /// Constructs an appropriate FormatException for the given existing exception + /// when trying to parse a float or double. + /// + private FormatException CreateFloatParseException(Exception e) { + return CreateFormatException("Couldn't parse number: " + e.Message); + } + } +} -- cgit v1.2.3