Implemented text parsing.

17 years ago · 1e42fdde2e
parent feb9385b04
commit 1e42fdde2e
4 changed files with 536 additions and 34 deletions
--- a/csharp/ProtocolBuffers.Test/TextFormatTest.cs
+++ b/csharp/ProtocolBuffers.Test/TextFormatTest.cs
@ -8,18 +8,6 @@ namespace Google.ProtocolBuffers {
  [TestFixture]
  public class TextFormatTest {

-    /// <summary>
-    /// A basic string with different escapable characters for testing.
-    /// </summary>
-    private const string EscapeTestString = "\"A string with ' characters \n and \r newlines and \t tabs and \001 "
-            + "slashes \\";
-
-    /// <summary>
-    /// A representation of the above string with all the characters escaped.
-    /// </summary>
-    private const string EscapeTestStringEscaped = "\"\\\"A string with \\' characters \\n and \\r newlines "
-            + "and \\t tabs and \\001 slashes \\\\\"";
-
    private static readonly string AllFieldsSetText = TestUtil.ReadTextFromFile("text_format_unittest_data.txt");
    private static readonly string AllExtensionsSetText = TestUtil.ReadTextFromFile("text_format_unittest_extensions_data.txt");

@ -193,7 +181,6 @@ namespace Google.ProtocolBuffers {
    // =================================================================

    [Test]
-    [Ignore("Parsing not implemented")]
    public void Parse() {
      TestAllTypes.Builder builder = TestAllTypes.CreateBuilder();
      TextFormat.Merge(AllFieldsSetText, builder);
@ -201,7 +188,6 @@ namespace Google.ProtocolBuffers {
    }

    [Test]
-    [Ignore("Parsing not implemented")]
    public void ParseReader() {
      TestAllTypes.Builder builder = TestAllTypes.CreateBuilder();
      TextFormat.Merge(new StringReader(AllFieldsSetText), builder);
@ -209,7 +195,6 @@ namespace Google.ProtocolBuffers {
    }

    [Test]
-    [Ignore("Parsing not implemented")]
    public void ParseExtensions() {
      TestAllExtensions.Builder builder = TestAllExtensions.CreateBuilder();
      TextFormat.Merge(AllExtensionsSetText,
@ -219,7 +204,6 @@ namespace Google.ProtocolBuffers {
    }

    [Test]
-    [Ignore("Parsing not implemented")]
    public void ParseExotic() {
      TestAllTypes.Builder builder = TestAllTypes.CreateBuilder();
      TextFormat.Merge(ExoticText, builder);
@ -230,7 +214,6 @@ namespace Google.ProtocolBuffers {
    }

    [Test]
-    [Ignore("Parsing not implemented")]
    public void ParseMessageSet() {
      ExtensionRegistry extensionRegistry = ExtensionRegistry.CreateInstance();
      extensionRegistry.Add(TestMessageSetExtension1.MessageSetExtension);
@ -247,7 +230,6 @@ namespace Google.ProtocolBuffers {
    }

    [Test]
-    [Ignore("Parsing not implemented")]
    public void ParseNumericEnum() {
      TestAllTypes.Builder builder = TestAllTypes.CreateBuilder();
      TextFormat.Merge("optional_nested_enum: 2", builder);
@ -255,7 +237,6 @@ namespace Google.ProtocolBuffers {
    }

    [Test]
-    [Ignore("Parsing not implemented")]
    public void ParseAngleBrackets() {
      TestAllTypes.Builder builder = TestAllTypes.CreateBuilder();
      TextFormat.Merge("OptionalGroup: < a: 1 >", builder);
@ -274,7 +255,6 @@ namespace Google.ProtocolBuffers {
    }

    [Test]
-    [Ignore("Parsing not implemented")]
    public void ParseErrors() {
      AssertParseError(
        "1:16: Expected \":\".",
@ -296,17 +276,17 @@ namespace Google.ProtocolBuffers {
        "1:18: Expected string.",
        "optional_string: 123");
      AssertParseError(
-        "1:18: string missing ending quote.",
+        "1:18: String missing ending quote.",
        "optional_string: \"ueoauaoe");
      AssertParseError(
-        "1:18: string missing ending quote.",
+        "1:18: String missing ending quote.",
        "optional_string: \"ueoauaoe\n" +
        "optional_int32: 123");
      AssertParseError(
        "1:18: Invalid escape sequence: '\\z'",
        "optional_string: \"\\z\"");
      AssertParseError(
-        "1:18: string missing ending quote.",
+        "1:18: String missing ending quote.",
        "optional_string: \"ueoauaoe\n" +
        "optional_int32: 123");
      AssertParseError(
--- a/csharp/ProtocolBuffers/ProtocolBuffers.csproj
+++ b/csharp/ProtocolBuffers/ProtocolBuffers.csproj
@ -97,6 +97,7 @@
    <Compile Include="RpcUtil.cs" />
    <Compile Include="TextFormat.cs" />
    <Compile Include="TextGenerator.cs" />
+    <Compile Include="TextTokenizer.cs" />
    <Compile Include="UninitializedMessageException.cs" />
    <Compile Include="UnknownField.cs" />
    <Compile Include="UnknownFieldSet.cs" />
--- a/csharp/ProtocolBuffers/TextFormat.cs
+++ b/csharp/ProtocolBuffers/TextFormat.cs
@ -1,5 +1,6 @@
 using System;
 using System.Collections.Generic;
+using System.Globalization;
 using System.IO;
 using System.Text;
 using Google.ProtocolBuffers.Descriptors;
@ -116,9 +117,9 @@ namespace Google.ProtocolBuffers {
        case FieldType.UInt64:
        case FieldType.Fixed32:
        case FieldType.Fixed64:
-          // Good old ToString() does what we want for these types. (Including the
-          // unsigned ones, unlike with Java.)
-          generator.Print(value.ToString());
+          // The simple Object.ToString converts using the current culture.
+          // We want to always use the invariant culture so it's predictable.
+          generator.Print(((IConvertible) value).ToString(CultureInfo.InvariantCulture));
          break;
        case FieldType.Bool:
          // Explicitly use the Java true/false
@ -237,13 +238,15 @@ namespace Google.ProtocolBuffers {
        result = radix == 10 ? ulong.Parse(text) : Convert.ToUInt64(text, radix);
      } catch (OverflowException) {
        // Convert OverflowException to FormatException so there's a single exception type this method can throw.
-        throw new FormatException("Number of out range: " + original);
+        string numberDescription = string.Format("{0}-bit {1}signed integer", isLong ? 64 : 32, isSigned ? "" : "un");
+        throw new FormatException("Number out of range for " + numberDescription + ": " + original);
      }

      if (negative) {
        ulong max = isLong ? 0x8000000000000000UL : 0x80000000L;
        if (result > max) {
-          throw new FormatException("Number of out range: " + original);
+          string numberDescription = string.Format("{0}-bit signed integer", isLong ? 64 : 32);
+          throw new FormatException("Number out of range for " + numberDescription + ": " + original);
        }
        return -((long) result);
      } else {
@ -251,7 +254,8 @@ namespace Google.ProtocolBuffers {
            ? (isLong ? (ulong) long.MaxValue : int.MaxValue)
            : (isLong ? ulong.MaxValue : uint.MaxValue);
        if (result > max) {
-          throw new FormatException("Number of out range: " + original);
+          string numberDescription = string.Format("{0}-bit {1}signed integer", isLong ? 64 : 32, isSigned ? "" : "un");
+          throw new FormatException("Number out of range for " + numberDescription + ": " + original);
        }
        return (long) result;
      }
@ -418,19 +422,195 @@ namespace Google.ProtocolBuffers {
    }

    public static void Merge(string text, IBuilder builder) {
-      throw new NotImplementedException();
+      Merge(text, ExtensionRegistry.Empty, builder);
    }

    public static void Merge(TextReader reader, IBuilder builder) {
-      throw new NotImplementedException();
+      Merge(reader, ExtensionRegistry.Empty, builder);
+    }
+
+    public static void Merge(TextReader reader, ExtensionRegistry registry, IBuilder builder) {
+      Merge(reader.ReadToEnd(), registry, builder);
    }

    public static void Merge(string text, ExtensionRegistry registry, IBuilder builder) {
-      throw new NotImplementedException();
+      TextTokenizer tokenizer = new TextTokenizer(text);
+
+      while (!tokenizer.AtEnd) {
+        MergeField(tokenizer, registry, builder);
+      }
    }

-    public static void Merge(TextReader reader, ExtensionRegistry registry, IBuilder builder) {
-      throw new NotImplementedException();
+    /// <summary>
+    /// Parses a single field from the specified tokenizer and merges it into
+    /// the builder.
+    /// </summary>
+    private static void MergeField(TextTokenizer tokenizer, ExtensionRegistry extensionRegistry,
+        IBuilder builder) {
+
+      FieldDescriptor field;
+      MessageDescriptor type = builder.DescriptorForType;
+      ExtensionInfo extension = null;
+
+      if (tokenizer.TryConsume("[")) {
+        // An extension.
+        StringBuilder name = new StringBuilder(tokenizer.ConsumeIdentifier());
+        while (tokenizer.TryConsume(".")) {
+          name.Append(".");
+          name.Append(tokenizer.ConsumeIdentifier());
+        }
+
+        extension = extensionRegistry[name.ToString()];
+
+        if (extension == null) {
+          throw tokenizer.CreateFormatExceptionPreviousToken("Extension \"" + name + "\" not found in the ExtensionRegistry.");
+        } else if (extension.Descriptor.ContainingType != type) {
+          throw tokenizer.CreateFormatExceptionPreviousToken("Extension \"" + name + "\" does not extend message type \"" +
+            type.FullName + "\".");
+        }
+
+        tokenizer.Consume("]");
+
+        field = extension.Descriptor;
+      } else {
+        String name = tokenizer.ConsumeIdentifier();
+        field = type.FindDescriptor<FieldDescriptor>(name);
+
+        // Group names are expected to be capitalized as they appear in the
+        // .proto file, which actually matches their type names, not their field
+        // names.
+        if (field == null) {
+          // Explicitly specify the invariant culture so that this code does not break when
+          // executing in Turkey.
+          String lowerName = name.ToLowerInvariant();
+          field = type.FindDescriptor<FieldDescriptor>(lowerName);
+          // If the case-insensitive match worked but the field is NOT a group,
+          // TODO(jonskeet): What? Java comment ends here!
+          if (field != null && field.FieldType != FieldType.Group) {
+            field = null;
+          }
+        }
+        // Again, special-case group names as described above.
+        if (field != null && field.FieldType == FieldType.Group && field.MessageType.Name != name) {
+          field = null;
+        }
+
+        if (field == null) {
+          throw tokenizer.CreateFormatExceptionPreviousToken(
+              "Message type \"" + type.FullName + "\" has no field named \"" + name + "\".");
+        }
+      }
+
+      object value = null;
+
+      if (field.MappedType == MappedType.Message) {
+        tokenizer.TryConsume(":");  // optional
+
+        String endToken;
+        if (tokenizer.TryConsume("<")) {
+          endToken = ">";
+        } else {
+          tokenizer.Consume("{");
+          endToken = "}";
+        }
+
+        IBuilder subBuilder;
+        if (extension == null) {
+          subBuilder = builder.CreateBuilderForField(field);
+        } else {
+          subBuilder = extension.DefaultInstance.WeakCreateBuilderForType();
+        }
+
+        while (!tokenizer.TryConsume(endToken)) {
+          if (tokenizer.AtEnd) {
+            throw tokenizer.CreateFormatException("Expected \"" + endToken + "\".");
+          }
+          MergeField(tokenizer, extensionRegistry, subBuilder);
+        }
+
+        value = subBuilder.WeakBuild();
+
+      } else {
+        tokenizer.Consume(":");
+
+        switch (field.FieldType) {
+          case FieldType.Int32:
+          case FieldType.SInt32:
+          case FieldType.SFixed32:
+            value = tokenizer.ConsumeInt32();
+            break;
+
+          case FieldType.Int64:
+          case FieldType.SInt64:
+          case FieldType.SFixed64:
+            value = tokenizer.ConsumeInt64();
+            break;
+
+          case FieldType.UInt32:
+          case FieldType.Fixed32:
+            value = tokenizer.ConsumeUInt32();
+            break;
+
+          case FieldType.UInt64:
+          case FieldType.Fixed64:
+            value = tokenizer.ConsumeUInt64();
+            break;
+
+          case FieldType.Float:
+            value = tokenizer.consumeFloat();
+            break;
+
+          case FieldType.Double:
+            value = tokenizer.ConsumeDouble();
+            break;
+
+          case FieldType.Bool:
+            value = tokenizer.ConsumeBoolean();
+            break;
+
+          case FieldType.String:
+            value = tokenizer.ConsumeString();
+            break;
+
+          case FieldType.Bytes:
+            value = tokenizer.ConsumeByteString();
+            break;
+
+          case FieldType.Enum: {
+            EnumDescriptor enumType = field.EnumType;
+
+            if (tokenizer.LookingAtInteger()) {
+              int number = tokenizer.ConsumeInt32();
+              value = enumType.FindValueByNumber(number);
+              if (value == null) {
+                throw tokenizer.CreateFormatExceptionPreviousToken(
+                  "Enum type \"" + enumType.FullName +
+                  "\" has no value with number " + number + ".");
+              }
+            } else {
+              String id = tokenizer.ConsumeIdentifier();
+              value = enumType.FindValueByName(id);
+              if (value == null) {
+                throw tokenizer.CreateFormatExceptionPreviousToken(
+                  "Enum type \"" + enumType.FullName +
+                  "\" has no value named \"" + id + "\".");
+              }
+            }
+
+            break;
+          }
+
+          case FieldType.Message:
+          case FieldType.Group:
+            throw new InvalidOperationException("Can't get here.");
+        }
+      }
+
+      if (field.IsRepeated) {
+        builder.WeakAddRepeatedField(field, value);
+      } else {
+        builder.SetField(field, value);
+      }
    }
  }
 }
--- a/csharp/ProtocolBuffers/TextTokenizer.cs
+++ b/csharp/ProtocolBuffers/TextTokenizer.cs
@ -0,0 +1,341 @@
+using System;
+using System.Globalization;
+using System.Text.RegularExpressions;
+
+namespace Google.ProtocolBuffers {
+  /// <summary>
+  /// Represents a stream of tokens parsed from a string.
+  /// </summary>
+  internal sealed class TextTokenizer {
+    private readonly string text;
+    private string currentToken;
+
+    /// <summary>
+    /// The character index within the text to perform the next regex match at.
+    /// </summary>
+    private int matchPos = 0;
+
+    /// <summary>
+    /// The character index within the text at which the current token begins.
+    /// </summary>
+    private int pos = 0;
+
+    /// <summary>
+    /// The line number of the current token.
+    /// </summary>
+    private int line = 0;
+    /// <summary>
+    /// The column number of the current token.
+    /// </summary>
+    private int column = 0;
+
+    /// <summary>
+    /// The line number of the previous token.
+    /// </summary>
+    private int previousLine = 0;
+    /// <summary>
+    /// The column number of the previous token.
+    /// </summary>
+    private int previousColumn = 0;
+
+    private static Regex WhitespaceAndCommentPattern = new Regex("\\G(\\s|(#[^\\\n]*\\n))+", RegexOptions.Compiled);
+    private static Regex TokenPattern = new Regex(
+      "\\G[a-zA-Z_][0-9a-zA-Z_+-]*|" +              // an identifier
+      "\\G[0-9+-][0-9a-zA-Z_.+-]*|" +                  // a number
+      "\\G\"([^\"\\\n\\\\]|\\\\[^\\\n])*(\"|\\\\?$)|" +    // a double-quoted string
+      "\\G\'([^\"\\\n\\\\]|\\\\[^\\\n])*(\'|\\\\?$)",      // a single-quoted string
+      RegexOptions.Compiled);
+
+    /** Construct a tokenizer that parses tokens from the given text. */
+    public TextTokenizer(string text) {
+      this.text = text;
+      SkipWhitespace();
+      NextToken();
+    }
+
+    /// <summary>
+    /// Are we at the end of the input?
+    /// </summary>
+    public bool AtEnd {
+      get { return currentToken.Length == 0; }
+    }
+
+    /// <summary>
+    /// Advances to the next token.
+    /// </summary>
+    public void NextToken() {
+      previousLine = line;
+      previousColumn = column;
+
+      // Advance the line counter to the current position.
+      while (pos < matchPos) {
+        if (text[pos] == '\n') {
+          ++line;
+          column = 0;
+        } else {
+          ++column;
+        }
+        ++pos;
+      }
+
+      // Match the next token.
+      if (matchPos == text.Length) {
+        // EOF
+        currentToken = "";
+      } else {
+        Match match = TokenPattern.Match(text, matchPos);
+        if (match.Success) {
+          currentToken = match.Value;
+          matchPos += match.Length;
+        } else {
+          // Take one character.
+          currentToken = text[matchPos].ToString();
+          matchPos++;
+        }
+
+        SkipWhitespace();
+      }
+    }
+
+    /// <summary>
+    /// Skip over any whitespace so that matchPos starts at the next token.
+    /// </summary>
+    private void SkipWhitespace() {
+      Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
+      if (match.Success) {
+        matchPos += match.Length;
+      }
+    }
+
+    /// <summary>
+    /// If the next token exactly matches the given token, consume it and return
+    /// true. Otherwise, return false without doing anything.
+    /// </summary>
+    public bool TryConsume(string token) {
+      if (currentToken == token) {
+        NextToken();
+        return true;
+      }
+      return false;
+    }
+
+    /*
+     * If the next token exactly matches {@code token}, consume it.  Otherwise,
+     * throw a {@link ParseException}.
+     */
+    /// <summary>
+    /// If the next token exactly matches the specified one, consume it.
+    /// Otherwise, throw a FormatException.
+    /// </summary>
+    /// <param name="token"></param>
+    public void Consume(string token) {
+      if (!TryConsume(token)) {
+        throw CreateFormatException("Expected \"" + token + "\".");
+      }
+    }
+
+    /// <summary>
+    /// Returns true if the next token is an integer, but does not consume it.
+    /// </summary>
+    public bool LookingAtInteger() {
+      if (currentToken.Length == 0) {
+        return false;
+      }
+
+      char c = currentToken[0];
+      return ('0' <= c && c <= '9') || c == '-' || c == '+';
+    }
+
+    /// <summary>
+    /// If the next token is an identifier, consume it and return its value.
+    /// Otherwise, throw a FormatException.
+    /// </summary>
+    public string ConsumeIdentifier() {
+      foreach (char c in currentToken) {
+        if (('a' <= c && c <= 'z') ||
+            ('A' <= c && c <= 'Z') ||
+            ('0' <= c && c <= '9') ||
+            (c == '_') || (c == '.')) {
+          // OK
+        } else {
+          throw CreateFormatException("Expected identifier.");
+        }
+      }
+
+      string result = currentToken;
+      NextToken();
+      return result;
+    }
+
+    /// <summary>
+    /// If the next token is a 32-bit signed integer, consume it and return its 
+    /// value. Otherwise, throw a FormatException.
+    /// </summary>
+    public int ConsumeInt32()  {
+      try {
+        int result = TextFormat.ParseInt32(currentToken);
+        NextToken();
+        return result;
+      } catch (FormatException e) {
+        throw CreateIntegerParseException(e);
+      }
+    }
+
+    /// <summary>
+    /// If the next token is a 32-bit unsigned integer, consume it and return its
+    /// value. Otherwise, throw a FormatException.
+    /// </summary>
+    public uint ConsumeUInt32() {
+      try {
+        uint result = TextFormat.ParseUInt32(currentToken);
+        NextToken();
+        return result;
+      } catch (FormatException e) {
+        throw CreateIntegerParseException(e);
+      }
+    }
+
+    /// <summary>
+    /// If the next token is a 64-bit signed integer, consume it and return its
+    /// value. Otherwise, throw a FormatException.
+    /// </summary>
+    public long ConsumeInt64() {
+      try {
+        long result = TextFormat.ParseInt64(currentToken);
+        NextToken();
+        return result;
+      } catch (FormatException e) {
+        throw CreateIntegerParseException(e);
+      }
+    }
+
+    /// <summary>
+    /// If the next token is a 64-bit unsigned integer, consume it and return its
+    /// value. Otherwise, throw a FormatException.
+    /// </summary>
+    public ulong ConsumeUInt64() {
+      try {
+        ulong result = TextFormat.ParseUInt64(currentToken);
+        NextToken();
+        return result;
+      } catch (FormatException e) {
+        throw CreateIntegerParseException(e);
+      }
+    }
+
+    /// <summary>
+    /// If the next token is a double, consume it and return its value.
+    /// Otherwise, throw a FormatException.
+    /// </summary>
+    public double ConsumeDouble() {
+      try {
+        double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
+        NextToken();
+        return result;
+      } catch (FormatException e) {
+        throw CreateFloatParseException(e);
+      } catch (OverflowException e) {
+        throw CreateFloatParseException(e);
+      }
+    }
+
+    /// <summary>
+    /// If the next token is a float, consume it and return its value.
+    /// Otherwise, throw a FormatException.
+    /// </summary>
+    public float consumeFloat() {
+      try {
+        float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
+        NextToken();
+        return result;
+      } catch (FormatException e) {
+        throw CreateFloatParseException(e);
+      } catch (OverflowException e) {
+        throw CreateFloatParseException(e);
+      }
+    }
+
+    /// <summary>
+    /// If the next token is a Boolean, consume it and return its value.
+    /// Otherwise, throw a FormatException.    
+    /// </summary>
+    public bool ConsumeBoolean() {
+      if (currentToken == "true") {
+        NextToken();
+        return true;
+      } 
+      if (currentToken == "false") {
+        NextToken();
+        return false;
+      }
+      throw CreateFormatException("Expected \"true\" or \"false\".");
+    }
+
+    /// <summary>
+    /// If the next token is a string, consume it and return its (unescaped) value.
+    /// Otherwise, throw a FormatException.
+    /// </summary>
+    public string ConsumeString() {
+      return ConsumeByteString().ToStringUtf8();
+    }
+
+    /// <summary>
+    /// If the next token is a string, consume it, unescape it as a
+    /// ByteString and return it. Otherwise, throw a FormatException.
+    /// </summary>
+    public ByteString ConsumeByteString() {
+      char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
+      if (quote != '\"' && quote != '\'') {
+        throw CreateFormatException("Expected string.");
+      }
+
+      if (currentToken.Length < 2 ||
+          currentToken[currentToken.Length-1] != quote) {
+        throw CreateFormatException("String missing ending quote.");
+      }
+
+      try {
+        string escaped = currentToken.Substring(1, currentToken.Length - 2);
+        ByteString result = TextFormat.UnescapeBytes(escaped);
+        NextToken();
+        return result;
+      } catch (FormatException e) {
+        throw CreateFormatException(e.Message);
+      }
+    }
+
+    /// <summary>
+    /// Returns a format exception with the current line and column numbers
+    /// in the description, suitable for throwing.
+    /// </summary>
+    public FormatException CreateFormatException(string description) {
+      // Note:  People generally prefer one-based line and column numbers.
+      return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
+    }
+
+    /// <summary>
+    /// Returns a format exception with the line and column numbers of the
+    /// previous token in the description, suitable for throwing.
+    /// </summary>
+    public FormatException CreateFormatExceptionPreviousToken(string description) {
+      // Note:  People generally prefer one-based line and column numbers.
+      return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
+    }
+
+    /// <summary>
+    /// Constructs an appropriate FormatException for the given existing exception
+    /// when trying to parse an integer.
+    /// </summary>
+    private FormatException CreateIntegerParseException(FormatException e) {
+      return CreateFormatException("Couldn't parse integer: " + e.Message);
+    }
+
+    /// <summary>
+    /// Constructs an appropriate FormatException for the given existing exception
+    /// when trying to parse a float or double.
+    /// </summary>
+    private FormatException CreateFloatParseException(Exception e) {
+      return CreateFormatException("Couldn't parse number: " + e.Message);
+    }
+  }
+}