Implemented text parsing.

pull/288/head
Jon Skeet 17 years ago
parent feb9385b04
commit 1e42fdde2e
  1. 26
      csharp/ProtocolBuffers.Test/TextFormatTest.cs
  2. 1
      csharp/ProtocolBuffers/ProtocolBuffers.csproj
  3. 202
      csharp/ProtocolBuffers/TextFormat.cs
  4. 341
      csharp/ProtocolBuffers/TextTokenizer.cs

@ -8,18 +8,6 @@ namespace Google.ProtocolBuffers {
[TestFixture]
public class TextFormatTest {
/// <summary>
/// A basic string with different escapable characters for testing.
/// </summary>
private const string EscapeTestString = "\"A string with ' characters \n and \r newlines and \t tabs and \001 "
+ "slashes \\";
/// <summary>
/// A representation of the above string with all the characters escaped.
/// </summary>
private const string EscapeTestStringEscaped = "\"\\\"A string with \\' characters \\n and \\r newlines "
+ "and \\t tabs and \\001 slashes \\\\\"";
private static readonly string AllFieldsSetText = TestUtil.ReadTextFromFile("text_format_unittest_data.txt");
private static readonly string AllExtensionsSetText = TestUtil.ReadTextFromFile("text_format_unittest_extensions_data.txt");
@ -193,7 +181,6 @@ namespace Google.ProtocolBuffers {
// =================================================================
[Test]
[Ignore("Parsing not implemented")]
public void Parse() {
TestAllTypes.Builder builder = TestAllTypes.CreateBuilder();
TextFormat.Merge(AllFieldsSetText, builder);
@ -201,7 +188,6 @@ namespace Google.ProtocolBuffers {
}
[Test]
[Ignore("Parsing not implemented")]
public void ParseReader() {
TestAllTypes.Builder builder = TestAllTypes.CreateBuilder();
TextFormat.Merge(new StringReader(AllFieldsSetText), builder);
@ -209,7 +195,6 @@ namespace Google.ProtocolBuffers {
}
[Test]
[Ignore("Parsing not implemented")]
public void ParseExtensions() {
TestAllExtensions.Builder builder = TestAllExtensions.CreateBuilder();
TextFormat.Merge(AllExtensionsSetText,
@ -219,7 +204,6 @@ namespace Google.ProtocolBuffers {
}
[Test]
[Ignore("Parsing not implemented")]
public void ParseExotic() {
TestAllTypes.Builder builder = TestAllTypes.CreateBuilder();
TextFormat.Merge(ExoticText, builder);
@ -230,7 +214,6 @@ namespace Google.ProtocolBuffers {
}
[Test]
[Ignore("Parsing not implemented")]
public void ParseMessageSet() {
ExtensionRegistry extensionRegistry = ExtensionRegistry.CreateInstance();
extensionRegistry.Add(TestMessageSetExtension1.MessageSetExtension);
@ -247,7 +230,6 @@ namespace Google.ProtocolBuffers {
}
[Test]
[Ignore("Parsing not implemented")]
public void ParseNumericEnum() {
TestAllTypes.Builder builder = TestAllTypes.CreateBuilder();
TextFormat.Merge("optional_nested_enum: 2", builder);
@ -255,7 +237,6 @@ namespace Google.ProtocolBuffers {
}
[Test]
[Ignore("Parsing not implemented")]
public void ParseAngleBrackets() {
TestAllTypes.Builder builder = TestAllTypes.CreateBuilder();
TextFormat.Merge("OptionalGroup: < a: 1 >", builder);
@ -274,7 +255,6 @@ namespace Google.ProtocolBuffers {
}
[Test]
[Ignore("Parsing not implemented")]
public void ParseErrors() {
AssertParseError(
"1:16: Expected \":\".",
@ -296,17 +276,17 @@ namespace Google.ProtocolBuffers {
"1:18: Expected string.",
"optional_string: 123");
AssertParseError(
"1:18: string missing ending quote.",
"1:18: String missing ending quote.",
"optional_string: \"ueoauaoe");
AssertParseError(
"1:18: string missing ending quote.",
"1:18: String missing ending quote.",
"optional_string: \"ueoauaoe\n" +
"optional_int32: 123");
AssertParseError(
"1:18: Invalid escape sequence: '\\z'",
"optional_string: \"\\z\"");
AssertParseError(
"1:18: string missing ending quote.",
"1:18: String missing ending quote.",
"optional_string: \"ueoauaoe\n" +
"optional_int32: 123");
AssertParseError(

@ -97,6 +97,7 @@
<Compile Include="RpcUtil.cs" />
<Compile Include="TextFormat.cs" />
<Compile Include="TextGenerator.cs" />
<Compile Include="TextTokenizer.cs" />
<Compile Include="UninitializedMessageException.cs" />
<Compile Include="UnknownField.cs" />
<Compile Include="UnknownFieldSet.cs" />

@ -1,5 +1,6 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Text;
using Google.ProtocolBuffers.Descriptors;
@ -116,9 +117,9 @@ namespace Google.ProtocolBuffers {
case FieldType.UInt64:
case FieldType.Fixed32:
case FieldType.Fixed64:
// Good old ToString() does what we want for these types. (Including the
// unsigned ones, unlike with Java.)
generator.Print(value.ToString());
// The simple Object.ToString converts using the current culture.
// We want to always use the invariant culture so it's predictable.
generator.Print(((IConvertible) value).ToString(CultureInfo.InvariantCulture));
break;
case FieldType.Bool:
// Explicitly use the Java true/false
@ -237,13 +238,15 @@ namespace Google.ProtocolBuffers {
result = radix == 10 ? ulong.Parse(text) : Convert.ToUInt64(text, radix);
} catch (OverflowException) {
// Convert OverflowException to FormatException so there's a single exception type this method can throw.
throw new FormatException("Number of out range: " + original);
string numberDescription = string.Format("{0}-bit {1}signed integer", isLong ? 64 : 32, isSigned ? "" : "un");
throw new FormatException("Number out of range for " + numberDescription + ": " + original);
}
if (negative) {
ulong max = isLong ? 0x8000000000000000UL : 0x80000000L;
if (result > max) {
throw new FormatException("Number of out range: " + original);
string numberDescription = string.Format("{0}-bit signed integer", isLong ? 64 : 32);
throw new FormatException("Number out of range for " + numberDescription + ": " + original);
}
return -((long) result);
} else {
@ -251,7 +254,8 @@ namespace Google.ProtocolBuffers {
? (isLong ? (ulong) long.MaxValue : int.MaxValue)
: (isLong ? ulong.MaxValue : uint.MaxValue);
if (result > max) {
throw new FormatException("Number of out range: " + original);
string numberDescription = string.Format("{0}-bit {1}signed integer", isLong ? 64 : 32, isSigned ? "" : "un");
throw new FormatException("Number out of range for " + numberDescription + ": " + original);
}
return (long) result;
}
@ -418,19 +422,195 @@ namespace Google.ProtocolBuffers {
}
public static void Merge(string text, IBuilder builder) {
throw new NotImplementedException();
Merge(text, ExtensionRegistry.Empty, builder);
}
public static void Merge(TextReader reader, IBuilder builder) {
throw new NotImplementedException();
Merge(reader, ExtensionRegistry.Empty, builder);
}
public static void Merge(TextReader reader, ExtensionRegistry registry, IBuilder builder) {
Merge(reader.ReadToEnd(), registry, builder);
}
public static void Merge(string text, ExtensionRegistry registry, IBuilder builder) {
throw new NotImplementedException();
TextTokenizer tokenizer = new TextTokenizer(text);
while (!tokenizer.AtEnd) {
MergeField(tokenizer, registry, builder);
}
}
public static void Merge(TextReader reader, ExtensionRegistry registry, IBuilder builder) {
throw new NotImplementedException();
/// <summary>
/// Parses a single field from the specified tokenizer and merges it into
/// the builder.
/// </summary>
private static void MergeField(TextTokenizer tokenizer, ExtensionRegistry extensionRegistry,
IBuilder builder) {
FieldDescriptor field;
MessageDescriptor type = builder.DescriptorForType;
ExtensionInfo extension = null;
if (tokenizer.TryConsume("[")) {
// An extension.
StringBuilder name = new StringBuilder(tokenizer.ConsumeIdentifier());
while (tokenizer.TryConsume(".")) {
name.Append(".");
name.Append(tokenizer.ConsumeIdentifier());
}
extension = extensionRegistry[name.ToString()];
if (extension == null) {
throw tokenizer.CreateFormatExceptionPreviousToken("Extension \"" + name + "\" not found in the ExtensionRegistry.");
} else if (extension.Descriptor.ContainingType != type) {
throw tokenizer.CreateFormatExceptionPreviousToken("Extension \"" + name + "\" does not extend message type \"" +
type.FullName + "\".");
}
tokenizer.Consume("]");
field = extension.Descriptor;
} else {
String name = tokenizer.ConsumeIdentifier();
field = type.FindDescriptor<FieldDescriptor>(name);
// Group names are expected to be capitalized as they appear in the
// .proto file, which actually matches their type names, not their field
// names.
if (field == null) {
// Explicitly specify the invariant culture so that this code does not break when
// executing in Turkey.
String lowerName = name.ToLowerInvariant();
field = type.FindDescriptor<FieldDescriptor>(lowerName);
// If the case-insensitive match worked but the field is NOT a group,
// TODO(jonskeet): What? Java comment ends here!
if (field != null && field.FieldType != FieldType.Group) {
field = null;
}
}
// Again, special-case group names as described above.
if (field != null && field.FieldType == FieldType.Group && field.MessageType.Name != name) {
field = null;
}
if (field == null) {
throw tokenizer.CreateFormatExceptionPreviousToken(
"Message type \"" + type.FullName + "\" has no field named \"" + name + "\".");
}
}
object value = null;
if (field.MappedType == MappedType.Message) {
tokenizer.TryConsume(":"); // optional
String endToken;
if (tokenizer.TryConsume("<")) {
endToken = ">";
} else {
tokenizer.Consume("{");
endToken = "}";
}
IBuilder subBuilder;
if (extension == null) {
subBuilder = builder.CreateBuilderForField(field);
} else {
subBuilder = extension.DefaultInstance.WeakCreateBuilderForType();
}
while (!tokenizer.TryConsume(endToken)) {
if (tokenizer.AtEnd) {
throw tokenizer.CreateFormatException("Expected \"" + endToken + "\".");
}
MergeField(tokenizer, extensionRegistry, subBuilder);
}
value = subBuilder.WeakBuild();
} else {
tokenizer.Consume(":");
switch (field.FieldType) {
case FieldType.Int32:
case FieldType.SInt32:
case FieldType.SFixed32:
value = tokenizer.ConsumeInt32();
break;
case FieldType.Int64:
case FieldType.SInt64:
case FieldType.SFixed64:
value = tokenizer.ConsumeInt64();
break;
case FieldType.UInt32:
case FieldType.Fixed32:
value = tokenizer.ConsumeUInt32();
break;
case FieldType.UInt64:
case FieldType.Fixed64:
value = tokenizer.ConsumeUInt64();
break;
case FieldType.Float:
value = tokenizer.consumeFloat();
break;
case FieldType.Double:
value = tokenizer.ConsumeDouble();
break;
case FieldType.Bool:
value = tokenizer.ConsumeBoolean();
break;
case FieldType.String:
value = tokenizer.ConsumeString();
break;
case FieldType.Bytes:
value = tokenizer.ConsumeByteString();
break;
case FieldType.Enum: {
EnumDescriptor enumType = field.EnumType;
if (tokenizer.LookingAtInteger()) {
int number = tokenizer.ConsumeInt32();
value = enumType.FindValueByNumber(number);
if (value == null) {
throw tokenizer.CreateFormatExceptionPreviousToken(
"Enum type \"" + enumType.FullName +
"\" has no value with number " + number + ".");
}
} else {
String id = tokenizer.ConsumeIdentifier();
value = enumType.FindValueByName(id);
if (value == null) {
throw tokenizer.CreateFormatExceptionPreviousToken(
"Enum type \"" + enumType.FullName +
"\" has no value named \"" + id + "\".");
}
}
break;
}
case FieldType.Message:
case FieldType.Group:
throw new InvalidOperationException("Can't get here.");
}
}
if (field.IsRepeated) {
builder.WeakAddRepeatedField(field, value);
} else {
builder.SetField(field, value);
}
}
}
}

@ -0,0 +1,341 @@
using System;
using System.Globalization;
using System.Text.RegularExpressions;
namespace Google.ProtocolBuffers {
/// <summary>
/// Represents a stream of tokens parsed from a string.
/// </summary>
internal sealed class TextTokenizer {
private readonly string text;
private string currentToken;
/// <summary>
/// The character index within the text to perform the next regex match at.
/// </summary>
private int matchPos = 0;
/// <summary>
/// The character index within the text at which the current token begins.
/// </summary>
private int pos = 0;
/// <summary>
/// The line number of the current token.
/// </summary>
private int line = 0;
/// <summary>
/// The column number of the current token.
/// </summary>
private int column = 0;
/// <summary>
/// The line number of the previous token.
/// </summary>
private int previousLine = 0;
/// <summary>
/// The column number of the previous token.
/// </summary>
private int previousColumn = 0;
private static Regex WhitespaceAndCommentPattern = new Regex("\\G(\\s|(#[^\\\n]*\\n))+", RegexOptions.Compiled);
private static Regex TokenPattern = new Regex(
"\\G[a-zA-Z_][0-9a-zA-Z_+-]*|" + // an identifier
"\\G[0-9+-][0-9a-zA-Z_.+-]*|" + // a number
"\\G\"([^\"\\\n\\\\]|\\\\[^\\\n])*(\"|\\\\?$)|" + // a double-quoted string
"\\G\'([^\"\\\n\\\\]|\\\\[^\\\n])*(\'|\\\\?$)", // a single-quoted string
RegexOptions.Compiled);
/** Construct a tokenizer that parses tokens from the given text. */
public TextTokenizer(string text) {
this.text = text;
SkipWhitespace();
NextToken();
}
/// <summary>
/// Are we at the end of the input?
/// </summary>
public bool AtEnd {
get { return currentToken.Length == 0; }
}
/// <summary>
/// Advances to the next token.
/// </summary>
public void NextToken() {
previousLine = line;
previousColumn = column;
// Advance the line counter to the current position.
while (pos < matchPos) {
if (text[pos] == '\n') {
++line;
column = 0;
} else {
++column;
}
++pos;
}
// Match the next token.
if (matchPos == text.Length) {
// EOF
currentToken = "";
} else {
Match match = TokenPattern.Match(text, matchPos);
if (match.Success) {
currentToken = match.Value;
matchPos += match.Length;
} else {
// Take one character.
currentToken = text[matchPos].ToString();
matchPos++;
}
SkipWhitespace();
}
}
/// <summary>
/// Skip over any whitespace so that matchPos starts at the next token.
/// </summary>
private void SkipWhitespace() {
Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
if (match.Success) {
matchPos += match.Length;
}
}
/// <summary>
/// If the next token exactly matches the given token, consume it and return
/// true. Otherwise, return false without doing anything.
/// </summary>
public bool TryConsume(string token) {
if (currentToken == token) {
NextToken();
return true;
}
return false;
}
/*
* If the next token exactly matches {@code token}, consume it. Otherwise,
* throw a {@link ParseException}.
*/
/// <summary>
/// If the next token exactly matches the specified one, consume it.
/// Otherwise, throw a FormatException.
/// </summary>
/// <param name="token"></param>
public void Consume(string token) {
if (!TryConsume(token)) {
throw CreateFormatException("Expected \"" + token + "\".");
}
}
/// <summary>
/// Returns true if the next token is an integer, but does not consume it.
/// </summary>
public bool LookingAtInteger() {
if (currentToken.Length == 0) {
return false;
}
char c = currentToken[0];
return ('0' <= c && c <= '9') || c == '-' || c == '+';
}
/// <summary>
/// If the next token is an identifier, consume it and return its value.
/// Otherwise, throw a FormatException.
/// </summary>
public string ConsumeIdentifier() {
foreach (char c in currentToken) {
if (('a' <= c && c <= 'z') ||
('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9') ||
(c == '_') || (c == '.')) {
// OK
} else {
throw CreateFormatException("Expected identifier.");
}
}
string result = currentToken;
NextToken();
return result;
}
/// <summary>
/// If the next token is a 32-bit signed integer, consume it and return its
/// value. Otherwise, throw a FormatException.
/// </summary>
public int ConsumeInt32() {
try {
int result = TextFormat.ParseInt32(currentToken);
NextToken();
return result;
} catch (FormatException e) {
throw CreateIntegerParseException(e);
}
}
/// <summary>
/// If the next token is a 32-bit unsigned integer, consume it and return its
/// value. Otherwise, throw a FormatException.
/// </summary>
public uint ConsumeUInt32() {
try {
uint result = TextFormat.ParseUInt32(currentToken);
NextToken();
return result;
} catch (FormatException e) {
throw CreateIntegerParseException(e);
}
}
/// <summary>
/// If the next token is a 64-bit signed integer, consume it and return its
/// value. Otherwise, throw a FormatException.
/// </summary>
public long ConsumeInt64() {
try {
long result = TextFormat.ParseInt64(currentToken);
NextToken();
return result;
} catch (FormatException e) {
throw CreateIntegerParseException(e);
}
}
/// <summary>
/// If the next token is a 64-bit unsigned integer, consume it and return its
/// value. Otherwise, throw a FormatException.
/// </summary>
public ulong ConsumeUInt64() {
try {
ulong result = TextFormat.ParseUInt64(currentToken);
NextToken();
return result;
} catch (FormatException e) {
throw CreateIntegerParseException(e);
}
}
/// <summary>
/// If the next token is a double, consume it and return its value.
/// Otherwise, throw a FormatException.
/// </summary>
public double ConsumeDouble() {
try {
double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
NextToken();
return result;
} catch (FormatException e) {
throw CreateFloatParseException(e);
} catch (OverflowException e) {
throw CreateFloatParseException(e);
}
}
/// <summary>
/// If the next token is a float, consume it and return its value.
/// Otherwise, throw a FormatException.
/// </summary>
public float consumeFloat() {
try {
float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
NextToken();
return result;
} catch (FormatException e) {
throw CreateFloatParseException(e);
} catch (OverflowException e) {
throw CreateFloatParseException(e);
}
}
/// <summary>
/// If the next token is a Boolean, consume it and return its value.
/// Otherwise, throw a FormatException.
/// </summary>
public bool ConsumeBoolean() {
if (currentToken == "true") {
NextToken();
return true;
}
if (currentToken == "false") {
NextToken();
return false;
}
throw CreateFormatException("Expected \"true\" or \"false\".");
}
/// <summary>
/// If the next token is a string, consume it and return its (unescaped) value.
/// Otherwise, throw a FormatException.
/// </summary>
public string ConsumeString() {
return ConsumeByteString().ToStringUtf8();
}
/// <summary>
/// If the next token is a string, consume it, unescape it as a
/// ByteString and return it. Otherwise, throw a FormatException.
/// </summary>
public ByteString ConsumeByteString() {
char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
if (quote != '\"' && quote != '\'') {
throw CreateFormatException("Expected string.");
}
if (currentToken.Length < 2 ||
currentToken[currentToken.Length-1] != quote) {
throw CreateFormatException("String missing ending quote.");
}
try {
string escaped = currentToken.Substring(1, currentToken.Length - 2);
ByteString result = TextFormat.UnescapeBytes(escaped);
NextToken();
return result;
} catch (FormatException e) {
throw CreateFormatException(e.Message);
}
}
/// <summary>
/// Returns a format exception with the current line and column numbers
/// in the description, suitable for throwing.
/// </summary>
public FormatException CreateFormatException(string description) {
// Note: People generally prefer one-based line and column numbers.
return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
}
/// <summary>
/// Returns a format exception with the line and column numbers of the
/// previous token in the description, suitable for throwing.
/// </summary>
public FormatException CreateFormatExceptionPreviousToken(string description) {
// Note: People generally prefer one-based line and column numbers.
return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
}
/// <summary>
/// Constructs an appropriate FormatException for the given existing exception
/// when trying to parse an integer.
/// </summary>
private FormatException CreateIntegerParseException(FormatException e) {
return CreateFormatException("Couldn't parse integer: " + e.Message);
}
/// <summary>
/// Constructs an appropriate FormatException for the given existing exception
/// when trying to parse a float or double.
/// </summary>
private FormatException CreateFloatParseException(Exception e) {
return CreateFormatException("Couldn't parse number: " + e.Message);
}
}
}
Loading…
Cancel
Save