Throw InvalidProtocolBufferException when parsing invalid UTF-8 in C#.

Uses UTF8Encoding configured with throwOnInvalidBytes=True instead of default Encoding.UTF8 per https://stackoverflow.com/questions/62762770/forcing-encoding-utf8-getstring-to-throw-an-argumentexception/62763077#62763077

This surfaces errors for bad UTF-8 earlier in C# for consistency across languages as announced in https://protobuf.dev/news/2024-10-02/#utf-8-enforcement

PiperOrigin-RevId: 694491987
pull/19056/head
Sandy Zhang 4 months ago committed by Copybara-Service
parent fa3790f67a
commit db9b2c8e9f
  1. 16
      csharp/src/Google.Protobuf.Test/CodedInputStreamTest.cs
  2. 16
      csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs
  3. 11
      csharp/src/Google.Protobuf.Test/ParsingPrimitivesTest.cs
  4. 6
      csharp/src/Google.Protobuf/InvalidProtocolBufferException.cs
  5. 35
      csharp/src/Google.Protobuf/ParsingPrimitives.cs
  6. 17
      csharp/src/Google.Protobuf/Reflection/FeatureSetDescriptor.g.cs
  7. 4
      csharp/src/Google.Protobuf/WritingPrimitives.cs

@ -7,12 +7,12 @@
// https://developers.google.com/open-source/licenses/bsd
#endregion
using System;
using System.Buffers;
using System.IO;
using Google.Protobuf.TestProtos;
using Proto2 = Google.Protobuf.TestProtos.Proto2;
using NUnit.Framework;
using System;
using System.Buffers;
using System.IO;
namespace Google.Protobuf
{
@ -577,12 +577,11 @@ namespace Google.Protobuf
}
/// <summary>
/// Tests that if we read an string that contains invalid UTF-8, no exception
/// is thrown. Instead, the invalid bytes are replaced with the Unicode
/// "replacement character" U+FFFD.
/// Tests that if we read a string that contains invalid UTF-8, an exception
/// is thrown.
/// </summary>
[Test]
public void ReadInvalidUtf8()
public void ReadInvalidUtf8ThrowsInvalidProtocolBufferException()
{
MemoryStream ms = new MemoryStream();
CodedOutputStream output = new CodedOutputStream(ms);
@ -597,8 +596,7 @@ namespace Google.Protobuf
CodedInputStream input = new CodedInputStream(ms);
Assert.AreEqual(tag, input.ReadTag());
string text = input.ReadString();
Assert.AreEqual('\ufffd', text[0]);
Assert.Throws<InvalidProtocolBufferException>(() => input.ReadString());
}
[Test]

@ -7,13 +7,13 @@
// https://developers.google.com/open-source/licenses/bsd
#endregion
using System;
using System.IO;
using Google.Protobuf.Collections;
using Google.Protobuf.TestProtos;
using Google.Protobuf.WellKnownTypes;
using NUnit.Framework;
using System;
using System.IO;
using System.Linq;
using Google.Protobuf.WellKnownTypes;
using Google.Protobuf.Collections;
namespace Google.Protobuf
{
@ -112,14 +112,10 @@ namespace Google.Protobuf
}
[Test]
public void InvalidUtf8ParsesAsReplacementChars()
public void ParseInvalidUtf8Rejected()
{
var payload = new byte[] { 0x72, 1, 0x80 };
// We would prefer to have this parse operation fail, but at the moment it substitutes
// the replacement character.
var message = TestAllTypes.Parser.ParseFrom(payload);
Assert.AreEqual("\ufffd", message.SingleString);
Assert.Throws<InvalidProtocolBufferException>(() => TestAllTypes.Parser.ParseFrom(payload));
}
[Test]

@ -1,4 +1,4 @@
#region Copyright notice and license
#region Copyright notice and license
// Protocol Buffers - Google's data interchange format
// Copyright 2022 Google Inc. All rights reserved.
//
@ -24,11 +24,12 @@ internal class ParsingPrimitivesTest
[TestCase("A\ufffd\ufffdB", 65, 255, 255, 66)]
// Overlong form of "space"
[TestCase("\ufffd\ufffd", 0xc0, 0xa0)]
public void ReadRawString_NonUtf8(string expectedText, params int[] bytes)
public void ReadRawString_NonUtf8ThrowsInvalidProtocolBufferException(string expectedText, params int[] bytes)
{
var context = CreateContext(bytes);
string text = ParsingPrimitives.ReadRawString(ref context.buffer, ref context.state, bytes.Length);
Assert.AreEqual(expectedText, text);
Assert.Throws<InvalidProtocolBufferException>(() => {
var context = CreateContext(bytes);
ParsingPrimitives.ReadRawString(ref context.buffer, ref context.state, bytes.Length);
});
}
private static ParseContext CreateContext(int[] bytes)

@ -76,6 +76,12 @@ namespace Google.Protobuf
return new InvalidProtocolBufferException("Invalid base64 data", innerException);
}
internal static InvalidProtocolBufferException InvalidUtf8(Exception innerException)
{
return new InvalidProtocolBufferException(
"String is invalid UTF-8.", innerException);
}
internal static InvalidProtocolBufferException InvalidEndTag()
{
return new InvalidProtocolBufferException(

@ -1,4 +1,4 @@
#region Copyright notice and license
#region Copyright notice and license
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc. All rights reserved.
//
@ -15,6 +15,7 @@ using System.IO;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Security;
using System.Text;
namespace Google.Protobuf
{
@ -24,6 +25,9 @@ namespace Google.Protobuf
[SecuritySafeCritical]
internal static class ParsingPrimitives
{
internal static readonly Encoding Utf8Encoding =
new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
private const int StackallocThreshold = 256;
/// <summary>
@ -576,7 +580,14 @@ namespace Google.Protobuf
{
fixed (byte* sourceBytes = &MemoryMarshal.GetReference(data))
{
value = WritingPrimitives.Utf8Encoding.GetString(sourceBytes, length);
try
{
value = Utf8Encoding.GetString(sourceBytes, length);
}
catch (DecoderFallbackException e)
{
throw InvalidProtocolBufferException.InvalidUtf8(e);
}
}
}
@ -618,8 +629,14 @@ namespace Google.Protobuf
// Make compiler happy by passing a new span created from pointer.
var tempSpan = new Span<byte>(pByteSpan, byteSpan.Length);
ReadRawBytesIntoSpan(ref buffer, ref state, length, tempSpan);
return WritingPrimitives.Utf8Encoding.GetString(pByteSpan, length);
try
{
return Utf8Encoding.GetString(pByteSpan, length);
}
catch (DecoderFallbackException e)
{
throw InvalidProtocolBufferException.InvalidUtf8(e);
}
}
}
}
@ -637,7 +654,15 @@ namespace Google.Protobuf
// This will be called when reading from a Stream because we don't know the length of the stream,
// or there is not enough data in the sequence. If there is not enough data then ReadRawBytes will
// throw an exception.
return WritingPrimitives.Utf8Encoding.GetString(ReadRawBytes(ref buffer, ref state, length), 0, length);
byte[] bytes = ReadRawBytes(ref buffer, ref state, length);
try
{
return Utf8Encoding.GetString(bytes, 0, length);
}
catch (DecoderFallbackException e)
{
throw InvalidProtocolBufferException.InvalidUtf8(e);
}
}
/// <summary>

@ -1,17 +0,0 @@
#region Copyright notice and license
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file or at
// https://developers.google.com/open-source/licenses/bsd
#endregion
namespace Google.Protobuf.Reflection;
internal sealed partial class FeatureSetDescriptor
{
// Canonical serialized form of the edition defaults, generated by embed_edition_defaults.
private const string DefaultsBase64 =
"ChMYhAciACoMCAEQAhgCIAMoATACChMY5wciACoMCAIQARgBIAIoATABChMY6AciDAgBEAEYASACKAEwASoAIOYHKOgH";
}

@ -27,6 +27,8 @@ namespace Google.Protobuf
[SecuritySafeCritical]
internal static class WritingPrimitives
{
// Ideally we would use the same UTF8Encoding as parse, but we should be able to serialize
// invalid UTF-8 that make it in via unvalidated setters without throwing an exception.
#if NET5_0_OR_GREATER
internal static Encoding Utf8Encoding => Encoding.UTF8; // allows JIT to devirtualize
#else
@ -35,7 +37,7 @@ namespace Google.Protobuf
// difference.)
#endif
#region Writing of values (not including tags)
#region Writing of values (not including tags)
/// <summary>
/// Writes a double field value, without a tag, to the stream.

Loading…
Cancel
Save