Throw InvalidProtocolBufferException when parsing invalid UTF-8 in C#.

Uses UTF8Encoding configured with throwOnInvalidBytes=True instead of default Encoding.UTF8 per https://stackoverflow.com/questions/62762770/forcing-encoding-utf8-getstring-to-throw-an-argumentexception/62763077#62763077

This surfaces errors for bad UTF-8 earlier in C# for consistency across languages as announced in https://protobuf.dev/news/2024-10-02/#utf-8-enforcement

PiperOrigin-RevId: 694491987
pull/19056/head
Sandy Zhang 5 months ago committed by Copybara-Service
parent fa3790f67a
commit db9b2c8e9f
  1. 16
      csharp/src/Google.Protobuf.Test/CodedInputStreamTest.cs
  2. 16
      csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs
  3. 9
      csharp/src/Google.Protobuf.Test/ParsingPrimitivesTest.cs
  4. 6
      csharp/src/Google.Protobuf/InvalidProtocolBufferException.cs
  5. 35
      csharp/src/Google.Protobuf/ParsingPrimitives.cs
  6. 17
      csharp/src/Google.Protobuf/Reflection/FeatureSetDescriptor.g.cs
  7. 2
      csharp/src/Google.Protobuf/WritingPrimitives.cs

@ -7,12 +7,12 @@
// https://developers.google.com/open-source/licenses/bsd // https://developers.google.com/open-source/licenses/bsd
#endregion #endregion
using System;
using System.Buffers;
using System.IO;
using Google.Protobuf.TestProtos; using Google.Protobuf.TestProtos;
using Proto2 = Google.Protobuf.TestProtos.Proto2; using Proto2 = Google.Protobuf.TestProtos.Proto2;
using NUnit.Framework; using NUnit.Framework;
using System;
using System.Buffers;
using System.IO;
namespace Google.Protobuf namespace Google.Protobuf
{ {
@ -577,12 +577,11 @@ namespace Google.Protobuf
} }
/// <summary> /// <summary>
/// Tests that if we read an string that contains invalid UTF-8, no exception /// Tests that if we read a string that contains invalid UTF-8, an exception
/// is thrown. Instead, the invalid bytes are replaced with the Unicode /// is thrown.
/// "replacement character" U+FFFD.
/// </summary> /// </summary>
[Test] [Test]
public void ReadInvalidUtf8() public void ReadInvalidUtf8ThrowsInvalidProtocolBufferException()
{ {
MemoryStream ms = new MemoryStream(); MemoryStream ms = new MemoryStream();
CodedOutputStream output = new CodedOutputStream(ms); CodedOutputStream output = new CodedOutputStream(ms);
@ -597,8 +596,7 @@ namespace Google.Protobuf
CodedInputStream input = new CodedInputStream(ms); CodedInputStream input = new CodedInputStream(ms);
Assert.AreEqual(tag, input.ReadTag()); Assert.AreEqual(tag, input.ReadTag());
string text = input.ReadString(); Assert.Throws<InvalidProtocolBufferException>(() => input.ReadString());
Assert.AreEqual('\ufffd', text[0]);
} }
[Test] [Test]

@ -7,13 +7,13 @@
// https://developers.google.com/open-source/licenses/bsd // https://developers.google.com/open-source/licenses/bsd
#endregion #endregion
using System; using Google.Protobuf.Collections;
using System.IO;
using Google.Protobuf.TestProtos; using Google.Protobuf.TestProtos;
using Google.Protobuf.WellKnownTypes;
using NUnit.Framework; using NUnit.Framework;
using System;
using System.IO;
using System.Linq; using System.Linq;
using Google.Protobuf.WellKnownTypes;
using Google.Protobuf.Collections;
namespace Google.Protobuf namespace Google.Protobuf
{ {
@ -112,14 +112,10 @@ namespace Google.Protobuf
} }
[Test] [Test]
public void InvalidUtf8ParsesAsReplacementChars() public void ParseInvalidUtf8Rejected()
{ {
var payload = new byte[] { 0x72, 1, 0x80 }; var payload = new byte[] { 0x72, 1, 0x80 };
Assert.Throws<InvalidProtocolBufferException>(() => TestAllTypes.Parser.ParseFrom(payload));
// We would prefer to have this parse operation fail, but at the moment it substitutes
// the replacement character.
var message = TestAllTypes.Parser.ParseFrom(payload);
Assert.AreEqual("\ufffd", message.SingleString);
} }
[Test] [Test]

@ -1,4 +1,4 @@
#region Copyright notice and license #region Copyright notice and license
// Protocol Buffers - Google's data interchange format // Protocol Buffers - Google's data interchange format
// Copyright 2022 Google Inc. All rights reserved. // Copyright 2022 Google Inc. All rights reserved.
// //
@ -24,11 +24,12 @@ internal class ParsingPrimitivesTest
[TestCase("A\ufffd\ufffdB", 65, 255, 255, 66)] [TestCase("A\ufffd\ufffdB", 65, 255, 255, 66)]
// Overlong form of "space" // Overlong form of "space"
[TestCase("\ufffd\ufffd", 0xc0, 0xa0)] [TestCase("\ufffd\ufffd", 0xc0, 0xa0)]
public void ReadRawString_NonUtf8(string expectedText, params int[] bytes) public void ReadRawString_NonUtf8ThrowsInvalidProtocolBufferException(string expectedText, params int[] bytes)
{ {
Assert.Throws<InvalidProtocolBufferException>(() => {
var context = CreateContext(bytes); var context = CreateContext(bytes);
string text = ParsingPrimitives.ReadRawString(ref context.buffer, ref context.state, bytes.Length); ParsingPrimitives.ReadRawString(ref context.buffer, ref context.state, bytes.Length);
Assert.AreEqual(expectedText, text); });
} }
private static ParseContext CreateContext(int[] bytes) private static ParseContext CreateContext(int[] bytes)

@ -76,6 +76,12 @@ namespace Google.Protobuf
return new InvalidProtocolBufferException("Invalid base64 data", innerException); return new InvalidProtocolBufferException("Invalid base64 data", innerException);
} }
internal static InvalidProtocolBufferException InvalidUtf8(Exception innerException)
{
return new InvalidProtocolBufferException(
"String is invalid UTF-8.", innerException);
}
internal static InvalidProtocolBufferException InvalidEndTag() internal static InvalidProtocolBufferException InvalidEndTag()
{ {
return new InvalidProtocolBufferException( return new InvalidProtocolBufferException(

@ -1,4 +1,4 @@
#region Copyright notice and license #region Copyright notice and license
// Protocol Buffers - Google's data interchange format // Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc. All rights reserved. // Copyright 2008 Google Inc. All rights reserved.
// //
@ -15,6 +15,7 @@ using System.IO;
using System.Runtime.CompilerServices; using System.Runtime.CompilerServices;
using System.Runtime.InteropServices; using System.Runtime.InteropServices;
using System.Security; using System.Security;
using System.Text;
namespace Google.Protobuf namespace Google.Protobuf
{ {
@ -24,6 +25,9 @@ namespace Google.Protobuf
[SecuritySafeCritical] [SecuritySafeCritical]
internal static class ParsingPrimitives internal static class ParsingPrimitives
{ {
internal static readonly Encoding Utf8Encoding =
new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
private const int StackallocThreshold = 256; private const int StackallocThreshold = 256;
/// <summary> /// <summary>
@ -576,7 +580,14 @@ namespace Google.Protobuf
{ {
fixed (byte* sourceBytes = &MemoryMarshal.GetReference(data)) fixed (byte* sourceBytes = &MemoryMarshal.GetReference(data))
{ {
value = WritingPrimitives.Utf8Encoding.GetString(sourceBytes, length); try
{
value = Utf8Encoding.GetString(sourceBytes, length);
}
catch (DecoderFallbackException e)
{
throw InvalidProtocolBufferException.InvalidUtf8(e);
}
} }
} }
@ -618,8 +629,14 @@ namespace Google.Protobuf
// Make compiler happy by passing a new span created from pointer. // Make compiler happy by passing a new span created from pointer.
var tempSpan = new Span<byte>(pByteSpan, byteSpan.Length); var tempSpan = new Span<byte>(pByteSpan, byteSpan.Length);
ReadRawBytesIntoSpan(ref buffer, ref state, length, tempSpan); ReadRawBytesIntoSpan(ref buffer, ref state, length, tempSpan);
try
return WritingPrimitives.Utf8Encoding.GetString(pByteSpan, length); {
return Utf8Encoding.GetString(pByteSpan, length);
}
catch (DecoderFallbackException e)
{
throw InvalidProtocolBufferException.InvalidUtf8(e);
}
} }
} }
} }
@ -637,7 +654,15 @@ namespace Google.Protobuf
// This will be called when reading from a Stream because we don't know the length of the stream, // This will be called when reading from a Stream because we don't know the length of the stream,
// or there is not enough data in the sequence. If there is not enough data then ReadRawBytes will // or there is not enough data in the sequence. If there is not enough data then ReadRawBytes will
// throw an exception. // throw an exception.
return WritingPrimitives.Utf8Encoding.GetString(ReadRawBytes(ref buffer, ref state, length), 0, length); byte[] bytes = ReadRawBytes(ref buffer, ref state, length);
try
{
return Utf8Encoding.GetString(bytes, 0, length);
}
catch (DecoderFallbackException e)
{
throw InvalidProtocolBufferException.InvalidUtf8(e);
}
} }
/// <summary> /// <summary>

@ -1,17 +0,0 @@
#region Copyright notice and license
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file or at
// https://developers.google.com/open-source/licenses/bsd
#endregion
namespace Google.Protobuf.Reflection;
internal sealed partial class FeatureSetDescriptor
{
// Canonical serialized form of the edition defaults, generated by embed_edition_defaults.
private const string DefaultsBase64 =
"ChMYhAciACoMCAEQAhgCIAMoATACChMY5wciACoMCAIQARgBIAIoATABChMY6AciDAgBEAEYASACKAEwASoAIOYHKOgH";
}

@ -27,6 +27,8 @@ namespace Google.Protobuf
[SecuritySafeCritical] [SecuritySafeCritical]
internal static class WritingPrimitives internal static class WritingPrimitives
{ {
// Ideally we would use the same UTF8Encoding as parse, but we should be able to serialize
// invalid UTF-8 that make it in via unvalidated setters without throwing an exception.
#if NET5_0_OR_GREATER #if NET5_0_OR_GREATER
internal static Encoding Utf8Encoding => Encoding.UTF8; // allows JIT to devirtualize internal static Encoding Utf8Encoding => Encoding.UTF8; // allows JIT to devirtualize
#else #else

Loading…
Cancel
Save