diff --git a/csharp/src/Google.Protobuf.Test/CodedInputStreamTest.cs b/csharp/src/Google.Protobuf.Test/CodedInputStreamTest.cs
index 2020de5da3..e76af650f6 100644
--- a/csharp/src/Google.Protobuf.Test/CodedInputStreamTest.cs
+++ b/csharp/src/Google.Protobuf.Test/CodedInputStreamTest.cs
@@ -7,12 +7,12 @@
// https://developers.google.com/open-source/licenses/bsd
#endregion
-using System;
-using System.Buffers;
-using System.IO;
using Google.Protobuf.TestProtos;
using Proto2 = Google.Protobuf.TestProtos.Proto2;
using NUnit.Framework;
+using System;
+using System.Buffers;
+using System.IO;
namespace Google.Protobuf
{
@@ -577,12 +577,11 @@ namespace Google.Protobuf
}
///
- /// Tests that if we read an string that contains invalid UTF-8, no exception
- /// is thrown. Instead, the invalid bytes are replaced with the Unicode
- /// "replacement character" U+FFFD.
+ /// Tests that if we read a string that contains invalid UTF-8, an exception
+ /// is thrown.
///
[Test]
- public void ReadInvalidUtf8()
+ public void ReadInvalidUtf8ThrowsInvalidProtocolBufferException()
{
MemoryStream ms = new MemoryStream();
CodedOutputStream output = new CodedOutputStream(ms);
@@ -597,8 +596,7 @@ namespace Google.Protobuf
CodedInputStream input = new CodedInputStream(ms);
Assert.AreEqual(tag, input.ReadTag());
- string text = input.ReadString();
- Assert.AreEqual('\ufffd', text[0]);
+ Assert.Throws(() => input.ReadString());
}
[Test]
diff --git a/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs b/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs
index e10d136a17..734c1d0555 100644
--- a/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs
+++ b/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs
@@ -7,13 +7,13 @@
// https://developers.google.com/open-source/licenses/bsd
#endregion
-using System;
-using System.IO;
+using Google.Protobuf.Collections;
using Google.Protobuf.TestProtos;
+using Google.Protobuf.WellKnownTypes;
using NUnit.Framework;
+using System;
+using System.IO;
using System.Linq;
-using Google.Protobuf.WellKnownTypes;
-using Google.Protobuf.Collections;
namespace Google.Protobuf
{
@@ -112,14 +112,10 @@ namespace Google.Protobuf
}
[Test]
- public void InvalidUtf8ParsesAsReplacementChars()
+ public void ParseInvalidUtf8Rejected()
{
var payload = new byte[] { 0x72, 1, 0x80 };
-
- // We would prefer to have this parse operation fail, but at the moment it substitutes
- // the replacement character.
- var message = TestAllTypes.Parser.ParseFrom(payload);
- Assert.AreEqual("\ufffd", message.SingleString);
+ Assert.Throws(() => TestAllTypes.Parser.ParseFrom(payload));
}
[Test]
diff --git a/csharp/src/Google.Protobuf.Test/ParsingPrimitivesTest.cs b/csharp/src/Google.Protobuf.Test/ParsingPrimitivesTest.cs
index c1ca6b58f6..408e9ac736 100644
--- a/csharp/src/Google.Protobuf.Test/ParsingPrimitivesTest.cs
+++ b/csharp/src/Google.Protobuf.Test/ParsingPrimitivesTest.cs
@@ -1,4 +1,4 @@
-#region Copyright notice and license
+#region Copyright notice and license
// Protocol Buffers - Google's data interchange format
// Copyright 2022 Google Inc. All rights reserved.
//
@@ -24,11 +24,12 @@ internal class ParsingPrimitivesTest
[TestCase("A\ufffd\ufffdB", 65, 255, 255, 66)]
// Overlong form of "space"
[TestCase("\ufffd\ufffd", 0xc0, 0xa0)]
- public void ReadRawString_NonUtf8(string expectedText, params int[] bytes)
+ public void ReadRawString_NonUtf8ThrowsInvalidProtocolBufferException(string expectedText, params int[] bytes)
{
- var context = CreateContext(bytes);
- string text = ParsingPrimitives.ReadRawString(ref context.buffer, ref context.state, bytes.Length);
- Assert.AreEqual(expectedText, text);
+ Assert.Throws(() => {
+ var context = CreateContext(bytes);
+ ParsingPrimitives.ReadRawString(ref context.buffer, ref context.state, bytes.Length);
+ });
}
private static ParseContext CreateContext(int[] bytes)
diff --git a/csharp/src/Google.Protobuf/InvalidProtocolBufferException.cs b/csharp/src/Google.Protobuf/InvalidProtocolBufferException.cs
index b023d8a15c..ea39624a59 100644
--- a/csharp/src/Google.Protobuf/InvalidProtocolBufferException.cs
+++ b/csharp/src/Google.Protobuf/InvalidProtocolBufferException.cs
@@ -76,6 +76,12 @@ namespace Google.Protobuf
return new InvalidProtocolBufferException("Invalid base64 data", innerException);
}
+ internal static InvalidProtocolBufferException InvalidUtf8(Exception innerException)
+ {
+ return new InvalidProtocolBufferException(
+ "String is invalid UTF-8.", innerException);
+ }
+
internal static InvalidProtocolBufferException InvalidEndTag()
{
return new InvalidProtocolBufferException(
diff --git a/csharp/src/Google.Protobuf/ParsingPrimitives.cs b/csharp/src/Google.Protobuf/ParsingPrimitives.cs
index 0c3420492f..1615ec832f 100644
--- a/csharp/src/Google.Protobuf/ParsingPrimitives.cs
+++ b/csharp/src/Google.Protobuf/ParsingPrimitives.cs
@@ -1,4 +1,4 @@
-#region Copyright notice and license
+#region Copyright notice and license
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc. All rights reserved.
//
@@ -15,6 +15,7 @@ using System.IO;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Security;
+using System.Text;
namespace Google.Protobuf
{
@@ -24,6 +25,9 @@ namespace Google.Protobuf
[SecuritySafeCritical]
internal static class ParsingPrimitives
{
+ internal static readonly Encoding Utf8Encoding =
+ new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
+
private const int StackallocThreshold = 256;
///
@@ -576,7 +580,14 @@ namespace Google.Protobuf
{
fixed (byte* sourceBytes = &MemoryMarshal.GetReference(data))
{
- value = WritingPrimitives.Utf8Encoding.GetString(sourceBytes, length);
+ try
+ {
+ value = Utf8Encoding.GetString(sourceBytes, length);
+ }
+ catch (DecoderFallbackException e)
+ {
+ throw InvalidProtocolBufferException.InvalidUtf8(e);
+ }
}
}
@@ -618,8 +629,14 @@ namespace Google.Protobuf
// Make compiler happy by passing a new span created from pointer.
var tempSpan = new Span(pByteSpan, byteSpan.Length);
ReadRawBytesIntoSpan(ref buffer, ref state, length, tempSpan);
-
- return WritingPrimitives.Utf8Encoding.GetString(pByteSpan, length);
+ try
+ {
+ return Utf8Encoding.GetString(pByteSpan, length);
+ }
+ catch (DecoderFallbackException e)
+ {
+ throw InvalidProtocolBufferException.InvalidUtf8(e);
+ }
}
}
}
@@ -637,7 +654,15 @@ namespace Google.Protobuf
// This will be called when reading from a Stream because we don't know the length of the stream,
// or there is not enough data in the sequence. If there is not enough data then ReadRawBytes will
// throw an exception.
- return WritingPrimitives.Utf8Encoding.GetString(ReadRawBytes(ref buffer, ref state, length), 0, length);
+ byte[] bytes = ReadRawBytes(ref buffer, ref state, length);
+ try
+ {
+ return Utf8Encoding.GetString(bytes, 0, length);
+ }
+ catch (DecoderFallbackException e)
+ {
+ throw InvalidProtocolBufferException.InvalidUtf8(e);
+ }
}
///
diff --git a/csharp/src/Google.Protobuf/Reflection/FeatureSetDescriptor.g.cs b/csharp/src/Google.Protobuf/Reflection/FeatureSetDescriptor.g.cs
deleted file mode 100644
index 208ce1fcb6..0000000000
--- a/csharp/src/Google.Protobuf/Reflection/FeatureSetDescriptor.g.cs
+++ /dev/null
@@ -1,17 +0,0 @@
-#region Copyright notice and license
-// Protocol Buffers - Google's data interchange format
-// Copyright 2008 Google Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file or at
-// https://developers.google.com/open-source/licenses/bsd
-#endregion
-
-namespace Google.Protobuf.Reflection;
-
-internal sealed partial class FeatureSetDescriptor
-{
- // Canonical serialized form of the edition defaults, generated by embed_edition_defaults.
- private const string DefaultsBase64 =
- "ChMYhAciACoMCAEQAhgCIAMoATACChMY5wciACoMCAIQARgBIAIoATABChMY6AciDAgBEAEYASACKAEwASoAIOYHKOgH";
-}
diff --git a/csharp/src/Google.Protobuf/WritingPrimitives.cs b/csharp/src/Google.Protobuf/WritingPrimitives.cs
index 7e77321404..372c03ef4f 100644
--- a/csharp/src/Google.Protobuf/WritingPrimitives.cs
+++ b/csharp/src/Google.Protobuf/WritingPrimitives.cs
@@ -27,6 +27,8 @@ namespace Google.Protobuf
[SecuritySafeCritical]
internal static class WritingPrimitives
{
+ // Ideally we would use the same UTF8Encoding as parse, but we should be able to serialize
+ // invalid UTF-8 that make it in via unvalidated setters without throwing an exception.
#if NET5_0_OR_GREATER
internal static Encoding Utf8Encoding => Encoding.UTF8; // allows JIT to devirtualize
#else
@@ -35,7 +37,7 @@ namespace Google.Protobuf
// difference.)
#endif
- #region Writing of values (not including tags)
+#region Writing of values (not including tags)
///
/// Writes a double field value, without a tag, to the stream.