diff --git a/csharp/src/Google.Protobuf.Test/CodedInputStreamTest.cs b/csharp/src/Google.Protobuf.Test/CodedInputStreamTest.cs index 2020de5da3..e76af650f6 100644 --- a/csharp/src/Google.Protobuf.Test/CodedInputStreamTest.cs +++ b/csharp/src/Google.Protobuf.Test/CodedInputStreamTest.cs @@ -7,12 +7,12 @@ // https://developers.google.com/open-source/licenses/bsd #endregion -using System; -using System.Buffers; -using System.IO; using Google.Protobuf.TestProtos; using Proto2 = Google.Protobuf.TestProtos.Proto2; using NUnit.Framework; +using System; +using System.Buffers; +using System.IO; namespace Google.Protobuf { @@ -577,12 +577,11 @@ namespace Google.Protobuf } /// - /// Tests that if we read an string that contains invalid UTF-8, no exception - /// is thrown. Instead, the invalid bytes are replaced with the Unicode - /// "replacement character" U+FFFD. + /// Tests that if we read a string that contains invalid UTF-8, an exception + /// is thrown. /// [Test] - public void ReadInvalidUtf8() + public void ReadInvalidUtf8ThrowsInvalidProtocolBufferException() { MemoryStream ms = new MemoryStream(); CodedOutputStream output = new CodedOutputStream(ms); @@ -597,8 +596,7 @@ namespace Google.Protobuf CodedInputStream input = new CodedInputStream(ms); Assert.AreEqual(tag, input.ReadTag()); - string text = input.ReadString(); - Assert.AreEqual('\ufffd', text[0]); + Assert.Throws(() => input.ReadString()); } [Test] diff --git a/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs b/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs index e10d136a17..734c1d0555 100644 --- a/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs +++ b/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs @@ -7,13 +7,13 @@ // https://developers.google.com/open-source/licenses/bsd #endregion -using System; -using System.IO; +using Google.Protobuf.Collections; using Google.Protobuf.TestProtos; +using Google.Protobuf.WellKnownTypes; using NUnit.Framework; +using System; +using System.IO; using System.Linq; -using Google.Protobuf.WellKnownTypes; -using Google.Protobuf.Collections; namespace Google.Protobuf { @@ -112,14 +112,10 @@ namespace Google.Protobuf } [Test] - public void InvalidUtf8ParsesAsReplacementChars() + public void ParseInvalidUtf8Rejected() { var payload = new byte[] { 0x72, 1, 0x80 }; - - // We would prefer to have this parse operation fail, but at the moment it substitutes - // the replacement character. - var message = TestAllTypes.Parser.ParseFrom(payload); - Assert.AreEqual("\ufffd", message.SingleString); + Assert.Throws(() => TestAllTypes.Parser.ParseFrom(payload)); } [Test] diff --git a/csharp/src/Google.Protobuf.Test/ParsingPrimitivesTest.cs b/csharp/src/Google.Protobuf.Test/ParsingPrimitivesTest.cs index c1ca6b58f6..408e9ac736 100644 --- a/csharp/src/Google.Protobuf.Test/ParsingPrimitivesTest.cs +++ b/csharp/src/Google.Protobuf.Test/ParsingPrimitivesTest.cs @@ -1,4 +1,4 @@ -#region Copyright notice and license +#region Copyright notice and license // Protocol Buffers - Google's data interchange format // Copyright 2022 Google Inc. All rights reserved. // @@ -24,11 +24,12 @@ internal class ParsingPrimitivesTest [TestCase("A\ufffd\ufffdB", 65, 255, 255, 66)] // Overlong form of "space" [TestCase("\ufffd\ufffd", 0xc0, 0xa0)] - public void ReadRawString_NonUtf8(string expectedText, params int[] bytes) + public void ReadRawString_NonUtf8ThrowsInvalidProtocolBufferException(string expectedText, params int[] bytes) { - var context = CreateContext(bytes); - string text = ParsingPrimitives.ReadRawString(ref context.buffer, ref context.state, bytes.Length); - Assert.AreEqual(expectedText, text); + Assert.Throws(() => { + var context = CreateContext(bytes); + ParsingPrimitives.ReadRawString(ref context.buffer, ref context.state, bytes.Length); + }); } private static ParseContext CreateContext(int[] bytes) diff --git a/csharp/src/Google.Protobuf/InvalidProtocolBufferException.cs b/csharp/src/Google.Protobuf/InvalidProtocolBufferException.cs index b023d8a15c..ea39624a59 100644 --- a/csharp/src/Google.Protobuf/InvalidProtocolBufferException.cs +++ b/csharp/src/Google.Protobuf/InvalidProtocolBufferException.cs @@ -76,6 +76,12 @@ namespace Google.Protobuf return new InvalidProtocolBufferException("Invalid base64 data", innerException); } + internal static InvalidProtocolBufferException InvalidUtf8(Exception innerException) + { + return new InvalidProtocolBufferException( + "String is invalid UTF-8.", innerException); + } + internal static InvalidProtocolBufferException InvalidEndTag() { return new InvalidProtocolBufferException( diff --git a/csharp/src/Google.Protobuf/ParsingPrimitives.cs b/csharp/src/Google.Protobuf/ParsingPrimitives.cs index 0c3420492f..1615ec832f 100644 --- a/csharp/src/Google.Protobuf/ParsingPrimitives.cs +++ b/csharp/src/Google.Protobuf/ParsingPrimitives.cs @@ -1,4 +1,4 @@ -#region Copyright notice and license +#region Copyright notice and license // Protocol Buffers - Google's data interchange format // Copyright 2008 Google Inc. All rights reserved. // @@ -15,6 +15,7 @@ using System.IO; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Security; +using System.Text; namespace Google.Protobuf { @@ -24,6 +25,9 @@ namespace Google.Protobuf [SecuritySafeCritical] internal static class ParsingPrimitives { + internal static readonly Encoding Utf8Encoding = + new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true); + private const int StackallocThreshold = 256; /// @@ -576,7 +580,14 @@ namespace Google.Protobuf { fixed (byte* sourceBytes = &MemoryMarshal.GetReference(data)) { - value = WritingPrimitives.Utf8Encoding.GetString(sourceBytes, length); + try + { + value = Utf8Encoding.GetString(sourceBytes, length); + } + catch (DecoderFallbackException e) + { + throw InvalidProtocolBufferException.InvalidUtf8(e); + } } } @@ -618,8 +629,14 @@ namespace Google.Protobuf // Make compiler happy by passing a new span created from pointer. var tempSpan = new Span(pByteSpan, byteSpan.Length); ReadRawBytesIntoSpan(ref buffer, ref state, length, tempSpan); - - return WritingPrimitives.Utf8Encoding.GetString(pByteSpan, length); + try + { + return Utf8Encoding.GetString(pByteSpan, length); + } + catch (DecoderFallbackException e) + { + throw InvalidProtocolBufferException.InvalidUtf8(e); + } } } } @@ -637,7 +654,15 @@ namespace Google.Protobuf // This will be called when reading from a Stream because we don't know the length of the stream, // or there is not enough data in the sequence. If there is not enough data then ReadRawBytes will // throw an exception. - return WritingPrimitives.Utf8Encoding.GetString(ReadRawBytes(ref buffer, ref state, length), 0, length); + byte[] bytes = ReadRawBytes(ref buffer, ref state, length); + try + { + return Utf8Encoding.GetString(bytes, 0, length); + } + catch (DecoderFallbackException e) + { + throw InvalidProtocolBufferException.InvalidUtf8(e); + } } /// diff --git a/csharp/src/Google.Protobuf/Reflection/FeatureSetDescriptor.g.cs b/csharp/src/Google.Protobuf/Reflection/FeatureSetDescriptor.g.cs deleted file mode 100644 index 208ce1fcb6..0000000000 --- a/csharp/src/Google.Protobuf/Reflection/FeatureSetDescriptor.g.cs +++ /dev/null @@ -1,17 +0,0 @@ -#region Copyright notice and license -// Protocol Buffers - Google's data interchange format -// Copyright 2008 Google Inc. All rights reserved. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file or at -// https://developers.google.com/open-source/licenses/bsd -#endregion - -namespace Google.Protobuf.Reflection; - -internal sealed partial class FeatureSetDescriptor -{ - // Canonical serialized form of the edition defaults, generated by embed_edition_defaults. - private const string DefaultsBase64 = - "ChMYhAciACoMCAEQAhgCIAMoATACChMY5wciACoMCAIQARgBIAIoATABChMY6AciDAgBEAEYASACKAEwASoAIOYHKOgH"; -} diff --git a/csharp/src/Google.Protobuf/WritingPrimitives.cs b/csharp/src/Google.Protobuf/WritingPrimitives.cs index 7e77321404..372c03ef4f 100644 --- a/csharp/src/Google.Protobuf/WritingPrimitives.cs +++ b/csharp/src/Google.Protobuf/WritingPrimitives.cs @@ -27,6 +27,8 @@ namespace Google.Protobuf [SecuritySafeCritical] internal static class WritingPrimitives { + // Ideally we would use the same UTF8Encoding as parse, but we should be able to serialize + // invalid UTF-8 that make it in via unvalidated setters without throwing an exception. #if NET5_0_OR_GREATER internal static Encoding Utf8Encoding => Encoding.UTF8; // allows JIT to devirtualize #else @@ -35,7 +37,7 @@ namespace Google.Protobuf // difference.) #endif - #region Writing of values (not including tags) +#region Writing of values (not including tags) /// /// Writes a double field value, without a tag, to the stream.