From 96100bf1e49c5452f21e4446e43f5e7526f1351f Mon Sep 17 00:00:00 2001 From: Lydon Chandra Date: Tue, 26 Sep 2023 00:40:39 -0700 Subject: [PATCH] add C# deterministic serialization (#13160) https://github.com/protocolbuffers/protobuf/issues/12881 @jskeet review please :-) as this is my first contrib to protobuf, any design / performance feedback is very sought after Closes #13160 COPYBARA_INTEGRATE_REVIEW=https://github.com/protocolbuffers/protobuf/pull/13160 from fmg-lydonchandra:feature/12881_cs_serialization_deterministic ab7e01b8049ec98b78eb9fb6e04e8fe30ecd9a33 PiperOrigin-RevId: 568448399 --- .../Collections/MapFieldTest.cs | 53 +++++++++++++ .../GeneratedMessageTest.cs | 75 ++++++++++++++++++- .../src/Google.Protobuf/CodedOutputStream.cs | 29 ++++++- .../Google.Protobuf/Collections/MapField.cs | 42 ++++++++++- 4 files changed, 193 insertions(+), 6 deletions(-) diff --git a/csharp/src/Google.Protobuf.Test/Collections/MapFieldTest.cs b/csharp/src/Google.Protobuf.Test/Collections/MapFieldTest.cs index 356765e994..b464deb76a 100644 --- a/csharp/src/Google.Protobuf.Test/Collections/MapFieldTest.cs +++ b/csharp/src/Google.Protobuf.Test/Collections/MapFieldTest.cs @@ -632,6 +632,59 @@ namespace Google.Protobuf.Collections CollectionAssert.AreEquivalent(((IDictionary)map).Values, ((IReadOnlyDictionary)map).Values); } + [Test] + public void SortIntKeys_RandomOrder() + { + var map = new MapField() { { 1, "val" }, { -1, "val"}, { 0, "val" } }; + var sortedList = map.GetSortedListCopy(map.ToList()).ToList(); + var sortedKeys = sortedList.Select(kvp => kvp.Key); + CollectionAssert.AreEqual(new[] { -1, 0, 1 }, sortedKeys); + } + + [Test] + public void SortIntKeys_Empty() + { + var map = new MapField { }; + var sortedList = map.GetSortedListCopy(map.ToList()).ToList(); + var sortedKeys = sortedList.Select(kvp => kvp.Key); + Assert.IsEmpty(sortedKeys); + } + + [Test] + public void SortStringKeys_RandomOrder() + { + var map = new MapField { { "a", "val" }, { "c", "val" }, { "b", "val" } }; + var sortedList = map.GetSortedListCopy(map.ToList()).ToList(); + var sortedKeys = sortedList.Select(kvp => kvp.Key); + CollectionAssert.AreEqual(new[] { "a", "b", "c" }, sortedKeys); + } + + [Test] + public void SortStringKeys_EnsureOrdinalSort() + { + var map = new MapField + { + { "i", "val" } , { "I", "val" }, { "ı", "val" }, { "İ", "val" } + }; + var sortedList = map.GetSortedListCopy(map.ToList()); + var sortedKeys = sortedList.Select(kvp => kvp.Key); + // Assert Ordinal sort I, i, ı, İ (Non-ordinal sort returns i, I, İ, ı) + // I == 0x49 , i == 0x69 , İ == 0x130 , ı == 0x131 + CollectionAssert.AreEqual(new[] { "I", "i", "İ", "ı" }, sortedKeys); + } + + [Test] + public void SortBoolKeys() + { + var map = new MapField + { + { true, "val" } , { false, "val" } + }; + var sortedList = map.GetSortedListCopy(map.ToList()); + var sortedKeys = sortedList.Select(kvp => kvp.Key); + CollectionAssert.AreEqual(new[] { false, true }, sortedKeys); + } + private static KeyValuePair NewKeyValuePair(TKey key, TValue value) { return new KeyValuePair(key, value); diff --git a/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs b/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs index 72235603a5..49cf771347 100644 --- a/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs +++ b/csharp/src/Google.Protobuf.Test/GeneratedMessageTest.cs @@ -311,7 +311,7 @@ namespace Google.Protobuf output.WriteTag(TestMap.MapInt32Int32FieldNumber, WireFormat.WireType.LengthDelimited); - var key = 10; // Field 1 + var key = 10; // Field 1 var value = 20; // Field 2 var extra = 30; // Field 3 @@ -668,6 +668,79 @@ namespace Google.Protobuf }); } + [Test] + public void MapStringString_DeterministicTrue_ThenBytesIdentical() + { + // Define three strings consisting of different versions of the letter I. + // LATIN CAPITAL LETTER I (U+0049) + string capitalLetterI = "I"; + // LATIN SMALL LETTER I (U+0069) + string smallLetterI = "i"; + // LATIN SMALL LETTER DOTLESS I (U+0131) + string smallLetterDotlessI = "\u0131"; + var testMap1 = new TestMap(); + + testMap1.MapStringString.Add(smallLetterDotlessI, "value_"+smallLetterDotlessI); + testMap1.MapStringString.Add(smallLetterI, "value_"+smallLetterI); + testMap1.MapStringString.Add(capitalLetterI, "content_"+capitalLetterI); + var bytes1 = SerializeTestMap(testMap1, true); + + var testMap2 = new TestMap(); + testMap2.MapStringString.Add(capitalLetterI, "content_"+capitalLetterI); + testMap2.MapStringString.Add(smallLetterI, "value_"+smallLetterI); + testMap2.MapStringString.Add(smallLetterDotlessI, "value_"+smallLetterDotlessI); + + var bytes2 = SerializeTestMap(testMap2, true); + var parsedBytes2 = TestMap.Parser.ParseFrom(bytes2); + var parsedBytes1 = TestMap.Parser.ParseFrom(bytes1); + Assert.IsTrue(bytes1.SequenceEqual(bytes2)); + } + + [Test] + public void MapInt32Bytes_DeterministicTrue_ThenBytesIdentical() + { + var testMap1 = new TestMap(); + testMap1.MapInt32Bytes.Add(1, ByteString.CopyFromUtf8("test1")); + testMap1.MapInt32Bytes.Add(2, ByteString.CopyFromUtf8("test2")); + var bytes1 = SerializeTestMap(testMap1, true); + + var testMap2 = new TestMap(); + testMap2.MapInt32Bytes.Add(2, ByteString.CopyFromUtf8("test2")); + testMap2.MapInt32Bytes.Add(1, ByteString.CopyFromUtf8("test1")); + var bytes2 = SerializeTestMap(testMap2, true); + + Assert.IsTrue(bytes1.SequenceEqual(bytes2)); + } + + [Test] + public void MapInt32Bytes_DeterministicFalse_ThenBytesDifferent() + { + var testMap1 = new TestMap(); + testMap1.MapInt32Bytes.Add(1, ByteString.CopyFromUtf8("test1")); + testMap1.MapInt32Bytes.Add(2, ByteString.CopyFromUtf8("test2")); + var bytes1 = SerializeTestMap(testMap1, false); + + var testMap2 = new TestMap(); + testMap2.MapInt32Bytes.Add(2, ByteString.CopyFromUtf8("test2")); + testMap2.MapInt32Bytes.Add(1, ByteString.CopyFromUtf8("test1")); + var bytes2 = SerializeTestMap(testMap2, false); + + Assert.IsFalse(bytes1.SequenceEqual(bytes2)); + } + + private byte[] SerializeTestMap(TestMap testMap, bool deterministic) + { + using var memoryStream = new MemoryStream(); + var codedOutputStream = new CodedOutputStream(memoryStream); + codedOutputStream.Deterministic = deterministic; + + testMap.WriteTo(codedOutputStream); + codedOutputStream.Flush(); + + memoryStream.Seek(0, SeekOrigin.Begin); + return memoryStream.ToArray(); + } + [Test] public void DiscardUnknownFields_RealDataStillRead() { diff --git a/csharp/src/Google.Protobuf/CodedOutputStream.cs b/csharp/src/Google.Protobuf/CodedOutputStream.cs index 85e48e5ab9..85586d2374 100644 --- a/csharp/src/Google.Protobuf/CodedOutputStream.cs +++ b/csharp/src/Google.Protobuf/CodedOutputStream.cs @@ -137,6 +137,33 @@ namespace Google.Protobuf } } + /// + /// Configures whether or not serialization is deterministic. + /// + /// + /// Deterministic serialization guarantees that for a given binary, equal messages (defined by the + /// equals methods in protos) will always be serialized to the same bytes. This implies: + /// + /// Repeated serialization of a message will return the same bytes. + /// Different processes of the same binary (which may be executing on different machines) + /// will serialize equal messages to the same bytes. + /// + /// Note the deterministic serialization is NOT canonical across languages; it is also unstable + /// across different builds with schema changes due to unknown fields. Users who need canonical + /// serialization, e.g. persistent storage in a canonical form, fingerprinting, etc, should define + /// their own canonicalization specification and implement the serializer using reflection APIs + /// rather than relying on this API. + /// Once set, the serializer will: (Note this is an implementation detail and may subject to + /// change in the future) + /// + /// Sort map entries by keys in lexicographical order or numerical order. Note: For string + /// keys, the order is based on comparing the UTF-16 code unit value of each character in the strings. + /// The order may be different from the deterministic serialization in other languages where + /// maps are sorted on the lexicographical order of the UTF8 encoded keys. + /// + /// + public bool Deterministic { get; set; } + #region Writing of values (not including tags) /// @@ -462,7 +489,7 @@ namespace Google.Protobuf #endregion #region Underlying writing primitives - + /// /// Writes a 32 bit value as a varint. The fast route is taken when /// there's enough buffer space left to whizz through without checking diff --git a/csharp/src/Google.Protobuf/Collections/MapField.cs b/csharp/src/Google.Protobuf/Collections/MapField.cs index f0be958303..722cc92322 100644 --- a/csharp/src/Google.Protobuf/Collections/MapField.cs +++ b/csharp/src/Google.Protobuf/Collections/MapField.cs @@ -327,7 +327,7 @@ namespace Google.Protobuf.Collections /// Returns a hash code for this instance. /// /// - /// A hash code for this instance, suitable for use in hashing algorithms and data structures like a hash table. + /// A hash code for this instance, suitable for use in hashing algorithms and data structures like a hash table. /// public override int GetHashCode() { @@ -432,7 +432,13 @@ namespace Google.Protobuf.Collections WriteContext.Initialize(output, out WriteContext ctx); try { - WriteTo(ref ctx, codec); + IEnumerable> listToWrite = list; + + if (output.Deterministic) + { + listToWrite = GetSortedListCopy(list); + } + WriteTo(ref ctx, codec, listToWrite); } finally { @@ -440,6 +446,23 @@ namespace Google.Protobuf.Collections } } + internal IEnumerable> GetSortedListCopy(IEnumerable> listToSort) + { + // We can't sort the list in place, as that would invalidate the linked list. + // Instead, we create a new list, sort that, and then write it out. + var listToWrite = new List>(listToSort); + listToWrite.Sort((pair1, pair2) => + { + if (typeof(TKey) == typeof(string)) + { + // Use Ordinal, otherwise Comparer.Default uses StringComparer.CurrentCulture + return StringComparer.Ordinal.Compare(pair1.Key.ToString(), pair2.Key.ToString()); + } + return Comparer.Default.Compare(pair1.Key, pair2.Key); + }); + return listToWrite; + } + /// /// Writes the contents of this map to the given write context, using the specified codec /// to encode each entry. @@ -449,7 +472,18 @@ namespace Google.Protobuf.Collections [SecuritySafeCritical] public void WriteTo(ref WriteContext ctx, Codec codec) { - foreach (var entry in list) + IEnumerable> listToWrite = list; + if (ctx.state.CodedOutputStream?.Deterministic ?? false) + { + listToWrite = GetSortedListCopy(list); + } + WriteTo(ref ctx, codec, listToWrite); + } + + [SecuritySafeCritical] + private void WriteTo(ref WriteContext ctx, Codec codec, IEnumerable> listKvp) + { + foreach (var entry in listKvp) { ctx.WriteTag(codec.MapTag); @@ -631,7 +665,7 @@ namespace Google.Protobuf.Collections this.containsCheck = containsCheck; } - public int Count => parent.Count; + public int Count => parent.Count; public bool IsReadOnly => true;