From 65a353fa10ad6243d4ae2de18ec2a824dc709fd2 Mon Sep 17 00:00:00 2001 From: Adam Cozzette Date: Thu, 18 Jan 2024 16:22:44 -0800 Subject: [PATCH] Update JSON parser to reject overlong UTF-8 encodings A fuzz test discovered that our JSON parser will accept "overlong" UTF-8 characters, i.e. encodings that use more bytes than necessary. We should reject these overlong encodings, because they are not considered valid. To fix this problem, I updated the JSON lexer to rely on utf8_range for checking UTF-8 validity. This way, the lexer does not need to do any UTF-8 validation of its own, but just has to interpret the UTF-8 enough to know how many bytes to read for each character. PiperOrigin-RevId: 599657903 --- src/google/protobuf/json/BUILD.bazel | 1 + src/google/protobuf/json/internal/lexer.cc | 38 +++++++++---------- .../protobuf/json/internal/lexer_test.cc | 17 +++++++++ 3 files changed, 36 insertions(+), 20 deletions(-) diff --git a/src/google/protobuf/json/BUILD.bazel b/src/google/protobuf/json/BUILD.bazel index 8f1fc61bc9..96a62f5724 100644 --- a/src/google/protobuf/json/BUILD.bazel +++ b/src/google/protobuf/json/BUILD.bazel @@ -137,6 +137,7 @@ cc_library( "//src/google/protobuf:port", "//src/google/protobuf/io", "//src/google/protobuf/stubs", + "//third_party/utf8_range:utf8_validity", "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/log:absl_check", "@com_google_absl//absl/numeric:bits", diff --git a/src/google/protobuf/json/internal/lexer.cc b/src/google/protobuf/json/internal/lexer.cc index 0d90d5416c..ff5380dc27 100644 --- a/src/google/protobuf/json/internal/lexer.cc +++ b/src/google/protobuf/json/internal/lexer.cc @@ -29,6 +29,7 @@ #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "absl/strings/string_view.h" +#include "utf8_validity.h" #include "google/protobuf/stubs/status_macros.h" // Must be included last. @@ -401,12 +402,14 @@ absl::StatusOr> JsonLexer::ParseUtf8() { goto normal_character; } - if (!on_heap.empty()) { - return LocationWith{ - MaybeOwnedString(std::move(on_heap)), loc}; - } // NOTE: the 1 below clips off the " from the end of the string. - return LocationWith{mark.value.UpToUnread(1), loc}; + MaybeOwnedString result = on_heap.empty() + ? mark.value.UpToUnread(1) + : MaybeOwnedString{std::move(on_heap)}; + if (utf8_range::IsStructurallyValid(result)) { + return LocationWith{std::move(result), loc}; + } + return Invalid("Invalid UTF-8 string"); } case '\\': { if (on_heap.empty()) { @@ -417,7 +420,7 @@ absl::StatusOr> JsonLexer::ParseUtf8() { // destroyed only if we need to handle an escape when on_heap is // empty. Because this branch unconditionally pushes to on_heap, this // condition can never be reached in any iteration that follows it. - // This, at most one move every actually occurs. + // Thus, at most one move ever actually occurs. std::move(mark).value.Discard(); } RETURN_IF_ERROR(stream_.BufferAtLeast(1).status()); @@ -450,15 +453,15 @@ absl::StatusOr> JsonLexer::ParseUtf8() { "invalid control character 0x%02x in string", uc)); } - // Verify this is valid UTF-8. UTF-8 is a varint encoding satisfying - // one of the following (big-endian) patterns: + // Process this UTF-8 code point. We do not need to fully validate it + // at this stage; we just need to interpret it enough to know how many + // bytes to read. UTF-8 is a varint encoding satisfying one of the + // following (big-endian) patterns: // // 0b0xxxxxxx // 0b110xxxxx'10xxxxxx // 0b1110xxxx'10xxxxxx'10xxxxxx // 0b11110xxx'10xxxxxx'10xxxxxx'10xxxxxx - // - // We don't need to decode it; just validate it. size_t lookahead = 0; switch (absl::countl_one(uc)) { case 0: @@ -479,16 +482,11 @@ absl::StatusOr> JsonLexer::ParseUtf8() { if (!on_heap.empty()) { on_heap.push_back(c); } - for (int i = 0; i < lookahead; ++i) { - RETURN_IF_ERROR(stream_.BufferAtLeast(1).status()); - uint8_t uc = static_cast(stream_.PeekChar()); - if ((uc >> 6) != 2) { - return Invalid("invalid UTF-8 in string"); - } - if (!on_heap.empty()) { - on_heap.push_back(stream_.PeekChar()); - } - RETURN_IF_ERROR(Advance(1)); + auto lookahead_bytes = stream_.Take(lookahead); + RETURN_IF_ERROR(lookahead_bytes.status()); + if (!on_heap.empty()) { + absl::string_view view = lookahead_bytes->AsView(); + on_heap.append(view.data(), view.size()); } break; } diff --git a/src/google/protobuf/json/internal/lexer_test.cc b/src/google/protobuf/json/internal/lexer_test.cc index 3dbad40b08..c84e5dbd7b 100644 --- a/src/google/protobuf/json/internal/lexer_test.cc +++ b/src/google/protobuf/json/internal/lexer_test.cc @@ -596,6 +596,23 @@ TEST(LexerTest, RejectNonUtf8String) { TEST(LexerTest, RejectNonUtf8Prefix) { Bad("\xff{}"); } +TEST(LexerTest, RejectOverlongUtf8) { + // This is the NUL character (U+0000) encoded in three bytes instead of one. + // Such "overlong" encodings are not considered valid. + Bad("\"\340\200\200\""); +} + +TEST(LexerTest, MixtureOfEscapesAndRawMultibyteCharacters) { + Do(R"json("šŸ˜\t")json", [](io::ZeroCopyInputStream* stream) { + EXPECT_THAT(Value::Parse(stream), + IsOkAndHolds(ValueIs("šŸ˜\t"))); + }); + Do(R"json("\tšŸ˜")json", [](io::ZeroCopyInputStream* stream) { + EXPECT_THAT(Value::Parse(stream), + IsOkAndHolds(ValueIs("\tšŸ˜"))); + }); +} + TEST(LexerTest, SurrogateEscape) { absl::string_view json = R"json( [ "\ud83d\udc08\u200D\u2b1B\ud83d\uDdA4" ]