From 65a353fa10ad6243d4ae2de18ec2a824dc709fd2 Mon Sep 17 00:00:00 2001
From: Adam Cozzette <acozzette@google.com>
Date: Thu, 18 Jan 2024 16:22:44 -0800
Subject: [PATCH] Update JSON parser to reject overlong UTF-8 encodings

A fuzz test discovered that our JSON parser will accept "overlong" UTF-8
characters, i.e. encodings that use more bytes than necessary. We should reject
these overlong encodings, because they are not considered valid.

To fix this problem, I updated the JSON lexer to rely on utf8_range for
checking UTF-8 validity. This way, the lexer does not need to do any UTF-8
validation of its own, but just has to interpret the UTF-8 enough to know how
many bytes to read for each character.

PiperOrigin-RevId: 599657903
---
 src/google/protobuf/json/BUILD.bazel          |  1 +
 src/google/protobuf/json/internal/lexer.cc    | 38 +++++++++----------
 .../protobuf/json/internal/lexer_test.cc      | 17 +++++++++
 3 files changed, 36 insertions(+), 20 deletions(-)
diff --git a/src/google/protobuf/json/BUILD.bazel b/src/google/protobuf/json/BUILD.bazel
index 8f1fc61bc9..96a62f5724 100644
--- a/src/google/protobuf/json/BUILD.bazel
+++ b/src/google/protobuf/json/BUILD.bazel
@@ -137,6 +137,7 @@ cc_library(
         "//src/google/protobuf:port",
         "//src/google/protobuf/io",
         "//src/google/protobuf/stubs",
+        "//third_party/utf8_range:utf8_validity",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:absl_check",
         "@com_google_absl//absl/numeric:bits",
diff --git a/src/google/protobuf/json/internal/lexer.cc b/src/google/protobuf/json/internal/lexer.cc
index 0d90d5416c..ff5380dc27 100644
--- a/src/google/protobuf/json/internal/lexer.cc
+++ b/src/google/protobuf/json/internal/lexer.cc
@@ -29,6 +29,7 @@
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "utf8_validity.h"
 #include "google/protobuf/stubs/status_macros.h"
 
 // Must be included last.
@@ -401,12 +402,14 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
           goto normal_character;
         }
 
-        if (!on_heap.empty()) {
-          return LocationWith<MaybeOwnedString>{
-              MaybeOwnedString(std::move(on_heap)), loc};
-        }
         // NOTE: the 1 below clips off the " from the end of the string.
-        return LocationWith<MaybeOwnedString>{mark.value.UpToUnread(1), loc};
+        MaybeOwnedString result = on_heap.empty()
+                                      ? mark.value.UpToUnread(1)
+                                      : MaybeOwnedString{std::move(on_heap)};
+        if (utf8_range::IsStructurallyValid(result)) {
+          return LocationWith<MaybeOwnedString>{std::move(result), loc};
+        }
+        return Invalid("Invalid UTF-8 string");
       }
       case '\\': {
         if (on_heap.empty()) {
@@ -417,7 +420,7 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
           // destroyed only if we need to handle an escape when on_heap is
           // empty. Because this branch unconditionally pushes to on_heap, this
           // condition can never be reached in any iteration that follows it.
-          // This, at most one move every actually occurs.
+          // Thus, at most one move ever actually occurs.
           std::move(mark).value.Discard();
         }
         RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
@@ -450,15 +453,15 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
               "invalid control character 0x%02x in string", uc));
         }
 
-        // Verify this is valid UTF-8. UTF-8 is a varint encoding satisfying
-        // one of the following (big-endian) patterns:
+        // Process this UTF-8 code point. We do not need to fully validate it
+        // at this stage; we just need to interpret it enough to know how many
+        // bytes to read. UTF-8 is a varint encoding satisfying one of the
+        // following (big-endian) patterns:
         //
         // 0b0xxxxxxx
         // 0b110xxxxx'10xxxxxx
         // 0b1110xxxx'10xxxxxx'10xxxxxx
         // 0b11110xxx'10xxxxxx'10xxxxxx'10xxxxxx
-        //
-        // We don't need to decode it; just validate it.
         size_t lookahead = 0;
         switch (absl::countl_one(uc)) {
           case 0:
@@ -479,16 +482,11 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
         if (!on_heap.empty()) {
           on_heap.push_back(c);
         }
-        for (int i = 0; i < lookahead; ++i) {
-          RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
-          uint8_t uc = static_cast<uint8_t>(stream_.PeekChar());
-          if ((uc >> 6) != 2) {
-            return Invalid("invalid UTF-8 in string");
-          }
-          if (!on_heap.empty()) {
-            on_heap.push_back(stream_.PeekChar());
-          }
-          RETURN_IF_ERROR(Advance(1));
+        auto lookahead_bytes = stream_.Take(lookahead);
+        RETURN_IF_ERROR(lookahead_bytes.status());
+        if (!on_heap.empty()) {
+          absl::string_view view = lookahead_bytes->AsView();
+          on_heap.append(view.data(), view.size());
         }
         break;
       }
diff --git a/src/google/protobuf/json/internal/lexer_test.cc b/src/google/protobuf/json/internal/lexer_test.cc
index 3dbad40b08..c84e5dbd7b 100644
--- a/src/google/protobuf/json/internal/lexer_test.cc
+++ b/src/google/protobuf/json/internal/lexer_test.cc
@@ -596,6 +596,23 @@ TEST(LexerTest, RejectNonUtf8String) {
 
 TEST(LexerTest, RejectNonUtf8Prefix) { Bad("\xff{}"); }
 
+TEST(LexerTest, RejectOverlongUtf8) {
+  // This is the NUL character (U+0000) encoded in three bytes instead of one.
+  // Such "overlong" encodings are not considered valid.
+  Bad("\"\340\200\200\"");
+}
+
+TEST(LexerTest, MixtureOfEscapesAndRawMultibyteCharacters) {
+  Do(R"json("😁\t")json", [](io::ZeroCopyInputStream* stream) {
+    EXPECT_THAT(Value::Parse(stream),
+                IsOkAndHolds(ValueIs<std::string>("😁\t")));
+  });
+  Do(R"json("\t😁")json", [](io::ZeroCopyInputStream* stream) {
+    EXPECT_THAT(Value::Parse(stream),
+                IsOkAndHolds(ValueIs<std::string>("\t😁")));
+  });
+}
+
 TEST(LexerTest, SurrogateEscape) {
   absl::string_view json = R"json(
     [ "\ud83d\udc08\u200D\u2b1B\ud83d\uDdA4" ]