Update JSON parser to reject overlong UTF-8 encodings

A fuzz test discovered that our JSON parser will accept "overlong" UTF-8 characters, i.e. encodings that use more bytes than necessary. We should reject these overlong encodings, because they are not considered valid. To fix this problem, I updated the JSON lexer to rely on utf8_range for checking UTF-8 validity. This way, the lexer does not need to do any UTF-8 validation of its own, but just has to interpret the UTF-8 enough to know how many bytes to read for each character. PiperOrigin-RevId: 599657903
1 year ago · 65a353fa10
parent 3c8a3d27f7
commit 65a353fa10
3 changed files with 36 additions and 20 deletions
--- a/src/google/protobuf/json/BUILD.bazel
+++ b/src/google/protobuf/json/BUILD.bazel
@ -137,6 +137,7 @@ cc_library(
        "//src/google/protobuf:port",
        "//src/google/protobuf/io",
        "//src/google/protobuf/stubs",
+        "//third_party/utf8_range:utf8_validity",
        "@com_google_absl//absl/algorithm:container",
        "@com_google_absl//absl/log:absl_check",
        "@com_google_absl//absl/numeric:bits",
--- a/src/google/protobuf/json/internal/lexer.cc
+++ b/src/google/protobuf/json/internal/lexer.cc
@ -29,6 +29,7 @@
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "utf8_validity.h"
 #include "google/protobuf/stubs/status_macros.h"

 // Must be included last.
@ -401,12 +402,14 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
          goto normal_character;
        }

-        if (!on_heap.empty()) {
-          return LocationWith<MaybeOwnedString>{
-              MaybeOwnedString(std::move(on_heap)), loc};
-        }
        // NOTE: the 1 below clips off the " from the end of the string.
-        return LocationWith<MaybeOwnedString>{mark.value.UpToUnread(1), loc};
+        MaybeOwnedString result = on_heap.empty()
+                                      ? mark.value.UpToUnread(1)
+                                      : MaybeOwnedString{std::move(on_heap)};
+        if (utf8_range::IsStructurallyValid(result)) {
+          return LocationWith<MaybeOwnedString>{std::move(result), loc};
+        }
+        return Invalid("Invalid UTF-8 string");
      }
      case '\\': {
        if (on_heap.empty()) {
@ -417,7 +420,7 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
          // destroyed only if we need to handle an escape when on_heap is
          // empty. Because this branch unconditionally pushes to on_heap, this
          // condition can never be reached in any iteration that follows it.
-          // This, at most one move every actually occurs.
+          // Thus, at most one move ever actually occurs.
          std::move(mark).value.Discard();
        }
        RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
@ -450,15 +453,15 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
              "invalid control character 0x%02x in string", uc));
        }

-        // Verify this is valid UTF-8. UTF-8 is a varint encoding satisfying
-        // one of the following (big-endian) patterns:
+        // Process this UTF-8 code point. We do not need to fully validate it
+        // at this stage; we just need to interpret it enough to know how many
+        // bytes to read. UTF-8 is a varint encoding satisfying one of the
+        // following (big-endian) patterns:
        //
        // 0b0xxxxxxx
        // 0b110xxxxx'10xxxxxx
        // 0b1110xxxx'10xxxxxx'10xxxxxx
        // 0b11110xxx'10xxxxxx'10xxxxxx'10xxxxxx
-        //
-        // We don't need to decode it; just validate it.
        size_t lookahead = 0;
        switch (absl::countl_one(uc)) {
          case 0:
@ -479,16 +482,11 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
        if (!on_heap.empty()) {
          on_heap.push_back(c);
        }
-        for (int i = 0; i < lookahead; ++i) {
-          RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
-          uint8_t uc = static_cast<uint8_t>(stream_.PeekChar());
-          if ((uc >> 6) != 2) {
-            return Invalid("invalid UTF-8 in string");
-          }
+        auto lookahead_bytes = stream_.Take(lookahead);
+        RETURN_IF_ERROR(lookahead_bytes.status());
        if (!on_heap.empty()) {
-            on_heap.push_back(stream_.PeekChar());
-          }
-          RETURN_IF_ERROR(Advance(1));
+          absl::string_view view = lookahead_bytes->AsView();
+          on_heap.append(view.data(), view.size());
        }
        break;
      }
--- a/src/google/protobuf/json/internal/lexer_test.cc
+++ b/src/google/protobuf/json/internal/lexer_test.cc
@ -596,6 +596,23 @@ TEST(LexerTest, RejectNonUtf8String) {

 TEST(LexerTest, RejectNonUtf8Prefix) { Bad("\xff{}"); }

+TEST(LexerTest, RejectOverlongUtf8) {
+  // This is the NUL character (U+0000) encoded in three bytes instead of one.
+  // Such "overlong" encodings are not considered valid.
+  Bad("\"\340\200\200\"");
+}
+
+TEST(LexerTest, MixtureOfEscapesAndRawMultibyteCharacters) {
+  Do(R"json("😁\t")json", [](io::ZeroCopyInputStream* stream) {
+    EXPECT_THAT(Value::Parse(stream),
+                IsOkAndHolds(ValueIs<std::string>("😁\t")));
+  });
+  Do(R"json("\t😁")json", [](io::ZeroCopyInputStream* stream) {
+    EXPECT_THAT(Value::Parse(stream),
+                IsOkAndHolds(ValueIs<std::string>("\t😁")));
+  });
+}
+
 TEST(LexerTest, SurrogateEscape) {
  absl::string_view json = R"json(
    [ "\ud83d\udc08\u200D\u2b1B\ud83d\uDdA4" ]