Update JSON parser to reject overlong UTF-8 encodings

A fuzz test discovered that our JSON parser will accept "overlong" UTF-8
characters, i.e. encodings that use more bytes than necessary. We should reject
these overlong encodings, because they are not considered valid.

To fix this problem, I updated the JSON lexer to rely on utf8_range for
checking UTF-8 validity. This way, the lexer does not need to do any UTF-8
validation of its own, but just has to interpret the UTF-8 enough to know how
many bytes to read for each character.

PiperOrigin-RevId: 599657903
pull/15465/head
Adam Cozzette 11 months ago committed by Copybara-Service
parent 3c8a3d27f7
commit 65a353fa10
  1. 1
      src/google/protobuf/json/BUILD.bazel
  2. 36
      src/google/protobuf/json/internal/lexer.cc
  3. 17
      src/google/protobuf/json/internal/lexer_test.cc

@ -137,6 +137,7 @@ cc_library(
"//src/google/protobuf:port",
"//src/google/protobuf/io",
"//src/google/protobuf/stubs",
"//third_party/utf8_range:utf8_validity",
"@com_google_absl//absl/algorithm:container",
"@com_google_absl//absl/log:absl_check",
"@com_google_absl//absl/numeric:bits",

@ -29,6 +29,7 @@
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"
#include "utf8_validity.h"
#include "google/protobuf/stubs/status_macros.h"
// Must be included last.
@ -401,12 +402,14 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
goto normal_character;
}
if (!on_heap.empty()) {
return LocationWith<MaybeOwnedString>{
MaybeOwnedString(std::move(on_heap)), loc};
}
// NOTE: the 1 below clips off the " from the end of the string.
return LocationWith<MaybeOwnedString>{mark.value.UpToUnread(1), loc};
MaybeOwnedString result = on_heap.empty()
? mark.value.UpToUnread(1)
: MaybeOwnedString{std::move(on_heap)};
if (utf8_range::IsStructurallyValid(result)) {
return LocationWith<MaybeOwnedString>{std::move(result), loc};
}
return Invalid("Invalid UTF-8 string");
}
case '\\': {
if (on_heap.empty()) {
@ -417,7 +420,7 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
// destroyed only if we need to handle an escape when on_heap is
// empty. Because this branch unconditionally pushes to on_heap, this
// condition can never be reached in any iteration that follows it.
// This, at most one move every actually occurs.
// Thus, at most one move ever actually occurs.
std::move(mark).value.Discard();
}
RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
@ -450,15 +453,15 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
"invalid control character 0x%02x in string", uc));
}
// Verify this is valid UTF-8. UTF-8 is a varint encoding satisfying
// one of the following (big-endian) patterns:
// Process this UTF-8 code point. We do not need to fully validate it
// at this stage; we just need to interpret it enough to know how many
// bytes to read. UTF-8 is a varint encoding satisfying one of the
// following (big-endian) patterns:
//
// 0b0xxxxxxx
// 0b110xxxxx'10xxxxxx
// 0b1110xxxx'10xxxxxx'10xxxxxx
// 0b11110xxx'10xxxxxx'10xxxxxx'10xxxxxx
//
// We don't need to decode it; just validate it.
size_t lookahead = 0;
switch (absl::countl_one(uc)) {
case 0:
@ -479,16 +482,11 @@ absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
if (!on_heap.empty()) {
on_heap.push_back(c);
}
for (int i = 0; i < lookahead; ++i) {
RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
uint8_t uc = static_cast<uint8_t>(stream_.PeekChar());
if ((uc >> 6) != 2) {
return Invalid("invalid UTF-8 in string");
}
auto lookahead_bytes = stream_.Take(lookahead);
RETURN_IF_ERROR(lookahead_bytes.status());
if (!on_heap.empty()) {
on_heap.push_back(stream_.PeekChar());
}
RETURN_IF_ERROR(Advance(1));
absl::string_view view = lookahead_bytes->AsView();
on_heap.append(view.data(), view.size());
}
break;
}

@ -596,6 +596,23 @@ TEST(LexerTest, RejectNonUtf8String) {
TEST(LexerTest, RejectNonUtf8Prefix) { Bad("\xff{}"); }
TEST(LexerTest, RejectOverlongUtf8) {
// This is the NUL character (U+0000) encoded in three bytes instead of one.
// Such "overlong" encodings are not considered valid.
Bad("\"\340\200\200\"");
}
TEST(LexerTest, MixtureOfEscapesAndRawMultibyteCharacters) {
Do(R"json("😁\t")json", [](io::ZeroCopyInputStream* stream) {
EXPECT_THAT(Value::Parse(stream),
IsOkAndHolds(ValueIs<std::string>("😁\t")));
});
Do(R"json("\t😁")json", [](io::ZeroCopyInputStream* stream) {
EXPECT_THAT(Value::Parse(stream),
IsOkAndHolds(ValueIs<std::string>("\t😁")));
});
}
TEST(LexerTest, SurrogateEscape) {
absl::string_view json = R"json(
[ "\ud83d\udc08\u200D\u2b1B\ud83d\uDdA4" ]

Loading…
Cancel
Save