diff --git a/src/core/lib/json/json_reader.cc b/src/core/lib/json/json_reader.cc index a47b448aa07..afa574dbbbf 100644 --- a/src/core/lib/json/json_reader.cc +++ b/src/core/lib/json/json_reader.cc @@ -128,6 +128,7 @@ class JsonReader { std::vector errors_; bool truncated_errors_ = false; uint8_t utf8_bytes_remaining_ = 0; + uint8_t utf8_first_byte_ = 0; Json root_value_; std::vector stack_; @@ -137,29 +138,53 @@ class JsonReader { }; bool JsonReader::StringAddChar(uint32_t c) { - switch (utf8_bytes_remaining_) { - case 0: - if ((c & 0x80) == 0) { - utf8_bytes_remaining_ = 0; - } else if ((c & 0xe0) == 0xc0) { - utf8_bytes_remaining_ = 1; - } else if ((c & 0xf0) == 0xe0) { - utf8_bytes_remaining_ = 2; - } else if ((c & 0xf8) == 0xf0) { - utf8_bytes_remaining_ = 3; - } else { - return false; - } - break; - case 1: - case 2: - case 3: - if ((c & 0xc0) != 0x80) return false; - --utf8_bytes_remaining_; - break; - default: - abort(); + if (utf8_bytes_remaining_ == 0) { + if ((c & 0x80) == 0) { + utf8_bytes_remaining_ = 0; + } else if ((c & 0xe0) == 0xc0 && c >= 0xc2) { + /// For the UTF-8 characters with length of 2 bytes, the range of the + /// first byte is [0xc2, 0xdf]. Reference: Table 3-7 in + /// https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf + utf8_bytes_remaining_ = 1; + } else if ((c & 0xf0) == 0xe0) { + utf8_bytes_remaining_ = 2; + } else if ((c & 0xf8) == 0xf0) { + utf8_bytes_remaining_ = 3; + } else { + return false; + } + utf8_first_byte_ = c; + } else if (utf8_bytes_remaining_ == 1) { + if ((c & 0xc0) != 0x80) { + return false; + } + --utf8_bytes_remaining_; + } else if (utf8_bytes_remaining_ == 2) { + /// For UTF-8 characters starting with 0xe0, their length is 3 bytes, and + /// the range of the second byte is [0xa0, 0xbf]. For UTF-8 characters + /// starting with 0xed, their length is 3 bytes, and the range of the second + /// byte is [0x80, 0x9f]. Reference: Table 3-7 in + /// https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf + if (((c & 0xc0) != 0x80) || (utf8_first_byte_ == 0xe0 && c < 0xa0) || + (utf8_first_byte_ == 0xed && c > 0x9f)) { + return false; + } + --utf8_bytes_remaining_; + } else if (utf8_bytes_remaining_ == 3) { + /// For UTF-8 characters starting with 0xf0, their length is 4 bytes, and + /// the range of the second byte is [0x90, 0xbf]. For UTF-8 characters + /// starting with 0xf4, their length is 4 bytes, and the range of the second + /// byte is [0x80, 0x8f]. Reference: Table 3-7 in + /// https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf + if (((c & 0xc0) != 0x80) || (utf8_first_byte_ == 0xf0 && c < 0x90) || + (utf8_first_byte_ == 0xf4 && c > 0x8f)) { + return false; + } + --utf8_bytes_remaining_; + } else { + abort(); } + string_.push_back(static_cast(c)); return true; } diff --git a/test/core/json/corpus/testcase-5115340413861888 b/test/core/json/corpus/testcase-5115340413861888 new file mode 100644 index 00000000000..c05a31b0296 --- /dev/null +++ b/test/core/json/corpus/testcase-5115340413861888 @@ -0,0 +1 @@ +"À¼" \ No newline at end of file diff --git a/test/core/json/json_test.cc b/test/core/json/json_test.cc index 232caca9ba2..9be8d40aca1 100644 --- a/test/core/json/json_test.cc +++ b/test/core/json/json_test.cc @@ -23,6 +23,8 @@ #include #include +#include "absl/strings/match.h" + #include #include #include @@ -96,6 +98,15 @@ TEST(Json, Utf16) { "\" \\\\\\u0010\\n\\r\""); } +MATCHER(ContainsInvalidUtf8, + absl::StrCat(negation ? "Contains" : "Does not contain", + " invalid UTF-8 characters.")) { + grpc_error_handle error = GRPC_ERROR_NONE; + const Json json = Json::Parse(arg, &error); + return (error.code() == absl::StatusCode::kUnknown) && + (absl::StrContains(error.message(), "JSON parsing failed")); +} + TEST(Json, Utf8) { RunSuccessTest("\"ßâñć௵⇒\"", "ßâñć௵⇒", "\"\\u00df\\u00e2\\u00f1\\u0107\\u0bf5\\u21d2\""); @@ -109,6 +120,27 @@ TEST(Json, Utf8) { RunSuccessTest("{\"\\ud834\\udd1e\":0}", Json::Object{{"\xf0\x9d\x84\x9e", 0}}, "{\"\\ud834\\udd1e\":0}"); + + /// For UTF-8 characters with length of 1 byte, the range of it is [0x00, + /// 0x7f]. + EXPECT_THAT("\"\xa0\"", ContainsInvalidUtf8()); + + /// For UTF-8 characters with length of 2 bytes, the range of the first byte + /// is [0xc2, 0xdf], and the range of the second byte is [0x80, 0xbf]. + EXPECT_THAT("\"\xc0\xbc\"", ContainsInvalidUtf8()); + EXPECT_THAT("\"\xbc\xc0\"", ContainsInvalidUtf8()); + + /// Corner cases for UTF-8 characters with length of 3 bytes. + /// If the first byte is 0xe0, the range of second byte is [0xa0, 0xbf]. + EXPECT_THAT("\"\xe0\x80\x80\"", ContainsInvalidUtf8()); + /// If the first byte is 0xed, the range of second byte is [0x80, 0x9f]. + EXPECT_THAT("\"\xed\xa0\x80\"", ContainsInvalidUtf8()); + + /// Corner cases for UTF-8 characters with length of 4 bytes. + /// If the first byte is 0xf0, the range of second byte is [0x90, 0xbf]. + EXPECT_THAT("\"\xf0\x80\x80\x80\"", ContainsInvalidUtf8()); + /// If the first byte is 0xf4, the range of second byte is [0x80, 0x8f]. + EXPECT_THAT("\"\xf4\x90\x80\x80\"", ContainsInvalidUtf8()); } TEST(Json, NestedEmptyContainers) {