Detect invalid UTF-8 characters (#30307)

* Detect invalid UTF-8 characters

* Fix checking bug

* Add unit test

* Add more comments and descriptions

* Add corpus

* Small fix to improve readability

* Fix sanity check
pull/30378/head
Cheng-Yu Chung 2 years ago committed by GitHub
parent 375e649011
commit 0536a75167
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 69
      src/core/lib/json/json_reader.cc
  2. 1
      test/core/json/corpus/testcase-5115340413861888
  3. 32
      test/core/json/json_test.cc

@ -128,6 +128,7 @@ class JsonReader {
std::vector<grpc_error_handle> errors_;
bool truncated_errors_ = false;
uint8_t utf8_bytes_remaining_ = 0;
uint8_t utf8_first_byte_ = 0;
Json root_value_;
std::vector<Json*> stack_;
@ -137,29 +138,53 @@ class JsonReader {
};
bool JsonReader::StringAddChar(uint32_t c) {
switch (utf8_bytes_remaining_) {
case 0:
if ((c & 0x80) == 0) {
utf8_bytes_remaining_ = 0;
} else if ((c & 0xe0) == 0xc0) {
utf8_bytes_remaining_ = 1;
} else if ((c & 0xf0) == 0xe0) {
utf8_bytes_remaining_ = 2;
} else if ((c & 0xf8) == 0xf0) {
utf8_bytes_remaining_ = 3;
} else {
return false;
}
break;
case 1:
case 2:
case 3:
if ((c & 0xc0) != 0x80) return false;
--utf8_bytes_remaining_;
break;
default:
abort();
if (utf8_bytes_remaining_ == 0) {
if ((c & 0x80) == 0) {
utf8_bytes_remaining_ = 0;
} else if ((c & 0xe0) == 0xc0 && c >= 0xc2) {
/// For the UTF-8 characters with length of 2 bytes, the range of the
/// first byte is [0xc2, 0xdf]. Reference: Table 3-7 in
/// https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf
utf8_bytes_remaining_ = 1;
} else if ((c & 0xf0) == 0xe0) {
utf8_bytes_remaining_ = 2;
} else if ((c & 0xf8) == 0xf0) {
utf8_bytes_remaining_ = 3;
} else {
return false;
}
utf8_first_byte_ = c;
} else if (utf8_bytes_remaining_ == 1) {
if ((c & 0xc0) != 0x80) {
return false;
}
--utf8_bytes_remaining_;
} else if (utf8_bytes_remaining_ == 2) {
/// For UTF-8 characters starting with 0xe0, their length is 3 bytes, and
/// the range of the second byte is [0xa0, 0xbf]. For UTF-8 characters
/// starting with 0xed, their length is 3 bytes, and the range of the second
/// byte is [0x80, 0x9f]. Reference: Table 3-7 in
/// https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf
if (((c & 0xc0) != 0x80) || (utf8_first_byte_ == 0xe0 && c < 0xa0) ||
(utf8_first_byte_ == 0xed && c > 0x9f)) {
return false;
}
--utf8_bytes_remaining_;
} else if (utf8_bytes_remaining_ == 3) {
/// For UTF-8 characters starting with 0xf0, their length is 4 bytes, and
/// the range of the second byte is [0x90, 0xbf]. For UTF-8 characters
/// starting with 0xf4, their length is 4 bytes, and the range of the second
/// byte is [0x80, 0x8f]. Reference: Table 3-7 in
/// https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf
if (((c & 0xc0) != 0x80) || (utf8_first_byte_ == 0xf0 && c < 0x90) ||
(utf8_first_byte_ == 0xf4 && c > 0x8f)) {
return false;
}
--utf8_bytes_remaining_;
} else {
abort();
}
string_.push_back(static_cast<uint8_t>(c));
return true;
}

@ -23,6 +23,8 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "absl/strings/match.h"
#include <grpc/support/alloc.h>
#include <grpc/support/log.h>
#include <grpc/support/string_util.h>
@ -96,6 +98,15 @@ TEST(Json, Utf16) {
"\" \\\\\\u0010\\n\\r\"");
}
MATCHER(ContainsInvalidUtf8,
absl::StrCat(negation ? "Contains" : "Does not contain",
" invalid UTF-8 characters.")) {
grpc_error_handle error = GRPC_ERROR_NONE;
const Json json = Json::Parse(arg, &error);
return (error.code() == absl::StatusCode::kUnknown) &&
(absl::StrContains(error.message(), "JSON parsing failed"));
}
TEST(Json, Utf8) {
RunSuccessTest("\"ßâñć௵⇒\"", "ßâñć௵⇒",
"\"\\u00df\\u00e2\\u00f1\\u0107\\u0bf5\\u21d2\"");
@ -109,6 +120,27 @@ TEST(Json, Utf8) {
RunSuccessTest("{\"\\ud834\\udd1e\":0}",
Json::Object{{"\xf0\x9d\x84\x9e", 0}},
"{\"\\ud834\\udd1e\":0}");
/// For UTF-8 characters with length of 1 byte, the range of it is [0x00,
/// 0x7f].
EXPECT_THAT("\"\xa0\"", ContainsInvalidUtf8());
/// For UTF-8 characters with length of 2 bytes, the range of the first byte
/// is [0xc2, 0xdf], and the range of the second byte is [0x80, 0xbf].
EXPECT_THAT("\"\xc0\xbc\"", ContainsInvalidUtf8());
EXPECT_THAT("\"\xbc\xc0\"", ContainsInvalidUtf8());
/// Corner cases for UTF-8 characters with length of 3 bytes.
/// If the first byte is 0xe0, the range of second byte is [0xa0, 0xbf].
EXPECT_THAT("\"\xe0\x80\x80\"", ContainsInvalidUtf8());
/// If the first byte is 0xed, the range of second byte is [0x80, 0x9f].
EXPECT_THAT("\"\xed\xa0\x80\"", ContainsInvalidUtf8());
/// Corner cases for UTF-8 characters with length of 4 bytes.
/// If the first byte is 0xf0, the range of second byte is [0x90, 0xbf].
EXPECT_THAT("\"\xf0\x80\x80\x80\"", ContainsInvalidUtf8());
/// If the first byte is 0xf4, the range of second byte is [0x80, 0x8f].
EXPECT_THAT("\"\xf4\x90\x80\x80\"", ContainsInvalidUtf8());
}
TEST(Json, NestedEmptyContainers) {

Loading…
Cancel
Save