Detect invalid UTF-8 characters (#30307)

* Detect invalid UTF-8 characters * Fix checking bug * Add unit test * Add more comments and descriptions * Add corpus * Small fix to improve readability * Fix sanity check
2 years ago · 0536a75167
parent 375e649011
commit 0536a75167
3 changed files with 80 additions and 22 deletions
--- a/src/core/lib/json/json_reader.cc
+++ b/src/core/lib/json/json_reader.cc
@ -128,6 +128,7 @@ class JsonReader {
  std::vector<grpc_error_handle> errors_;
  bool truncated_errors_ = false;
  uint8_t utf8_bytes_remaining_ = 0;
+  uint8_t utf8_first_byte_ = 0;

  Json root_value_;
  std::vector<Json*> stack_;
@ -137,29 +138,53 @@ class JsonReader {
 };

 bool JsonReader::StringAddChar(uint32_t c) {
-  switch (utf8_bytes_remaining_) {
-    case 0:
-      if ((c & 0x80) == 0) {
-        utf8_bytes_remaining_ = 0;
-      } else if ((c & 0xe0) == 0xc0) {
-        utf8_bytes_remaining_ = 1;
-      } else if ((c & 0xf0) == 0xe0) {
-        utf8_bytes_remaining_ = 2;
-      } else if ((c & 0xf8) == 0xf0) {
-        utf8_bytes_remaining_ = 3;
-      } else {
-        return false;
-      }
-      break;
-    case 1:
-    case 2:
-    case 3:
-      if ((c & 0xc0) != 0x80) return false;
-      --utf8_bytes_remaining_;
-      break;
-    default:
-      abort();
+  if (utf8_bytes_remaining_ == 0) {
+    if ((c & 0x80) == 0) {
+      utf8_bytes_remaining_ = 0;
+    } else if ((c & 0xe0) == 0xc0 && c >= 0xc2) {
+      /// For the UTF-8 characters with length of 2 bytes, the range of the
+      /// first byte is [0xc2, 0xdf]. Reference: Table 3-7 in
+      /// https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf
+      utf8_bytes_remaining_ = 1;
+    } else if ((c & 0xf0) == 0xe0) {
+      utf8_bytes_remaining_ = 2;
+    } else if ((c & 0xf8) == 0xf0) {
+      utf8_bytes_remaining_ = 3;
+    } else {
+      return false;
+    }
+    utf8_first_byte_ = c;
+  } else if (utf8_bytes_remaining_ == 1) {
+    if ((c & 0xc0) != 0x80) {
+      return false;
+    }
+    --utf8_bytes_remaining_;
+  } else if (utf8_bytes_remaining_ == 2) {
+    /// For UTF-8 characters starting with 0xe0, their length is 3 bytes, and
+    /// the range of the second byte is [0xa0, 0xbf]. For UTF-8 characters
+    /// starting with 0xed, their length is 3 bytes, and the range of the second
+    /// byte is [0x80, 0x9f]. Reference: Table 3-7 in
+    /// https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf
+    if (((c & 0xc0) != 0x80) || (utf8_first_byte_ == 0xe0 && c < 0xa0) ||
+        (utf8_first_byte_ == 0xed && c > 0x9f)) {
+      return false;
+    }
+    --utf8_bytes_remaining_;
+  } else if (utf8_bytes_remaining_ == 3) {
+    /// For UTF-8 characters starting with 0xf0, their length is 4 bytes, and
+    /// the range of the second byte is [0x90, 0xbf]. For UTF-8 characters
+    /// starting with 0xf4, their length is 4 bytes, and the range of the second
+    /// byte is [0x80, 0x8f]. Reference: Table 3-7 in
+    /// https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf
+    if (((c & 0xc0) != 0x80) || (utf8_first_byte_ == 0xf0 && c < 0x90) ||
+        (utf8_first_byte_ == 0xf4 && c > 0x8f)) {
+      return false;
+    }
+    --utf8_bytes_remaining_;
+  } else {
+    abort();
  }
+
  string_.push_back(static_cast<uint8_t>(c));
  return true;
 }
--- a/test/core/json/corpus/testcase-5115340413861888
+++ b/test/core/json/corpus/testcase-5115340413861888
@ -0,0 +1 @@
+"<EFBFBD><EFBFBD>"
--- a/test/core/json/json_test.cc
+++ b/test/core/json/json_test.cc
@ -23,6 +23,8 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>

+#include "absl/strings/match.h"
+
 #include <grpc/support/alloc.h>
 #include <grpc/support/log.h>
 #include <grpc/support/string_util.h>
@ -96,6 +98,15 @@ TEST(Json, Utf16) {
                 "\" \\\\\\u0010\\n\\r\"");
 }

+MATCHER(ContainsInvalidUtf8,
+        absl::StrCat(negation ? "Contains" : "Does not contain",
+                     " invalid UTF-8 characters.")) {
+  grpc_error_handle error = GRPC_ERROR_NONE;
+  const Json json = Json::Parse(arg, &error);
+  return (error.code() == absl::StatusCode::kUnknown) &&
+         (absl::StrContains(error.message(), "JSON parsing failed"));
+}
+
 TEST(Json, Utf8) {
  RunSuccessTest("\"ßâñć௵⇒\"", "ßâñć௵⇒",
                 "\"\\u00df\\u00e2\\u00f1\\u0107\\u0bf5\\u21d2\"");
@ -109,6 +120,27 @@ TEST(Json, Utf8) {
  RunSuccessTest("{\"\\ud834\\udd1e\":0}",
                 Json::Object{{"\xf0\x9d\x84\x9e", 0}},
                 "{\"\\ud834\\udd1e\":0}");
+
+  /// For UTF-8 characters with length of 1 byte, the range of it is [0x00,
+  /// 0x7f].
+  EXPECT_THAT("\"\xa0\"", ContainsInvalidUtf8());
+
+  /// For UTF-8 characters with length of 2 bytes, the range of the first byte
+  /// is [0xc2, 0xdf], and the range of the second byte is [0x80, 0xbf].
+  EXPECT_THAT("\"\xc0\xbc\"", ContainsInvalidUtf8());
+  EXPECT_THAT("\"\xbc\xc0\"", ContainsInvalidUtf8());
+
+  /// Corner cases for UTF-8 characters with length of 3 bytes.
+  /// If the first byte is 0xe0, the range of second byte is [0xa0, 0xbf].
+  EXPECT_THAT("\"\xe0\x80\x80\"", ContainsInvalidUtf8());
+  /// If the first byte is 0xed, the range of second byte is [0x80, 0x9f].
+  EXPECT_THAT("\"\xed\xa0\x80\"", ContainsInvalidUtf8());
+
+  /// Corner cases for UTF-8 characters with length of 4 bytes.
+  /// If the first byte is 0xf0, the range of second byte is [0x90, 0xbf].
+  EXPECT_THAT("\"\xf0\x80\x80\x80\"", ContainsInvalidUtf8());
+  /// If the first byte is 0xf4, the range of second byte is [0x80, 0x8f].
+  EXPECT_THAT("\"\xf4\x90\x80\x80\"", ContainsInvalidUtf8());
 }

 TEST(Json, NestedEmptyContainers) {