// Protocol Buffers - Google's data interchange format // Copyright 2023 Google LLC. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file or at // https://developers.google.com/open-source/licenses/bsd #include "upb/io/tokenizer.h" #include #include "absl/strings/escaping.h" #include "absl/strings/str_format.h" #include "upb/io/chunked_input_stream.h" #include "upb/io/string.h" #include "upb/lex/unicode.h" #include "upb/mem/arena.hpp" // Must be last. #include "upb/port/def.inc" namespace google { namespace protobuf { namespace io { namespace { #ifndef arraysize #define arraysize(a) (sizeof(a) / sizeof(a[0])) #endif static bool StringEquals(const char* a, const char* b) { return strcmp(a, b) == 0; } // =================================================================== // Data-Driven Test Infrastructure // TODO: This is copied from coded_stream_unittest. This is // temporary until these features are integrated into gTest itself. // TEST_1D and TEST_2D are macros I'd eventually like to see added to // gTest. These macros can be used to declare tests which should be // run multiple times, once for each item in some input array. TEST_1D // tests all cases in a single input array. TEST_2D tests all // combinations of cases from two arrays. The arrays must be statically // defined such that the arraysize() macro works on them. Example: // // int kCases[] = {1, 2, 3, 4} // TEST_1D(MyFixture, MyTest, kCases) { // EXPECT_GT(kCases_case, 0); // } // // This test iterates through the numbers 1, 2, 3, and 4 and tests that // they are all grater than zero. In case of failure, the exact case // which failed will be printed. The case type must be printable using // ostream::operator<<. #define TEST_1D(FIXTURE, NAME, CASES) \ class FIXTURE##_##NAME##_DD : public FIXTURE { \ protected: \ template \ void DoSingleCase(const CaseType& CASES##_case); \ }; \ \ TEST_F(FIXTURE##_##NAME##_DD, NAME) { \ for (size_t i = 0; i < arraysize(CASES); i++) { \ SCOPED_TRACE(testing::Message() \ << #CASES " case #" << i << ": " << CASES[i]); \ DoSingleCase(CASES[i]); \ } \ } \ \ template \ void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case) #define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \ class FIXTURE##_##NAME##_DD : public FIXTURE { \ protected: \ template \ void DoSingleCase(const CaseType1& CASES1##_case, \ const CaseType2& CASES2##_case); \ }; \ \ TEST_F(FIXTURE##_##NAME##_DD, NAME) { \ for (size_t i = 0; i < arraysize(CASES1); i++) { \ for (size_t j = 0; j < arraysize(CASES2); j++) { \ SCOPED_TRACE(testing::Message() \ << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \ << #CASES2 " case #" << j << ": " << CASES2[j]); \ DoSingleCase(CASES1[i], CASES2[j]); \ } \ } \ } \ \ template \ void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \ const CaseType2& CASES2##_case) // ------------------------------------------------------------------- // In C, a size of zero from ZCIS_Next() means EOF so we can't play the same // trick here that happens in the C++ version. Use ChunkedInputStream instead. upb_ZeroCopyInputStream* TestInputStream(const void* data, size_t size, size_t block_size, upb_Arena* arena) { return upb_ChunkedInputStream_New(data, size, block_size, arena); } // ------------------------------------------------------------------- // We test each operation over a variety of block sizes to insure that // we test cases where reads cross buffer boundaries as well as cases // where they don't. This is sort of a brute-force approach to this, // but it's easy to write and easy to understand. const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024}; class TokenizerTest : public testing::Test { protected: // For easy testing. uint64_t ParseInteger(const std::string& text) { uint64_t result; EXPECT_TRUE(upb_Parse_Integer(text.data(), UINT64_MAX, &result)) << "'" << text << "'"; return result; } }; // =================================================================== // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error: // "sorry, unimplemented: `method_call_expr' not supported by dump_expr" #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) // In each test case, the entire input text should parse as a single token // of the given type. struct SimpleTokenCase { std::string input; upb_TokenType type; }; inline std::ostream& operator<<(std::ostream& out, const SimpleTokenCase& test_case) { return out << absl::CEscape(test_case.input); } SimpleTokenCase kSimpleTokenCases[] = { // Test identifiers. {"hello", kUpb_TokenType_Identifier}, // Test integers. {"123", kUpb_TokenType_Integer}, {"0xab6", kUpb_TokenType_Integer}, {"0XAB6", kUpb_TokenType_Integer}, {"0X1234567", kUpb_TokenType_Integer}, {"0x89abcdef", kUpb_TokenType_Integer}, {"0x89ABCDEF", kUpb_TokenType_Integer}, {"01234567", kUpb_TokenType_Integer}, // Test floats. {"123.45", kUpb_TokenType_Float}, {"1.", kUpb_TokenType_Float}, {"1e3", kUpb_TokenType_Float}, {"1E3", kUpb_TokenType_Float}, {"1e-3", kUpb_TokenType_Float}, {"1e+3", kUpb_TokenType_Float}, {"1.e3", kUpb_TokenType_Float}, {"1.2e3", kUpb_TokenType_Float}, {".1", kUpb_TokenType_Float}, {".1e3", kUpb_TokenType_Float}, {".1e-3", kUpb_TokenType_Float}, {".1e+3", kUpb_TokenType_Float}, // Test strings. {"'hello'", kUpb_TokenType_String}, {"\"foo\"", kUpb_TokenType_String}, {"'a\"b'", kUpb_TokenType_String}, {"\"a'b\"", kUpb_TokenType_String}, {"'a\\'b'", kUpb_TokenType_String}, {"\"a\\\"b\"", kUpb_TokenType_String}, {"'\\xf'", kUpb_TokenType_String}, {"'\\0'", kUpb_TokenType_String}, // Test symbols. {"+", kUpb_TokenType_Symbol}, {".", kUpb_TokenType_Symbol}, }; TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) { upb::Arena arena; // Set up the tokenizer. auto input = TestInputStream(kSimpleTokenCases_case.input.data(), kSimpleTokenCases_case.input.size(), kBlockSizes_case, arena.ptr()); auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr()); // Before Next() is called, the initial token should always be TYPE_START. EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start); EXPECT_EQ(upb_Tokenizer_Line(t), 0); EXPECT_EQ(upb_Tokenizer_Column(t), 0); EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0); EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); // Parse the token. EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); // Check that it has the right type. EXPECT_EQ(upb_Tokenizer_Type(t), kSimpleTokenCases_case.type); // Check that it contains the complete input text. EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), kSimpleTokenCases_case.input.data())); // Check that it is located at the beginning of the input EXPECT_EQ(upb_Tokenizer_Line(t), 0); EXPECT_EQ(upb_Tokenizer_Column(t), 0); EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size()); upb_Status status; upb_Status_Clear(&status); // There should be no more input and no errors.. EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); EXPECT_TRUE(upb_Status_IsOk(&status)); // After Next() returns false, the token should have type TYPE_END. EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_End); EXPECT_EQ(upb_Tokenizer_Line(t), 0); EXPECT_EQ(upb_Tokenizer_Column(t), kSimpleTokenCases_case.input.size()); EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size()); EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); } TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) { // Test the "allow_f_after_float" option. // Set up the tokenizer. upb::Arena arena; const char* text = "1f 2.5f 6e3f 7F"; auto input = TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr()); const int options = kUpb_TokenizerOption_AllowFAfterFloat; auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr()); // Advance through tokens and check that they are parsed as expected. EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "1f")); EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "2.5f")); EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "6e3f")); EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "7F")); upb_Status status; upb_Status_Clear(&status); // There should be no more input and no errors.. EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); EXPECT_TRUE(upb_Status_IsOk(&status)); } SimpleTokenCase kWhitespaceTokenCases[] = { {" ", kUpb_TokenType_Whitespace}, {" ", kUpb_TokenType_Whitespace}, {"\t", kUpb_TokenType_Whitespace}, {"\v", kUpb_TokenType_Whitespace}, {"\t ", kUpb_TokenType_Whitespace}, {"\v\t", kUpb_TokenType_Whitespace}, {" \t\r", kUpb_TokenType_Whitespace}, // Newlines: {"\n", kUpb_TokenType_Newline}, }; TEST_2D(TokenizerTest, Whitespace, kWhitespaceTokenCases, kBlockSizes) { upb::Arena arena; { auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(), kWhitespaceTokenCases_case.input.size(), kBlockSizes_case, arena.ptr()); auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr()); EXPECT_FALSE(upb_Tokenizer_Next(t, nullptr)); } { auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(), kWhitespaceTokenCases_case.input.size(), kBlockSizes_case, arena.ptr()); const int options = kUpb_TokenizerOption_ReportNewlines; auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr()); EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); EXPECT_EQ(upb_Tokenizer_Type(t), kWhitespaceTokenCases_case.type); EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), kWhitespaceTokenCases_case.input.data())); EXPECT_FALSE(upb_Tokenizer_Next(t, nullptr)); } } #endif // ------------------------------------------------------------------- struct TokenFields { upb_TokenType type; std::string text; size_t line; size_t column; size_t end_column; }; // In each case, the input is parsed to produce a list of tokens. The // last token in "output" must have type TYPE_END. struct MultiTokenCase { std::string input; std::vector output; }; inline std::ostream& operator<<(std::ostream& out, const MultiTokenCase& test_case) { return out << absl::CEscape(test_case.input); } MultiTokenCase kMultiTokenCases[] = { // Test empty input. {"", { {kUpb_TokenType_End, "", 0, 0, 0}, }}, // Test all token types at the same time. {"foo 1 1.2 + 'bar'", { {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, {kUpb_TokenType_Integer, "1", 0, 4, 5}, {kUpb_TokenType_Float, "1.2", 0, 6, 9}, {kUpb_TokenType_Symbol, "+", 0, 10, 11}, {kUpb_TokenType_String, "'bar'", 0, 12, 17}, {kUpb_TokenType_End, "", 0, 17, 17}, }}, // Test that consecutive symbols are parsed as separate tokens. {"!@+%", { {kUpb_TokenType_Symbol, "!", 0, 0, 1}, {kUpb_TokenType_Symbol, "@", 0, 1, 2}, {kUpb_TokenType_Symbol, "+", 0, 2, 3}, {kUpb_TokenType_Symbol, "%", 0, 3, 4}, {kUpb_TokenType_End, "", 0, 4, 4}, }}, // Test that newlines affect line numbers correctly. {"foo bar\nrab oof", { {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, {kUpb_TokenType_Identifier, "bar", 0, 4, 7}, {kUpb_TokenType_Identifier, "rab", 1, 0, 3}, {kUpb_TokenType_Identifier, "oof", 1, 4, 7}, {kUpb_TokenType_End, "", 1, 7, 7}, }}, // Test that tabs affect column numbers correctly. {"foo\tbar \tbaz", { {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, {kUpb_TokenType_Identifier, "bar", 0, 8, 11}, {kUpb_TokenType_Identifier, "baz", 0, 16, 19}, {kUpb_TokenType_End, "", 0, 19, 19}, }}, // Test that tabs in string literals affect column numbers correctly. {"\"foo\tbar\" baz", { {kUpb_TokenType_String, "\"foo\tbar\"", 0, 0, 12}, {kUpb_TokenType_Identifier, "baz", 0, 13, 16}, {kUpb_TokenType_End, "", 0, 16, 16}, }}, // Test that line comments are ignored. {"foo // This is a comment\n" "bar // This is another comment", { {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, {kUpb_TokenType_Identifier, "bar", 1, 0, 3}, {kUpb_TokenType_End, "", 1, 30, 30}, }}, // Test that block comments are ignored. {"foo /* This is a block comment */ bar", { {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, {kUpb_TokenType_Identifier, "bar", 0, 34, 37}, {kUpb_TokenType_End, "", 0, 37, 37}, }}, // Test that sh-style comments are not ignored by default. {"foo # bar\n" "baz", { {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, {kUpb_TokenType_Symbol, "#", 0, 4, 5}, {kUpb_TokenType_Identifier, "bar", 0, 6, 9}, {kUpb_TokenType_Identifier, "baz", 1, 0, 3}, {kUpb_TokenType_End, "", 1, 3, 3}, }}, // Test all whitespace chars {"foo\n\t\r\v\fbar", { {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, {kUpb_TokenType_Identifier, "bar", 1, 11, 14}, {kUpb_TokenType_End, "", 1, 14, 14}, }}, }; TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) { // Set up the tokenizer. upb::Arena arena; auto input = TestInputStream(kMultiTokenCases_case.input.data(), kMultiTokenCases_case.input.size(), kBlockSizes_case, arena.ptr()); auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr()); // Before Next() is called, the initial token should always be TYPE_START. EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start); EXPECT_EQ(upb_Tokenizer_Line(t), 0); EXPECT_EQ(upb_Tokenizer_Column(t), 0); EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0); EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); // Loop through all expected tokens. TokenFields token_fields; upb_Status status; upb_Status_Clear(&status); int i = 0; do { token_fields = kMultiTokenCases_case.output[i++]; SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << absl::CEscape(token_fields.text)); // Next() should only return false when it hits the end token. if (token_fields.type == kUpb_TokenType_End) { EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); EXPECT_TRUE(upb_Status_IsOk(&status)); } else { EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); } // Check that the token matches the expected one. EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type); EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line); EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column); EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column); EXPECT_EQ(upb_Tokenizer_TextSize(t), token_fields.text.size()); EXPECT_TRUE( StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data())); } while (token_fields.type != kUpb_TokenType_End); } MultiTokenCase kMultiWhitespaceTokenCases[] = { // Test all token types at the same time. {"foo 1 \t1.2 \n +\v'bar'", { {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, {kUpb_TokenType_Whitespace, " ", 0, 3, 4}, {kUpb_TokenType_Integer, "1", 0, 4, 5}, {kUpb_TokenType_Whitespace, " \t", 0, 5, 8}, {kUpb_TokenType_Float, "1.2", 0, 8, 11}, {kUpb_TokenType_Whitespace, " ", 0, 11, 13}, {kUpb_TokenType_Newline, "\n", 0, 13, 0}, {kUpb_TokenType_Whitespace, " ", 1, 0, 3}, {kUpb_TokenType_Symbol, "+", 1, 3, 4}, {kUpb_TokenType_Whitespace, "\v", 1, 4, 5}, {kUpb_TokenType_String, "'bar'", 1, 5, 10}, {kUpb_TokenType_End, "", 1, 10, 10}, }}, }; TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases, kBlockSizes) { // Set up the tokenizer. upb::Arena arena; auto input = TestInputStream(kMultiWhitespaceTokenCases_case.input.data(), kMultiWhitespaceTokenCases_case.input.size(), kBlockSizes_case, arena.ptr()); const int options = kUpb_TokenizerOption_ReportNewlines; auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr()); // Before Next() is called, the initial token should always be TYPE_START. EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start); EXPECT_EQ(upb_Tokenizer_Line(t), 0); EXPECT_EQ(upb_Tokenizer_Column(t), 0); EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0); EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); // Loop through all expected tokens. TokenFields token_fields; upb_Status status; upb_Status_Clear(&status); int i = 0; do { token_fields = kMultiWhitespaceTokenCases_case.output[i++]; SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token_fields.text); // Next() should only return false when it hits the end token. if (token_fields.type == kUpb_TokenType_End) { EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); EXPECT_TRUE(upb_Status_IsOk(&status)); } else { EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); } // Check that the token matches the expected one. EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type); EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line); EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column); EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column); EXPECT_TRUE( StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data())); } while (token_fields.type != kUpb_TokenType_End); } // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error: // "sorry, unimplemented: `method_call_expr' not supported by dump_expr" #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) { // Test the "comment_style" option. const char* text = "foo # bar\n" "baz // qux\n" "corge /* grault */\n" "garply"; const char* const kTokens[] = {"foo", // "# bar" is ignored "baz", "/", "/", "qux", "corge", "/", "*", "grault", "*", "/", "garply"}; // Set up the tokenizer. upb::Arena arena; auto input = TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr()); const int options = kUpb_TokenizerOption_CommentStyleShell; auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr()); // Advance through tokens and check that they are parsed as expected. for (size_t i = 0; i < arraysize(kTokens); i++) { EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), kTokens[i])); } // There should be no more input and no errors. upb_Status status; upb_Status_Clear(&status); EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); EXPECT_TRUE(upb_Status_IsOk(&status)); } #endif // ------------------------------------------------------------------- #if 0 // TODO: Extended comments are currently unimplemented. // In each case, the input is expected to have two tokens named "prev" and // "next" with comments in between. struct DocCommentCase { std::string input; const char* prev_trailing_comments; const char* detached_comments[10]; const char* next_leading_comments; }; inline std::ostream& operator<<(std::ostream& out, const DocCommentCase& test_case) { return out << absl::CEscape(test_case.input); } DocCommentCase kDocCommentCases[] = { {"prev next", "", {}, ""}, {"prev /* ignored */ next", "", {}, ""}, {"prev // trailing comment\n" "next", " trailing comment\n", {}, ""}, {"prev\n" "// leading comment\n" "// line 2\n" "next", "", {}, " leading comment\n" " line 2\n"}, {"prev\n" "// trailing comment\n" "// line 2\n" "\n" "next", " trailing comment\n" " line 2\n", {}, ""}, {"prev // trailing comment\n" "// leading comment\n" "// line 2\n" "next", " trailing comment\n", {}, " leading comment\n" " line 2\n"}, {"prev /* trailing block comment */\n" "/* leading block comment\n" " * line 2\n" " * line 3 */" "next", " trailing block comment ", {}, " leading block comment\n" " line 2\n" " line 3 "}, {"prev\n" "/* trailing block comment\n" " * line 2\n" " * line 3\n" " */\n" "/* leading block comment\n" " * line 2\n" " * line 3 */" "next", " trailing block comment\n" " line 2\n" " line 3\n", {}, " leading block comment\n" " line 2\n" " line 3 "}, {"prev\n" "// trailing comment\n" "\n" "// detached comment\n" "// line 2\n" "\n" "// second detached comment\n" "/* third detached comment\n" " * line 2 */\n" "// leading comment\n" "next", " trailing comment\n", {" detached comment\n" " line 2\n", " second detached comment\n", " third detached comment\n" " line 2 "}, " leading comment\n"}, {"prev /**/\n" "\n" "// detached comment\n" "\n" "// leading comment\n" "next", "", {" detached comment\n"}, " leading comment\n"}, {"prev /**/\n" "// leading comment\n" "next", "", {}, " leading comment\n"}, }; TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) { // Set up the tokenizer. TestInputStream input(kDocCommentCases_case.input.data(), kDocCommentCases_case.input.size(), kBlockSizes_case); TestErrorCollector error_collector; Tokenizer tokenizer(&input, &error_collector); // Set up a second tokenizer where we'll pass all NULLs to NextWithComments(). TestInputStream input2(kDocCommentCases_case.input.data(), kDocCommentCases_case.input.size(), kBlockSizes_case); Tokenizer tokenizer2(&input2, &error_collector); tokenizer.Next(); tokenizer2.Next(); EXPECT_EQ("prev", tokenizer.current().text); EXPECT_EQ("prev", tokenizer2.current().text); std::string prev_trailing_comments; std::vector detached_comments; std::string next_leading_comments; tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments, &next_leading_comments); tokenizer2.NextWithComments(nullptr, nullptr, nullptr); EXPECT_EQ("next", tokenizer.current().text); EXPECT_EQ("next", tokenizer2.current().text); EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments, prev_trailing_comments); for (int i = 0; i < detached_comments.size(); i++) { EXPECT_LT(i, arraysize(kDocCommentCases)); EXPECT_TRUE(kDocCommentCases_case.detached_comments[i] != nullptr); EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]); } // Verify that we matched all the detached comments. EXPECT_EQ(nullptr, kDocCommentCases_case.detached_comments[detached_comments.size()]); EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments); } #endif // 0 // ------------------------------------------------------------------- // Test parse helpers. // TODO: Add a fuzz test for this. TEST_F(TokenizerTest, ParseInteger) { EXPECT_EQ(0, ParseInteger("0")); EXPECT_EQ(123, ParseInteger("123")); EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12")); EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12")); EXPECT_EQ(UINT64_MAX, ParseInteger("0xFFFFFFFFFFFFFFFF")); EXPECT_EQ(01234567, ParseInteger("01234567")); EXPECT_EQ(0X123, ParseInteger("0X123")); // Test invalid integers that may still be tokenized as integers. EXPECT_EQ(0, ParseInteger("0x")); uint64_t i; // Test invalid integers that will never be tokenized as integers. EXPECT_FALSE(upb_Parse_Integer("zxy", UINT64_MAX, &i)); EXPECT_FALSE(upb_Parse_Integer("1.2", UINT64_MAX, &i)); EXPECT_FALSE(upb_Parse_Integer("08", UINT64_MAX, &i)); EXPECT_FALSE(upb_Parse_Integer("0xg", UINT64_MAX, &i)); EXPECT_FALSE(upb_Parse_Integer("-1", UINT64_MAX, &i)); // Test overflows. EXPECT_TRUE(upb_Parse_Integer("0", 0, &i)); EXPECT_FALSE(upb_Parse_Integer("1", 0, &i)); EXPECT_TRUE(upb_Parse_Integer("1", 1, &i)); EXPECT_TRUE(upb_Parse_Integer("12345", 12345, &i)); EXPECT_FALSE(upb_Parse_Integer("12346", 12345, &i)); EXPECT_TRUE(upb_Parse_Integer("0xFFFFFFFFFFFFFFFF", UINT64_MAX, &i)); EXPECT_FALSE(upb_Parse_Integer("0x10000000000000000", UINT64_MAX, &i)); // Test near the limits of signed parsing (values in INT64_MAX +/- 1600) for (int64_t offset = -1600; offset <= 1600; ++offset) { // We make sure to perform an unsigned addition so that we avoid signed // overflow, which would be undefined behavior. uint64_t i = 0x7FFFFFFFFFFFFFFFu + static_cast(offset); char decimal[32]; snprintf(decimal, 32, "%llu", static_cast(i)); if (offset > 0) { uint64_t parsed = -1; EXPECT_FALSE(upb_Parse_Integer(decimal, INT64_MAX, &parsed)) << decimal << "=>" << parsed; } else { uint64_t parsed = -1; EXPECT_TRUE(upb_Parse_Integer(decimal, INT64_MAX, &parsed)) << decimal << "=>" << parsed; EXPECT_EQ(parsed, i); } char octal[32]; snprintf(octal, 32, "0%llo", static_cast(i)); if (offset > 0) { uint64_t parsed = -1; EXPECT_FALSE(upb_Parse_Integer(octal, INT64_MAX, &parsed)) << octal << "=>" << parsed; } else { uint64_t parsed = -1; EXPECT_TRUE(upb_Parse_Integer(octal, INT64_MAX, &parsed)) << octal << "=>" << parsed; EXPECT_EQ(parsed, i); } char hex[32]; snprintf(hex, 32, "0x%llx", static_cast(i)); if (offset > 0) { uint64_t parsed = -1; EXPECT_FALSE(upb_Parse_Integer(hex, INT64_MAX, &parsed)) << hex << "=>" << parsed; } else { uint64_t parsed = -1; EXPECT_TRUE(upb_Parse_Integer(hex, INT64_MAX, &parsed)) << hex; EXPECT_EQ(parsed, i); } // EXPECT_NE(offset, -237); } // Test near the limits of unsigned parsing (values in UINT64_MAX +/- 1600) // By definition, values greater than UINT64_MAX cannot be held in a uint64_t // variable, so printing them is a little tricky; fortunately all but the // last four digits are known, so we can hard-code them in the printf string, // and we only need to format the last 4. for (int64_t offset = -1600; offset <= 1600; ++offset) { { uint64_t i = 18446744073709551615u + offset; char decimal[32]; snprintf(decimal, 32, "1844674407370955%04llu", static_cast(1615 + offset)); if (offset > 0) { uint64_t parsed = -1; EXPECT_FALSE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed)) << decimal << "=>" << parsed; } else { uint64_t parsed = -1; EXPECT_TRUE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed)) << decimal; EXPECT_EQ(parsed, i); } } { uint64_t i = 01777777777777777777777u + offset; if (offset > 0) { char octal[32]; snprintf(octal, 32, "0200000000000000000%04llo", static_cast(offset - 1)); uint64_t parsed = -1; EXPECT_FALSE(upb_Parse_Integer(octal, UINT64_MAX, &parsed)) << octal << "=>" << parsed; } else { char octal[32]; snprintf(octal, 32, "0%llo", static_cast(i)); uint64_t parsed = -1; EXPECT_TRUE(upb_Parse_Integer(octal, UINT64_MAX, &parsed)) << octal; EXPECT_EQ(parsed, i); } } { uint64_t ui = 0xffffffffffffffffu + offset; char hex[32]; if (offset > 0) { snprintf(hex, 32, "0x1000000000000%04llx", static_cast(offset - 1)); uint64_t parsed = -1; EXPECT_FALSE(upb_Parse_Integer(hex, UINT64_MAX, &parsed)) << hex << "=>" << parsed; } else { snprintf(hex, 32, "0x%llx", static_cast(ui)); uint64_t parsed = -1; EXPECT_TRUE(upb_Parse_Integer(hex, UINT64_MAX, &parsed)) << hex; EXPECT_EQ(parsed, ui); } } } } TEST_F(TokenizerTest, ParseFloat) { EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.")); EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1e3")); EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1E3")); EXPECT_DOUBLE_EQ(1.5e3, upb_Parse_Float("1.5e3")); EXPECT_DOUBLE_EQ(.1, upb_Parse_Float(".1")); EXPECT_DOUBLE_EQ(.25, upb_Parse_Float(".25")); EXPECT_DOUBLE_EQ(.1e3, upb_Parse_Float(".1e3")); EXPECT_DOUBLE_EQ(.25e3, upb_Parse_Float(".25e3")); EXPECT_DOUBLE_EQ(.1e+3, upb_Parse_Float(".1e+3")); EXPECT_DOUBLE_EQ(.1e-3, upb_Parse_Float(".1e-3")); EXPECT_DOUBLE_EQ(5, upb_Parse_Float("5")); EXPECT_DOUBLE_EQ(6e-12, upb_Parse_Float("6e-12")); EXPECT_DOUBLE_EQ(1.2, upb_Parse_Float("1.2")); EXPECT_DOUBLE_EQ(1.e2, upb_Parse_Float("1.e2")); // Test invalid integers that may still be tokenized as integers. EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e")); EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e-")); EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.e")); // Test 'f' suffix. EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1f")); EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.0f")); EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1F")); // These should parse successfully even though they are out of range. // Overflows become infinity and underflows become zero. EXPECT_EQ(0.0, upb_Parse_Float("1e-9999999999999999999999999999")); EXPECT_EQ(HUGE_VAL, upb_Parse_Float("1e+9999999999999999999999999999")); #if GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet // Test invalid integers that will never be tokenized as integers. EXPECT_DEBUG_DEATH( upb_Parse_Float("zxy"), "passed text that could not have been tokenized as a float"); EXPECT_DEBUG_DEATH( upb_Parse_Float("1-e0"), "passed text that could not have been tokenized as a float"); EXPECT_DEBUG_DEATH( upb_Parse_Float("-1.0"), "passed text that could not have been tokenized as a float"); #endif // GTEST_HAS_DEATH_TEST } TEST_F(TokenizerTest, ParseString) { const std::string inputs[] = { "'hello'", "\"blah\\nblah2\"", "'\\1x\\1\\123\\739\\52\\334n\\3'", "'\\x20\\x4'", // Test invalid strings that may still be tokenized as strings. "\"\\a\\l\\v\\t", // \l is invalid "'", "'\\", // Experiment with Unicode escapes. // Here are one-, two- and three-byte Unicode characters. "'\\u0024\\u00a2\\u20ac\\U00024b62XX'", "'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", // Same, encoded using UTF16. // Here's some broken UTF16: a head surrogate with no tail surrogate. // We just output this as if it were UTF8; it's not a defined code point, // but it has a defined encoding. "'\\ud852XX'", // Malformed escape: Demons may fly out of the nose. "'\\u0'", // Beyond the range of valid UTF-32 code units. "'\\U00110000\\U00200000\\UFFFFFFFF'", }; const std::string outputs[] = { "hello", "blah\nblah2", "\1x\1\123\739\52\334n\3", "\x20\x4", "\a?\v\t", "", "\\", "$¢€𤭢XX", "$¢€𤭢XX", "\xed\xa1\x92XX", "u0", "\\U00110000\\U00200000\\Uffffffff", }; upb::Arena arena; for (size_t i = 0; i < sizeof(inputs) / sizeof(inputs[0]); i++) { auto sv = upb_Parse_String(inputs[i].data(), arena.ptr()); EXPECT_TRUE(StringEquals(sv.data, outputs[i].data())); } // Test invalid strings that will never be tokenized as strings. #if GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet EXPECT_DEBUG_DEATH( upb_Parse_String("", arena.ptr()), "passed text that could not have been tokenized as a string"); #endif // GTEST_HAS_DEATH_TEST } TEST_F(TokenizerTest, ParseStringAppend) { upb::Arena arena; upb_String output; upb_String_Init(&output, arena.ptr()); upb_String_Assign(&output, "stuff+", 6); auto sv = upb_Parse_String("'hello'", arena.ptr()); EXPECT_TRUE(StringEquals(sv.data, "hello")); upb_String_Append(&output, sv.data, sv.size); EXPECT_TRUE(StringEquals(upb_String_Data(&output), "stuff+hello")); } // ------------------------------------------------------------------- // Each case parses some input text, ignoring the tokens produced, and // checks that the error output matches what is expected. struct ErrorCase { std::string input; const char* errors; }; inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) { return out << absl::CEscape(test_case.input); } ErrorCase kErrorCases[] = { // String errors. {"'\\l'", "0:2: Invalid escape sequence in string literal."}, {"'\\X'", "0:2: Invalid escape sequence in string literal."}, {"'\\x'", "0:3: Expected hex digits for escape sequence."}, {"'foo", "0:4: Unexpected end of string."}, {"'bar\nfoo", "0:4: String literals cannot cross line boundaries."}, {"'\\u01'", "0:5: Expected four hex digits for \\u escape sequence."}, {"'\\uXYZ'", "0:3: Expected four hex digits for \\u escape sequence."}, // Integer errors. {"123foo", "0:3: Need space between number and identifier."}, // Hex/octal errors. {"0x foo", "0:2: \"0x\" must be followed by hex digits."}, {"0541823", "0:4: Numbers starting with leading zero must be in octal."}, {"0x123z", "0:5: Need space between number and identifier."}, {"0x123.4", "0:5: Hex and octal numbers must be integers."}, {"0123.4", "0:4: Hex and octal numbers must be integers."}, // Float errors. {"1e foo", "0:2: \"e\" must be followed by exponent."}, {"1e- foo", "0:3: \"e\" must be followed by exponent."}, {"1.2.3", "0:3: Already saw decimal point or exponent; can't have another one."}, {"1e2.3", "0:3: Already saw decimal point or exponent; can't have another one."}, {"a.1", "0:1: Need space between identifier and decimal point."}, // allow_f_after_float not enabled, so this should be an error. {"1.0f", "0:3: Need space between number and identifier."}, // Block comment errors. {"/*", "0:2: End-of-file inside block comment.\n0:0: Comment started here."}, {"/*/*/ foo", "0:3: \"/*\" inside block comment. Block comments cannot be nested."}, // Control characters. Multiple consecutive control characters should only // produce one error. {"\b foo", "0:0: Invalid control characters encountered in text."}, {"\b\b foo", "0:0: Invalid control characters encountered in text."}, // Check that control characters at end of input don't result in an // infinite loop. {"\b", "0:0: Invalid control characters encountered in text."}, // Check recovery from '\0'. We have to explicitly specify the length of // these strings because otherwise the string constructor will just call // strlen() which will see the first '\0' and think that is the end of the // string. {std::string("\0foo", 4), "0:0: Invalid control characters encountered in text."}, {std::string("\0\0foo", 5), "0:0: Invalid control characters encountered in text."}, // Check error from high order bits set {"\300", "0:0: Interpreting non ascii codepoint 192."}, }; TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) { // Set up the tokenizer. upb::Arena arena; auto input = TestInputStream(kErrorCases_case.input.data(), kErrorCases_case.input.size(), kBlockSizes_case, arena.ptr()); auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr()); upb_Status status; upb_Status_Clear(&status); while (upb_Tokenizer_Next(t, &status)) ; // just keep looping EXPECT_TRUE( StringEquals(upb_Status_ErrorMessage(&status), kErrorCases_case.errors)); } // ------------------------------------------------------------------- TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) { const std::string text = "foo bar"; upb::Arena arena; auto input = TestInputStream(text.data(), text.size(), kBlockSizes_case, arena.ptr()); // Create a tokenizer, read one token, then destroy it. auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr()); upb_Tokenizer_Next(t, nullptr); upb_Tokenizer_Fini(t); // Only "foo" should have been read. EXPECT_EQ(strlen("foo"), upb_ZeroCopyInputStream_ByteCount(input)); } static const char* kParseBenchmark[] = { "\"partner-google-mobile-modes-print\"", "\"partner-google-mobile-modes-products\"", "\"partner-google-mobile-modes-realtime\"", "\"partner-google-mobile-modes-video\"", "\"partner-google-modes-news\"", "\"partner-google-modes-places\"", "\"partner-google-news\"", "\"partner-google-print\"", "\"partner-google-products\"", "\"partner-google-realtime\"", "\"partner-google-video\"", "\"true\"", "\"BigImagesHover__js_list\"", "\"XFEExternJsVersionParameters\"", "\"Available versions of the big images hover javascript\"", "\"Version: {\n\"", "\" script_name: \"extern_js/dummy_file_compiled_post20070813.js\"\n\"", "\" version_number: 0\n\"", "\"}\"", "\"BigImagesHover__js_selection\"", "\"XFEExternJsVersionParameters\"", "\"Versioning info for the big images hover javascript.\"", "\"current_version: 0\"", "\"BigImagesHover__js_suppressed\"", "\"Indicates if the client-side javascript associated with big images.\"", "\"true\"", "\"BrowserAnyOf\"", "\"IsChrome5OrAbove\"", "\"IsFirefox3OrAbove\"", "IsIE8OrAboveBinary", "\"Abe \"Sausage King\" Froman\"", "\"Frank \"Meatball\" Febbraro\"", }; TEST(Benchmark, ParseStringAppendAccumulate) { upb::Arena arena; size_t outsize = 0; int benchmark_len = arraysize(kParseBenchmark); for (int i = 0; i < benchmark_len; i++) { auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr()); outsize += sv.size; } EXPECT_NE(0, outsize); } TEST(Benchmark, ParseStringAppend) { upb::Arena arena; upb_String output; upb_String_Init(&output, arena.ptr()); int benchmark_len = arraysize(kParseBenchmark); for (int i = 0; i < benchmark_len; i++) { auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr()); upb_String_Append(&output, sv.data, sv.size); } EXPECT_NE(0, upb_String_Size(&output)); } // These tests validate the Tokenizer's handling of Unicode escapes. // Encode a single code point as UTF8. static std::string StandardUTF8(uint32_t code_point) { char buffer[4]; int count = upb_Unicode_ToUTF8(code_point, &buffer[0]); EXPECT_NE(count, 0) << "Failed to encode point " << std::hex << code_point; return std::string(reinterpret_cast(buffer), count); } static std::string DisplayHex(const std::string& data) { std::string output; for (size_t i = 0; i < data.size(); ++i) { absl::StrAppendFormat(&output, "%02x ", data[i]); } return output; } static void ExpectFormat(const std::string& expectation, const std::string& formatted) { upb::Arena arena; auto sv = upb_Parse_String(formatted.data(), arena.ptr()); EXPECT_EQ(strcmp(sv.data, expectation.data()), 0) << ": Incorrectly parsed " << formatted << ":\nGot " << DisplayHex(sv.data) << "\nExpected " << DisplayHex(expectation); } TEST(TokenizerHandlesUnicode, BMPCodes) { for (uint32_t code_point = 0; code_point < 0x10000; ++code_point) { // The UTF8 encoding of surrogates as single entities is not defined. if (upb_Unicode_IsHigh(code_point)) continue; if (upb_Unicode_IsLow(code_point)) continue; const std::string expectation = StandardUTF8(code_point); // Points in the BMP pages can be encoded using either \u with four hex // digits, or \U with eight hex digits. ExpectFormat(expectation, absl::StrFormat("'\\u%04x'", code_point)); ExpectFormat(expectation, absl::StrFormat("'\\u%04X'", code_point)); ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point)); ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point)); } } TEST(TokenizerHandlesUnicode, NonBMPCodes) { for (uint32_t code_point = 0x10000; code_point < 0x110000; ++code_point) { const std::string expectation = StandardUTF8(code_point); // Points in the non-BMP pages can be encoded using either \U with eight hex // digits, or using UTF-16 surrogate pairs. ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point)); ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point)); ExpectFormat(expectation, absl::StrFormat("'\\u%04x\\u%04x'", upb_Unicode_ToHigh(code_point), upb_Unicode_ToLow(code_point))); } } } // namespace } // namespace io } // namespace protobuf } // namespace google