Protocol Buffers - Google's data interchange format (grpc依赖)
https://developers.google.com/protocol-buffers/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1256 lines
44 KiB
1256 lines
44 KiB
// Protocol Buffers - Google's data interchange format |
|
// Copyright 2023 Google LLC. All rights reserved. |
|
// https://developers.google.com/protocol-buffers/ |
|
// |
|
// Redistribution and use in source and binary forms, with or without |
|
// modification, are permitted provided that the following conditions are |
|
// met: |
|
// |
|
// * Redistributions of source code must retain the above copyright |
|
// notice, this list of conditions and the following disclaimer. |
|
// * Redistributions in binary form must reproduce the above |
|
// copyright notice, this list of conditions and the following disclaimer |
|
// in the documentation and/or other materials provided with the |
|
// distribution. |
|
// * Neither the name of Google LLC nor the names of its |
|
// contributors may be used to endorse or promote products derived from |
|
// this software without specific prior written permission. |
|
// |
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
|
|
#include "upb/io/tokenizer.h" |
|
|
|
#include "gtest/gtest.h" |
|
#include "absl/strings/escaping.h" |
|
#include "absl/strings/str_format.h" |
|
#include "upb/io/chunked_input_stream.h" |
|
#include "upb/io/string.h" |
|
#include "upb/lex/unicode.h" |
|
#include "upb/upb.hpp" |
|
|
|
// Must be last. |
|
#include "upb/port/def.inc" |
|
|
|
namespace proto2 { |
|
namespace io { |
|
namespace { |
|
|
|
#ifndef arraysize |
|
#define arraysize(a) (sizeof(a) / sizeof(a[0])) |
|
#endif |
|
|
|
static bool StringEquals(const char* a, const char* b) { |
|
return strcmp(a, b) == 0; |
|
} |
|
|
|
// =================================================================== |
|
// Data-Driven Test Infrastructure |
|
|
|
// TODO(kenton): This is copied from coded_stream_unittest. This is |
|
// temporary until these features are integrated into gUnit itself. |
|
|
|
// TEST_1D and TEST_2D are macros I'd eventually like to see added to |
|
// gUnit. These macros can be used to declare tests which should be |
|
// run multiple times, once for each item in some input array. TEST_1D |
|
// tests all cases in a single input array. TEST_2D tests all |
|
// combinations of cases from two arrays. The arrays must be statically |
|
// defined such that the arraysize() macro works on them. Example: |
|
// |
|
// int kCases[] = {1, 2, 3, 4} |
|
// TEST_1D(MyFixture, MyTest, kCases) { |
|
// EXPECT_GT(kCases_case, 0); |
|
// } |
|
// |
|
// This test iterates through the numbers 1, 2, 3, and 4 and tests that |
|
// they are all grater than zero. In case of failure, the exact case |
|
// which failed will be printed. The case type must be printable using |
|
// ostream::operator<<. |
|
|
|
#define TEST_1D(FIXTURE, NAME, CASES) \ |
|
class FIXTURE##_##NAME##_DD : public FIXTURE { \ |
|
protected: \ |
|
template <typename CaseType> \ |
|
void DoSingleCase(const CaseType& CASES##_case); \ |
|
}; \ |
|
\ |
|
TEST_F(FIXTURE##_##NAME##_DD, NAME) { \ |
|
for (size_t i = 0; i < arraysize(CASES); i++) { \ |
|
SCOPED_TRACE(testing::Message() \ |
|
<< #CASES " case #" << i << ": " << CASES[i]); \ |
|
DoSingleCase(CASES[i]); \ |
|
} \ |
|
} \ |
|
\ |
|
template <typename CaseType> \ |
|
void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case) |
|
|
|
#define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \ |
|
class FIXTURE##_##NAME##_DD : public FIXTURE { \ |
|
protected: \ |
|
template <typename CaseType1, typename CaseType2> \ |
|
void DoSingleCase(const CaseType1& CASES1##_case, \ |
|
const CaseType2& CASES2##_case); \ |
|
}; \ |
|
\ |
|
TEST_F(FIXTURE##_##NAME##_DD, NAME) { \ |
|
for (size_t i = 0; i < arraysize(CASES1); i++) { \ |
|
for (size_t j = 0; j < arraysize(CASES2); j++) { \ |
|
SCOPED_TRACE(testing::Message() \ |
|
<< #CASES1 " case #" << i << ": " << CASES1[i] << ", " \ |
|
<< #CASES2 " case #" << j << ": " << CASES2[j]); \ |
|
DoSingleCase(CASES1[i], CASES2[j]); \ |
|
} \ |
|
} \ |
|
} \ |
|
\ |
|
template <typename CaseType1, typename CaseType2> \ |
|
void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \ |
|
const CaseType2& CASES2##_case) |
|
|
|
// ------------------------------------------------------------------- |
|
|
|
// In C, a size of zero from ZCIS_Next() means EOF so we can't play the same |
|
// trick here that happens in the C++ version. Use ChunkedInputStream instead. |
|
upb_ZeroCopyInputStream* TestInputStream(const void* data, size_t size, |
|
size_t block_size, upb_Arena* arena) { |
|
return upb_ChunkedInputStream_New(data, size, block_size, arena); |
|
} |
|
|
|
// ------------------------------------------------------------------- |
|
|
|
// We test each operation over a variety of block sizes to insure that |
|
// we test cases where reads cross buffer boundaries as well as cases |
|
// where they don't. This is sort of a brute-force approach to this, |
|
// but it's easy to write and easy to understand. |
|
const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024}; |
|
|
|
class TokenizerTest : public testing::Test { |
|
protected: |
|
// For easy testing. |
|
uint64_t ParseInteger(const std::string& text) { |
|
uint64_t result; |
|
EXPECT_TRUE(upb_Parse_Integer(text.data(), UINT64_MAX, &result)) |
|
<< "'" << text << "'"; |
|
return result; |
|
} |
|
}; |
|
|
|
// =================================================================== |
|
|
|
// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error: |
|
// "sorry, unimplemented: `method_call_expr' not supported by dump_expr" |
|
#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) |
|
|
|
// In each test case, the entire input text should parse as a single token |
|
// of the given type. |
|
struct SimpleTokenCase { |
|
std::string input; |
|
upb_TokenType type; |
|
}; |
|
|
|
inline std::ostream& operator<<(std::ostream& out, |
|
const SimpleTokenCase& test_case) { |
|
return out << absl::CEscape(test_case.input); |
|
} |
|
|
|
SimpleTokenCase kSimpleTokenCases[] = { |
|
// Test identifiers. |
|
{"hello", kUpb_TokenType_Identifier}, |
|
|
|
// Test integers. |
|
{"123", kUpb_TokenType_Integer}, |
|
{"0xab6", kUpb_TokenType_Integer}, |
|
{"0XAB6", kUpb_TokenType_Integer}, |
|
{"0X1234567", kUpb_TokenType_Integer}, |
|
{"0x89abcdef", kUpb_TokenType_Integer}, |
|
{"0x89ABCDEF", kUpb_TokenType_Integer}, |
|
{"01234567", kUpb_TokenType_Integer}, |
|
|
|
// Test floats. |
|
{"123.45", kUpb_TokenType_Float}, |
|
{"1.", kUpb_TokenType_Float}, |
|
{"1e3", kUpb_TokenType_Float}, |
|
{"1E3", kUpb_TokenType_Float}, |
|
{"1e-3", kUpb_TokenType_Float}, |
|
{"1e+3", kUpb_TokenType_Float}, |
|
{"1.e3", kUpb_TokenType_Float}, |
|
{"1.2e3", kUpb_TokenType_Float}, |
|
{".1", kUpb_TokenType_Float}, |
|
{".1e3", kUpb_TokenType_Float}, |
|
{".1e-3", kUpb_TokenType_Float}, |
|
{".1e+3", kUpb_TokenType_Float}, |
|
|
|
// Test strings. |
|
{"'hello'", kUpb_TokenType_String}, |
|
{"\"foo\"", kUpb_TokenType_String}, |
|
{"'a\"b'", kUpb_TokenType_String}, |
|
{"\"a'b\"", kUpb_TokenType_String}, |
|
{"'a\\'b'", kUpb_TokenType_String}, |
|
{"\"a\\\"b\"", kUpb_TokenType_String}, |
|
{"'\\xf'", kUpb_TokenType_String}, |
|
{"'\\0'", kUpb_TokenType_String}, |
|
|
|
// Test symbols. |
|
{"+", kUpb_TokenType_Symbol}, |
|
{".", kUpb_TokenType_Symbol}, |
|
}; |
|
|
|
TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) { |
|
upb::Arena arena; |
|
|
|
// Set up the tokenizer. |
|
auto input = TestInputStream(kSimpleTokenCases_case.input.data(), |
|
kSimpleTokenCases_case.input.size(), |
|
kBlockSizes_case, arena.ptr()); |
|
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr()); |
|
|
|
// Before Next() is called, the initial token should always be TYPE_START. |
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start); |
|
EXPECT_EQ(upb_Tokenizer_Line(t), 0); |
|
EXPECT_EQ(upb_Tokenizer_Column(t), 0); |
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0); |
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); |
|
|
|
// Parse the token. |
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL)); |
|
// Check that it has the right type. |
|
EXPECT_EQ(upb_Tokenizer_Type(t), kSimpleTokenCases_case.type); |
|
// Check that it contains the complete input text. |
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), |
|
kSimpleTokenCases_case.input.data())); |
|
|
|
// Check that it is located at the beginning of the input |
|
EXPECT_EQ(upb_Tokenizer_Line(t), 0); |
|
EXPECT_EQ(upb_Tokenizer_Column(t), 0); |
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size()); |
|
|
|
upb_Status status; |
|
upb_Status_Clear(&status); |
|
|
|
// There should be no more input and no errors.. |
|
EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); |
|
EXPECT_TRUE(upb_Status_IsOk(&status)); |
|
|
|
// After Next() returns false, the token should have type TYPE_END. |
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_End); |
|
EXPECT_EQ(upb_Tokenizer_Line(t), 0); |
|
EXPECT_EQ(upb_Tokenizer_Column(t), kSimpleTokenCases_case.input.size()); |
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size()); |
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); |
|
} |
|
|
|
TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) { |
|
// Test the "allow_f_after_float" option. |
|
|
|
// Set up the tokenizer. |
|
upb::Arena arena; |
|
const char* text = "1f 2.5f 6e3f 7F"; |
|
auto input = |
|
TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr()); |
|
const int options = kUpb_TokenizerOption_AllowFAfterFloat; |
|
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr()); |
|
|
|
// Advance through tokens and check that they are parsed as expected. |
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL)); |
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); |
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "1f")); |
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL)); |
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); |
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "2.5f")); |
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL)); |
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); |
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "6e3f")); |
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL)); |
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); |
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "7F")); |
|
|
|
upb_Status status; |
|
upb_Status_Clear(&status); |
|
|
|
// There should be no more input and no errors.. |
|
EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); |
|
EXPECT_TRUE(upb_Status_IsOk(&status)); |
|
} |
|
|
|
SimpleTokenCase kWhitespaceTokenCases[] = { |
|
{" ", kUpb_TokenType_Whitespace}, |
|
{" ", kUpb_TokenType_Whitespace}, |
|
{"\t", kUpb_TokenType_Whitespace}, |
|
{"\v", kUpb_TokenType_Whitespace}, |
|
{"\t ", kUpb_TokenType_Whitespace}, |
|
{"\v\t", kUpb_TokenType_Whitespace}, |
|
{" \t\r", kUpb_TokenType_Whitespace}, |
|
// Newlines: |
|
{"\n", kUpb_TokenType_Newline}, |
|
}; |
|
|
|
TEST_2D(TokenizerTest, Whitespace, kWhitespaceTokenCases, kBlockSizes) { |
|
upb::Arena arena; |
|
{ |
|
auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(), |
|
kWhitespaceTokenCases_case.input.size(), |
|
kBlockSizes_case, arena.ptr()); |
|
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr()); |
|
|
|
EXPECT_FALSE(upb_Tokenizer_Next(t, NULL)); |
|
} |
|
{ |
|
auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(), |
|
kWhitespaceTokenCases_case.input.size(), |
|
kBlockSizes_case, arena.ptr()); |
|
const int options = kUpb_TokenizerOption_ReportNewlines; |
|
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr()); |
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL)); |
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), kWhitespaceTokenCases_case.type); |
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), |
|
kWhitespaceTokenCases_case.input.data())); |
|
EXPECT_FALSE(upb_Tokenizer_Next(t, NULL)); |
|
} |
|
} |
|
|
|
#endif |
|
|
|
// ------------------------------------------------------------------- |
|
|
|
struct TokenFields { |
|
upb_TokenType type; |
|
std::string text; |
|
size_t line; |
|
size_t column; |
|
size_t end_column; |
|
}; |
|
|
|
// In each case, the input is parsed to produce a list of tokens. The |
|
// last token in "output" must have type TYPE_END. |
|
struct MultiTokenCase { |
|
std::string input; |
|
std::vector<TokenFields> output; |
|
}; |
|
|
|
inline std::ostream& operator<<(std::ostream& out, |
|
const MultiTokenCase& test_case) { |
|
return out << absl::CEscape(test_case.input); |
|
} |
|
|
|
MultiTokenCase kMultiTokenCases[] = { |
|
// Test empty input. |
|
{"", |
|
{ |
|
{kUpb_TokenType_End, "", 0, 0, 0}, |
|
}}, |
|
// Test all token types at the same time. |
|
{"foo 1 1.2 + 'bar'", |
|
{ |
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3}, |
|
{kUpb_TokenType_Integer, "1", 0, 4, 5}, |
|
{kUpb_TokenType_Float, "1.2", 0, 6, 9}, |
|
{kUpb_TokenType_Symbol, "+", 0, 10, 11}, |
|
{kUpb_TokenType_String, "'bar'", 0, 12, 17}, |
|
{kUpb_TokenType_End, "", 0, 17, 17}, |
|
}}, |
|
|
|
// Test that consecutive symbols are parsed as separate tokens. |
|
{"!@+%", |
|
{ |
|
{kUpb_TokenType_Symbol, "!", 0, 0, 1}, |
|
{kUpb_TokenType_Symbol, "@", 0, 1, 2}, |
|
{kUpb_TokenType_Symbol, "+", 0, 2, 3}, |
|
{kUpb_TokenType_Symbol, "%", 0, 3, 4}, |
|
{kUpb_TokenType_End, "", 0, 4, 4}, |
|
}}, |
|
|
|
// Test that newlines affect line numbers correctly. |
|
{"foo bar\nrab oof", |
|
{ |
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3}, |
|
{kUpb_TokenType_Identifier, "bar", 0, 4, 7}, |
|
{kUpb_TokenType_Identifier, "rab", 1, 0, 3}, |
|
{kUpb_TokenType_Identifier, "oof", 1, 4, 7}, |
|
{kUpb_TokenType_End, "", 1, 7, 7}, |
|
}}, |
|
|
|
// Test that tabs affect column numbers correctly. |
|
{"foo\tbar \tbaz", |
|
{ |
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3}, |
|
{kUpb_TokenType_Identifier, "bar", 0, 8, 11}, |
|
{kUpb_TokenType_Identifier, "baz", 0, 16, 19}, |
|
{kUpb_TokenType_End, "", 0, 19, 19}, |
|
}}, |
|
|
|
// Test that tabs in string literals affect column numbers correctly. |
|
{"\"foo\tbar\" baz", |
|
{ |
|
{kUpb_TokenType_String, "\"foo\tbar\"", 0, 0, 12}, |
|
{kUpb_TokenType_Identifier, "baz", 0, 13, 16}, |
|
{kUpb_TokenType_End, "", 0, 16, 16}, |
|
}}, |
|
|
|
// Test that line comments are ignored. |
|
{"foo // This is a comment\n" |
|
"bar // This is another comment", |
|
{ |
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3}, |
|
{kUpb_TokenType_Identifier, "bar", 1, 0, 3}, |
|
{kUpb_TokenType_End, "", 1, 30, 30}, |
|
}}, |
|
|
|
// Test that block comments are ignored. |
|
{"foo /* This is a block comment */ bar", |
|
{ |
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3}, |
|
{kUpb_TokenType_Identifier, "bar", 0, 34, 37}, |
|
{kUpb_TokenType_End, "", 0, 37, 37}, |
|
}}, |
|
|
|
// Test that sh-style comments are not ignored by default. |
|
{"foo # bar\n" |
|
"baz", |
|
{ |
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3}, |
|
{kUpb_TokenType_Symbol, "#", 0, 4, 5}, |
|
{kUpb_TokenType_Identifier, "bar", 0, 6, 9}, |
|
{kUpb_TokenType_Identifier, "baz", 1, 0, 3}, |
|
{kUpb_TokenType_End, "", 1, 3, 3}, |
|
}}, |
|
|
|
// Test all whitespace chars |
|
{"foo\n\t\r\v\fbar", |
|
{ |
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3}, |
|
{kUpb_TokenType_Identifier, "bar", 1, 11, 14}, |
|
{kUpb_TokenType_End, "", 1, 14, 14}, |
|
}}, |
|
}; |
|
|
|
TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) { |
|
// Set up the tokenizer. |
|
upb::Arena arena; |
|
auto input = TestInputStream(kMultiTokenCases_case.input.data(), |
|
kMultiTokenCases_case.input.size(), |
|
kBlockSizes_case, arena.ptr()); |
|
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr()); |
|
|
|
// Before Next() is called, the initial token should always be TYPE_START. |
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start); |
|
EXPECT_EQ(upb_Tokenizer_Line(t), 0); |
|
EXPECT_EQ(upb_Tokenizer_Column(t), 0); |
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0); |
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); |
|
|
|
// Loop through all expected tokens. |
|
TokenFields token_fields; |
|
upb_Status status; |
|
upb_Status_Clear(&status); |
|
int i = 0; |
|
do { |
|
token_fields = kMultiTokenCases_case.output[i++]; |
|
|
|
SCOPED_TRACE(testing::Message() |
|
<< "Token #" << i << ": " << absl::CEscape(token_fields.text)); |
|
|
|
// Next() should only return false when it hits the end token. |
|
if (token_fields.type == kUpb_TokenType_End) { |
|
EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); |
|
EXPECT_TRUE(upb_Status_IsOk(&status)); |
|
} else { |
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL)); |
|
} |
|
|
|
// Check that the token matches the expected one. |
|
EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type); |
|
EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line); |
|
EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column); |
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column); |
|
EXPECT_EQ(upb_Tokenizer_TextSize(t), token_fields.text.size()); |
|
EXPECT_TRUE( |
|
StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data())); |
|
} while (token_fields.type != kUpb_TokenType_End); |
|
} |
|
|
|
MultiTokenCase kMultiWhitespaceTokenCases[] = { |
|
// Test all token types at the same time. |
|
{"foo 1 \t1.2 \n +\v'bar'", |
|
{ |
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3}, |
|
{kUpb_TokenType_Whitespace, " ", 0, 3, 4}, |
|
{kUpb_TokenType_Integer, "1", 0, 4, 5}, |
|
{kUpb_TokenType_Whitespace, " \t", 0, 5, 8}, |
|
{kUpb_TokenType_Float, "1.2", 0, 8, 11}, |
|
{kUpb_TokenType_Whitespace, " ", 0, 11, 13}, |
|
{kUpb_TokenType_Newline, "\n", 0, 13, 0}, |
|
{kUpb_TokenType_Whitespace, " ", 1, 0, 3}, |
|
{kUpb_TokenType_Symbol, "+", 1, 3, 4}, |
|
{kUpb_TokenType_Whitespace, "\v", 1, 4, 5}, |
|
{kUpb_TokenType_String, "'bar'", 1, 5, 10}, |
|
{kUpb_TokenType_End, "", 1, 10, 10}, |
|
}}, |
|
|
|
}; |
|
|
|
TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases, |
|
kBlockSizes) { |
|
// Set up the tokenizer. |
|
upb::Arena arena; |
|
auto input = TestInputStream(kMultiWhitespaceTokenCases_case.input.data(), |
|
kMultiWhitespaceTokenCases_case.input.size(), |
|
kBlockSizes_case, arena.ptr()); |
|
const int options = kUpb_TokenizerOption_ReportNewlines; |
|
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr()); |
|
|
|
// Before Next() is called, the initial token should always be TYPE_START. |
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start); |
|
EXPECT_EQ(upb_Tokenizer_Line(t), 0); |
|
EXPECT_EQ(upb_Tokenizer_Column(t), 0); |
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0); |
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); |
|
|
|
// Loop through all expected tokens. |
|
TokenFields token_fields; |
|
upb_Status status; |
|
upb_Status_Clear(&status); |
|
int i = 0; |
|
do { |
|
token_fields = kMultiWhitespaceTokenCases_case.output[i++]; |
|
|
|
SCOPED_TRACE(testing::Message() |
|
<< "Token #" << i << ": " << token_fields.text); |
|
|
|
// Next() should only return false when it hits the end token. |
|
if (token_fields.type == kUpb_TokenType_End) { |
|
EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); |
|
EXPECT_TRUE(upb_Status_IsOk(&status)); |
|
} else { |
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL)); |
|
} |
|
|
|
// Check that the token matches the expected one. |
|
EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type); |
|
EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line); |
|
EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column); |
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column); |
|
EXPECT_TRUE( |
|
StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data())); |
|
} while (token_fields.type != kUpb_TokenType_End); |
|
} |
|
|
|
// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error: |
|
// "sorry, unimplemented: `method_call_expr' not supported by dump_expr" |
|
#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) |
|
|
|
TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) { |
|
// Test the "comment_style" option. |
|
|
|
const char* text = |
|
"foo # bar\n" |
|
"baz // qux\n" |
|
"corge /* grault */\n" |
|
"garply"; |
|
const char* const kTokens[] = {"foo", // "# bar" is ignored |
|
"baz", "/", "/", "qux", "corge", "/", |
|
"*", "grault", "*", "/", "garply"}; |
|
|
|
// Set up the tokenizer. |
|
upb::Arena arena; |
|
auto input = |
|
TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr()); |
|
const int options = kUpb_TokenizerOption_CommentStyleShell; |
|
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr()); |
|
|
|
// Advance through tokens and check that they are parsed as expected. |
|
for (size_t i = 0; i < arraysize(kTokens); i++) { |
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL)); |
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), kTokens[i])); |
|
} |
|
|
|
// There should be no more input and no errors. |
|
upb_Status status; |
|
upb_Status_Clear(&status); |
|
EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); |
|
EXPECT_TRUE(upb_Status_IsOk(&status)); |
|
} |
|
|
|
#endif |
|
|
|
// ------------------------------------------------------------------- |
|
|
|
#if 0 // TODO(salo): Extended comments are currently unimplemented. |
|
|
|
// In each case, the input is expected to have two tokens named "prev" and |
|
// "next" with comments in between. |
|
struct DocCommentCase { |
|
std::string input; |
|
|
|
const char* prev_trailing_comments; |
|
const char* detached_comments[10]; |
|
const char* next_leading_comments; |
|
}; |
|
|
|
inline std::ostream& operator<<(std::ostream& out, |
|
const DocCommentCase& test_case) { |
|
return out << absl::CEscape(test_case.input); |
|
} |
|
|
|
DocCommentCase kDocCommentCases[] = { |
|
{"prev next", |
|
|
|
"", |
|
{}, |
|
""}, |
|
|
|
{"prev /* ignored */ next", |
|
|
|
"", |
|
{}, |
|
""}, |
|
|
|
{"prev // trailing comment\n" |
|
"next", |
|
|
|
" trailing comment\n", |
|
{}, |
|
""}, |
|
|
|
{"prev\n" |
|
"// leading comment\n" |
|
"// line 2\n" |
|
"next", |
|
|
|
"", |
|
{}, |
|
" leading comment\n" |
|
" line 2\n"}, |
|
|
|
{"prev\n" |
|
"// trailing comment\n" |
|
"// line 2\n" |
|
"\n" |
|
"next", |
|
|
|
" trailing comment\n" |
|
" line 2\n", |
|
{}, |
|
""}, |
|
|
|
{"prev // trailing comment\n" |
|
"// leading comment\n" |
|
"// line 2\n" |
|
"next", |
|
|
|
" trailing comment\n", |
|
{}, |
|
" leading comment\n" |
|
" line 2\n"}, |
|
|
|
{"prev /* trailing block comment */\n" |
|
"/* leading block comment\n" |
|
" * line 2\n" |
|
" * line 3 */" |
|
"next", |
|
|
|
" trailing block comment ", |
|
{}, |
|
" leading block comment\n" |
|
" line 2\n" |
|
" line 3 "}, |
|
|
|
{"prev\n" |
|
"/* trailing block comment\n" |
|
" * line 2\n" |
|
" * line 3\n" |
|
" */\n" |
|
"/* leading block comment\n" |
|
" * line 2\n" |
|
" * line 3 */" |
|
"next", |
|
|
|
" trailing block comment\n" |
|
" line 2\n" |
|
" line 3\n", |
|
{}, |
|
" leading block comment\n" |
|
" line 2\n" |
|
" line 3 "}, |
|
|
|
{"prev\n" |
|
"// trailing comment\n" |
|
"\n" |
|
"// detached comment\n" |
|
"// line 2\n" |
|
"\n" |
|
"// second detached comment\n" |
|
"/* third detached comment\n" |
|
" * line 2 */\n" |
|
"// leading comment\n" |
|
"next", |
|
|
|
" trailing comment\n", |
|
{" detached comment\n" |
|
" line 2\n", |
|
" second detached comment\n", |
|
" third detached comment\n" |
|
" line 2 "}, |
|
" leading comment\n"}, |
|
|
|
{"prev /**/\n" |
|
"\n" |
|
"// detached comment\n" |
|
"\n" |
|
"// leading comment\n" |
|
"next", |
|
|
|
"", |
|
{" detached comment\n"}, |
|
" leading comment\n"}, |
|
|
|
{"prev /**/\n" |
|
"// leading comment\n" |
|
"next", |
|
|
|
"", |
|
{}, |
|
" leading comment\n"}, |
|
}; |
|
|
|
TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) { |
|
// Set up the tokenizer. |
|
TestInputStream input(kDocCommentCases_case.input.data(), |
|
kDocCommentCases_case.input.size(), kBlockSizes_case); |
|
TestErrorCollector error_collector; |
|
Tokenizer tokenizer(&input, &error_collector); |
|
|
|
// Set up a second tokenizer where we'll pass all NULLs to NextWithComments(). |
|
TestInputStream input2(kDocCommentCases_case.input.data(), |
|
kDocCommentCases_case.input.size(), kBlockSizes_case); |
|
Tokenizer tokenizer2(&input2, &error_collector); |
|
|
|
tokenizer.Next(); |
|
tokenizer2.Next(); |
|
|
|
EXPECT_EQ("prev", tokenizer.current().text); |
|
EXPECT_EQ("prev", tokenizer2.current().text); |
|
|
|
std::string prev_trailing_comments; |
|
std::vector<std::string> detached_comments; |
|
std::string next_leading_comments; |
|
tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments, |
|
&next_leading_comments); |
|
tokenizer2.NextWithComments(NULL, NULL, NULL); |
|
EXPECT_EQ("next", tokenizer.current().text); |
|
EXPECT_EQ("next", tokenizer2.current().text); |
|
|
|
EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments, |
|
prev_trailing_comments); |
|
|
|
for (int i = 0; i < detached_comments.size(); i++) { |
|
EXPECT_LT(i, arraysize(kDocCommentCases)); |
|
EXPECT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL); |
|
EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]); |
|
} |
|
|
|
// Verify that we matched all the detached comments. |
|
EXPECT_EQ(NULL, |
|
kDocCommentCases_case.detached_comments[detached_comments.size()]); |
|
|
|
EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments); |
|
} |
|
|
|
#endif // 0 |
|
|
|
// ------------------------------------------------------------------- |
|
|
|
// Test parse helpers. |
|
// TODO(b/225783758): Add a fuzz test for this. |
|
TEST_F(TokenizerTest, ParseInteger) { |
|
EXPECT_EQ(0, ParseInteger("0")); |
|
EXPECT_EQ(123, ParseInteger("123")); |
|
EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12")); |
|
EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12")); |
|
EXPECT_EQ(UINT64_MAX, ParseInteger("0xFFFFFFFFFFFFFFFF")); |
|
EXPECT_EQ(01234567, ParseInteger("01234567")); |
|
EXPECT_EQ(0X123, ParseInteger("0X123")); |
|
|
|
// Test invalid integers that may still be tokenized as integers. |
|
EXPECT_EQ(0, ParseInteger("0x")); |
|
|
|
uint64_t i; |
|
|
|
// Test invalid integers that will never be tokenized as integers. |
|
EXPECT_FALSE(upb_Parse_Integer("zxy", UINT64_MAX, &i)); |
|
EXPECT_FALSE(upb_Parse_Integer("1.2", UINT64_MAX, &i)); |
|
EXPECT_FALSE(upb_Parse_Integer("08", UINT64_MAX, &i)); |
|
EXPECT_FALSE(upb_Parse_Integer("0xg", UINT64_MAX, &i)); |
|
EXPECT_FALSE(upb_Parse_Integer("-1", UINT64_MAX, &i)); |
|
|
|
// Test overflows. |
|
EXPECT_TRUE(upb_Parse_Integer("0", 0, &i)); |
|
EXPECT_FALSE(upb_Parse_Integer("1", 0, &i)); |
|
EXPECT_TRUE(upb_Parse_Integer("1", 1, &i)); |
|
EXPECT_TRUE(upb_Parse_Integer("12345", 12345, &i)); |
|
EXPECT_FALSE(upb_Parse_Integer("12346", 12345, &i)); |
|
EXPECT_TRUE(upb_Parse_Integer("0xFFFFFFFFFFFFFFFF", UINT64_MAX, &i)); |
|
EXPECT_FALSE(upb_Parse_Integer("0x10000000000000000", UINT64_MAX, &i)); |
|
|
|
// Test near the limits of signed parsing (values in INT64_MAX +/- 1600) |
|
for (int64_t offset = -1600; offset <= 1600; ++offset) { |
|
// We make sure to perform an unsigned addition so that we avoid signed |
|
// overflow, which would be undefined behavior. |
|
uint64_t i = 0x7FFFFFFFFFFFFFFFu + static_cast<uint64_t>(offset); |
|
char decimal[32]; |
|
snprintf(decimal, 32, "%llu", static_cast<unsigned long long>(i)); |
|
if (offset > 0) { |
|
uint64_t parsed = -1; |
|
EXPECT_FALSE(upb_Parse_Integer(decimal, INT64_MAX, &parsed)) |
|
<< decimal << "=>" << parsed; |
|
} else { |
|
uint64_t parsed = -1; |
|
EXPECT_TRUE(upb_Parse_Integer(decimal, INT64_MAX, &parsed)) |
|
<< decimal << "=>" << parsed; |
|
EXPECT_EQ(parsed, i); |
|
} |
|
char octal[32]; |
|
snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i)); |
|
if (offset > 0) { |
|
uint64_t parsed = -1; |
|
EXPECT_FALSE(upb_Parse_Integer(octal, INT64_MAX, &parsed)) |
|
<< octal << "=>" << parsed; |
|
} else { |
|
uint64_t parsed = -1; |
|
EXPECT_TRUE(upb_Parse_Integer(octal, INT64_MAX, &parsed)) |
|
<< octal << "=>" << parsed; |
|
EXPECT_EQ(parsed, i); |
|
} |
|
char hex[32]; |
|
snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(i)); |
|
if (offset > 0) { |
|
uint64_t parsed = -1; |
|
EXPECT_FALSE(upb_Parse_Integer(hex, INT64_MAX, &parsed)) |
|
<< hex << "=>" << parsed; |
|
} else { |
|
uint64_t parsed = -1; |
|
EXPECT_TRUE(upb_Parse_Integer(hex, INT64_MAX, &parsed)) << hex; |
|
EXPECT_EQ(parsed, i); |
|
} |
|
// EXPECT_NE(offset, -237); |
|
} |
|
|
|
// Test near the limits of unsigned parsing (values in UINT64_MAX +/- 1600) |
|
// By definition, values greater than UINT64_MAX cannot be held in a uint64_t |
|
// variable, so printing them is a little tricky; fortunately all but the |
|
// last four digits are known, so we can hard-code them in the printf string, |
|
// and we only need to format the last 4. |
|
for (int64_t offset = -1600; offset <= 1600; ++offset) { |
|
{ |
|
uint64_t i = 18446744073709551615u + offset; |
|
char decimal[32]; |
|
snprintf(decimal, 32, "1844674407370955%04llu", |
|
static_cast<unsigned long long>(1615 + offset)); |
|
if (offset > 0) { |
|
uint64_t parsed = -1; |
|
EXPECT_FALSE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed)) |
|
<< decimal << "=>" << parsed; |
|
} else { |
|
uint64_t parsed = -1; |
|
EXPECT_TRUE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed)) << decimal; |
|
EXPECT_EQ(parsed, i); |
|
} |
|
} |
|
{ |
|
uint64_t i = 01777777777777777777777u + offset; |
|
if (offset > 0) { |
|
char octal[32]; |
|
snprintf(octal, 32, "0200000000000000000%04llo", |
|
static_cast<unsigned long long>(offset - 1)); |
|
uint64_t parsed = -1; |
|
EXPECT_FALSE(upb_Parse_Integer(octal, UINT64_MAX, &parsed)) |
|
<< octal << "=>" << parsed; |
|
} else { |
|
char octal[32]; |
|
snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i)); |
|
uint64_t parsed = -1; |
|
EXPECT_TRUE(upb_Parse_Integer(octal, UINT64_MAX, &parsed)) << octal; |
|
EXPECT_EQ(parsed, i); |
|
} |
|
} |
|
{ |
|
uint64_t ui = 0xffffffffffffffffu + offset; |
|
char hex[32]; |
|
if (offset > 0) { |
|
snprintf(hex, 32, "0x1000000000000%04llx", |
|
static_cast<unsigned long long>(offset - 1)); |
|
uint64_t parsed = -1; |
|
EXPECT_FALSE(upb_Parse_Integer(hex, UINT64_MAX, &parsed)) |
|
<< hex << "=>" << parsed; |
|
} else { |
|
snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(ui)); |
|
uint64_t parsed = -1; |
|
EXPECT_TRUE(upb_Parse_Integer(hex, UINT64_MAX, &parsed)) << hex; |
|
EXPECT_EQ(parsed, ui); |
|
} |
|
} |
|
} |
|
} |
|
|
|
TEST_F(TokenizerTest, ParseFloat) { |
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.")); |
|
EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1e3")); |
|
EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1E3")); |
|
EXPECT_DOUBLE_EQ(1.5e3, upb_Parse_Float("1.5e3")); |
|
EXPECT_DOUBLE_EQ(.1, upb_Parse_Float(".1")); |
|
EXPECT_DOUBLE_EQ(.25, upb_Parse_Float(".25")); |
|
EXPECT_DOUBLE_EQ(.1e3, upb_Parse_Float(".1e3")); |
|
EXPECT_DOUBLE_EQ(.25e3, upb_Parse_Float(".25e3")); |
|
EXPECT_DOUBLE_EQ(.1e+3, upb_Parse_Float(".1e+3")); |
|
EXPECT_DOUBLE_EQ(.1e-3, upb_Parse_Float(".1e-3")); |
|
EXPECT_DOUBLE_EQ(5, upb_Parse_Float("5")); |
|
EXPECT_DOUBLE_EQ(6e-12, upb_Parse_Float("6e-12")); |
|
EXPECT_DOUBLE_EQ(1.2, upb_Parse_Float("1.2")); |
|
EXPECT_DOUBLE_EQ(1.e2, upb_Parse_Float("1.e2")); |
|
|
|
// Test invalid integers that may still be tokenized as integers. |
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e")); |
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e-")); |
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.e")); |
|
|
|
// Test 'f' suffix. |
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1f")); |
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.0f")); |
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1F")); |
|
|
|
// These should parse successfully even though they are out of range. |
|
// Overflows become infinity and underflows become zero. |
|
EXPECT_EQ(0.0, upb_Parse_Float("1e-9999999999999999999999999999")); |
|
EXPECT_EQ(HUGE_VAL, upb_Parse_Float("1e+9999999999999999999999999999")); |
|
|
|
#if GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet |
|
// Test invalid integers that will never be tokenized as integers. |
|
EXPECT_DEBUG_DEATH( |
|
upb_Parse_Float("zxy"), |
|
"passed text that could not have been tokenized as a float"); |
|
EXPECT_DEBUG_DEATH( |
|
upb_Parse_Float("1-e0"), |
|
"passed text that could not have been tokenized as a float"); |
|
EXPECT_DEBUG_DEATH( |
|
upb_Parse_Float("-1.0"), |
|
"passed text that could not have been tokenized as a float"); |
|
#endif // GTEST_HAS_DEATH_TEST |
|
} |
|
|
|
TEST_F(TokenizerTest, ParseString) { |
|
const std::string inputs[] = { |
|
"'hello'", |
|
"\"blah\\nblah2\"", |
|
"'\\1x\\1\\123\\739\\52\\334n\\3'", |
|
"'\\x20\\x4'", |
|
|
|
// Test invalid strings that may still be tokenized as strings. |
|
"\"\\a\\l\\v\\t", // \l is invalid |
|
"'", |
|
"'\\", |
|
|
|
// Experiment with Unicode escapes. |
|
// Here are one-, two- and three-byte Unicode characters. |
|
"'\\u0024\\u00a2\\u20ac\\U00024b62XX'", |
|
"'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", // Same, encoded using UTF16. |
|
|
|
// Here's some broken UTF16: a head surrogate with no tail surrogate. |
|
// We just output this as if it were UTF8; it's not a defined code point, |
|
// but it has a defined encoding. |
|
"'\\ud852XX'", |
|
|
|
// Malformed escape: Demons may fly out of the nose. |
|
"'\\u0'", |
|
|
|
// Beyond the range of valid UTF-32 code units. |
|
"'\\U00110000\\U00200000\\UFFFFFFFF'", |
|
}; |
|
|
|
const std::string outputs[] = { |
|
"hello", |
|
"blah\nblah2", |
|
"\1x\1\123\739\52\334n\3", |
|
"\x20\x4", |
|
|
|
"\a?\v\t", |
|
"", |
|
"\\", |
|
|
|
"$¢€𤭢XX", |
|
"$¢€𤭢XX", |
|
|
|
"\xed\xa1\x92XX", |
|
|
|
"u0", |
|
|
|
"\\U00110000\\U00200000\\Uffffffff", |
|
}; |
|
|
|
upb::Arena arena; |
|
|
|
for (size_t i = 0; i < sizeof(inputs) / sizeof(inputs[0]); i++) { |
|
auto sv = upb_Parse_String(inputs[i].data(), arena.ptr()); |
|
EXPECT_TRUE(StringEquals(sv.data, outputs[i].data())); |
|
} |
|
|
|
// Test invalid strings that will never be tokenized as strings. |
|
#if GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet |
|
EXPECT_DEBUG_DEATH( |
|
upb_Parse_String("", arena.ptr()), |
|
"passed text that could not have been tokenized as a string"); |
|
#endif // GTEST_HAS_DEATH_TEST |
|
} |
|
|
|
TEST_F(TokenizerTest, ParseStringAppend) { |
|
upb::Arena arena; |
|
upb_String output; |
|
upb_String_Init(&output, arena.ptr()); |
|
|
|
upb_String_Assign(&output, "stuff+", 6); |
|
auto sv = upb_Parse_String("'hello'", arena.ptr()); |
|
EXPECT_TRUE(StringEquals(sv.data, "hello")); |
|
upb_String_Append(&output, sv.data, sv.size); |
|
EXPECT_TRUE(StringEquals(upb_String_Data(&output), "stuff+hello")); |
|
} |
|
|
|
// ------------------------------------------------------------------- |
|
|
|
// Each case parses some input text, ignoring the tokens produced, and |
|
// checks that the error output matches what is expected. |
|
struct ErrorCase { |
|
std::string input; |
|
const char* errors; |
|
}; |
|
|
|
inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) { |
|
return out << absl::CEscape(test_case.input); |
|
} |
|
|
|
ErrorCase kErrorCases[] = { |
|
// String errors. |
|
{"'\\l'", "0:2: Invalid escape sequence in string literal."}, |
|
{"'\\X'", "0:2: Invalid escape sequence in string literal."}, |
|
{"'\\x'", "0:3: Expected hex digits for escape sequence."}, |
|
{"'foo", "0:4: Unexpected end of string."}, |
|
{"'bar\nfoo", "0:4: String literals cannot cross line boundaries."}, |
|
{"'\\u01'", "0:5: Expected four hex digits for \\u escape sequence."}, |
|
{"'\\uXYZ'", "0:3: Expected four hex digits for \\u escape sequence."}, |
|
|
|
// Integer errors. |
|
{"123foo", "0:3: Need space between number and identifier."}, |
|
|
|
// Hex/octal errors. |
|
{"0x foo", "0:2: \"0x\" must be followed by hex digits."}, |
|
{"0541823", "0:4: Numbers starting with leading zero must be in octal."}, |
|
{"0x123z", "0:5: Need space between number and identifier."}, |
|
{"0x123.4", "0:5: Hex and octal numbers must be integers."}, |
|
{"0123.4", "0:4: Hex and octal numbers must be integers."}, |
|
|
|
// Float errors. |
|
{"1e foo", "0:2: \"e\" must be followed by exponent."}, |
|
{"1e- foo", "0:3: \"e\" must be followed by exponent."}, |
|
{"1.2.3", |
|
"0:3: Already saw decimal point or exponent; can't have another one."}, |
|
{"1e2.3", |
|
"0:3: Already saw decimal point or exponent; can't have another one."}, |
|
{"a.1", "0:1: Need space between identifier and decimal point."}, |
|
// allow_f_after_float not enabled, so this should be an error. |
|
{"1.0f", "0:3: Need space between number and identifier."}, |
|
|
|
// Block comment errors. |
|
{"/*", |
|
"0:2: End-of-file inside block comment.\n0:0: Comment started here."}, |
|
{"/*/*/ foo", |
|
"0:3: \"/*\" inside block comment. Block comments cannot be nested."}, |
|
|
|
// Control characters. Multiple consecutive control characters should only |
|
// produce one error. |
|
{"\b foo", "0:0: Invalid control characters encountered in text."}, |
|
{"\b\b foo", "0:0: Invalid control characters encountered in text."}, |
|
|
|
// Check that control characters at end of input don't result in an |
|
// infinite loop. |
|
{"\b", "0:0: Invalid control characters encountered in text."}, |
|
|
|
// Check recovery from '\0'. We have to explicitly specify the length of |
|
// these strings because otherwise the string constructor will just call |
|
// strlen() which will see the first '\0' and think that is the end of the |
|
// string. |
|
{std::string("\0foo", 4), |
|
"0:0: Invalid control characters encountered in text."}, |
|
{std::string("\0\0foo", 5), |
|
"0:0: Invalid control characters encountered in text."}, |
|
|
|
// Check error from high order bits set |
|
{"\300", "0:0: Interpreting non ascii codepoint 192."}, |
|
}; |
|
|
|
TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) { |
|
// Set up the tokenizer. |
|
upb::Arena arena; |
|
auto input = TestInputStream(kErrorCases_case.input.data(), |
|
kErrorCases_case.input.size(), kBlockSizes_case, |
|
arena.ptr()); |
|
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr()); |
|
|
|
upb_Status status; |
|
upb_Status_Clear(&status); |
|
|
|
while (upb_Tokenizer_Next(t, &status)) |
|
; // just keep looping |
|
EXPECT_TRUE( |
|
StringEquals(upb_Status_ErrorMessage(&status), kErrorCases_case.errors)); |
|
} |
|
|
|
// ------------------------------------------------------------------- |
|
|
|
TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) { |
|
const std::string text = "foo bar"; |
|
upb::Arena arena; |
|
auto input = |
|
TestInputStream(text.data(), text.size(), kBlockSizes_case, arena.ptr()); |
|
|
|
// Create a tokenizer, read one token, then destroy it. |
|
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr()); |
|
upb_Tokenizer_Next(t, NULL); |
|
upb_Tokenizer_Fini(t); |
|
|
|
// Only "foo" should have been read. |
|
EXPECT_EQ(strlen("foo"), upb_ZeroCopyInputStream_ByteCount(input)); |
|
} |
|
|
|
static const char* kParseBenchmark[] = { |
|
"\"partner-google-mobile-modes-print\"", |
|
"\"partner-google-mobile-modes-products\"", |
|
"\"partner-google-mobile-modes-realtime\"", |
|
"\"partner-google-mobile-modes-video\"", |
|
"\"partner-google-modes-news\"", |
|
"\"partner-google-modes-places\"", |
|
"\"partner-google-news\"", |
|
"\"partner-google-print\"", |
|
"\"partner-google-products\"", |
|
"\"partner-google-realtime\"", |
|
"\"partner-google-video\"", |
|
"\"true\"", |
|
"\"BigImagesHover__js_list\"", |
|
"\"XFEExternJsVersionParameters\"", |
|
"\"Available versions of the big images hover javascript\"", |
|
"\"Version: {\n\"", |
|
"\" script_name: \"extern_js/dummy_file_compiled_post20070813.js\"\n\"", |
|
"\" version_number: 0\n\"", |
|
"\"}\"", |
|
"\"BigImagesHover__js_selection\"", |
|
"\"XFEExternJsVersionParameters\"", |
|
"\"Versioning info for the big images hover javascript.\"", |
|
"\"current_version: 0\"", |
|
"\"BigImagesHover__js_suppressed\"", |
|
"\"Indicates if the client-side javascript associated with big images.\"", |
|
"\"true\"", |
|
"\"BrowserAnyOf\"", |
|
"\"IsChrome5OrAbove\"", |
|
"\"IsFirefox3OrAbove\"", |
|
"IsIE8OrAboveBinary", |
|
"\"Abe \"Sausage King\" Froman\"", |
|
"\"Frank \"Meatball\" Febbraro\"", |
|
}; |
|
|
|
TEST(Benchmark, ParseStringAppendAccumulate) { |
|
upb::Arena arena; |
|
size_t outsize = 0; |
|
int benchmark_len = arraysize(kParseBenchmark); |
|
for (int i = 0; i < benchmark_len; i++) { |
|
auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr()); |
|
outsize += sv.size; |
|
} |
|
EXPECT_NE(0, outsize); |
|
} |
|
|
|
TEST(Benchmark, ParseStringAppend) { |
|
upb::Arena arena; |
|
upb_String output; |
|
upb_String_Init(&output, arena.ptr()); |
|
int benchmark_len = arraysize(kParseBenchmark); |
|
for (int i = 0; i < benchmark_len; i++) { |
|
auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr()); |
|
upb_String_Append(&output, sv.data, sv.size); |
|
} |
|
EXPECT_NE(0, upb_String_Size(&output)); |
|
} |
|
|
|
// These tests validate the Tokenizer's handling of Unicode escapes. |
|
|
|
// Encode a single code point as UTF8. |
|
static std::string StandardUTF8(uint32_t code_point) { |
|
char buffer[4]; |
|
int count = upb_Unicode_ToUTF8(code_point, &buffer[0]); |
|
|
|
EXPECT_NE(count, 0) << "Failed to encode point " << std::hex << code_point; |
|
return std::string(reinterpret_cast<const char*>(buffer), count); |
|
} |
|
|
|
static std::string DisplayHex(const std::string& data) { |
|
std::string output; |
|
for (size_t i = 0; i < data.size(); ++i) { |
|
absl::StrAppendFormat(&output, "%02x ", data[i]); |
|
} |
|
return output; |
|
} |
|
|
|
static void ExpectFormat(const std::string& expectation, |
|
const std::string& formatted) { |
|
upb::Arena arena; |
|
auto sv = upb_Parse_String(formatted.data(), arena.ptr()); |
|
EXPECT_EQ(strcmp(sv.data, expectation.data()), 0) |
|
<< ": Incorrectly parsed " << formatted << ":\nGot " |
|
<< DisplayHex(sv.data) << "\nExpected " << DisplayHex(expectation); |
|
} |
|
|
|
TEST(TokenizerHandlesUnicode, BMPCodes) { |
|
for (uint32_t code_point = 0; code_point < 0x10000; ++code_point) { |
|
// The UTF8 encoding of surrogates as single entities is not defined. |
|
if (upb_Unicode_IsHigh(code_point)) continue; |
|
if (upb_Unicode_IsLow(code_point)) continue; |
|
|
|
const std::string expectation = StandardUTF8(code_point); |
|
|
|
// Points in the BMP pages can be encoded using either \u with four hex |
|
// digits, or \U with eight hex digits. |
|
ExpectFormat(expectation, absl::StrFormat("'\\u%04x'", code_point)); |
|
ExpectFormat(expectation, absl::StrFormat("'\\u%04X'", code_point)); |
|
ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point)); |
|
ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point)); |
|
} |
|
} |
|
|
|
TEST(TokenizerHandlesUnicode, NonBMPCodes) { |
|
for (uint32_t code_point = 0x10000; code_point < 0x110000; ++code_point) { |
|
const std::string expectation = StandardUTF8(code_point); |
|
|
|
// Points in the non-BMP pages can be encoded using either \U with eight hex |
|
// digits, or using UTF-16 surrogate pairs. |
|
ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point)); |
|
ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point)); |
|
ExpectFormat(expectation, absl::StrFormat("'\\u%04x\\u%04x'", |
|
upb_Unicode_ToHigh(code_point), |
|
upb_Unicode_ToLow(code_point))); |
|
} |
|
} |
|
|
|
} // namespace |
|
} // namespace io |
|
} // namespace proto2
|
|
|