|
|
|
/*
|
|
|
|
* Copyright (c) 2009-2022, Google LLC
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
* * Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* * Neither the name of Google LLC nor the
|
|
|
|
* names of its contributors may be used to endorse or promote products
|
|
|
|
* derived from this software without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
|
|
|
|
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
|
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "upb/io/tokenizer.h"
|
|
|
|
|
|
|
|
#include "gtest/gtest.h"
|
|
|
|
#include "absl/strings/escaping.h"
|
|
|
|
#include "absl/strings/str_format.h"
|
|
|
|
#include "upb/internal/unicode.h"
|
|
|
|
#include "upb/io/chunked_input_stream.h"
|
|
|
|
#include "upb/io/string.h"
|
|
|
|
#include "upb/upb.hpp"
|
|
|
|
|
|
|
|
// Must be last.
|
|
|
|
#include "upb/port_def.inc"
|
|
|
|
|
|
|
|
namespace proto2 {
|
|
|
|
namespace io {
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
#ifndef arraysize
|
|
|
|
#define arraysize(a) (sizeof(a) / sizeof(a[0]))
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static bool StringEquals(const char* a, const char* b) {
|
|
|
|
return strcmp(a, b) == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// ===================================================================
|
|
|
|
// Data-Driven Test Infrastructure
|
|
|
|
|
|
|
|
// TODO(kenton): This is copied from coded_stream_unittest. This is
|
|
|
|
// temporary until these features are integrated into gUnit itself.
|
|
|
|
|
|
|
|
// TEST_1D and TEST_2D are macros I'd eventually like to see added to
|
|
|
|
// gUnit. These macros can be used to declare tests which should be
|
|
|
|
// run multiple times, once for each item in some input array. TEST_1D
|
|
|
|
// tests all cases in a single input array. TEST_2D tests all
|
|
|
|
// combinations of cases from two arrays. The arrays must be statically
|
|
|
|
// defined such that the arraysize() macro works on them. Example:
|
|
|
|
//
|
|
|
|
// int kCases[] = {1, 2, 3, 4}
|
|
|
|
// TEST_1D(MyFixture, MyTest, kCases) {
|
|
|
|
// EXPECT_GT(kCases_case, 0);
|
|
|
|
// }
|
|
|
|
//
|
|
|
|
// This test iterates through the numbers 1, 2, 3, and 4 and tests that
|
|
|
|
// they are all grater than zero. In case of failure, the exact case
|
|
|
|
// which failed will be printed. The case type must be printable using
|
|
|
|
// ostream::operator<<.
|
|
|
|
|
|
|
|
#define TEST_1D(FIXTURE, NAME, CASES) \
|
|
|
|
class FIXTURE##_##NAME##_DD : public FIXTURE { \
|
|
|
|
protected: \
|
|
|
|
template <typename CaseType> \
|
|
|
|
void DoSingleCase(const CaseType& CASES##_case); \
|
|
|
|
}; \
|
|
|
|
\
|
|
|
|
TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
|
|
|
|
for (int i = 0; i < arraysize(CASES); i++) { \
|
|
|
|
SCOPED_TRACE(testing::Message() \
|
|
|
|
<< #CASES " case #" << i << ": " << CASES[i]); \
|
|
|
|
DoSingleCase(CASES[i]); \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
template <typename CaseType> \
|
|
|
|
void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
|
|
|
|
|
|
|
|
#define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
|
|
|
|
class FIXTURE##_##NAME##_DD : public FIXTURE { \
|
|
|
|
protected: \
|
|
|
|
template <typename CaseType1, typename CaseType2> \
|
|
|
|
void DoSingleCase(const CaseType1& CASES1##_case, \
|
|
|
|
const CaseType2& CASES2##_case); \
|
|
|
|
}; \
|
|
|
|
\
|
|
|
|
TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
|
|
|
|
for (int i = 0; i < arraysize(CASES1); i++) { \
|
|
|
|
for (int j = 0; j < arraysize(CASES2); j++) { \
|
|
|
|
SCOPED_TRACE(testing::Message() \
|
|
|
|
<< #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
|
|
|
|
<< #CASES2 " case #" << j << ": " << CASES2[j]); \
|
|
|
|
DoSingleCase(CASES1[i], CASES2[j]); \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
template <typename CaseType1, typename CaseType2> \
|
|
|
|
void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
|
|
|
|
const CaseType2& CASES2##_case)
|
|
|
|
|
|
|
|
// -------------------------------------------------------------------
|
|
|
|
|
|
|
|
// In C, a size of zero from ZCIS_Next() means EOF so we can't play the same
|
|
|
|
// trick here that happens in the C++ version. Use ChunkedInputStream instead.
|
|
|
|
upb_ZeroCopyInputStream* TestInputStream(const void* data, size_t size,
|
|
|
|
size_t block_size, upb_Arena* arena) {
|
|
|
|
return upb_ChunkedInputStream_New(data, size, block_size, arena);
|
|
|
|
}
|
|
|
|
|
|
|
|
// -------------------------------------------------------------------
|
|
|
|
|
|
|
|
// We test each operation over a variety of block sizes to insure that
|
|
|
|
// we test cases where reads cross buffer boundaries as well as cases
|
|
|
|
// where they don't. This is sort of a brute-force approach to this,
|
|
|
|
// but it's easy to write and easy to understand.
|
|
|
|
const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
|
|
|
|
|
|
|
|
class TokenizerTest : public testing::Test {
|
|
|
|
protected:
|
|
|
|
// For easy testing.
|
|
|
|
uint64_t ParseInteger(const std::string& text) {
|
|
|
|
uint64_t result;
|
|
|
|
EXPECT_TRUE(upb_Parse_Integer(text.data(), UINT64_MAX, &result))
|
|
|
|
<< "'" << text << "'";
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// ===================================================================
|
|
|
|
|
|
|
|
// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
|
|
|
|
// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
|
|
|
|
#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
|
|
|
|
|
|
|
|
// In each test case, the entire input text should parse as a single token
|
|
|
|
// of the given type.
|
|
|
|
struct SimpleTokenCase {
|
|
|
|
std::string input;
|
|
|
|
upb_TokenType type;
|
|
|
|
};
|
|
|
|
|
|
|
|
inline std::ostream& operator<<(std::ostream& out,
|
|
|
|
const SimpleTokenCase& test_case) {
|
|
|
|
return out << absl::CEscape(test_case.input);
|
|
|
|
}
|
|
|
|
|
|
|
|
SimpleTokenCase kSimpleTokenCases[] = {
|
|
|
|
// Test identifiers.
|
|
|
|
{"hello", kUpb_TokenType_Identifier},
|
|
|
|
|
|
|
|
// Test integers.
|
|
|
|
{"123", kUpb_TokenType_Integer},
|
|
|
|
{"0xab6", kUpb_TokenType_Integer},
|
|
|
|
{"0XAB6", kUpb_TokenType_Integer},
|
|
|
|
{"0X1234567", kUpb_TokenType_Integer},
|
|
|
|
{"0x89abcdef", kUpb_TokenType_Integer},
|
|
|
|
{"0x89ABCDEF", kUpb_TokenType_Integer},
|
|
|
|
{"01234567", kUpb_TokenType_Integer},
|
|
|
|
|
|
|
|
// Test floats.
|
|
|
|
{"123.45", kUpb_TokenType_Float},
|
|
|
|
{"1.", kUpb_TokenType_Float},
|
|
|
|
{"1e3", kUpb_TokenType_Float},
|
|
|
|
{"1E3", kUpb_TokenType_Float},
|
|
|
|
{"1e-3", kUpb_TokenType_Float},
|
|
|
|
{"1e+3", kUpb_TokenType_Float},
|
|
|
|
{"1.e3", kUpb_TokenType_Float},
|
|
|
|
{"1.2e3", kUpb_TokenType_Float},
|
|
|
|
{".1", kUpb_TokenType_Float},
|
|
|
|
{".1e3", kUpb_TokenType_Float},
|
|
|
|
{".1e-3", kUpb_TokenType_Float},
|
|
|
|
{".1e+3", kUpb_TokenType_Float},
|
|
|
|
|
|
|
|
// Test strings.
|
|
|
|
{"'hello'", kUpb_TokenType_String},
|
|
|
|
{"\"foo\"", kUpb_TokenType_String},
|
|
|
|
{"'a\"b'", kUpb_TokenType_String},
|
|
|
|
{"\"a'b\"", kUpb_TokenType_String},
|
|
|
|
{"'a\\'b'", kUpb_TokenType_String},
|
|
|
|
{"\"a\\\"b\"", kUpb_TokenType_String},
|
|
|
|
{"'\\xf'", kUpb_TokenType_String},
|
|
|
|
{"'\\0'", kUpb_TokenType_String},
|
|
|
|
|
|
|
|
// Test symbols.
|
|
|
|
{"+", kUpb_TokenType_Symbol},
|
|
|
|
{".", kUpb_TokenType_Symbol},
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
|
|
|
|
upb::Arena arena;
|
|
|
|
|
|
|
|
// Set up the tokenizer.
|
|
|
|
auto input = TestInputStream(kSimpleTokenCases_case.input.data(),
|
|
|
|
kSimpleTokenCases_case.input.size(),
|
|
|
|
kBlockSizes_case, arena.ptr());
|
|
|
|
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr());
|
|
|
|
|
|
|
|
// Before Next() is called, the initial token should always be TYPE_START.
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Line(t), 0);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Column(t), 0);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
|
|
|
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
|
|
|
|
|
|
|
|
// Parse the token.
|
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
|
|
|
|
// Check that it has the right type.
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), kSimpleTokenCases_case.type);
|
|
|
|
// Check that it contains the complete input text.
|
|
|
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t),
|
|
|
|
kSimpleTokenCases_case.input.data()));
|
|
|
|
|
|
|
|
// Check that it is located at the beginning of the input
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Line(t), 0);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Column(t), 0);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size());
|
|
|
|
|
|
|
|
upb_Status status;
|
|
|
|
upb_Status_Clear(&status);
|
|
|
|
|
|
|
|
// There should be no more input and no errors..
|
|
|
|
EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
|
|
|
|
EXPECT_TRUE(upb_Status_IsOk(&status));
|
|
|
|
|
|
|
|
// After Next() returns false, the token should have type TYPE_END.
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_End);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Line(t), 0);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Column(t), kSimpleTokenCases_case.input.size());
|
|
|
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size());
|
|
|
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
|
|
|
|
// Test the "allow_f_after_float" option.
|
|
|
|
|
|
|
|
// Set up the tokenizer.
|
|
|
|
upb::Arena arena;
|
|
|
|
const char* text = "1f 2.5f 6e3f 7F";
|
|
|
|
auto input =
|
|
|
|
TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr());
|
|
|
|
const int options = kUpb_TokenizerOption_AllowFAfterFloat;
|
|
|
|
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr());
|
|
|
|
|
|
|
|
// Advance through tokens and check that they are parsed as expected.
|
|
|
|
|
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
|
|
|
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "1f"));
|
|
|
|
|
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
|
|
|
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "2.5f"));
|
|
|
|
|
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
|
|
|
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "6e3f"));
|
|
|
|
|
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
|
|
|
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "7F"));
|
|
|
|
|
|
|
|
upb_Status status;
|
|
|
|
upb_Status_Clear(&status);
|
|
|
|
|
|
|
|
// There should be no more input and no errors..
|
|
|
|
EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
|
|
|
|
EXPECT_TRUE(upb_Status_IsOk(&status));
|
|
|
|
}
|
|
|
|
|
|
|
|
SimpleTokenCase kWhitespaceTokenCases[] = {
|
|
|
|
{" ", kUpb_TokenType_Whitespace},
|
|
|
|
{" ", kUpb_TokenType_Whitespace},
|
|
|
|
{"\t", kUpb_TokenType_Whitespace},
|
|
|
|
{"\v", kUpb_TokenType_Whitespace},
|
|
|
|
{"\t ", kUpb_TokenType_Whitespace},
|
|
|
|
{"\v\t", kUpb_TokenType_Whitespace},
|
|
|
|
{" \t\r", kUpb_TokenType_Whitespace},
|
|
|
|
// Newlines:
|
|
|
|
{"\n", kUpb_TokenType_Newline},
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_2D(TokenizerTest, Whitespace, kWhitespaceTokenCases, kBlockSizes) {
|
|
|
|
upb::Arena arena;
|
|
|
|
{
|
|
|
|
auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(),
|
|
|
|
kWhitespaceTokenCases_case.input.size(),
|
|
|
|
kBlockSizes_case, arena.ptr());
|
|
|
|
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr());
|
|
|
|
|
|
|
|
EXPECT_FALSE(upb_Tokenizer_Next(t, NULL));
|
|
|
|
}
|
|
|
|
{
|
|
|
|
auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(),
|
|
|
|
kWhitespaceTokenCases_case.input.size(),
|
|
|
|
kBlockSizes_case, arena.ptr());
|
|
|
|
const int options = kUpb_TokenizerOption_ReportNewlines;
|
|
|
|
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr());
|
|
|
|
|
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
|
|
|
|
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), kWhitespaceTokenCases_case.type);
|
|
|
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t),
|
|
|
|
kWhitespaceTokenCases_case.input.data()));
|
|
|
|
EXPECT_FALSE(upb_Tokenizer_Next(t, NULL));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// -------------------------------------------------------------------
|
|
|
|
|
|
|
|
struct TokenFields {
|
|
|
|
upb_TokenType type;
|
|
|
|
std::string text;
|
|
|
|
size_t line;
|
|
|
|
size_t column;
|
|
|
|
size_t end_column;
|
|
|
|
};
|
|
|
|
|
|
|
|
// In each case, the input is parsed to produce a list of tokens. The
|
|
|
|
// last token in "output" must have type TYPE_END.
|
|
|
|
struct MultiTokenCase {
|
|
|
|
std::string input;
|
|
|
|
std::vector<TokenFields> output;
|
|
|
|
};
|
|
|
|
|
|
|
|
inline std::ostream& operator<<(std::ostream& out,
|
|
|
|
const MultiTokenCase& test_case) {
|
|
|
|
return out << absl::CEscape(test_case.input);
|
|
|
|
}
|
|
|
|
|
|
|
|
MultiTokenCase kMultiTokenCases[] = {
|
|
|
|
// Test empty input.
|
|
|
|
{"",
|
|
|
|
{
|
|
|
|
{kUpb_TokenType_End, "", 0, 0, 0},
|
|
|
|
}},
|
|
|
|
// Test all token types at the same time.
|
|
|
|
{"foo 1 1.2 + 'bar'",
|
|
|
|
{
|
|
|
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3},
|
|
|
|
{kUpb_TokenType_Integer, "1", 0, 4, 5},
|
|
|
|
{kUpb_TokenType_Float, "1.2", 0, 6, 9},
|
|
|
|
{kUpb_TokenType_Symbol, "+", 0, 10, 11},
|
|
|
|
{kUpb_TokenType_String, "'bar'", 0, 12, 17},
|
|
|
|
{kUpb_TokenType_End, "", 0, 17, 17},
|
|
|
|
}},
|
|
|
|
|
|
|
|
// Test that consecutive symbols are parsed as separate tokens.
|
|
|
|
{"!@+%",
|
|
|
|
{
|
|
|
|
{kUpb_TokenType_Symbol, "!", 0, 0, 1},
|
|
|
|
{kUpb_TokenType_Symbol, "@", 0, 1, 2},
|
|
|
|
{kUpb_TokenType_Symbol, "+", 0, 2, 3},
|
|
|
|
{kUpb_TokenType_Symbol, "%", 0, 3, 4},
|
|
|
|
{kUpb_TokenType_End, "", 0, 4, 4},
|
|
|
|
}},
|
|
|
|
|
|
|
|
// Test that newlines affect line numbers correctly.
|
|
|
|
{"foo bar\nrab oof",
|
|
|
|
{
|
|
|
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3},
|
|
|
|
{kUpb_TokenType_Identifier, "bar", 0, 4, 7},
|
|
|
|
{kUpb_TokenType_Identifier, "rab", 1, 0, 3},
|
|
|
|
{kUpb_TokenType_Identifier, "oof", 1, 4, 7},
|
|
|
|
{kUpb_TokenType_End, "", 1, 7, 7},
|
|
|
|
}},
|
|
|
|
|
|
|
|
// Test that tabs affect column numbers correctly.
|
|
|
|
{"foo\tbar \tbaz",
|
|
|
|
{
|
|
|
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3},
|
|
|
|
{kUpb_TokenType_Identifier, "bar", 0, 8, 11},
|
|
|
|
{kUpb_TokenType_Identifier, "baz", 0, 16, 19},
|
|
|
|
{kUpb_TokenType_End, "", 0, 19, 19},
|
|
|
|
}},
|
|
|
|
|
|
|
|
// Test that tabs in string literals affect column numbers correctly.
|
|
|
|
{"\"foo\tbar\" baz",
|
|
|
|
{
|
|
|
|
{kUpb_TokenType_String, "\"foo\tbar\"", 0, 0, 12},
|
|
|
|
{kUpb_TokenType_Identifier, "baz", 0, 13, 16},
|
|
|
|
{kUpb_TokenType_End, "", 0, 16, 16},
|
|
|
|
}},
|
|
|
|
|
|
|
|
// Test that line comments are ignored.
|
|
|
|
{"foo // This is a comment\n"
|
|
|
|
"bar // This is another comment",
|
|
|
|
{
|
|
|
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3},
|
|
|
|
{kUpb_TokenType_Identifier, "bar", 1, 0, 3},
|
|
|
|
{kUpb_TokenType_End, "", 1, 30, 30},
|
|
|
|
}},
|
|
|
|
|
|
|
|
// Test that block comments are ignored.
|
|
|
|
{"foo /* This is a block comment */ bar",
|
|
|
|
{
|
|
|
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3},
|
|
|
|
{kUpb_TokenType_Identifier, "bar", 0, 34, 37},
|
|
|
|
{kUpb_TokenType_End, "", 0, 37, 37},
|
|
|
|
}},
|
|
|
|
|
|
|
|
// Test that sh-style comments are not ignored by default.
|
|
|
|
{"foo # bar\n"
|
|
|
|
"baz",
|
|
|
|
{
|
|
|
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3},
|
|
|
|
{kUpb_TokenType_Symbol, "#", 0, 4, 5},
|
|
|
|
{kUpb_TokenType_Identifier, "bar", 0, 6, 9},
|
|
|
|
{kUpb_TokenType_Identifier, "baz", 1, 0, 3},
|
|
|
|
{kUpb_TokenType_End, "", 1, 3, 3},
|
|
|
|
}},
|
|
|
|
|
|
|
|
// Test all whitespace chars
|
|
|
|
{"foo\n\t\r\v\fbar",
|
|
|
|
{
|
|
|
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3},
|
|
|
|
{kUpb_TokenType_Identifier, "bar", 1, 11, 14},
|
|
|
|
{kUpb_TokenType_End, "", 1, 14, 14},
|
|
|
|
}},
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
|
|
|
|
// Set up the tokenizer.
|
|
|
|
upb::Arena arena;
|
|
|
|
auto input = TestInputStream(kMultiTokenCases_case.input.data(),
|
|
|
|
kMultiTokenCases_case.input.size(),
|
|
|
|
kBlockSizes_case, arena.ptr());
|
|
|
|
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr());
|
|
|
|
|
|
|
|
// Before Next() is called, the initial token should always be TYPE_START.
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Line(t), 0);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Column(t), 0);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
|
|
|
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
|
|
|
|
|
|
|
|
// Loop through all expected tokens.
|
|
|
|
TokenFields token_fields;
|
|
|
|
upb_Status status;
|
|
|
|
upb_Status_Clear(&status);
|
|
|
|
int i = 0;
|
|
|
|
do {
|
|
|
|
token_fields = kMultiTokenCases_case.output[i++];
|
|
|
|
|
|
|
|
SCOPED_TRACE(testing::Message()
|
|
|
|
<< "Token #" << i << ": " << absl::CEscape(token_fields.text));
|
|
|
|
|
|
|
|
// Next() should only return false when it hits the end token.
|
|
|
|
if (token_fields.type == kUpb_TokenType_End) {
|
|
|
|
EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
|
|
|
|
EXPECT_TRUE(upb_Status_IsOk(&status));
|
|
|
|
} else {
|
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check that the token matches the expected one.
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_TextSize(t), token_fields.text.size());
|
|
|
|
EXPECT_TRUE(
|
|
|
|
StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data()));
|
|
|
|
} while (token_fields.type != kUpb_TokenType_End);
|
|
|
|
}
|
|
|
|
|
|
|
|
MultiTokenCase kMultiWhitespaceTokenCases[] = {
|
|
|
|
// Test all token types at the same time.
|
|
|
|
{"foo 1 \t1.2 \n +\v'bar'",
|
|
|
|
{
|
|
|
|
{kUpb_TokenType_Identifier, "foo", 0, 0, 3},
|
|
|
|
{kUpb_TokenType_Whitespace, " ", 0, 3, 4},
|
|
|
|
{kUpb_TokenType_Integer, "1", 0, 4, 5},
|
|
|
|
{kUpb_TokenType_Whitespace, " \t", 0, 5, 8},
|
|
|
|
{kUpb_TokenType_Float, "1.2", 0, 8, 11},
|
|
|
|
{kUpb_TokenType_Whitespace, " ", 0, 11, 13},
|
|
|
|
{kUpb_TokenType_Newline, "\n", 0, 13, 0},
|
|
|
|
{kUpb_TokenType_Whitespace, " ", 1, 0, 3},
|
|
|
|
{kUpb_TokenType_Symbol, "+", 1, 3, 4},
|
|
|
|
{kUpb_TokenType_Whitespace, "\v", 1, 4, 5},
|
|
|
|
{kUpb_TokenType_String, "'bar'", 1, 5, 10},
|
|
|
|
{kUpb_TokenType_End, "", 1, 10, 10},
|
|
|
|
}},
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases,
|
|
|
|
kBlockSizes) {
|
|
|
|
// Set up the tokenizer.
|
|
|
|
upb::Arena arena;
|
|
|
|
auto input = TestInputStream(kMultiWhitespaceTokenCases_case.input.data(),
|
|
|
|
kMultiWhitespaceTokenCases_case.input.size(),
|
|
|
|
kBlockSizes_case, arena.ptr());
|
|
|
|
const int options = kUpb_TokenizerOption_ReportNewlines;
|
|
|
|
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr());
|
|
|
|
|
|
|
|
// Before Next() is called, the initial token should always be TYPE_START.
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Line(t), 0);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Column(t), 0);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
|
|
|
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
|
|
|
|
|
|
|
|
// Loop through all expected tokens.
|
|
|
|
TokenFields token_fields;
|
|
|
|
upb_Status status;
|
|
|
|
upb_Status_Clear(&status);
|
|
|
|
int i = 0;
|
|
|
|
do {
|
|
|
|
token_fields = kMultiWhitespaceTokenCases_case.output[i++];
|
|
|
|
|
|
|
|
SCOPED_TRACE(testing::Message()
|
|
|
|
<< "Token #" << i << ": " << token_fields.text);
|
|
|
|
|
|
|
|
// Next() should only return false when it hits the end token.
|
|
|
|
if (token_fields.type == kUpb_TokenType_End) {
|
|
|
|
EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
|
|
|
|
EXPECT_TRUE(upb_Status_IsOk(&status));
|
|
|
|
} else {
|
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check that the token matches the expected one.
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column);
|
|
|
|
EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column);
|
|
|
|
EXPECT_TRUE(
|
|
|
|
StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data()));
|
|
|
|
} while (token_fields.type != kUpb_TokenType_End);
|
|
|
|
}
|
|
|
|
|
|
|
|
// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
|
|
|
|
// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
|
|
|
|
#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
|
|
|
|
|
|
|
|
TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
|
|
|
|
// Test the "comment_style" option.
|
|
|
|
|
|
|
|
const char* text =
|
|
|
|
"foo # bar\n"
|
|
|
|
"baz // qux\n"
|
|
|
|
"corge /* grault */\n"
|
|
|
|
"garply";
|
|
|
|
const char* const kTokens[] = {"foo", // "# bar" is ignored
|
|
|
|
"baz", "/", "/", "qux", "corge", "/",
|
|
|
|
"*", "grault", "*", "/", "garply"};
|
|
|
|
|
|
|
|
// Set up the tokenizer.
|
|
|
|
upb::Arena arena;
|
|
|
|
auto input =
|
|
|
|
TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr());
|
|
|
|
const int options = kUpb_TokenizerOption_CommentStyleShell;
|
|
|
|
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr());
|
|
|
|
|
|
|
|
// Advance through tokens and check that they are parsed as expected.
|
|
|
|
for (int i = 0; i < arraysize(kTokens); i++) {
|
|
|
|
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
|
|
|
|
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), kTokens[i]));
|
|
|
|
}
|
|
|
|
|
|
|
|
// There should be no more input and no errors.
|
|
|
|
upb_Status status;
|
|
|
|
upb_Status_Clear(&status);
|
|
|
|
EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
|
|
|
|
EXPECT_TRUE(upb_Status_IsOk(&status));
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// -------------------------------------------------------------------
|
|
|
|
|
|
|
|
#if 0 // TODO(salo): Extended comments are currently unimplemented.
|
|
|
|
|
|
|
|
// In each case, the input is expected to have two tokens named "prev" and
|
|
|
|
// "next" with comments in between.
|
|
|
|
struct DocCommentCase {
|
|
|
|
std::string input;
|
|
|
|
|
|
|
|
const char* prev_trailing_comments;
|
|
|
|
const char* detached_comments[10];
|
|
|
|
const char* next_leading_comments;
|
|
|
|
};
|
|
|
|
|
|
|
|
inline std::ostream& operator<<(std::ostream& out,
|
|
|
|
const DocCommentCase& test_case) {
|
|
|
|
return out << absl::CEscape(test_case.input);
|
|
|
|
}
|
|
|
|
|
|
|
|
DocCommentCase kDocCommentCases[] = {
|
|
|
|
{"prev next",
|
|
|
|
|
|
|
|
"",
|
|
|
|
{},
|
|
|
|
""},
|
|
|
|
|
|
|
|
{"prev /* ignored */ next",
|
|
|
|
|
|
|
|
"",
|
|
|
|
{},
|
|
|
|
""},
|
|
|
|
|
|
|
|
{"prev // trailing comment\n"
|
|
|
|
"next",
|
|
|
|
|
|
|
|
" trailing comment\n",
|
|
|
|
{},
|
|
|
|
""},
|
|
|
|
|
|
|
|
{"prev\n"
|
|
|
|
"// leading comment\n"
|
|
|
|
"// line 2\n"
|
|
|
|
"next",
|
|
|
|
|
|
|
|
"",
|
|
|
|
{},
|
|
|
|
" leading comment\n"
|
|
|
|
" line 2\n"},
|
|
|
|
|
|
|
|
{"prev\n"
|
|
|
|
"// trailing comment\n"
|
|
|
|
"// line 2\n"
|
|
|
|
"\n"
|
|
|
|
"next",
|
|
|
|
|
|
|
|
" trailing comment\n"
|
|
|
|
" line 2\n",
|
|
|
|
{},
|
|
|
|
""},
|
|
|
|
|
|
|
|
{"prev // trailing comment\n"
|
|
|
|
"// leading comment\n"
|
|
|
|
"// line 2\n"
|
|
|
|
"next",
|
|
|
|
|
|
|
|
" trailing comment\n",
|
|
|
|
{},
|
|
|
|
" leading comment\n"
|
|
|
|
" line 2\n"},
|
|
|
|
|
|
|
|
{"prev /* trailing block comment */\n"
|
|
|
|
"/* leading block comment\n"
|
|
|
|
" * line 2\n"
|
|
|
|
" * line 3 */"
|
|
|
|
"next",
|
|
|
|
|
|
|
|
" trailing block comment ",
|
|
|
|
{},
|
|
|
|
" leading block comment\n"
|
|
|
|
" line 2\n"
|
|
|
|
" line 3 "},
|
|
|
|
|
|
|
|
{"prev\n"
|
|
|
|
"/* trailing block comment\n"
|
|
|
|
" * line 2\n"
|
|
|
|
" * line 3\n"
|
|
|
|
" */\n"
|
|
|
|
"/* leading block comment\n"
|
|
|
|
" * line 2\n"
|
|
|
|
" * line 3 */"
|
|
|
|
"next",
|
|
|
|
|
|
|
|
" trailing block comment\n"
|
|
|
|
" line 2\n"
|
|
|
|
" line 3\n",
|
|
|
|
{},
|
|
|
|
" leading block comment\n"
|
|
|
|
" line 2\n"
|
|
|
|
" line 3 "},
|
|
|
|
|
|
|
|
{"prev\n"
|
|
|
|
"// trailing comment\n"
|
|
|
|
"\n"
|
|
|
|
"// detached comment\n"
|
|
|
|
"// line 2\n"
|
|
|
|
"\n"
|
|
|
|
"// second detached comment\n"
|
|
|
|
"/* third detached comment\n"
|
|
|
|
" * line 2 */\n"
|
|
|
|
"// leading comment\n"
|
|
|
|
"next",
|
|
|
|
|
|
|
|
" trailing comment\n",
|
|
|
|
{" detached comment\n"
|
|
|
|
" line 2\n",
|
|
|
|
" second detached comment\n",
|
|
|
|
" third detached comment\n"
|
|
|
|
" line 2 "},
|
|
|
|
" leading comment\n"},
|
|
|
|
|
|
|
|
{"prev /**/\n"
|
|
|
|
"\n"
|
|
|
|
"// detached comment\n"
|
|
|
|
"\n"
|
|
|
|
"// leading comment\n"
|
|
|
|
"next",
|
|
|
|
|
|
|
|
"",
|
|
|
|
{" detached comment\n"},
|
|
|
|
" leading comment\n"},
|
|
|
|
|
|
|
|
{"prev /**/\n"
|
|
|
|
"// leading comment\n"
|
|
|
|
"next",
|
|
|
|
|
|
|
|
"",
|
|
|
|
{},
|
|
|
|
" leading comment\n"},
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
|
|
|
|
// Set up the tokenizer.
|
|
|
|
TestInputStream input(kDocCommentCases_case.input.data(),
|
|
|
|
kDocCommentCases_case.input.size(), kBlockSizes_case);
|
|
|
|
TestErrorCollector error_collector;
|
|
|
|
Tokenizer tokenizer(&input, &error_collector);
|
|
|
|
|
|
|
|
// Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
|
|
|
|
TestInputStream input2(kDocCommentCases_case.input.data(),
|
|
|
|
kDocCommentCases_case.input.size(), kBlockSizes_case);
|
|
|
|
Tokenizer tokenizer2(&input2, &error_collector);
|
|
|
|
|
|
|
|
tokenizer.Next();
|
|
|
|
tokenizer2.Next();
|
|
|
|
|
|
|
|
EXPECT_EQ("prev", tokenizer.current().text);
|
|
|
|
EXPECT_EQ("prev", tokenizer2.current().text);
|
|
|
|
|
|
|
|
std::string prev_trailing_comments;
|
|
|
|
std::vector<std::string> detached_comments;
|
|
|
|
std::string next_leading_comments;
|
|
|
|
tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
|
|
|
|
&next_leading_comments);
|
|
|
|
tokenizer2.NextWithComments(NULL, NULL, NULL);
|
|
|
|
EXPECT_EQ("next", tokenizer.current().text);
|
|
|
|
EXPECT_EQ("next", tokenizer2.current().text);
|
|
|
|
|
|
|
|
EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
|
|
|
|
prev_trailing_comments);
|
|
|
|
|
|
|
|
for (int i = 0; i < detached_comments.size(); i++) {
|
|
|
|
EXPECT_LT(i, arraysize(kDocCommentCases));
|
|
|
|
EXPECT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
|
|
|
|
EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Verify that we matched all the detached comments.
|
|
|
|
EXPECT_EQ(NULL,
|
|
|
|
kDocCommentCases_case.detached_comments[detached_comments.size()]);
|
|
|
|
|
|
|
|
EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // 0
|
|
|
|
|
|
|
|
// -------------------------------------------------------------------
|
|
|
|
|
|
|
|
// Test parse helpers.
|
|
|
|
// TODO(b/225783758): Add a fuzz test for this.
|
|
|
|
TEST_F(TokenizerTest, ParseInteger) {
|
|
|
|
EXPECT_EQ(0, ParseInteger("0"));
|
|
|
|
EXPECT_EQ(123, ParseInteger("123"));
|
|
|
|
EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
|
|
|
|
EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
|
|
|
|
EXPECT_EQ(UINT64_MAX, ParseInteger("0xFFFFFFFFFFFFFFFF"));
|
|
|
|
EXPECT_EQ(01234567, ParseInteger("01234567"));
|
|
|
|
EXPECT_EQ(0X123, ParseInteger("0X123"));
|
|
|
|
|
|
|
|
// Test invalid integers that may still be tokenized as integers.
|
|
|
|
EXPECT_EQ(0, ParseInteger("0x"));
|
|
|
|
|
|
|
|
uint64_t i;
|
|
|
|
|
|
|
|
// Test invalid integers that will never be tokenized as integers.
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer("zxy", UINT64_MAX, &i));
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer("1.2", UINT64_MAX, &i));
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer("08", UINT64_MAX, &i));
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer("0xg", UINT64_MAX, &i));
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer("-1", UINT64_MAX, &i));
|
|
|
|
|
|
|
|
// Test overflows.
|
|
|
|
EXPECT_TRUE(upb_Parse_Integer("0", 0, &i));
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer("1", 0, &i));
|
|
|
|
EXPECT_TRUE(upb_Parse_Integer("1", 1, &i));
|
|
|
|
EXPECT_TRUE(upb_Parse_Integer("12345", 12345, &i));
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer("12346", 12345, &i));
|
|
|
|
EXPECT_TRUE(upb_Parse_Integer("0xFFFFFFFFFFFFFFFF", UINT64_MAX, &i));
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer("0x10000000000000000", UINT64_MAX, &i));
|
|
|
|
|
|
|
|
// Test near the limits of signed parsing (values in INT64_MAX +/- 1600)
|
|
|
|
for (int64_t offset = -1600; offset <= 1600; ++offset) {
|
|
|
|
// We make sure to perform an unsigned addition so that we avoid signed
|
|
|
|
// overflow, which would be undefined behavior.
|
|
|
|
uint64_t i = 0x7FFFFFFFFFFFFFFFu + static_cast<uint64_t>(offset);
|
|
|
|
char decimal[32];
|
|
|
|
snprintf(decimal, 32, "%llu", static_cast<unsigned long long>(i));
|
|
|
|
if (offset > 0) {
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer(decimal, INT64_MAX, &parsed))
|
|
|
|
<< decimal << "=>" << parsed;
|
|
|
|
} else {
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_TRUE(upb_Parse_Integer(decimal, INT64_MAX, &parsed))
|
|
|
|
<< decimal << "=>" << parsed;
|
|
|
|
EXPECT_EQ(parsed, i);
|
|
|
|
}
|
|
|
|
char octal[32];
|
|
|
|
snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
|
|
|
|
if (offset > 0) {
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer(octal, INT64_MAX, &parsed))
|
|
|
|
<< octal << "=>" << parsed;
|
|
|
|
} else {
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_TRUE(upb_Parse_Integer(octal, INT64_MAX, &parsed))
|
|
|
|
<< octal << "=>" << parsed;
|
|
|
|
EXPECT_EQ(parsed, i);
|
|
|
|
}
|
|
|
|
char hex[32];
|
|
|
|
snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(i));
|
|
|
|
if (offset > 0) {
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer(hex, INT64_MAX, &parsed))
|
|
|
|
<< hex << "=>" << parsed;
|
|
|
|
} else {
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_TRUE(upb_Parse_Integer(hex, INT64_MAX, &parsed)) << hex;
|
|
|
|
EXPECT_EQ(parsed, i);
|
|
|
|
}
|
|
|
|
// EXPECT_NE(offset, -237);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test near the limits of unsigned parsing (values in UINT64_MAX +/- 1600)
|
|
|
|
// By definition, values greater than UINT64_MAX cannot be held in a uint64_t
|
|
|
|
// variable, so printing them is a little tricky; fortunately all but the
|
|
|
|
// last four digits are known, so we can hard-code them in the printf string,
|
|
|
|
// and we only need to format the last 4.
|
|
|
|
for (int64_t offset = -1600; offset <= 1600; ++offset) {
|
|
|
|
{
|
|
|
|
uint64_t i = 18446744073709551615u + offset;
|
|
|
|
char decimal[32];
|
|
|
|
snprintf(decimal, 32, "1844674407370955%04llu",
|
|
|
|
static_cast<unsigned long long>(1615 + offset));
|
|
|
|
if (offset > 0) {
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed))
|
|
|
|
<< decimal << "=>" << parsed;
|
|
|
|
} else {
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_TRUE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed)) << decimal;
|
|
|
|
EXPECT_EQ(parsed, i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
uint64_t i = 01777777777777777777777u + offset;
|
|
|
|
if (offset > 0) {
|
|
|
|
char octal[32];
|
|
|
|
snprintf(octal, 32, "0200000000000000000%04llo",
|
|
|
|
static_cast<unsigned long long>(offset - 1));
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer(octal, UINT64_MAX, &parsed))
|
|
|
|
<< octal << "=>" << parsed;
|
|
|
|
} else {
|
|
|
|
char octal[32];
|
|
|
|
snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_TRUE(upb_Parse_Integer(octal, UINT64_MAX, &parsed)) << octal;
|
|
|
|
EXPECT_EQ(parsed, i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
uint64_t ui = 0xffffffffffffffffu + offset;
|
|
|
|
char hex[32];
|
|
|
|
if (offset > 0) {
|
|
|
|
snprintf(hex, 32, "0x1000000000000%04llx",
|
|
|
|
static_cast<unsigned long long>(offset - 1));
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_FALSE(upb_Parse_Integer(hex, UINT64_MAX, &parsed))
|
|
|
|
<< hex << "=>" << parsed;
|
|
|
|
} else {
|
|
|
|
snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(ui));
|
|
|
|
uint64_t parsed = -1;
|
|
|
|
EXPECT_TRUE(upb_Parse_Integer(hex, UINT64_MAX, &parsed)) << hex;
|
|
|
|
EXPECT_EQ(parsed, ui);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(TokenizerTest, ParseFloat) {
|
|
|
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1."));
|
|
|
|
EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1e3"));
|
|
|
|
EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1E3"));
|
|
|
|
EXPECT_DOUBLE_EQ(1.5e3, upb_Parse_Float("1.5e3"));
|
|
|
|
EXPECT_DOUBLE_EQ(.1, upb_Parse_Float(".1"));
|
|
|
|
EXPECT_DOUBLE_EQ(.25, upb_Parse_Float(".25"));
|
|
|
|
EXPECT_DOUBLE_EQ(.1e3, upb_Parse_Float(".1e3"));
|
|
|
|
EXPECT_DOUBLE_EQ(.25e3, upb_Parse_Float(".25e3"));
|
|
|
|
EXPECT_DOUBLE_EQ(.1e+3, upb_Parse_Float(".1e+3"));
|
|
|
|
EXPECT_DOUBLE_EQ(.1e-3, upb_Parse_Float(".1e-3"));
|
|
|
|
EXPECT_DOUBLE_EQ(5, upb_Parse_Float("5"));
|
|
|
|
EXPECT_DOUBLE_EQ(6e-12, upb_Parse_Float("6e-12"));
|
|
|
|
EXPECT_DOUBLE_EQ(1.2, upb_Parse_Float("1.2"));
|
|
|
|
EXPECT_DOUBLE_EQ(1.e2, upb_Parse_Float("1.e2"));
|
|
|
|
|
|
|
|
// Test invalid integers that may still be tokenized as integers.
|
|
|
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e"));
|
|
|
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e-"));
|
|
|
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.e"));
|
|
|
|
|
|
|
|
// Test 'f' suffix.
|
|
|
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1f"));
|
|
|
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.0f"));
|
|
|
|
EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1F"));
|
|
|
|
|
|
|
|
// These should parse successfully even though they are out of range.
|
|
|
|
// Overflows become infinity and underflows become zero.
|
|
|
|
EXPECT_EQ(0.0, upb_Parse_Float("1e-9999999999999999999999999999"));
|
|
|
|
EXPECT_EQ(HUGE_VAL, upb_Parse_Float("1e+9999999999999999999999999999"));
|
|
|
|
|
|
|
|
#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
|
|
|
|
// Test invalid integers that will never be tokenized as integers.
|
|
|
|
EXPECT_DEBUG_DEATH(
|
|
|
|
upb_Parse_Float("zxy"),
|
|
|
|
"passed text that could not have been tokenized as a float");
|
|
|
|
EXPECT_DEBUG_DEATH(
|
|
|
|
upb_Parse_Float("1-e0"),
|
|
|
|
"passed text that could not have been tokenized as a float");
|
|
|
|
EXPECT_DEBUG_DEATH(
|
|
|
|
upb_Parse_Float("-1.0"),
|
|
|
|
"passed text that could not have been tokenized as a float");
|
|
|
|
#endif // GTEST_HAS_DEATH_TEST
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(TokenizerTest, ParseString) {
|
|
|
|
const std::string inputs[] = {
|
|
|
|
"'hello'",
|
|
|
|
"\"blah\\nblah2\"",
|
|
|
|
"'\\1x\\1\\123\\739\\52\\334n\\3'",
|
|
|
|
"'\\x20\\x4'",
|
|
|
|
|
|
|
|
// Test invalid strings that may still be tokenized as strings.
|
|
|
|
"\"\\a\\l\\v\\t", // \l is invalid
|
|
|
|
"'",
|
|
|
|
"'\\",
|
|
|
|
|
|
|
|
// Experiment with Unicode escapes.
|
|
|
|
// Here are one-, two- and three-byte Unicode characters.
|
|
|
|
"'\\u0024\\u00a2\\u20ac\\U00024b62XX'",
|
|
|
|
"'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", // Same, encoded using UTF16.
|
|
|
|
|
|
|
|
// Here's some broken UTF16: a head surrogate with no tail surrogate.
|
|
|
|
// We just output this as if it were UTF8; it's not a defined code point,
|
|
|
|
// but it has a defined encoding.
|
|
|
|
"'\\ud852XX'",
|
|
|
|
|
|
|
|
// Malformed escape: Demons may fly out of the nose.
|
|
|
|
"'\\u0'",
|
|
|
|
|
|
|
|
// Beyond the range of valid UTF-32 code units.
|
|
|
|
"'\\U00110000\\U00200000\\UFFFFFFFF'",
|
|
|
|
};
|
|
|
|
|
|
|
|
const std::string outputs[] = {
|
|
|
|
"hello",
|
|
|
|
"blah\nblah2",
|
|
|
|
"\1x\1\123\739\52\334n\3",
|
|
|
|
"\x20\x4",
|
|
|
|
|
|
|
|
"\a?\v\t",
|
|
|
|
"",
|
|
|
|
"\\",
|
|
|
|
|
|
|
|
"$¢€𤭢XX",
|
|
|
|
"$¢€𤭢XX",
|
|
|
|
|
|
|
|
"\xed\xa1\x92XX",
|
|
|
|
|
|
|
|
"u0",
|
|
|
|
|
|
|
|
"\\U00110000\\U00200000\\Uffffffff",
|
|
|
|
};
|
|
|
|
|
|
|
|
upb::Arena arena;
|
|
|
|
|
|
|
|
for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); i++) {
|
|
|
|
auto sv = upb_Parse_String(inputs[i].data(), arena.ptr());
|
|
|
|
EXPECT_TRUE(StringEquals(sv.data, outputs[i].data()));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test invalid strings that will never be tokenized as strings.
|
|
|
|
#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
|
|
|
|
EXPECT_DEBUG_DEATH(
|
|
|
|
upb_Parse_String("", arena.ptr()),
|
|
|
|
"passed text that could not have been tokenized as a string");
|
|
|
|
#endif // GTEST_HAS_DEATH_TEST
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(TokenizerTest, ParseStringAppend) {
|
|
|
|
upb::Arena arena;
|
|
|
|
upb_String output;
|
|
|
|
upb_String_Init(&output, arena.ptr());
|
|
|
|
|
|
|
|
upb_String_Assign(&output, "stuff+", 6);
|
|
|
|
auto sv = upb_Parse_String("'hello'", arena.ptr());
|
|
|
|
EXPECT_TRUE(StringEquals(sv.data, "hello"));
|
|
|
|
upb_String_Append(&output, sv.data, sv.size);
|
|
|
|
EXPECT_TRUE(StringEquals(upb_String_Data(&output), "stuff+hello"));
|
|
|
|
}
|
|
|
|
|
|
|
|
// -------------------------------------------------------------------
|
|
|
|
|
|
|
|
// Each case parses some input text, ignoring the tokens produced, and
|
|
|
|
// checks that the error output matches what is expected.
|
|
|
|
struct ErrorCase {
|
|
|
|
std::string input;
|
|
|
|
const char* errors;
|
|
|
|
};
|
|
|
|
|
|
|
|
inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
|
|
|
|
return out << absl::CEscape(test_case.input);
|
|
|
|
}
|
|
|
|
|
|
|
|
ErrorCase kErrorCases[] = {
|
|
|
|
// String errors.
|
|
|
|
{"'\\l'", "0:2: Invalid escape sequence in string literal."},
|
|
|
|
{"'\\X'", "0:2: Invalid escape sequence in string literal."},
|
|
|
|
{"'\\x'", "0:3: Expected hex digits for escape sequence."},
|
|
|
|
{"'foo", "0:4: Unexpected end of string."},
|
|
|
|
{"'bar\nfoo", "0:4: String literals cannot cross line boundaries."},
|
|
|
|
{"'\\u01'", "0:5: Expected four hex digits for \\u escape sequence."},
|
|
|
|
{"'\\uXYZ'", "0:3: Expected four hex digits for \\u escape sequence."},
|
|
|
|
|
|
|
|
// Integer errors.
|
|
|
|
{"123foo", "0:3: Need space between number and identifier."},
|
|
|
|
|
|
|
|
// Hex/octal errors.
|
|
|
|
{"0x foo", "0:2: \"0x\" must be followed by hex digits."},
|
|
|
|
{"0541823", "0:4: Numbers starting with leading zero must be in octal."},
|
|
|
|
{"0x123z", "0:5: Need space between number and identifier."},
|
|
|
|
{"0x123.4", "0:5: Hex and octal numbers must be integers."},
|
|
|
|
{"0123.4", "0:4: Hex and octal numbers must be integers."},
|
|
|
|
|
|
|
|
// Float errors.
|
|
|
|
{"1e foo", "0:2: \"e\" must be followed by exponent."},
|
|
|
|
{"1e- foo", "0:3: \"e\" must be followed by exponent."},
|
|
|
|
{"1.2.3",
|
|
|
|
"0:3: Already saw decimal point or exponent; can't have another one."},
|
|
|
|
{"1e2.3",
|
|
|
|
"0:3: Already saw decimal point or exponent; can't have another one."},
|
|
|
|
{"a.1", "0:1: Need space between identifier and decimal point."},
|
|
|
|
// allow_f_after_float not enabled, so this should be an error.
|
|
|
|
{"1.0f", "0:3: Need space between number and identifier."},
|
|
|
|
|
|
|
|
// Block comment errors.
|
|
|
|
{"/*",
|
|
|
|
"0:2: End-of-file inside block comment.\n0:0: Comment started here."},
|
|
|
|
{"/*/*/ foo",
|
|
|
|
"0:3: \"/*\" inside block comment. Block comments cannot be nested."},
|
|
|
|
|
|
|
|
// Control characters. Multiple consecutive control characters should only
|
|
|
|
// produce one error.
|
|
|
|
{"\b foo", "0:0: Invalid control characters encountered in text."},
|
|
|
|
{"\b\b foo", "0:0: Invalid control characters encountered in text."},
|
|
|
|
|
|
|
|
// Check that control characters at end of input don't result in an
|
|
|
|
// infinite loop.
|
|
|
|
{"\b", "0:0: Invalid control characters encountered in text."},
|
|
|
|
|
|
|
|
// Check recovery from '\0'. We have to explicitly specify the length of
|
|
|
|
// these strings because otherwise the string constructor will just call
|
|
|
|
// strlen() which will see the first '\0' and think that is the end of the
|
|
|
|
// string.
|
|
|
|
{std::string("\0foo", 4),
|
|
|
|
"0:0: Invalid control characters encountered in text."},
|
|
|
|
{std::string("\0\0foo", 5),
|
|
|
|
"0:0: Invalid control characters encountered in text."},
|
|
|
|
|
|
|
|
// Check error from high order bits set
|
|
|
|
{"\300", "0:0: Interpreting non ascii codepoint 192."},
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
|
|
|
|
// Set up the tokenizer.
|
|
|
|
upb::Arena arena;
|
|
|
|
auto input = TestInputStream(kErrorCases_case.input.data(),
|
|
|
|
kErrorCases_case.input.size(), kBlockSizes_case,
|
|
|
|
arena.ptr());
|
|
|
|
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr());
|
|
|
|
|
|
|
|
upb_Status status;
|
|
|
|
upb_Status_Clear(&status);
|
|
|
|
|
|
|
|
while (upb_Tokenizer_Next(t, &status))
|
|
|
|
; // just keep looping
|
|
|
|
EXPECT_TRUE(
|
|
|
|
StringEquals(upb_Status_ErrorMessage(&status), kErrorCases_case.errors));
|
|
|
|
}
|
|
|
|
|
|
|
|
// -------------------------------------------------------------------
|
|
|
|
|
|
|
|
TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
|
|
|
|
const std::string text = "foo bar";
|
|
|
|
upb::Arena arena;
|
|
|
|
auto input =
|
|
|
|
TestInputStream(text.data(), text.size(), kBlockSizes_case, arena.ptr());
|
|
|
|
|
|
|
|
// Create a tokenizer, read one token, then destroy it.
|
|
|
|
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr());
|
|
|
|
upb_Tokenizer_Next(t, NULL);
|
|
|
|
upb_Tokenizer_Fini(t);
|
|
|
|
|
|
|
|
// Only "foo" should have been read.
|
|
|
|
EXPECT_EQ(strlen("foo"), upb_ZeroCopyInputStream_ByteCount(input));
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char* kParseBenchmark[] = {
|
|
|
|
"\"partner-google-mobile-modes-print\"",
|
|
|
|
"\"partner-google-mobile-modes-products\"",
|
|
|
|
"\"partner-google-mobile-modes-realtime\"",
|
|
|
|
"\"partner-google-mobile-modes-video\"",
|
|
|
|
"\"partner-google-modes-news\"",
|
|
|
|
"\"partner-google-modes-places\"",
|
|
|
|
"\"partner-google-news\"",
|
|
|
|
"\"partner-google-print\"",
|
|
|
|
"\"partner-google-products\"",
|
|
|
|
"\"partner-google-realtime\"",
|
|
|
|
"\"partner-google-video\"",
|
|
|
|
"\"true\"",
|
|
|
|
"\"BigImagesHover__js_list\"",
|
|
|
|
"\"XFEExternJsVersionParameters\"",
|
|
|
|
"\"Available versions of the big images hover javascript\"",
|
|
|
|
"\"Version: {\n\"",
|
|
|
|
"\" script_name: \"extern_js/dummy_file_compiled_post20070813.js\"\n\"",
|
|
|
|
"\" version_number: 0\n\"",
|
|
|
|
"\"}\"",
|
|
|
|
"\"BigImagesHover__js_selection\"",
|
|
|
|
"\"XFEExternJsVersionParameters\"",
|
|
|
|
"\"Versioning info for the big images hover javascript.\"",
|
|
|
|
"\"current_version: 0\"",
|
|
|
|
"\"BigImagesHover__js_suppressed\"",
|
|
|
|
"\"Indicates if the client-side javascript associated with big images.\"",
|
|
|
|
"\"true\"",
|
|
|
|
"\"BrowserAnyOf\"",
|
|
|
|
"\"IsChrome5OrAbove\"",
|
|
|
|
"\"IsFirefox3OrAbove\"",
|
|
|
|
"IsIE8OrAboveBinary",
|
|
|
|
"\"Abe \"Sausage King\" Froman\"",
|
|
|
|
"\"Frank \"Meatball\" Febbraro\"",
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST(Benchmark, ParseStringAppendAccumulate) {
|
|
|
|
upb::Arena arena;
|
|
|
|
size_t outsize = 0;
|
|
|
|
int benchmark_len = arraysize(kParseBenchmark);
|
|
|
|
for (int i = 0; i < benchmark_len; i++) {
|
|
|
|
auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr());
|
|
|
|
outsize += sv.size;
|
|
|
|
}
|
|
|
|
EXPECT_NE(0, outsize);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(Benchmark, ParseStringAppend) {
|
|
|
|
upb::Arena arena;
|
|
|
|
upb_String output;
|
|
|
|
upb_String_Init(&output, arena.ptr());
|
|
|
|
int benchmark_len = arraysize(kParseBenchmark);
|
|
|
|
for (int i = 0; i < benchmark_len; i++) {
|
|
|
|
auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr());
|
|
|
|
upb_String_Append(&output, sv.data, sv.size);
|
|
|
|
}
|
|
|
|
EXPECT_NE(0, upb_String_Size(&output));
|
|
|
|
}
|
|
|
|
|
|
|
|
// These tests validate the Tokenizer's handling of Unicode escapes.
|
|
|
|
|
|
|
|
// Encode a single code point as UTF8.
|
|
|
|
static std::string StandardUTF8(uint32_t code_point) {
|
|
|
|
char buffer[4];
|
|
|
|
int count = upb_Unicode_ToUTF8(code_point, &buffer[0]);
|
|
|
|
|
|
|
|
EXPECT_NE(count, 0) << "Failed to encode point " << std::hex << code_point;
|
|
|
|
return std::string(reinterpret_cast<const char*>(buffer), count);
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string DisplayHex(const std::string& data) {
|
|
|
|
std::string output;
|
|
|
|
for (int i = 0; i < data.size(); ++i) {
|
|
|
|
absl::StrAppendFormat(&output, "%02x ", data[i]);
|
|
|
|
}
|
|
|
|
return output;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ExpectFormat(const std::string& expectation,
|
|
|
|
const std::string& formatted) {
|
|
|
|
upb::Arena arena;
|
|
|
|
auto sv = upb_Parse_String(formatted.data(), arena.ptr());
|
|
|
|
EXPECT_EQ(strcmp(sv.data, expectation.data()), 0)
|
|
|
|
<< ": Incorrectly parsed " << formatted << ":\nGot "
|
|
|
|
<< DisplayHex(sv.data) << "\nExpected " << DisplayHex(expectation);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(TokenizerHandlesUnicode, BMPCodes) {
|
|
|
|
for (uint32_t code_point = 0; code_point < 0x10000; ++code_point) {
|
|
|
|
// The UTF8 encoding of surrogates as single entities is not defined.
|
|
|
|
if (upb_Unicode_IsHigh(code_point)) continue;
|
|
|
|
if (upb_Unicode_IsLow(code_point)) continue;
|
|
|
|
|
|
|
|
const std::string expectation = StandardUTF8(code_point);
|
|
|
|
|
|
|
|
// Points in the BMP pages can be encoded using either \u with four hex
|
|
|
|
// digits, or \U with eight hex digits.
|
|
|
|
ExpectFormat(expectation, absl::StrFormat("'\\u%04x'", code_point));
|
|
|
|
ExpectFormat(expectation, absl::StrFormat("'\\u%04X'", code_point));
|
|
|
|
ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point));
|
|
|
|
ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(TokenizerHandlesUnicode, NonBMPCodes) {
|
|
|
|
for (uint32_t code_point = 0x10000; code_point < 0x110000; ++code_point) {
|
|
|
|
const std::string expectation = StandardUTF8(code_point);
|
|
|
|
|
|
|
|
// Points in the non-BMP pages can be encoded using either \U with eight hex
|
|
|
|
// digits, or using UTF-16 surrogate pairs.
|
|
|
|
ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point));
|
|
|
|
ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point));
|
|
|
|
ExpectFormat(expectation, absl::StrFormat("'\\u%04x\\u%04x'",
|
|
|
|
upb_Unicode_ToHigh(code_point),
|
|
|
|
upb_Unicode_ToLow(code_point)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
} // namespace io
|
|
|
|
} // namespace proto2
|