add upb_Status to the tokenizer

PiperOrigin-RevId: 469721241
pull/13171/head
Eric Salo 3 years ago committed by Copybara-Service
parent c67021f84a
commit 0013c936ef
  1. 196
      upb/io/tokenizer.c
  2. 28
      upb/io/tokenizer.h
  3. 233
      upb/io/tokenizer_test.cc

@ -164,9 +164,8 @@ struct upb_Tokenizer {
int token_end_column;
upb_ZeroCopyInputStream* input;
upb_ErrorCollector* error_collector;
upb_Arena* arena;
upb_Status* status;
char current_char; // == buffer_[buffer_pos_], updated by NextChar().
const char* buffer; // Current buffer returned from input_.
@ -195,17 +194,25 @@ struct upb_Tokenizer {
// position within the current buffer where recording started.
upb_String* record_target;
int record_start;
int options;
jmp_buf err;
};
// -------------------------------------------------------------------
// Internal helpers.
// Convenience methods to return an error at the current line and column.
UPB_NORETURN static void ReportError(upb_Tokenizer* t, const char* msg) {
upb_Status_SetErrorFormat(t->status, "%d:%d: %s", t->line, t->column, msg);
UPB_LONGJMP(t->err, 1);
}
// Convenience method to add an error at the current line and column.
static void AddError(upb_Tokenizer* t, const char* message) {
t->error_collector->AddError(t->line, t->column, message,
t->error_collector->context);
UPB_NORETURN UPB_PRINTF(2, 3) static void ReportErrorFormat(upb_Tokenizer* t,
const char* fmt,
...) {
va_list args;
va_start(args, fmt);
char msg[128];
vsnprintf(msg, sizeof(msg), fmt, args);
ReportError(t, msg);
}
// Read a new buffer from the input.
@ -339,12 +346,12 @@ static void ConsumeZeroOrMore(upb_Tokenizer* t, bool (*f)(char)) {
static void ConsumeOneOrMore(upb_Tokenizer* t, bool (*f)(char),
const char* err_msg) {
if (!f(t->current_char)) {
AddError(t, err_msg);
} else {
do {
NextChar(t);
} while (f(t->current_char));
ReportError(t, err_msg);
}
do {
NextChar(t);
} while (f(t->current_char));
}
// -----------------------------------------------------------------
@ -358,13 +365,10 @@ static void ConsumeString(upb_Tokenizer* t, char delimiter) {
while (true) {
switch (t->current_char) {
case '\0':
AddError(t, "Unexpected end of string.");
return;
ReportError(t, "Unexpected end of string.");
case '\n': {
AddError(t, "String literals cannot cross line boundaries.");
return;
}
case '\n':
ReportError(t, "String literals cannot cross line boundaries.");
case '\\': {
// An escape sequence.
@ -377,7 +381,7 @@ static void ConsumeString(upb_Tokenizer* t, char delimiter) {
// to do so explicitly here.
} else if (TryConsume(t, 'x')) {
if (!TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
AddError(t, "Expected hex digits for escape sequence.");
ReportError(t, "Expected hex digits for escape sequence.");
}
// Possibly followed by another hex digit, but again we don't care.
} else if (TryConsume(t, 'u')) {
@ -385,7 +389,7 @@ static void ConsumeString(upb_Tokenizer* t, char delimiter) {
!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
!TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
AddError(t, "Expected four hex digits for \\u escape sequence.");
ReportError(t, "Expected four hex digits for \\u escape sequence.");
}
} else if (TryConsume(t, 'U')) {
// We expect 8 hex digits; but only the range up to 0x10ffff is
@ -397,12 +401,12 @@ static void ConsumeString(upb_Tokenizer* t, char delimiter) {
!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
!TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
AddError(t,
"Expected eight hex digits up to 10ffff for \\U escape "
"sequence");
ReportError(t,
"Expected eight hex digits up to 10ffff for \\U escape "
"sequence");
}
} else {
AddError(t, "Invalid escape sequence in string literal.");
ReportError(t, "Invalid escape sequence in string literal.");
}
break;
}
@ -436,8 +440,7 @@ static upb_TokenType ConsumeNumber(upb_Tokenizer* t, bool started_with_zero,
// An octal number (had a leading zero).
ConsumeZeroOrMore(t, upb_Tokenizer_IsOctalDigit);
if (LookingAt(t, upb_Tokenizer_IsDigit)) {
AddError(t, "Numbers starting with leading zero must be in octal.");
ConsumeZeroOrMore(t, upb_Tokenizer_IsDigit);
ReportError(t, "Numbers starting with leading zero must be in octal.");
}
} else {
@ -467,13 +470,15 @@ static upb_TokenType ConsumeNumber(upb_Tokenizer* t, bool started_with_zero,
}
if (LookingAt(t, upb_Tokenizer_IsLetter)) {
AddError(t, "Need space between number and identifier.");
} else if (t->current_char == '.') {
ReportError(t, "Need space between number and identifier.");
}
if (t->current_char == '.') {
if (is_float) {
AddError(
ReportError(
t, "Already saw decimal point or exponent; can't have another one.");
} else {
AddError(t, "Hex and octal numbers must be integers.");
ReportError(t, "Hex and octal numbers must be integers.");
}
}
@ -528,15 +533,12 @@ static void ConsumeBlockComment(upb_Tokenizer* t, upb_String* content) {
} else if (TryConsume(t, '/') && t->current_char == '*') {
// Note: We didn't consume the '*' because if there is a '/' after it
// we want to interpret that as the end of the comment.
AddError(
ReportError(
t, "\"/*\" inside block comment. Block comments cannot be nested.");
} else if (t->current_char == '\0') {
AddError(t, "End-of-file inside block comment.");
t->error_collector->AddError(start_line, start_column,
" Comment started here.",
t->error_collector->context);
if (content != NULL) StopRecording(t);
break;
ReportErrorFormat(
t, "End-of-file inside block comment.\n%d:%d: Comment started here.",
start_line, start_column);
}
}
}
@ -621,19 +623,20 @@ upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t) {
return t->token_type;
}
bool upb_Tokenizer_Next(upb_Tokenizer* t) {
bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status) {
t->status = status;
t->previous_type = t->token_type;
t->previous_line = t->token_line;
t->previous_column = t->token_column;
t->previous_end_column = t->token_end_column;
if (UPB_SETJMP(t->err)) return false;
while (!t->read_error) {
StartToken(t);
bool report_token = TryConsumeWhitespace(t) || TryConsumeNewline(t);
EndToken(t);
if (report_token) {
return true;
}
if (report_token) return true;
switch (TryConsumeCommentStart(t)) {
case kUpb_CommentType_Line:
@ -652,70 +655,54 @@ bool upb_Tokenizer_Next(upb_Tokenizer* t) {
if (t->read_error) break;
if (LookingAt(t, upb_Tokenizer_IsUnprintable) || t->current_char == '\0') {
AddError(t, "Invalid control characters encountered in text.");
NextChar(t);
// Skip more unprintable characters, too. But, remember that '\0' is
// also what current_char_ is set to after EOF / read error. We have
// to be careful not to go into an infinite loop of trying to consume
// it, so make sure to check read_error_ explicitly before consuming
// '\0'.
while (TryConsumeOne(t, upb_Tokenizer_IsUnprintable) ||
(!t->read_error && TryConsume(t, '\0'))) {
// Ignore.
}
ReportError(t, "Invalid control characters encountered in text.");
}
} else {
// Reading some sort of token.
StartToken(t);
if (TryConsumeOne(t, upb_Tokenizer_IsLetter)) {
ConsumeZeroOrMore(t, upb_Tokenizer_IsAlphanumeric);
t->token_type = kUpb_TokenType_Identifier;
} else if (TryConsume(t, '0')) {
t->token_type = ConsumeNumber(t, true, false);
} else if (TryConsume(t, '.')) {
// This could be the beginning of a floating-point number, or it could
// just be a '.' symbol.
if (TryConsumeOne(t, upb_Tokenizer_IsDigit)) {
// It's a floating-point number.
if (t->previous_type == kUpb_TokenType_Identifier &&
t->token_line == t->previous_line &&
t->token_column == t->previous_end_column) {
// We don't accept syntax like "blah.123".
t->error_collector->AddError(
t->line, t->column - 2,
"Need space between identifier and decimal point.",
t->error_collector->context);
}
t->token_type = ConsumeNumber(t, false, true);
} else {
t->token_type = kUpb_TokenType_Symbol;
// Reading some sort of token.
StartToken(t);
if (TryConsumeOne(t, upb_Tokenizer_IsLetter)) {
ConsumeZeroOrMore(t, upb_Tokenizer_IsAlphanumeric);
t->token_type = kUpb_TokenType_Identifier;
} else if (TryConsume(t, '0')) {
t->token_type = ConsumeNumber(t, true, false);
} else if (TryConsume(t, '.')) {
// This could be the beginning of a floating-point number, or it could
// just be a '.' symbol.
if (TryConsumeOne(t, upb_Tokenizer_IsDigit)) {
// It's a floating-point number.
if (t->previous_type == kUpb_TokenType_Identifier &&
t->token_line == t->previous_line &&
t->token_column == t->previous_end_column) {
// We don't accept syntax like "blah.123".
t->column -= 2;
ReportError(t, "Need space between identifier and decimal point.");
}
} else if (TryConsumeOne(t, upb_Tokenizer_IsDigit)) {
t->token_type = ConsumeNumber(t, false, false);
} else if (TryConsume(t, '\"')) {
ConsumeString(t, '\"');
t->token_type = kUpb_TokenType_String;
} else if (TryConsume(t, '\'')) {
ConsumeString(t, '\'');
t->token_type = kUpb_TokenType_String;
t->token_type = ConsumeNumber(t, false, true);
} else {
// Check if the high order bit is set.
if (t->current_char & 0x80) {
char temp[80];
snprintf(temp, sizeof temp, "Interpreting non ascii codepoint %d.",
(uint8_t)t->current_char);
t->error_collector->AddError(t->line, t->column, temp,
t->error_collector->context);
}
NextChar(t);
t->token_type = kUpb_TokenType_Symbol;
}
EndToken(t);
return true;
} else if (TryConsumeOne(t, upb_Tokenizer_IsDigit)) {
t->token_type = ConsumeNumber(t, false, false);
} else if (TryConsume(t, '\"')) {
ConsumeString(t, '\"');
t->token_type = kUpb_TokenType_String;
} else if (TryConsume(t, '\'')) {
ConsumeString(t, '\'');
t->token_type = kUpb_TokenType_String;
} else {
// Check if the high order bit is set.
if (t->current_char & 0x80) {
ReportErrorFormat(t, "Interpreting non ascii codepoint %d.",
(uint8_t)t->current_char);
}
NextChar(t);
t->token_type = kUpb_TokenType_Symbol;
}
EndToken(t);
return true;
}
// EOF
@ -724,6 +711,7 @@ bool upb_Tokenizer_Next(upb_Tokenizer* t) {
t->token_line = t->line;
t->token_column = t->column;
t->token_end_column = t->column;
upb_Status_Clear(status);
return false;
}
@ -977,14 +965,12 @@ bool upb_Tokenizer_IsIdentifier(const char* text, int size) {
}
upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
upb_ZeroCopyInputStream* input,
upb_ErrorCollector* error_collector,
int options, upb_Arena* arena) {
upb_ZeroCopyInputStream* input, int options,
upb_Arena* arena) {
upb_Tokenizer* t = upb_Arena_Malloc(arena, sizeof(upb_Tokenizer));
if (!t) return NULL;
t->input = input;
t->error_collector = error_collector;
t->arena = arena;
t->buffer = data;
t->buffer_size = size;

@ -99,35 +99,19 @@ typedef enum {
kUpb_TokenizerOption_CommentStyleShell = 1 << 3,
} upb_Tokenizer_Option;
// Abstract interface for an object which collects the errors that occur
// during parsing. A typical implementation might simply print the errors
// to stdout.
typedef struct {
// Indicates that there was an error in the input at the given line and
// column numbers. The numbers are zero-based, so you may want to add
// 1 to each before printing them.
void (*AddError)(int line, int column, const char* message, void* context);
// Indicates that there was a warning in the input at the given line and
// column numbers. The numbers are zero-based, so you may want to add
// 1 to each before printing them.
void (*AddWarning)(int line, int column, const char* message, void* context);
// Opaque pointer, passed an as argument to the above functions.
void* context;
} upb_ErrorCollector;
typedef struct upb_Tokenizer upb_Tokenizer;
// Can be passed a flat array and/or a ZCIS as input.
// The array will be read first (if non-NULL), then the stream (if non-NULL).
upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
upb_ZeroCopyInputStream* input,
upb_ErrorCollector* error_collector,
int options, upb_Arena* arena);
upb_ZeroCopyInputStream* input, int options,
upb_Arena* arena);
void upb_Tokenizer_Fini(upb_Tokenizer* t);
bool upb_Tokenizer_Next(upb_Tokenizer* t);
// Advance the tokenizer to the next input token. Returns True on success.
// Returns False and (clears *status on EOF, sets *status on error).
bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status);
// Accessors for inspecting current/previous parse tokens,
// which are opaque to the tokenizer (to reduce copying).

@ -124,38 +124,6 @@ upb_ZeroCopyInputStream* TestInputStream(const void* data, size_t size,
// -------------------------------------------------------------------
typedef struct {
upb_ErrorCollector base;
upb_String text;
} TestErrorCollector;
static void TestErrorCollector_AddError(int line, int column,
const char* message, void* context) {
TestErrorCollector* t = (TestErrorCollector*)context;
char temp[800];
int len = snprintf(temp, sizeof(temp), "%d:%d: %s\n", line, column, message);
upb_String_Append(&t->text, temp, len);
}
static void TestErrorCollector_AddWarning(int line, int column,
const char* message, void* context) {}
static upb_ErrorCollector* TestErrorCollector_New(upb_Arena* arena) {
TestErrorCollector* out =
(TestErrorCollector*)upb_Arena_Malloc(arena, sizeof(*out));
out->base.AddError = TestErrorCollector_AddError;
out->base.AddWarning = TestErrorCollector_AddWarning;
out->base.context = out;
upb_String_Init(&out->text, arena);
return (upb_ErrorCollector*)out;
}
// -------------------------------------------------------------------
// We test each operation over a variety of block sizes to insure that
// we test cases where reads cross buffer boundaries as well as cases
// where they don't. This is sort of a brute-force approach to this,
@ -240,8 +208,7 @@ TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
auto input = TestInputStream(kSimpleTokenCases_case.input.data(),
kSimpleTokenCases_case.input.size(),
kBlockSizes_case, arena.ptr());
auto error_collector = TestErrorCollector_New(arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, error_collector, 0, arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr());
// Before Next() is called, the initial token should always be TYPE_START.
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
@ -251,7 +218,7 @@ TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
// Parse the token.
EXPECT_TRUE(upb_Tokenizer_Next(t));
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
// Check that it has the right type.
EXPECT_EQ(upb_Tokenizer_Type(t), kSimpleTokenCases_case.type);
// Check that it contains the complete input text.
@ -263,8 +230,12 @@ TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
EXPECT_EQ(upb_Tokenizer_Column(t), 0);
EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size());
// There should be no more input.
EXPECT_FALSE(upb_Tokenizer_Next(t));
upb_Status status;
upb_Status_Clear(&status);
// There should be no more input and no errors..
EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
EXPECT_TRUE(upb_Status_IsOk(&status));
// After Next() returns false, the token should have type TYPE_END.
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_End);
@ -272,9 +243,6 @@ TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
EXPECT_EQ(upb_Tokenizer_Column(t), kSimpleTokenCases_case.input.size());
EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size());
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
// There should be no errors.
EXPECT_TRUE(upb_String_Empty(&((TestErrorCollector*)error_collector)->text));
}
TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
@ -285,33 +253,33 @@ TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
const char* text = "1f 2.5f 6e3f 7F";
auto input =
TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr());
auto error_collector = TestErrorCollector_New(arena.ptr());
const int options = kUpb_TokenizerOption_AllowFAfterFloat;
auto t =
upb_Tokenizer_New(NULL, 0, input, error_collector, options, arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr());
// Advance through tokens and check that they are parsed as expected.
EXPECT_TRUE(upb_Tokenizer_Next(t));
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "1f"));
EXPECT_TRUE(upb_Tokenizer_Next(t));
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "2.5f"));
EXPECT_TRUE(upb_Tokenizer_Next(t));
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "6e3f"));
EXPECT_TRUE(upb_Tokenizer_Next(t));
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "7F"));
// There should be no more input.
EXPECT_FALSE(upb_Tokenizer_Next(t));
// There should be no errors.
EXPECT_TRUE(upb_String_Empty(&((TestErrorCollector*)error_collector)->text));
upb_Status status;
upb_Status_Clear(&status);
// There should be no more input and no errors..
EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
EXPECT_TRUE(upb_Status_IsOk(&status));
}
SimpleTokenCase kWhitespaceTokenCases[] = {
@ -332,26 +300,23 @@ TEST_2D(TokenizerTest, Whitespace, kWhitespaceTokenCases, kBlockSizes) {
auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(),
kWhitespaceTokenCases_case.input.size(),
kBlockSizes_case, arena.ptr());
auto error_collector = TestErrorCollector_New(arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, error_collector, 0, arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr());
EXPECT_FALSE(upb_Tokenizer_Next(t));
EXPECT_FALSE(upb_Tokenizer_Next(t, NULL));
}
{
auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(),
kWhitespaceTokenCases_case.input.size(),
kBlockSizes_case, arena.ptr());
auto error_collector = TestErrorCollector_New(arena.ptr());
const int options = kUpb_TokenizerOption_ReportNewlines;
auto t = upb_Tokenizer_New(NULL, 0, input, error_collector, options,
arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr());
EXPECT_TRUE(upb_Tokenizer_Next(t));
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
EXPECT_EQ(upb_Tokenizer_Type(t), kWhitespaceTokenCases_case.type);
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t),
kWhitespaceTokenCases_case.input.data()));
EXPECT_FALSE(upb_Tokenizer_Next(t));
EXPECT_FALSE(upb_Tokenizer_Next(t, NULL));
}
}
@ -476,8 +441,7 @@ TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
auto input = TestInputStream(kMultiTokenCases_case.input.data(),
kMultiTokenCases_case.input.size(),
kBlockSizes_case, arena.ptr());
auto error_collector = TestErrorCollector_New(arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, error_collector, 0, arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr());
// Before Next() is called, the initial token should always be TYPE_START.
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
@ -488,6 +452,8 @@ TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
// Loop through all expected tokens.
TokenFields token_fields;
upb_Status status;
upb_Status_Clear(&status);
int i = 0;
do {
token_fields = kMultiTokenCases_case.output[i++];
@ -497,9 +463,10 @@ TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
// Next() should only return false when it hits the end token.
if (token_fields.type == kUpb_TokenType_End) {
EXPECT_FALSE(upb_Tokenizer_Next(t));
EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
EXPECT_TRUE(upb_Status_IsOk(&status));
} else {
EXPECT_TRUE(upb_Tokenizer_Next(t));
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
}
// Check that the token matches the expected one.
@ -511,9 +478,6 @@ TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
EXPECT_TRUE(
StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data()));
} while (token_fields.type != kUpb_TokenType_End);
// There should be no errors.
EXPECT_TRUE(upb_String_Empty(&((TestErrorCollector*)error_collector)->text));
}
MultiTokenCase kMultiWhitespaceTokenCases[] = {
@ -543,10 +507,8 @@ TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases,
auto input = TestInputStream(kMultiWhitespaceTokenCases_case.input.data(),
kMultiWhitespaceTokenCases_case.input.size(),
kBlockSizes_case, arena.ptr());
auto error_collector = TestErrorCollector_New(arena.ptr());
const int options = kUpb_TokenizerOption_ReportNewlines;
auto t =
upb_Tokenizer_New(NULL, 0, input, error_collector, options, arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr());
// Before Next() is called, the initial token should always be TYPE_START.
EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
@ -557,6 +519,8 @@ TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases,
// Loop through all expected tokens.
TokenFields token_fields;
upb_Status status;
upb_Status_Clear(&status);
int i = 0;
do {
token_fields = kMultiWhitespaceTokenCases_case.output[i++];
@ -565,10 +529,11 @@ TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases,
<< "Token #" << i << ": " << token_fields.text);
// Next() should only return false when it hits the end token.
if (token_fields.type != kUpb_TokenType_End) {
EXPECT_TRUE(upb_Tokenizer_Next(t));
if (token_fields.type == kUpb_TokenType_End) {
EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
EXPECT_TRUE(upb_Status_IsOk(&status));
} else {
EXPECT_FALSE(upb_Tokenizer_Next(t));
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
}
// Check that the token matches the expected one.
@ -579,9 +544,6 @@ TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases,
EXPECT_TRUE(
StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data()));
} while (token_fields.type != kUpb_TokenType_End);
// There should be no errors.
EXPECT_TRUE(upb_String_Empty(&((TestErrorCollector*)error_collector)->text));
}
// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
@ -604,21 +566,20 @@ TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
upb::Arena arena;
auto input =
TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr());
auto error_collector = TestErrorCollector_New(arena.ptr());
const int options = kUpb_TokenizerOption_CommentStyleShell;
auto t =
upb_Tokenizer_New(NULL, 0, input, error_collector, options, arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, options, arena.ptr());
// Advance through tokens and check that they are parsed as expected.
for (int i = 0; i < arraysize(kTokens); i++) {
EXPECT_TRUE(upb_Tokenizer_Next(t));
EXPECT_TRUE(upb_Tokenizer_Next(t, NULL));
EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), kTokens[i]));
}
// There should be no more input.
EXPECT_FALSE(upb_Tokenizer_Next(t));
// There should be no errors.
EXPECT_TRUE(upb_String_Empty(&((TestErrorCollector*)error_collector)->text));
// There should be no more input and no errors.
upb_Status status;
upb_Status_Clear(&status);
EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
EXPECT_TRUE(upb_Status_IsOk(&status));
}
#endif
@ -1074,10 +1035,6 @@ TEST_F(TokenizerTest, ParseStringAppend) {
// checks that the error output matches what is expected.
struct ErrorCase {
std::string input;
bool recoverable; // True if the tokenizer should be able to recover and
// parse more tokens after seeing this error. Cases
// for which this is true must end with "foo" as
// the last token, which the test will check for.
const char* errors;
};
@ -1087,69 +1044,61 @@ inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
ErrorCase kErrorCases[] = {
// String errors.
{"'\\l' foo", true, "0:2: Invalid escape sequence in string literal.\n"},
{"'\\X' foo", true, "0:2: Invalid escape sequence in string literal.\n"},
{"'\\x' foo", true, "0:3: Expected hex digits for escape sequence.\n"},
{"'foo", false, "0:4: Unexpected end of string.\n"},
{"'bar\nfoo", true, "0:4: String literals cannot cross line boundaries.\n"},
{"'\\u01' foo", true,
"0:5: Expected four hex digits for \\u escape sequence.\n"},
{"'\\u01' foo", true,
"0:5: Expected four hex digits for \\u escape sequence.\n"},
{"'\\uXYZ' foo", true,
"0:3: Expected four hex digits for \\u escape sequence.\n"},
{"'\\l'", "0:2: Invalid escape sequence in string literal."},
{"'\\X'", "0:2: Invalid escape sequence in string literal."},
{"'\\x'", "0:3: Expected hex digits for escape sequence."},
{"'foo", "0:4: Unexpected end of string."},
{"'bar\nfoo", "0:4: String literals cannot cross line boundaries."},
{"'\\u01'", "0:5: Expected four hex digits for \\u escape sequence."},
{"'\\uXYZ'", "0:3: Expected four hex digits for \\u escape sequence."},
// Integer errors.
{"123foo", true, "0:3: Need space between number and identifier.\n"},
{"123foo", "0:3: Need space between number and identifier."},
// Hex/octal errors.
{"0x foo", true, "0:2: \"0x\" must be followed by hex digits.\n"},
{"0541823 foo", true,
"0:4: Numbers starting with leading zero must be in octal.\n"},
{"0x123z foo", true, "0:5: Need space between number and identifier.\n"},
{"0x123.4 foo", true, "0:5: Hex and octal numbers must be integers.\n"},
{"0123.4 foo", true, "0:4: Hex and octal numbers must be integers.\n"},
{"0x foo", "0:2: \"0x\" must be followed by hex digits."},
{"0541823", "0:4: Numbers starting with leading zero must be in octal."},
{"0x123z", "0:5: Need space between number and identifier."},
{"0x123.4", "0:5: Hex and octal numbers must be integers."},
{"0123.4", "0:4: Hex and octal numbers must be integers."},
// Float errors.
{"1e foo", true, "0:2: \"e\" must be followed by exponent.\n"},
{"1e- foo", true, "0:3: \"e\" must be followed by exponent.\n"},
{"1.2.3 foo", true,
"0:3: Already saw decimal point or exponent; can't have another one.\n"},
{"1e2.3 foo", true,
"0:3: Already saw decimal point or exponent; can't have another one.\n"},
{"a.1 foo", true,
"0:1: Need space between identifier and decimal point.\n"},
{"1e foo", "0:2: \"e\" must be followed by exponent."},
{"1e- foo", "0:3: \"e\" must be followed by exponent."},
{"1.2.3",
"0:3: Already saw decimal point or exponent; can't have another one."},
{"1e2.3",
"0:3: Already saw decimal point or exponent; can't have another one."},
{"a.1", "0:1: Need space between identifier and decimal point."},
// allow_f_after_float not enabled, so this should be an error.
{"1.0f foo", true, "0:3: Need space between number and identifier.\n"},
{"1.0f", "0:3: Need space between number and identifier."},
// Block comment errors.
{"/*", false,
"0:2: End-of-file inside block comment.\n"
"0:0: Comment started here.\n"},
{"/*/*/ foo", true,
"0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
{"/*",
"0:2: End-of-file inside block comment.\n0:0: Comment started here."},
{"/*/*/ foo",
"0:3: \"/*\" inside block comment. Block comments cannot be nested."},
// Control characters. Multiple consecutive control characters should only
// produce one error.
{"\b foo", true, "0:0: Invalid control characters encountered in text.\n"},
{"\b\b foo", true,
"0:0: Invalid control characters encountered in text.\n"},
{"\b foo", "0:0: Invalid control characters encountered in text."},
{"\b\b foo", "0:0: Invalid control characters encountered in text."},
// Check that control characters at end of input don't result in an
// infinite loop.
{"\b", false, "0:0: Invalid control characters encountered in text.\n"},
{"\b", "0:0: Invalid control characters encountered in text."},
// Check recovery from '\0'. We have to explicitly specify the length of
// these strings because otherwise the string constructor will just call
// strlen() which will see the first '\0' and think that is the end of the
// string.
{std::string("\0foo", 4), true,
"0:0: Invalid control characters encountered in text.\n"},
{std::string("\0\0foo", 5), true,
"0:0: Invalid control characters encountered in text.\n"},
{std::string("\0foo", 4),
"0:0: Invalid control characters encountered in text."},
{std::string("\0\0foo", 5),
"0:0: Invalid control characters encountered in text."},
// Check error from high order bits set
{"\300foo", true, "0:0: Interpreting non ascii codepoint 192.\n"},
{"\300", "0:0: Interpreting non ascii codepoint 192."},
};
TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
@ -1158,24 +1107,15 @@ TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
auto input = TestInputStream(kErrorCases_case.input.data(),
kErrorCases_case.input.size(), kBlockSizes_case,
arena.ptr());
auto error_collector = TestErrorCollector_New(arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, error_collector, 0, arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr());
// Ignore all input, except remember if the last token was "foo".
bool last_was_foo = false;
while (upb_Tokenizer_Next(t)) {
last_was_foo = StringEquals(upb_Tokenizer_TextData(t), "foo");
}
upb_Status status;
upb_Status_Clear(&status);
// Check that the errors match what was expected.
EXPECT_TRUE(StringEquals(
upb_String_Data(&((TestErrorCollector*)error_collector)->text),
kErrorCases_case.errors));
// If the error was recoverable, make sure we saw "foo" after it.
if (kErrorCases_case.recoverable) {
EXPECT_TRUE(last_was_foo);
}
while (upb_Tokenizer_Next(t, &status))
; // just keep looping
EXPECT_TRUE(
StringEquals(upb_Status_ErrorMessage(&status), kErrorCases_case.errors));
}
// -------------------------------------------------------------------
@ -1187,9 +1127,8 @@ TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
TestInputStream(text.data(), text.size(), kBlockSizes_case, arena.ptr());
// Create a tokenizer, read one token, then destroy it.
auto error_collector = TestErrorCollector_New(arena.ptr());
auto t = upb_Tokenizer_New(NULL, 0, input, error_collector, 0, arena.ptr());
upb_Tokenizer_Next(t);
auto t = upb_Tokenizer_New(NULL, 0, input, 0, arena.ptr());
upb_Tokenizer_Next(t, NULL);
upb_Tokenizer_Fini(t);
// Only "foo" should have been read.

Loading…
Cancel
Save