clean up tokenizer options and defaults

- Disallow multiline strings.
- Disallow a letter immediately following a number without intervening whitespace.
- Replace distinct bool option flags with a single options int.

PiperOrigin-RevId: 467829817
pull/13171/head
Eric Salo 3 years ago committed by Copybara-Service
parent 6861966501
commit 922a858e5c
  1. 67
      upb/io/tokenizer.c
  2. 18
      upb/io/tokenizer.h

@ -213,30 +213,9 @@ struct upb_Tokenizer {
upb_String* record_target;
int record_start;
// Options.
bool allow_f_after_float;
bool allow_multiline_strings;
bool require_space_after_number;
bool report_whitespace;
bool report_newlines;
bool comment_style_cpp;
bool comment_style_sh;
int options;
};
static void upb_Tokenizer_Options(upb_Tokenizer* t, int options) {
t->allow_f_after_float =
(options & kUpb_TokenizerOption_AllowFAfterFloat) != 0;
t->allow_multiline_strings =
(options & kUpb_TokenizerOption_AllowMultilineStrings) != 0;
t->require_space_after_number =
(options & kUpb_TokenizerOption_AllowFieldImmediatelyAfterNumber) == 0;
t->report_whitespace = (options & (kUpb_TokenizerOption_ReportWhitespace |
kUpb_TokenizerOption_ReportNewlines)) != 0;
t->report_newlines = (options & kUpb_TokenizerOption_ReportNewlines) != 0;
t->comment_style_sh = (options & kUpb_TokenizerOption_CommentStyleShell) != 0;
t->comment_style_cpp = !t->comment_style_sh;
}
// -------------------------------------------------------------------
// Internal helpers.
@ -400,12 +379,8 @@ static void ConsumeString(upb_Tokenizer* t, char delimiter) {
return;
case '\n': {
if (!t->allow_multiline_strings) {
AddError(t, "String literals cannot cross line boundaries.");
return;
}
NextChar(t);
break;
AddError(t, "String literals cannot cross line boundaries.");
return;
}
case '\\': {
@ -503,12 +478,12 @@ static upb_TokenType ConsumeNumber(upb_Tokenizer* t, bool started_with_zero,
"\"e\" must be followed by exponent.");
}
if (t->allow_f_after_float && (TryConsume(t, 'f') || TryConsume(t, 'F'))) {
is_float = true;
if (t->options & kUpb_TokenizerOption_AllowFAfterFloat) {
if (TryConsume(t, 'f') || TryConsume(t, 'F')) is_float = true;
}
}
if (LookingAt(t, upb_Tokenizer_IsLetter) && t->require_space_after_number) {
if (LookingAt(t, upb_Tokenizer_IsLetter)) {
AddError(t, "Need space between number and identifier.");
} else if (t->current_char == '.') {
if (is_float) {
@ -586,7 +561,10 @@ static void ConsumeBlockComment(upb_Tokenizer* t, upb_String* content) {
// If we're at the start of a new comment, consume it and return what kind
// of comment it is.
static upb_CommentType TryConsumeCommentStart(upb_Tokenizer* t) {
if (t->comment_style_cpp && TryConsume(t, '/')) {
const bool style_sh = t->options & kUpb_TokenizerOption_CommentStyleShell;
const bool style_cpp = !style_sh;
if (style_cpp && TryConsume(t, '/')) {
if (TryConsume(t, '/')) {
return kUpb_CommentType_Line;
} else if (TryConsume(t, '*')) {
@ -600,7 +578,7 @@ static upb_CommentType TryConsumeCommentStart(upb_Tokenizer* t) {
t->current.end_column = t->column;
return kUpb_CommentType_SlashNot;
}
} else if (t->comment_style_sh && TryConsume(t, '#')) {
} else if (style_sh && TryConsume(t, '#')) {
return kUpb_CommentType_Line;
} else {
return kUpb_CommentType_None;
@ -610,7 +588,7 @@ static upb_CommentType TryConsumeCommentStart(upb_Tokenizer* t) {
// If we're looking at a TYPE_WHITESPACE token and `report_whitespace` is true,
// consume it and return true.
static bool TryConsumeWhitespace(upb_Tokenizer* t) {
if (t->report_newlines) {
if (t->options & kUpb_TokenizerOption_ReportNewlines) {
if (TryConsumeOne(t, upb_Tokenizer_IsWhitespaceNoNewline)) {
ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespaceNoNewline);
t->current.type = kUpb_TokenType_Whitespace;
@ -621,7 +599,7 @@ static bool TryConsumeWhitespace(upb_Tokenizer* t) {
if (TryConsumeOne(t, upb_Tokenizer_IsWhitespace)) {
ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespace);
t->current.type = kUpb_TokenType_Whitespace;
return t->report_whitespace;
return (t->options & kUpb_TokenizerOption_ReportWhitespace) != 0;
}
return false;
}
@ -629,12 +607,11 @@ static bool TryConsumeWhitespace(upb_Tokenizer* t) {
// If we're looking at a TYPE_NEWLINE token and `report_newlines` is true,
// consume it and return true.
static bool TryConsumeNewline(upb_Tokenizer* t) {
if (!t->report_whitespace || !t->report_newlines) {
return false;
}
if (TryConsume(t, '\n')) {
t->current.type = kUpb_TokenType_Newline;
return true;
if (t->options & kUpb_TokenizerOption_ReportNewlines) {
if (TryConsume(t, '\n')) {
t->current.type = kUpb_TokenType_Newline;
return true;
}
}
return false;
}
@ -1055,13 +1032,17 @@ upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
t->record_target = NULL;
t->record_start = -1;
// ReportNewlines implies ReportWhitespace.
if (options & kUpb_TokenizerOption_ReportNewlines) {
options |= kUpb_TokenizerOption_ReportWhitespace;
}
t->options = options;
t->previous_type = kUpb_TokenType_Start;
t->previous_line = 0;
t->previous_column = 0;
t->previous_end_column = 0;
upb_Tokenizer_Options(t, options);
upb_Token_Init(&t->current, arena);
if (size) {

@ -87,24 +87,16 @@ typedef enum {
// interpreted as floats. For all other purposes, the 'f' is ignored.
kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0,
// If set, allow string literals to span multiple lines.
// Do not use this; for Google-internal cleanup only.
kUpb_TokenizerOption_AllowMultilineStrings = 1 << 1,
// If set, allow a field name to appear immediately after a number without
// requiring any intervening whitespace as a delimiter.
// Do not use this; for Google-internal cleanup only.
kUpb_TokenizerOption_AllowFieldImmediatelyAfterNumber = 1 << 2,
// If set, whitespace tokens are reported by Next().
kUpb_TokenizerOption_ReportWhitespace = 1 << 3,
kUpb_TokenizerOption_ReportWhitespace = 1 << 1,
// If set, newline tokens are reported by Next(). Implies ReportWhitespace.
kUpb_TokenizerOption_ReportNewlines = 1 << 4,
// If set, newline tokens are reported by Next().
// This is a superset of ReportWhitespace.
kUpb_TokenizerOption_ReportNewlines = 1 << 2,
// By default the tokenizer expects C-style (/* */) comments.
// If set, it expects shell-style (#) comments instead.
kUpb_TokenizerOption_CommentStyleShell = 1 << 5,
kUpb_TokenizerOption_CommentStyleShell = 1 << 3,
} upb_Tokenizer_Option;
// Abstract interface for an object which collects the errors that occur

Loading…
Cancel
Save