Protocol Buffers - Google's data interchange format (grpc依赖)
https://developers.google.com/protocol-buffers/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
5.1 KiB
133 lines
5.1 KiB
// Protocol Buffers - Google's data interchange format |
|
// Copyright 2023 Google LLC. All rights reserved. |
|
// |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file or at |
|
// https://developers.google.com/open-source/licenses/bsd |
|
|
|
// Class for parsing tokenized text from a ZeroCopyInputStream. |
|
|
|
#ifndef UPB_IO_TOKENIZER_H_ |
|
#define UPB_IO_TOKENIZER_H_ |
|
|
|
#include "upb/base/status.h" |
|
#include "upb/base/string_view.h" |
|
#include "upb/io/zero_copy_input_stream.h" |
|
#include "upb/mem/arena.h" |
|
|
|
// Must be included last. |
|
#include "upb/port/def.inc" |
|
|
|
#ifdef __cplusplus |
|
extern "C" { |
|
#endif |
|
|
|
typedef enum { |
|
kUpb_TokenType_Start, // Next() has not yet been called. |
|
kUpb_TokenType_End, // End of input reached. "text" is empty. |
|
|
|
// A sequence of letters, digits, and underscores, not starting with a digit. |
|
// It is an error for a number to be followed by an identifier with no space |
|
// in between. |
|
kUpb_TokenType_Identifier, |
|
|
|
// A sequence of digits representing an integer. Normally the digits are |
|
// decimal, but a prefix of "0x" indicates a hex number and a leading zero |
|
// indicates octal, just like with C numeric literals. A leading negative |
|
// sign is NOT included in the token; it's up to the parser to interpret the |
|
// unary minus operator on its own. |
|
kUpb_TokenType_Integer, |
|
|
|
// A floating point literal, with a fractional part and/or an exponent. |
|
// Always in decimal. Again, never negative. |
|
kUpb_TokenType_Float, |
|
|
|
// A quoted sequence of escaped characters. |
|
// Either single or double quotes can be used, but they must match. |
|
// A string literal cannot cross a line break. |
|
kUpb_TokenType_String, |
|
|
|
// Any other printable character, like '!' or '+'. |
|
// Symbols are always a single character, so "!+$%" is four tokens. |
|
kUpb_TokenType_Symbol, |
|
|
|
// A sequence of whitespace. |
|
// This token type is only produced if report_whitespace() is true. |
|
// It is not reported for whitespace within comments or strings. |
|
kUpb_TokenType_Whitespace, |
|
|
|
// A newline ('\n'). This token type is only produced if report_whitespace() |
|
// is true and report_newlines() is also true. |
|
// It is not reported for newlines in comments or strings. |
|
kUpb_TokenType_Newline, |
|
} upb_TokenType; |
|
|
|
typedef enum { |
|
// Set to allow floats to be suffixed with the letter 'f'. Tokens which would |
|
// otherwise be integers but which have the 'f' suffix will be forced to be |
|
// interpreted as floats. For all other purposes, the 'f' is ignored. |
|
kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0, |
|
|
|
// If set, whitespace tokens are reported by Next(). |
|
kUpb_TokenizerOption_ReportWhitespace = 1 << 1, |
|
|
|
// If set, newline tokens are reported by Next(). |
|
// This is a superset of ReportWhitespace. |
|
kUpb_TokenizerOption_ReportNewlines = 1 << 2, |
|
|
|
// By default the tokenizer expects C-style (/* */) comments. |
|
// If set, it expects shell-style (#) comments instead. |
|
kUpb_TokenizerOption_CommentStyleShell = 1 << 3, |
|
} upb_Tokenizer_Option; |
|
|
|
typedef struct upb_Tokenizer upb_Tokenizer; |
|
|
|
// Can be passed a flat array and/or a ZCIS as input. |
|
// The array will be read first (if non-NULL), then the stream (if non-NULL). |
|
upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size, |
|
upb_ZeroCopyInputStream* input, int options, |
|
upb_Arena* arena); |
|
|
|
void upb_Tokenizer_Fini(upb_Tokenizer* t); |
|
|
|
// Advance the tokenizer to the next input token. Returns True on success. |
|
// Returns False and (clears *status on EOF, sets *status on error). |
|
bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status); |
|
|
|
// Accessors for inspecting current/previous parse tokens, |
|
// which are opaque to the tokenizer (to reduce copying). |
|
|
|
upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t); |
|
int upb_Tokenizer_Column(const upb_Tokenizer* t); |
|
int upb_Tokenizer_EndColumn(const upb_Tokenizer* t); |
|
int upb_Tokenizer_Line(const upb_Tokenizer* t); |
|
int upb_Tokenizer_TextSize(const upb_Tokenizer* t); |
|
const char* upb_Tokenizer_TextData(const upb_Tokenizer* t); |
|
|
|
// External helper: validate an identifier. |
|
bool upb_Tokenizer_IsIdentifier(const char* data, int size); |
|
|
|
// Parses a TYPE_INTEGER token. Returns false if the result would be |
|
// greater than max_value. Otherwise, returns true and sets *output to the |
|
// result. If the text is not from a Token of type TYPE_INTEGER originally |
|
// parsed by a Tokenizer, the result is undefined (possibly an assert |
|
// failure). |
|
bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output); |
|
|
|
// Parses a TYPE_FLOAT token. This never fails, so long as the text actually |
|
// comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the |
|
// result is undefined (possibly an assert failure). |
|
double upb_Parse_Float(const char* text); |
|
|
|
// Parses a TYPE_STRING token. This never fails, so long as the text actually |
|
// comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the |
|
// result is undefined (possibly an assert failure). |
|
upb_StringView upb_Parse_String(const char* text, upb_Arena* arena); |
|
|
|
#ifdef __cplusplus |
|
} /* extern "C" */ |
|
#endif |
|
|
|
#include "upb/port/undef.inc" |
|
|
|
#endif // UPB_IO_TOKENIZER_H_
|
|
|