first stab at a Tokenizer api

These functions are not yet part of the upb build but this is a good chunk of work so let's snapshot it now.

PiperOrigin-RevId: 467733791
pull/13171/head
Eric Salo 3 years ago committed by Copybara-Service
parent 0c6531378d
commit 6861966501
  1. 65
      upb/io/BUILD
  2. 135
      upb/io/string.h
  3. 127
      upb/io/string_test.cc
  4. 98
      upb/io/strtod.c
  5. 46
      upb/io/strtod.h
  6. 1081
      upb/io/tokenizer.c
  7. 187
      upb/io/tokenizer.h
  8. 1347
      upb/io/tokenizer_test.cc

@ -1,16 +1,49 @@
cc_library(
name = "string",
hdrs = ["string.h"],
deps = [
"//:upb",
"//:port",
],
)
cc_library(
name = "strtod",
srcs = ["strtod.c"],
hdrs = ["strtod.h"],
deps = [
"//:port",
],
)
cc_library(
name = "tokenizer",
srcs = ["tokenizer.c"],
hdrs = ["tokenizer.h"],
deps = [
":string",
":strtod",
":zero_copy_stream",
"//:upb",
"//:port",
"//:unicode_internal",
],
)
cc_library(
name = "zero_copy_stream",
hdrs = [
"zero_copy_input_stream.h",
"zero_copy_output_stream.h",
],
visibility = ["//upb/io:__pkg__"],
deps = [
"//:upb",
"//:port",
],
)
# ====================================================================
cc_library(
name = "chunked_stream",
testonly = 1,
@ -22,7 +55,6 @@ cc_library(
"chunked_input_stream.h",
"chunked_output_stream.h",
],
visibility = ["//upb/io:__pkg__"],
deps = [
":zero_copy_stream",
"//:upb",
@ -30,6 +62,35 @@ cc_library(
],
)
cc_test(
name = "string_test",
size = "small",
srcs = ["string_test.cc"],
deps = [
":string",
"//:upb",
"@com_google_googletest//:gtest_main",
],
)
cc_test(
name = "tokenizer_test",
size = "small",
srcs = ["tokenizer_test.cc"],
deps = [
":chunked_stream",
":string",
":tokenizer",
":zero_copy_stream",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
"//:upb",
"//:port",
"//:unicode_internal",
"@com_google_googletest//:gtest_main",
],
)
cc_test(
name = "zero_copy_stream_test",
size = "small",

@ -0,0 +1,135 @@
/*
* Copyright (c) 2009-2021, Google LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Google LLC nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// An attempt to provide some of the C++ string functionality in C.
// Function names generally match those of corresponding C++ string methods.
// All buffers are copied so operations are relatively expensive.
// Internal character strings are always NULL-terminated.
// All bool functions return true on success, false on failure.
#ifndef UPB_IO_STRING_H_
#define UPB_IO_STRING_H_
#include <string.h>
#include "upb/arena.h"
// Must be last.
#include "upb/port_def.inc"
#ifdef __cplusplus
extern "C" {
#endif
// Do not directly access the fields of this struct - use the accessors only.
// TODO(salo): Add a small (16 bytes, maybe?) internal buffer so we can avoid
// hitting the arena for short strings.
typedef struct {
size_t size_;
size_t capacity_;
char* data_;
upb_Arena* arena_;
} upb_String;
// Initialize an already-allocted upb_String object.
UPB_INLINE bool upb_String_Init(upb_String* s, upb_Arena* a) {
static const int kDefaultCapacity = 16;
s->size_ = 0;
s->capacity_ = kDefaultCapacity;
s->data_ = (char*)upb_Arena_Malloc(a, kDefaultCapacity);
if (!s->data_) return false;
s->data_[0] = '\0';
s->arena_ = a;
return true;
}
UPB_INLINE void upb_String_Clear(upb_String* s) {
s->size_ = 0;
s->data_[0] = '\0';
}
UPB_INLINE char* upb_String_Data(const upb_String* s) { return s->data_; }
UPB_INLINE size_t upb_String_Size(const upb_String* s) { return s->size_; }
UPB_INLINE bool upb_String_Empty(const upb_String* s) { return s->size_ == 0; }
UPB_INLINE void upb_String_Erase(upb_String* s, size_t pos, size_t len) {
if (pos >= s->size_) return;
char* des = s->data_ + pos;
if (pos + len > s->size_) len = s->size_ - pos;
char* src = des + len;
memmove(des, src, s->size_ - (src - s->data_) + 1);
s->size_ -= len;
}
UPB_INLINE bool upb_String_Reserve(upb_String* s, size_t size) {
if (s->capacity_ <= size) {
const size_t new_cap = size + 1;
s->data_ =
(char*)upb_Arena_Realloc(s->arena_, s->data_, s->capacity_, new_cap);
if (!s->data_) return false;
s->capacity_ = new_cap;
}
return true;
}
UPB_INLINE bool upb_String_Append(upb_String* s, const char* data,
size_t size) {
if (s->capacity_ <= s->size_ + size) {
const size_t new_cap = 2 * (s->size_ + size) + 1;
if (!upb_String_Reserve(s, new_cap)) return false;
}
memcpy(s->data_ + s->size_, data, size);
s->size_ += size;
s->data_[s->size_] = '\0';
return true;
}
UPB_INLINE bool upb_String_Assign(upb_String* s, const char* data,
size_t size) {
upb_String_Clear(s);
return upb_String_Append(s, data, size);
}
UPB_INLINE bool upb_String_Copy(upb_String* des, const upb_String* src) {
return upb_String_Assign(des, src->data_, src->size_);
}
UPB_INLINE bool upb_String_PushBack(upb_String* s, char ch) {
return upb_String_Append(s, &ch, 1);
}
#ifdef __cplusplus
} /* extern "C" */
#endif
#include "upb/port_undef.inc"
#endif /* UPB_IO_STRING_H_ */

@ -0,0 +1,127 @@
/*
* Copyright (c) 2009-2022, Google LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Google LLC nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "upb/io/string.h"
#include <string.h>
#include "gtest/gtest.h"
#include "upb/upb.hpp"
TEST(StringTest, Append) {
upb::Arena arena;
upb_String foo;
EXPECT_TRUE(upb_String_Init(&foo, arena.ptr()));
EXPECT_EQ(upb_String_Size(&foo), 0);
EXPECT_TRUE(upb_String_Assign(&foo, "foobar", 3));
EXPECT_EQ(upb_String_Size(&foo), 3);
EXPECT_EQ(strcmp(upb_String_Data(&foo), "foo"), 0);
EXPECT_TRUE(upb_String_Append(&foo, "bar", 3));
EXPECT_EQ(upb_String_Size(&foo), 6);
EXPECT_EQ(strcmp(upb_String_Data(&foo), "foobar"), 0);
EXPECT_TRUE(upb_String_Append(&foo, "baz", 3));
EXPECT_EQ(upb_String_Size(&foo), 9);
EXPECT_EQ(strcmp(upb_String_Data(&foo), "foobarbaz"), 0);
EXPECT_TRUE(upb_String_Append(&foo, "bat", 3));
EXPECT_EQ(upb_String_Size(&foo), 12);
EXPECT_EQ(strcmp(upb_String_Data(&foo), "foobarbazbat"), 0);
EXPECT_TRUE(upb_String_Append(&foo, "feefiefoefoo", 12));
EXPECT_EQ(upb_String_Size(&foo), 24);
EXPECT_EQ(strcmp(upb_String_Data(&foo), "foobarbazbatfeefiefoefoo"), 0);
const char* password = "fiddlesnarf";
EXPECT_TRUE(upb_String_Assign(&foo, password, strlen(password)));
EXPECT_EQ(upb_String_Size(&foo), strlen(password));
EXPECT_EQ(strcmp(upb_String_Data(&foo), password), 0);
}
TEST(StringTest, PushBack) {
upb::Arena arena;
upb_String foo;
EXPECT_TRUE(upb_String_Init(&foo, arena.ptr()));
EXPECT_EQ(upb_String_Size(&foo), 0);
const std::string big =
"asfashfxauwhfwu4fuwafxasnfwxnxwunxuwxufhwfaiwj4w9jvwxssldfjlasviorwnvwij"
"grsdjrfiasrjrasijgraisjvrvoiasjspjfsjgfasjgiasjidjsrvjsrjrasjfrijwjajsrF"
"JWJGF4WWJSAVSLJArSJGFrAISJGASrlafjgrivarijrraisrgjiawrijg3874f87f7hqfhpf"
"f8929hr32p8475902387459023475297328-22-3776-26";
EXPECT_TRUE(upb_String_Reserve(&foo, big.size() + 1));
EXPECT_TRUE(upb_String_Append(&foo, big.data(), big.size()));
EXPECT_EQ(upb_String_Size(&foo), big.size());
EXPECT_EQ(strcmp(upb_String_Data(&foo), big.data()), 0);
upb_String bar;
EXPECT_TRUE(upb_String_Init(&bar, arena.ptr()));
EXPECT_EQ(upb_String_Size(&bar), 0);
EXPECT_TRUE(upb_String_PushBack(&bar, 'x'));
EXPECT_TRUE(upb_String_PushBack(&bar, 'y'));
EXPECT_TRUE(upb_String_PushBack(&bar, 'z'));
EXPECT_TRUE(upb_String_PushBack(&bar, 'z'));
EXPECT_TRUE(upb_String_PushBack(&bar, 'y'));
EXPECT_EQ(upb_String_Size(&bar), 5);
EXPECT_EQ(strcmp(upb_String_Data(&bar), "xyzzy"), 0);
}
TEST(StringTest, Erase) {
upb::Arena arena;
upb_String foo;
EXPECT_TRUE(upb_String_Init(&foo, arena.ptr()));
const char* sent = "This is an example sentence.";
EXPECT_TRUE(upb_String_Assign(&foo, sent, strlen(sent)));
EXPECT_EQ(upb_String_Size(&foo), 28);
upb_String_Erase(&foo, 10, 8);
EXPECT_EQ(upb_String_Size(&foo), 20);
EXPECT_EQ(strcmp(upb_String_Data(&foo), "This is an sentence."), 0);
upb_String_Erase(&foo, 9, 1);
EXPECT_EQ(upb_String_Size(&foo), 19);
EXPECT_EQ(strcmp(upb_String_Data(&foo), "This is a sentence."), 0);
upb_String_Erase(&foo, 5, 5);
EXPECT_EQ(upb_String_Size(&foo), 14);
EXPECT_EQ(strcmp(upb_String_Data(&foo), "This sentence."), 0);
upb_String_Erase(&foo, 4, 99);
EXPECT_EQ(upb_String_Size(&foo), 4);
EXPECT_EQ(strcmp(upb_String_Data(&foo), "This"), 0);
upb_String_Erase(&foo, 0, 4);
EXPECT_EQ(upb_String_Size(&foo), 0);
EXPECT_EQ(strcmp(upb_String_Data(&foo), ""), 0);
}

@ -0,0 +1,98 @@
/*
* Copyright (c) 2009-2022, Google LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Google LLC nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "upb/io/strtod.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Must be last.
#include "upb/port_def.inc"
// Determine the locale-specific radix character by calling sprintf() to print
// the number 1.5, then stripping off the digits. As far as I can tell, this
// is the only portable, thread-safe way to get the C library to divulge the
// locale's radix character. No, localeconv() is NOT thread-safe.
static int GetLocaleRadix(char *data, size_t capacity) {
char temp[16];
const int size = snprintf(temp, sizeof(temp), "%.1f", 1.5);
UPB_ASSERT(temp[0] == '1');
UPB_ASSERT(temp[size - 1] == '5');
UPB_ASSERT(size < capacity);
temp[size - 1] = '\0';
strcpy(data, temp + 1);
return size - 2;
}
// Populates a string identical to *input except that the character pointed to
// by pos (which should be '.') is replaced with the locale-specific radix.
static void LocalizeRadix(const char *input, const char *pos, char *output) {
const int len1 = pos - input;
char radix[8];
const int len2 = GetLocaleRadix(radix, sizeof(radix));
memcpy(output, input, len1);
memcpy(output + len1, radix, len2);
strcpy(output + len1 + len2, input + len1 + 1);
}
double NoLocaleStrtod(const char *str, char **endptr) {
// We cannot simply set the locale to "C" temporarily with setlocale()
// as this is not thread-safe. Instead, we try to parse in the current
// locale first. If parsing stops at a '.' character, then this is a
// pretty good hint that we're actually in some other locale in which
// '.' is not the radix character.
char *temp_endptr;
double result = strtod(str, &temp_endptr);
if (endptr != NULL) *endptr = temp_endptr;
if (*temp_endptr != '.') return result;
// Parsing halted on a '.'. Perhaps we're in a different locale? Let's
// try to replace the '.' with a locale-specific radix character and
// try again.
char localized[80];
LocalizeRadix(str, temp_endptr, localized);
char *localized_endptr;
result = strtod(localized, &localized_endptr);
if ((localized_endptr - &localized[0]) > (temp_endptr - str)) {
// This attempt got further, so replacing the decimal must have helped.
// Update endptr to point at the right location.
if (endptr != NULL) {
// size_diff is non-zero if the localized radix has multiple bytes.
int size_diff = strlen(localized) - strlen(str);
*endptr = (char *)str + (localized_endptr - &localized[0] - size_diff);
}
}
return result;
}

@ -0,0 +1,46 @@
/*
* Copyright (c) 2009-2022, Google LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Google LLC nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef UPB_IO_STRTOD_H_
#define UPB_IO_STRTOD_H_
// Must be last.
#include "upb/port_def.inc"
#ifdef __cplusplus
extern "C" {
#endif
double NoLocaleStrtod(const char *str, char **endptr);
#ifdef __cplusplus
} /* extern "C" */
#endif
#include "upb/port_undef.inc"
#endif /* UPB_IO_STRTOD_H_ */

File diff suppressed because it is too large Load Diff

@ -0,0 +1,187 @@
/*
* Copyright (c) 2009-2022, Google LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Google LLC nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Class for parsing tokenized text from a ZeroCopyInputStream.
#ifndef UPB_IO_TOKENIZER_H_
#define UPB_IO_TOKENIZER_H_
#include "upb/io/string.h"
#include "upb/io/zero_copy_input_stream.h"
#include "upb/upb.h"
// Must be included last.
#include "upb/port_def.inc"
#ifdef __cplusplus
extern "C" {
#endif
typedef enum {
kUpb_TokenType_Start, // Next() has not yet been called.
kUpb_TokenType_End, // End of input reached. "text" is empty.
// A sequence of letters, digits, and underscores, not starting with a digit.
// It is an error for a number to be followed by an identifier with no space
// in between.
kUpb_TokenType_Identifier,
// A sequence of digits representing an integer. Normally the digits are
// decimal, but a prefix of "0x" indicates a hex number and a leading zero
// indicates octal, just like with C numeric literals. A leading negative
// sign is NOT included in the token; it's up to the parser to interpret the
// unary minus operator on its own.
kUpb_TokenType_Integer,
// A floating point literal, with a fractional part and/or an exponent.
// Always in decimal. Again, never negative.
kUpb_TokenType_Float,
// A quoted sequence of escaped characters.
// Either single or double quotes can be used, but they must match.
// A string literal cannot cross a line break.
kUpb_TokenType_String,
// Any other printable character, like '!' or '+'.
// Symbols are always a single character, so "!+$%" is four tokens.
kUpb_TokenType_Symbol,
// A sequence of whitespace.
// This token type is only produced if report_whitespace() is true.
// It is not reported for whitespace within comments or strings.
kUpb_TokenType_Whitespace,
// A newline ('\n'). This token type is only produced if report_whitespace()
// is true and report_newlines() is also true.
// It is not reported for newlines in comments or strings.
kUpb_TokenType_Newline,
} upb_TokenType;
typedef enum {
// Set to allow floats to be suffixed with the letter 'f'. Tokens which would
// otherwise be integers but which have the 'f' suffix will be forced to be
// interpreted as floats. For all other purposes, the 'f' is ignored.
kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0,
// If set, allow string literals to span multiple lines.
// Do not use this; for Google-internal cleanup only.
kUpb_TokenizerOption_AllowMultilineStrings = 1 << 1,
// If set, allow a field name to appear immediately after a number without
// requiring any intervening whitespace as a delimiter.
// Do not use this; for Google-internal cleanup only.
kUpb_TokenizerOption_AllowFieldImmediatelyAfterNumber = 1 << 2,
// If set, whitespace tokens are reported by Next().
kUpb_TokenizerOption_ReportWhitespace = 1 << 3,
// If set, newline tokens are reported by Next(). Implies ReportWhitespace.
kUpb_TokenizerOption_ReportNewlines = 1 << 4,
// By default the tokenizer expects C-style (/* */) comments.
// If set, it expects shell-style (#) comments instead.
kUpb_TokenizerOption_CommentStyleShell = 1 << 5,
} upb_Tokenizer_Option;
// Abstract interface for an object which collects the errors that occur
// during parsing. A typical implementation might simply print the errors
// to stdout.
typedef struct {
// Indicates that there was an error in the input at the given line and
// column numbers. The numbers are zero-based, so you may want to add
// 1 to each before printing them.
void (*AddError)(int line, int column, const char* message, void* context);
// Indicates that there was a warning in the input at the given line and
// column numbers. The numbers are zero-based, so you may want to add
// 1 to each before printing them.
void (*AddWarning)(int line, int column, const char* message, void* context);
// Opaque pointer, passed an as argument to the above functions.
void* context;
} upb_ErrorCollector;
typedef struct upb_Tokenizer upb_Tokenizer;
// Can be passed a flat array and/or a ZCIS as input.
// The array will be read first (if non-NULL), then the stream (if non-NULL).
upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
upb_ZeroCopyInputStream* input,
upb_ErrorCollector* error_collector,
int options, upb_Arena* arena);
void upb_Tokenizer_Fini(upb_Tokenizer* t);
bool upb_Tokenizer_Next(upb_Tokenizer* t);
// Accessors for inspecting current/previous parse tokens,
// which are opaque to the tokenizer (to reduce copying).
upb_TokenType upb_Tokenizer_CurrentType(const upb_Tokenizer* t);
int upb_Tokenizer_CurrentColumn(const upb_Tokenizer* t);
int upb_Tokenizer_CurrentEndColumn(const upb_Tokenizer* t);
int upb_Tokenizer_CurrentLine(const upb_Tokenizer* t);
int upb_Tokenizer_CurrentTextSize(const upb_Tokenizer* t);
const char* upb_Tokenizer_CurrentTextData(const upb_Tokenizer* t);
upb_TokenType upb_Tokenizer_PreviousType(const upb_Tokenizer* t);
int upb_Tokenizer_PreviousColumn(const upb_Tokenizer* t);
int upb_Tokenizer_PreviousEndColumn(const upb_Tokenizer* t);
int upb_Tokenizer_PreviousLine(const upb_Tokenizer* t);
// Parses a TYPE_INTEGER token. Returns false if the result would be
// greater than max_value. Otherwise, returns true and sets *output to the
// result. If the text is not from a Token of type TYPE_INTEGER originally
// parsed by a Tokenizer, the result is undefined (possibly an assert
// failure).
bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output);
// Parses a TYPE_FLOAT token. This never fails, so long as the text actually
// comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
// result is undefined (possibly an assert failure).
double upb_Parse_Float(const char* text);
// Identical to ParseString (below), but appends to output.
void upb_Parse_StringAppend(const char* text, upb_String* output);
// Parses a TYPE_STRING token. This never fails, so long as the text actually
// comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
// result is undefined (possibly an assert failure).
UPB_INLINE void upb_Parse_String(const char* text, upb_String* output) {
upb_String_Clear(output);
upb_Parse_StringAppend(text, output);
}
// External helper: validate an identifier.
bool upb_Tokenizer_IsIdentifier(const char* text, int size);
#ifdef __cplusplus
} /* extern "C" */
#endif
#include "upb/port_undef.inc"
#endif // UPB_IO_TOKENIZER_H_

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save