diff --git a/upb/io/tokenizer.c b/upb/io/tokenizer.c index 41ba7d9245..e3acd4dad7 100644 --- a/upb/io/tokenizer.c +++ b/upb/io/tokenizer.c @@ -30,6 +30,7 @@ #include #include "upb/internal/unicode.h" +#include "upb/io/string.h" #include "upb/io/strtod.h" // Must be included last. @@ -871,23 +872,25 @@ static const char* FetchUnicodePoint(const char* ptr, uint32_t* code_point) { } // The text string must begin and end with single or double quote characters. -void upb_Parse_StringAppend(const char* text, upb_String* output) { +upb_StringView upb_Parse_String(const char* text, upb_Arena* arena) { const size_t size = strlen(text); - // Reminder: text[0] is always a quote character. (If text is - // empty, it's invalid, so we'll just return). + upb_String output; + upb_String_Init(&output, arena); + + // Reminder: text[0] is always a quote character. + // (If text is empty, it's invalid, so we'll just return). if (size == 0) { fprintf(stderr, "Tokenizer::ParseStringAppend() passed text that could not" " have been tokenized as a string: %s", text); UPB_ASSERT(0); - return; } // Reserve room for new string. - const size_t new_len = size + upb_String_Size(output); - upb_String_Reserve(output, new_len); + const size_t new_len = size + upb_String_Size(&output); + upb_String_Reserve(&output, new_len); // Loop through the string copying characters to "output" and // interpreting escape sequences. Note that any invalid escape @@ -909,7 +912,7 @@ void upb_Parse_StringAppend(const char* text, upb_String* output) { ++ptr; code = code * 8 + DigitValue(*ptr); } - upb_String_PushBack(output, (char)code); + upb_String_PushBack(&output, (char)code); } else if (*ptr == 'x') { // A hex escape. May zero, one, or two digits. (The zero case @@ -923,29 +926,32 @@ void upb_Parse_StringAppend(const char* text, upb_String* output) { ++ptr; code = code * 16 + DigitValue(*ptr); } - upb_String_PushBack(output, (char)code); + upb_String_PushBack(&output, (char)code); } else if (*ptr == 'u' || *ptr == 'U') { uint32_t unicode; const char* end = FetchUnicodePoint(ptr, &unicode); if (end == ptr) { // Failure: Just dump out what we saw, don't try to parse it. - upb_String_PushBack(output, *ptr); + upb_String_PushBack(&output, *ptr); } else { - AppendUTF8(unicode, output); + AppendUTF8(unicode, &output); ptr = end - 1; // Because we're about to ++ptr. } } else { // Some other escape code. - upb_String_PushBack(output, TranslateEscape(*ptr)); + upb_String_PushBack(&output, TranslateEscape(*ptr)); } } else if (*ptr == text[0] && ptr[1] == '\0') { // Ignore final quote matching the starting quote. } else { - upb_String_PushBack(output, *ptr); + upb_String_PushBack(&output, *ptr); } } + + return upb_StringView_FromDataAndSize(upb_String_Data(&output), + upb_String_Size(&output)); } static bool AllInClass(bool (*f)(char), const char* text, int size) { @@ -955,11 +961,11 @@ static bool AllInClass(bool (*f)(char), const char* text, int size) { return true; } -bool upb_Tokenizer_IsIdentifier(const char* text, int size) { +bool upb_Tokenizer_IsIdentifier(const char* data, int size) { // Mirrors IDENTIFIER definition in Tokenizer::Next() above. if (size == 0) return false; - if (!upb_Tokenizer_IsLetter(text[0])) return false; - if (!AllInClass(upb_Tokenizer_IsAlphanumeric, text + 1, size - 1)) + if (!upb_Tokenizer_IsLetter(data[0])) return false; + if (!AllInClass(upb_Tokenizer_IsAlphanumeric, data + 1, size - 1)) return false; return true; } diff --git a/upb/io/tokenizer.h b/upb/io/tokenizer.h index d0d4f4d2b4..ed6ca5f434 100644 --- a/upb/io/tokenizer.h +++ b/upb/io/tokenizer.h @@ -30,8 +30,8 @@ #ifndef UPB_IO_TOKENIZER_H_ #define UPB_IO_TOKENIZER_H_ -#include "upb/io/string.h" #include "upb/io/zero_copy_input_stream.h" +#include "upb/string_view.h" #include "upb/upb.h" // Must be included last. @@ -123,6 +123,9 @@ int upb_Tokenizer_Line(const upb_Tokenizer* t); int upb_Tokenizer_TextSize(const upb_Tokenizer* t); const char* upb_Tokenizer_TextData(const upb_Tokenizer* t); +// External helper: validate an identifier. +bool upb_Tokenizer_IsIdentifier(const char* data, int size); + // Parses a TYPE_INTEGER token. Returns false if the result would be // greater than max_value. Otherwise, returns true and sets *output to the // result. If the text is not from a Token of type TYPE_INTEGER originally @@ -135,19 +138,10 @@ bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output); // result is undefined (possibly an assert failure). double upb_Parse_Float(const char* text); -// Identical to ParseString (below), but appends to output. -void upb_Parse_StringAppend(const char* text, upb_String* output); - // Parses a TYPE_STRING token. This never fails, so long as the text actually // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the // result is undefined (possibly an assert failure). -UPB_INLINE void upb_Parse_String(const char* text, upb_String* output) { - upb_String_Clear(output); - upb_Parse_StringAppend(text, output); -} - -// External helper: validate an identifier. -bool upb_Tokenizer_IsIdentifier(const char* text, int size); +upb_StringView upb_Parse_String(const char* text, upb_Arena* arena); #ifdef __cplusplus } /* extern "C" */ diff --git a/upb/io/tokenizer_test.cc b/upb/io/tokenizer_test.cc index aa40b10f54..a4ed94337c 100644 --- a/upb/io/tokenizer_test.cc +++ b/upb/io/tokenizer_test.cc @@ -32,6 +32,7 @@ #include "absl/strings/str_format.h" #include "upb/internal/unicode.h" #include "upb/io/chunked_input_stream.h" +#include "upb/io/string.h" #include "upb/upb.hpp" // Must be last. @@ -998,35 +999,30 @@ TEST_F(TokenizerTest, ParseString) { }; upb::Arena arena; - upb_String result; - upb_String_Init(&result, arena.ptr()); for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); i++) { - upb_Parse_String(inputs[i].data(), &result); - EXPECT_TRUE(StringEquals(upb_String_Data(&result), outputs[i].data())); + auto sv = upb_Parse_String(inputs[i].data(), arena.ptr()); + EXPECT_TRUE(StringEquals(sv.data, outputs[i].data())); } // Test invalid strings that will never be tokenized as strings. #ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet EXPECT_DEBUG_DEATH( - upb_Parse_String("", &result), + upb_Parse_String("", arena.ptr()), "passed text that could not have been tokenized as a string"); #endif // GTEST_HAS_DEATH_TEST } TEST_F(TokenizerTest, ParseStringAppend) { - // Check that ParseString and ParseStringAppend differ. upb::Arena arena; upb_String output; upb_String_Init(&output, arena.ptr()); upb_String_Assign(&output, "stuff+", 6); - - upb_Parse_StringAppend("'hello'", &output); + auto sv = upb_Parse_String("'hello'", arena.ptr()); + EXPECT_TRUE(StringEquals(sv.data, "hello")); + upb_String_Append(&output, sv.data, sv.size); EXPECT_TRUE(StringEquals(upb_String_Data(&output), "stuff+hello")); - - upb_Parse_String("'hello'", &output); - EXPECT_TRUE(StringEquals(upb_String_Data(&output), "hello")); } // ------------------------------------------------------------------- @@ -1172,14 +1168,11 @@ static const char* kParseBenchmark[] = { TEST(Benchmark, ParseStringAppendAccumulate) { upb::Arena arena; - upb_String output; - upb_String_Init(&output, arena.ptr()); size_t outsize = 0; int benchmark_len = arraysize(kParseBenchmark); for (int i = 0; i < benchmark_len; i++) { - upb_Parse_StringAppend(kParseBenchmark[i], &output); - outsize += upb_String_Size(&output); - upb_String_Clear(&output); + auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr()); + outsize += sv.size; } EXPECT_NE(0, outsize); } @@ -1190,7 +1183,8 @@ TEST(Benchmark, ParseStringAppend) { upb_String_Init(&output, arena.ptr()); int benchmark_len = arraysize(kParseBenchmark); for (int i = 0; i < benchmark_len; i++) { - upb_Parse_StringAppend(kParseBenchmark[i], &output); + auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr()); + upb_String_Append(&output, sv.data, sv.size); } EXPECT_NE(0, upb_String_Size(&output)); } @@ -1217,12 +1211,10 @@ static std::string DisplayHex(const std::string& data) { static void ExpectFormat(const std::string& expectation, const std::string& formatted) { upb::Arena arena; - upb_String output; - upb_String_Init(&output, arena.ptr()); - upb_Parse_String(formatted.data(), &output); - EXPECT_EQ(strcmp(upb_String_Data(&output), expectation.data()), 0) + auto sv = upb_Parse_String(formatted.data(), arena.ptr()); + EXPECT_EQ(strcmp(sv.data, expectation.data()), 0) << ": Incorrectly parsed " << formatted << ":\nGot " - << DisplayHex(output.data_) << "\nExpected " << DisplayHex(expectation); + << DisplayHex(sv.data) << "\nExpected " << DisplayHex(expectation); } TEST(TokenizerHandlesUnicode, BMPCodes) {