remove upb_String from the public tokenizer api

upb_String is a hack which exists because the original C++ tokenizer got to assume the existence of C++ strings, so at least for now the C tokenizer needs a rough equivalent. But this should be a purely internal implementation detail, not part of the visible surface. PiperOrigin-RevId: 469814074
3 years ago · f3316e2d7d
parent 0013c936ef
commit f3316e2d7d
3 changed files with 40 additions and 48 deletions
--- a/upb/io/tokenizer.c
+++ b/upb/io/tokenizer.c
@ -30,6 +30,7 @@
 #include <stdio.h>

 #include "upb/internal/unicode.h"
+#include "upb/io/string.h"
 #include "upb/io/strtod.h"

 // Must be included last.
@ -871,23 +872,25 @@ static const char* FetchUnicodePoint(const char* ptr, uint32_t* code_point) {
 }

 // The text string must begin and end with single or double quote characters.
-void upb_Parse_StringAppend(const char* text, upb_String* output) {
+upb_StringView upb_Parse_String(const char* text, upb_Arena* arena) {
  const size_t size = strlen(text);

-  // Reminder: text[0] is always a quote character.  (If text is
-  // empty, it's invalid, so we'll just return).
+  upb_String output;
+  upb_String_Init(&output, arena);
+
+  // Reminder: text[0] is always a quote character.
+  // (If text is empty, it's invalid, so we'll just return).
  if (size == 0) {
    fprintf(stderr,
            "Tokenizer::ParseStringAppend() passed text that could not"
            " have been tokenized as a string: %s",
            text);
    UPB_ASSERT(0);
-    return;
  }

  // Reserve room for new string.
-  const size_t new_len = size + upb_String_Size(output);
-  upb_String_Reserve(output, new_len);
+  const size_t new_len = size + upb_String_Size(&output);
+  upb_String_Reserve(&output, new_len);

  // Loop through the string copying characters to "output" and
  // interpreting escape sequences.  Note that any invalid escape
@ -909,7 +912,7 @@ void upb_Parse_StringAppend(const char* text, upb_String* output) {
          ++ptr;
          code = code * 8 + DigitValue(*ptr);
        }
-        upb_String_PushBack(output, (char)code);
+        upb_String_PushBack(&output, (char)code);

      } else if (*ptr == 'x') {
        // A hex escape.  May zero, one, or two digits.  (The zero case
@ -923,29 +926,32 @@ void upb_Parse_StringAppend(const char* text, upb_String* output) {
          ++ptr;
          code = code * 16 + DigitValue(*ptr);
        }
-        upb_String_PushBack(output, (char)code);
+        upb_String_PushBack(&output, (char)code);

      } else if (*ptr == 'u' || *ptr == 'U') {
        uint32_t unicode;
        const char* end = FetchUnicodePoint(ptr, &unicode);
        if (end == ptr) {
          // Failure: Just dump out what we saw, don't try to parse it.
-          upb_String_PushBack(output, *ptr);
+          upb_String_PushBack(&output, *ptr);
        } else {
-          AppendUTF8(unicode, output);
+          AppendUTF8(unicode, &output);
          ptr = end - 1;  // Because we're about to ++ptr.
        }
      } else {
        // Some other escape code.
-        upb_String_PushBack(output, TranslateEscape(*ptr));
+        upb_String_PushBack(&output, TranslateEscape(*ptr));
      }

    } else if (*ptr == text[0] && ptr[1] == '\0') {
      // Ignore final quote matching the starting quote.
    } else {
-      upb_String_PushBack(output, *ptr);
+      upb_String_PushBack(&output, *ptr);
    }
  }
+
+  return upb_StringView_FromDataAndSize(upb_String_Data(&output),
+                                        upb_String_Size(&output));
 }

 static bool AllInClass(bool (*f)(char), const char* text, int size) {
@ -955,11 +961,11 @@ static bool AllInClass(bool (*f)(char), const char* text, int size) {
  return true;
 }

-bool upb_Tokenizer_IsIdentifier(const char* text, int size) {
+bool upb_Tokenizer_IsIdentifier(const char* data, int size) {
  // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
  if (size == 0) return false;
-  if (!upb_Tokenizer_IsLetter(text[0])) return false;
-  if (!AllInClass(upb_Tokenizer_IsAlphanumeric, text + 1, size - 1))
+  if (!upb_Tokenizer_IsLetter(data[0])) return false;
+  if (!AllInClass(upb_Tokenizer_IsAlphanumeric, data + 1, size - 1))
    return false;
  return true;
 }
--- a/upb/io/tokenizer.h
+++ b/upb/io/tokenizer.h
@ -30,8 +30,8 @@
 #ifndef UPB_IO_TOKENIZER_H_
 #define UPB_IO_TOKENIZER_H_

-#include "upb/io/string.h"
 #include "upb/io/zero_copy_input_stream.h"
+#include "upb/string_view.h"
 #include "upb/upb.h"

 // Must be included last.
@ -123,6 +123,9 @@ int upb_Tokenizer_Line(const upb_Tokenizer* t);
 int upb_Tokenizer_TextSize(const upb_Tokenizer* t);
 const char* upb_Tokenizer_TextData(const upb_Tokenizer* t);

+// External helper: validate an identifier.
+bool upb_Tokenizer_IsIdentifier(const char* data, int size);
+
 // Parses a TYPE_INTEGER token. Returns false if the result would be
 // greater than max_value. Otherwise, returns true and sets *output to the
 // result. If the text is not from a Token of type TYPE_INTEGER originally
@ -135,19 +138,10 @@ bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output);
 // result is undefined (possibly an assert failure).
 double upb_Parse_Float(const char* text);

-// Identical to ParseString (below), but appends to output.
-void upb_Parse_StringAppend(const char* text, upb_String* output);
-
 // Parses a TYPE_STRING token. This never fails, so long as the text actually
 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
 // result is undefined (possibly an assert failure).
-UPB_INLINE void upb_Parse_String(const char* text, upb_String* output) {
-  upb_String_Clear(output);
-  upb_Parse_StringAppend(text, output);
-}
-
-// External helper: validate an identifier.
-bool upb_Tokenizer_IsIdentifier(const char* text, int size);
+upb_StringView upb_Parse_String(const char* text, upb_Arena* arena);

 #ifdef __cplusplus
 } /* extern "C" */
--- a/upb/io/tokenizer_test.cc
+++ b/upb/io/tokenizer_test.cc
@ -32,6 +32,7 @@
 #include "absl/strings/str_format.h"
 #include "upb/internal/unicode.h"
 #include "upb/io/chunked_input_stream.h"
+#include "upb/io/string.h"
 #include "upb/upb.hpp"

 // Must be last.
@ -998,35 +999,30 @@ TEST_F(TokenizerTest, ParseString) {
  };

  upb::Arena arena;
-  upb_String result;
-  upb_String_Init(&result, arena.ptr());

  for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); i++) {
-    upb_Parse_String(inputs[i].data(), &result);
-    EXPECT_TRUE(StringEquals(upb_String_Data(&result), outputs[i].data()));
+    auto sv = upb_Parse_String(inputs[i].data(), arena.ptr());
+    EXPECT_TRUE(StringEquals(sv.data, outputs[i].data()));
  }

  // Test invalid strings that will never be tokenized as strings.
 #ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
  EXPECT_DEBUG_DEATH(
-      upb_Parse_String("", &result),
+      upb_Parse_String("", arena.ptr()),
      "passed text that could not have been tokenized as a string");
 #endif  // GTEST_HAS_DEATH_TEST
 }

 TEST_F(TokenizerTest, ParseStringAppend) {
-  // Check that ParseString and ParseStringAppend differ.
  upb::Arena arena;
  upb_String output;
  upb_String_Init(&output, arena.ptr());

  upb_String_Assign(&output, "stuff+", 6);
-
-  upb_Parse_StringAppend("'hello'", &output);
+  auto sv = upb_Parse_String("'hello'", arena.ptr());
+  EXPECT_TRUE(StringEquals(sv.data, "hello"));
+  upb_String_Append(&output, sv.data, sv.size);
  EXPECT_TRUE(StringEquals(upb_String_Data(&output), "stuff+hello"));
-
-  upb_Parse_String("'hello'", &output);
-  EXPECT_TRUE(StringEquals(upb_String_Data(&output), "hello"));
 }

 // -------------------------------------------------------------------
@ -1172,14 +1168,11 @@ static const char* kParseBenchmark[] = {

 TEST(Benchmark, ParseStringAppendAccumulate) {
  upb::Arena arena;
-  upb_String output;
-  upb_String_Init(&output, arena.ptr());
  size_t outsize = 0;
  int benchmark_len = arraysize(kParseBenchmark);
  for (int i = 0; i < benchmark_len; i++) {
-    upb_Parse_StringAppend(kParseBenchmark[i], &output);
-    outsize += upb_String_Size(&output);
-    upb_String_Clear(&output);
+    auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr());
+    outsize += sv.size;
  }
  EXPECT_NE(0, outsize);
 }
@ -1190,7 +1183,8 @@ TEST(Benchmark, ParseStringAppend) {
  upb_String_Init(&output, arena.ptr());
  int benchmark_len = arraysize(kParseBenchmark);
  for (int i = 0; i < benchmark_len; i++) {
-    upb_Parse_StringAppend(kParseBenchmark[i], &output);
+    auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr());
+    upb_String_Append(&output, sv.data, sv.size);
  }
  EXPECT_NE(0, upb_String_Size(&output));
 }
@ -1217,12 +1211,10 @@ static std::string DisplayHex(const std::string& data) {
 static void ExpectFormat(const std::string& expectation,
                         const std::string& formatted) {
  upb::Arena arena;
-  upb_String output;
-  upb_String_Init(&output, arena.ptr());
-  upb_Parse_String(formatted.data(), &output);
-  EXPECT_EQ(strcmp(upb_String_Data(&output), expectation.data()), 0)
+  auto sv = upb_Parse_String(formatted.data(), arena.ptr());
+  EXPECT_EQ(strcmp(sv.data, expectation.data()), 0)
      << ": Incorrectly parsed " << formatted << ":\nGot      "
-      << DisplayHex(output.data_) << "\nExpected " << DisplayHex(expectation);
+      << DisplayHex(sv.data) << "\nExpected " << DisplayHex(expectation);
 }

 TEST(TokenizerHandlesUnicode, BMPCodes) {