// Protocol Buffers - Google's data interchange format // Copyright 2023 Google LLC. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file or at // https://developers.google.com/open-source/licenses/bsd #ifndef UPB_TEXT_ENCODE_INTERNAL_H_ #define UPB_TEXT_ENCODE_INTERNAL_H_ #include #include #include "upb/base/descriptor_constants.h" #include "upb/base/string_view.h" #include "upb/message/array.h" #include "upb/message/internal/map_sorter.h" #include "upb/message/message.h" #include "upb/port/vsnprintf_compat.h" #include "upb/text/options.h" #include "upb/wire/eps_copy_input_stream.h" #include "utf8_range.h" // Must be last. #include "upb/port/def.inc" typedef struct { char *buf, *ptr, *end; size_t overflow; int indent_depth; int options; const struct upb_DefPool* ext_pool; _upb_mapsorter sorter; } txtenc; UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutBytes)(txtenc* e, const void* data, size_t len) { size_t have = e->end - e->ptr; if (UPB_LIKELY(have >= len)) { memcpy(e->ptr, data, len); e->ptr += len; } else { if (have) { memcpy(e->ptr, data, have); e->ptr += have; } e->overflow += (len - have); } } UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutStr)(txtenc* e, const char* str) { UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, str, strlen(str)); } UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Printf)(txtenc* e, const char* fmt, ...) { size_t n; size_t have = e->end - e->ptr; va_list args; va_start(args, fmt); n = _upb_vsnprintf(e->ptr, have, fmt, args); va_end(args); if (UPB_LIKELY(have > n)) { e->ptr += n; } else { e->ptr = UPB_PTRADD(e->ptr, have); e->overflow += (n - have); } } UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Indent)(txtenc* e) { if ((e->options & UPB_TXTENC_SINGLELINE) == 0) { int i = e->indent_depth; while (i-- > 0) { UPB_PRIVATE(_upb_TextEncode_PutStr)(e, " "); } } } UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_EndField)(txtenc* e) { if (e->options & UPB_TXTENC_SINGLELINE) { UPB_PRIVATE(_upb_TextEncode_PutStr)(e, " "); } else { UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\n"); } } UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Escaped)(txtenc* e, unsigned char ch) { switch (ch) { case '\n': UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\n"); break; case '\r': UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\r"); break; case '\t': UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\t"); break; case '\"': UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\""); break; case '\'': UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\'"); break; case '\\': UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\\"); break; default: UPB_PRIVATE(_upb_TextEncode_Printf)(e, "\\%03o", ch); break; } } // Returns true if `ch` needs to be escaped in TextFormat, independent of any // UTF-8 validity issues. UPB_INLINE bool UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(unsigned char ch) { if (ch < 32) return true; switch (ch) { case '\"': case '\'': case '\\': case 127: return true; } return false; } UPB_INLINE bool UPB_PRIVATE(_upb_AsciiIsPrint)(unsigned char ch) { return ch >= 32 && ch < 127; } // Returns true if this is a high byte that requires UTF-8 validation. If the // UTF-8 validation fails, we must escape the byte. UPB_INLINE bool UPB_PRIVATE(_upb_NeedsUtf8Validation)(unsigned char ch) { return ch > 127; } // Returns the number of bytes in the prefix of `val` that do not need escaping. // This is like utf8_range::SpanStructurallyValid(), except that it also // terminates at any ASCII char that needs to be escaped in TextFormat (any char // that has `DefinitelyNeedsEscape(ch) == true`). // // If we could get a variant of utf8_range::SpanStructurallyValid() that could // terminate on any of these chars, that might be more efficient, but it would // be much more complicated to modify that heavily SIMD code. UPB_INLINE size_t UPB_PRIVATE(_SkipPassthroughBytes)(const char* ptr, size_t size) { for (size_t i = 0; i < size; i++) { unsigned char uc = ptr[i]; if (UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(uc)) return i; if (UPB_PRIVATE(_upb_NeedsUtf8Validation)(uc)) { // Find the end of this region of consecutive high bytes, so that we only // give high bytes to the UTF-8 checker. This avoids needing to perform // a second scan of the ASCII characters looking for characters that // need escaping. // // We assume that high bytes are less frequent than plain, printable ASCII // bytes, so we accept the double-scan of high bytes. size_t end = i + 1; for (; end < size; end++) { if (!UPB_PRIVATE(_upb_NeedsUtf8Validation)(ptr[end])) break; } size_t n = end - i; size_t ok = utf8_range_ValidPrefix(ptr + i, n); if (ok != n) return i + ok; i += ok - 1; } } return size; } UPB_INLINE void UPB_PRIVATE(_upb_HardenedPrintString)(txtenc* e, const char* ptr, size_t len) { // Print as UTF-8, while guarding against any invalid UTF-8 in the string // field. // // If in the future we have a guaranteed invariant that invalid UTF-8 will // never be present, we could avoid the UTF-8 check here. UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\""); const char* end = ptr + len; while (ptr < end) { size_t n = UPB_PRIVATE(_SkipPassthroughBytes)(ptr, end - ptr); if (n != 0) { UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, n); ptr += n; if (ptr == end) break; } // If repeated calls to CEscape() and PrintString() are expensive, we could // consider batching them, at the cost of some complexity. UPB_PRIVATE(_upb_TextEncode_Escaped)(e, *ptr); ptr++; } UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\""); } UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Bytes)(txtenc* e, upb_StringView data) { const char* ptr = data.data; const char* end = ptr + data.size; UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\""); for (; ptr < end; ptr++) { unsigned char uc = *ptr; if (UPB_PRIVATE(_upb_AsciiIsPrint)(uc)) { UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, 1); } else { UPB_PRIVATE(_upb_TextEncode_Escaped)(e, uc); } } UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\""); } UPB_INLINE size_t UPB_PRIVATE(_upb_TextEncode_Nullz)(txtenc* e, size_t size) { size_t ret = e->ptr - e->buf + e->overflow; if (size > 0) { if (e->ptr == e->end) e->ptr--; *e->ptr = '\0'; } return ret; } const char* UPB_PRIVATE(_upb_TextEncode_Unknown)(txtenc* e, const char* ptr, upb_EpsCopyInputStream* stream, int groupnum); void UPB_PRIVATE(_upb_TextEncode_ParseUnknown)(txtenc* e, const upb_Message* msg); // Must not be called for ctype = kUpb_CType_Enum, as they require different // handling depending on whether or not we're doing reflection-based encoding. void UPB_PRIVATE(_upb_TextEncode_Scalar)(txtenc* e, upb_MessageValue val, upb_CType ctype); #include "upb/port/undef.inc" #endif // UPB_TEXT_ENCODE_INTERNAL_H_