|
|
|
// Protocol Buffers - Google's data interchange format
|
|
|
|
// Copyright 2023 Google LLC. All rights reserved.
|
|
|
|
//
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file or at
|
|
|
|
// https://developers.google.com/open-source/licenses/bsd
|
|
|
|
|
|
|
|
#ifndef UPB_TEXT_ENCODE_INTERNAL_H_
|
|
|
|
#define UPB_TEXT_ENCODE_INTERNAL_H_
|
|
|
|
|
|
|
|
#include <stdarg.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#include "upb/base/descriptor_constants.h"
|
|
|
|
#include "upb/base/string_view.h"
|
|
|
|
#include "upb/message/array.h"
|
|
|
|
#include "upb/message/internal/map_sorter.h"
|
|
|
|
#include "upb/message/message.h"
|
|
|
|
#include "upb/port/vsnprintf_compat.h"
|
|
|
|
#include "upb/text/options.h"
|
|
|
|
#include "upb/wire/eps_copy_input_stream.h"
|
|
|
|
#include "utf8_range.h"
|
|
|
|
|
|
|
|
// Must be last.
|
|
|
|
#include "upb/port/def.inc"
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
char *buf, *ptr, *end;
|
|
|
|
size_t overflow;
|
|
|
|
int indent_depth;
|
|
|
|
int options;
|
|
|
|
const struct upb_DefPool* ext_pool;
|
|
|
|
_upb_mapsorter sorter;
|
|
|
|
} txtenc;
|
|
|
|
|
|
|
|
UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutBytes)(txtenc* e,
|
|
|
|
const void* data,
|
|
|
|
size_t len) {
|
|
|
|
size_t have = e->end - e->ptr;
|
|
|
|
if (UPB_LIKELY(have >= len)) {
|
|
|
|
memcpy(e->ptr, data, len);
|
|
|
|
e->ptr += len;
|
|
|
|
} else {
|
|
|
|
if (have) {
|
|
|
|
memcpy(e->ptr, data, have);
|
|
|
|
e->ptr += have;
|
|
|
|
}
|
|
|
|
e->overflow += (len - have);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutStr)(txtenc* e,
|
|
|
|
const char* str) {
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, str, strlen(str));
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Printf)(txtenc* e, const char* fmt,
|
|
|
|
...) {
|
|
|
|
size_t n;
|
|
|
|
size_t have = e->end - e->ptr;
|
|
|
|
va_list args;
|
|
|
|
|
|
|
|
va_start(args, fmt);
|
|
|
|
n = _upb_vsnprintf(e->ptr, have, fmt, args);
|
|
|
|
va_end(args);
|
|
|
|
|
|
|
|
if (UPB_LIKELY(have > n)) {
|
|
|
|
e->ptr += n;
|
|
|
|
} else {
|
|
|
|
e->ptr = UPB_PTRADD(e->ptr, have);
|
|
|
|
e->overflow += (n - have);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Indent)(txtenc* e) {
|
|
|
|
if ((e->options & UPB_TXTENC_SINGLELINE) == 0) {
|
|
|
|
int i = e->indent_depth;
|
|
|
|
while (i-- > 0) {
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, " ");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_EndField)(txtenc* e) {
|
|
|
|
if (e->options & UPB_TXTENC_SINGLELINE) {
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, " ");
|
|
|
|
} else {
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Escaped)(txtenc* e,
|
|
|
|
unsigned char ch) {
|
|
|
|
switch (ch) {
|
|
|
|
case '\n':
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\n");
|
|
|
|
break;
|
|
|
|
case '\r':
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\r");
|
|
|
|
break;
|
|
|
|
case '\t':
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\t");
|
|
|
|
break;
|
|
|
|
case '\"':
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\"");
|
|
|
|
break;
|
|
|
|
case '\'':
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\'");
|
|
|
|
break;
|
|
|
|
case '\\':
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\\");
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_Printf)(e, "\\%03o", ch);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns true if `ch` needs to be escaped in TextFormat, independent of any
|
|
|
|
// UTF-8 validity issues.
|
|
|
|
UPB_INLINE bool UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(unsigned char ch) {
|
|
|
|
if (ch < 32) return true;
|
|
|
|
switch (ch) {
|
|
|
|
case '\"':
|
|
|
|
case '\'':
|
|
|
|
case '\\':
|
|
|
|
case 127:
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE bool UPB_PRIVATE(_upb_AsciiIsPrint)(unsigned char ch) {
|
|
|
|
return ch >= 32 && ch < 127;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns true if this is a high byte that requires UTF-8 validation. If the
|
|
|
|
// UTF-8 validation fails, we must escape the byte.
|
|
|
|
UPB_INLINE bool UPB_PRIVATE(_upb_NeedsUtf8Validation)(unsigned char ch) {
|
|
|
|
return ch > 127;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns the number of bytes in the prefix of `val` that do not need escaping.
|
|
|
|
// This is like utf8_range::SpanStructurallyValid(), except that it also
|
|
|
|
// terminates at any ASCII char that needs to be escaped in TextFormat (any char
|
|
|
|
// that has `DefinitelyNeedsEscape(ch) == true`).
|
|
|
|
//
|
|
|
|
// If we could get a variant of utf8_range::SpanStructurallyValid() that could
|
|
|
|
// terminate on any of these chars, that might be more efficient, but it would
|
|
|
|
// be much more complicated to modify that heavily SIMD code.
|
|
|
|
UPB_INLINE size_t UPB_PRIVATE(_SkipPassthroughBytes)(const char* ptr,
|
|
|
|
size_t size) {
|
|
|
|
for (size_t i = 0; i < size; i++) {
|
|
|
|
unsigned char uc = ptr[i];
|
|
|
|
if (UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(uc)) return i;
|
|
|
|
if (UPB_PRIVATE(_upb_NeedsUtf8Validation)(uc)) {
|
|
|
|
// Find the end of this region of consecutive high bytes, so that we only
|
|
|
|
// give high bytes to the UTF-8 checker. This avoids needing to perform
|
|
|
|
// a second scan of the ASCII characters looking for characters that
|
|
|
|
// need escaping.
|
|
|
|
//
|
|
|
|
// We assume that high bytes are less frequent than plain, printable ASCII
|
|
|
|
// bytes, so we accept the double-scan of high bytes.
|
|
|
|
size_t end = i + 1;
|
|
|
|
for (; end < size; end++) {
|
|
|
|
if (!UPB_PRIVATE(_upb_NeedsUtf8Validation)(ptr[end])) break;
|
|
|
|
}
|
|
|
|
size_t n = end - i;
|
|
|
|
size_t ok = utf8_range_ValidPrefix(ptr + i, n);
|
|
|
|
if (ok != n) return i + ok;
|
|
|
|
i += ok - 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE void UPB_PRIVATE(_upb_HardenedPrintString)(txtenc* e,
|
|
|
|
const char* ptr,
|
|
|
|
size_t len) {
|
|
|
|
// Print as UTF-8, while guarding against any invalid UTF-8 in the string
|
|
|
|
// field.
|
|
|
|
//
|
|
|
|
// If in the future we have a guaranteed invariant that invalid UTF-8 will
|
|
|
|
// never be present, we could avoid the UTF-8 check here.
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
|
|
|
|
const char* end = ptr + len;
|
|
|
|
while (ptr < end) {
|
|
|
|
size_t n = UPB_PRIVATE(_SkipPassthroughBytes)(ptr, end - ptr);
|
|
|
|
if (n != 0) {
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, n);
|
|
|
|
ptr += n;
|
|
|
|
if (ptr == end) break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If repeated calls to CEscape() and PrintString() are expensive, we could
|
|
|
|
// consider batching them, at the cost of some complexity.
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_Escaped)(e, *ptr);
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Bytes)(txtenc* e,
|
|
|
|
upb_StringView data) {
|
|
|
|
const char* ptr = data.data;
|
|
|
|
const char* end = ptr + data.size;
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
|
|
|
|
for (; ptr < end; ptr++) {
|
|
|
|
unsigned char uc = *ptr;
|
|
|
|
if (UPB_PRIVATE(_upb_AsciiIsPrint)(uc)) {
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, 1);
|
|
|
|
} else {
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_Escaped)(e, uc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE size_t UPB_PRIVATE(_upb_TextEncode_Nullz)(txtenc* e, size_t size) {
|
|
|
|
size_t ret = e->ptr - e->buf + e->overflow;
|
|
|
|
|
|
|
|
if (size > 0) {
|
|
|
|
if (e->ptr == e->end) e->ptr--;
|
|
|
|
*e->ptr = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* UPB_PRIVATE(_upb_TextEncode_Unknown)(txtenc* e, const char* ptr,
|
|
|
|
upb_EpsCopyInputStream* stream,
|
|
|
|
int groupnum);
|
|
|
|
|
|
|
|
void UPB_PRIVATE(_upb_TextEncode_ParseUnknown)(txtenc* e,
|
|
|
|
const upb_Message* msg);
|
|
|
|
|
|
|
|
// Must not be called for ctype = kUpb_CType_Enum, as they require different
|
|
|
|
// handling depending on whether or not we're doing reflection-based encoding.
|
|
|
|
void UPB_PRIVATE(_upb_TextEncode_Scalar)(txtenc* e, upb_MessageValue val,
|
|
|
|
upb_CType ctype);
|
|
|
|
|
|
|
|
#include "upb/port/undef.inc"
|
|
|
|
|
|
|
|
#endif // UPB_TEXT_ENCODE_INTERNAL_H_
|