split out some unicode logic from the json decoder

Also fixed a bug in the json decoder which caused it to break on a code point value of exactly 0x10ffff

PiperOrigin-RevId: 459856813
pull/13171/head
Eric Salo 3 years ago committed by Copybara-Service
parent 201a79071a
commit 410143b265
  1. 16
      BUILD
  2. 57
      upb/internal/unicode.c
  3. 77
      upb/internal/unicode.h
  4. 46
      upb/json_decode.c

16
BUILD

@ -108,6 +108,7 @@ cc_library(
"upb/decode.c",
"upb/encode.c",
"upb/internal/table.h",
"upb/internal/unicode.h",
"upb/msg.c",
"upb/msg_internal.h",
"upb/status.c",
@ -138,6 +139,7 @@ cc_library(
":fastdecode",
":port",
":table_internal",
":unicode_internal",
],
)
@ -448,6 +450,7 @@ cc_library(
":encode_internal",
":port",
":reflection",
":unicode_internal",
":upb",
],
)
@ -825,6 +828,19 @@ cc_library(
deps = [":port"],
)
cc_library(
name = "unicode_internal",
srcs = [
"upb/internal/unicode.c",
],
hdrs = [
"upb/internal/unicode.h",
],
copts = UPB_DEFAULT_COPTS,
visibility = ["//:__subpackages__"],
deps = [":port"],
)
# Amalgamation #################################################################
# begin:github_only

@ -0,0 +1,57 @@
/*
* Copyright (c) 2009-2021, Google LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Google LLC nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "upb/internal/unicode.h"
// Must be last.
#include "upb/port_def.inc"
int upb_Unicode_ToUTF8(uint32_t cp, char* out) {
if (cp <= 0x7f) {
out[0] = cp;
return 1;
}
if (cp <= 0x07ff) {
out[0] = (cp >> 6) | 0xc0;
out[1] = (cp & 0x3f) | 0x80;
return 2;
}
if (cp <= 0xffff) {
out[0] = (cp >> 12) | 0xe0;
out[1] = ((cp >> 6) & 0x3f) | 0x80;
out[2] = (cp & 0x3f) | 0x80;
return 3;
}
if (cp <= 0x10ffff) {
out[0] = (cp >> 18) | 0xf0;
out[1] = ((cp >> 12) & 0x3f) | 0x80;
out[2] = ((cp >> 6) & 0x3f) | 0x80;
out[3] = (cp & 0x3f) | 0x80;
return 4;
}
return 0;
}

@ -0,0 +1,77 @@
/*
* Copyright (c) 2009-2021, Google LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Google LLC nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef UPB_INTERNAL_UNICODE_H_
#define UPB_INTERNAL_UNICODE_H_
// Must be last.
#include "upb/port_def.inc"
#ifdef __cplusplus
extern "C" {
#endif
// Returns true iff a codepoint is the value for a high surrogate.
UPB_INLINE bool upb_Unicode_IsHigh(uint32_t cp) {
return (cp >= 0xd800 && cp <= 0xdbff);
}
// Returns true iff a codepoint is the value for a low surrogate.
UPB_INLINE bool upb_Unicode_IsLow(uint32_t cp) {
return (cp >= 0xdc00 && cp <= 0xdfff);
}
// Returns the high 16-bit surrogate value for a supplementary codepoint.
// Does not sanity-check the input.
UPB_INLINE uint16_t upb_Unicode_ToHigh(uint32_t cp) {
return (cp >> 10) + 0xd7c0;
}
// Returns the low 16-bit surrogate value for a supplementary codepoint.
// Does not sanity-check the input.
UPB_INLINE uint16_t upb_Unicode_ToLow(uint32_t cp) {
return (cp & 0x3ff) | 0xdc00;
}
// Returns the 32-bit value corresponding to a pair of 16-bit surrogates.
// Does not sanity-check the input.
UPB_INLINE uint32_t upb_Unicode_FromPair(uint32_t high, uint32_t low) {
return ((high & 0x3ff) << 10) + (low & 0x3ff) + 0x10000;
}
// Outputs a codepoint as UTF8.
// Returns the number of bytes written (1-4 on success, 0 on error).
// Does not sanity-check the input. Specifically does not check for surrogates.
int upb_Unicode_ToUTF8(uint32_t cp, char* out);
#ifdef __cplusplus
} /* extern "C" */
#endif
#include "upb/port_undef.inc"
#endif /* UPB_INTERNAL_UNICODE_H_ */

@ -32,14 +32,14 @@
#include <inttypes.h>
#include <limits.h>
#include <math.h>
#include <setjmp.h>
#include <stdlib.h>
#include <string.h>
#include "upb/encode.h"
#include "upb/internal/unicode.h"
#include "upb/reflection.h"
/* Special header, must be included last. */
// Must be last.
#include "upb/port_def.inc"
typedef struct {
@ -377,44 +377,20 @@ static uint32_t jsondec_codepoint(jsondec* d) {
/* Parses a \uXXXX unicode escape (possibly a surrogate pair). */
static size_t jsondec_unicode(jsondec* d, char* out) {
uint32_t cp = jsondec_codepoint(d);
if (cp >= 0xd800 && cp <= 0xdbff) {
if (upb_Unicode_IsHigh(cp)) {
/* Surrogate pair: two 16-bit codepoints become a 32-bit codepoint. */
uint32_t high = cp;
uint32_t low;
jsondec_parselit(d, "\\u");
low = jsondec_codepoint(d);
if (low < 0xdc00 || low > 0xdfff) {
jsondec_err(d, "Invalid low surrogate");
}
cp = (high & 0x3ff) << 10;
cp |= (low & 0x3ff);
cp += 0x10000;
} else if (cp >= 0xdc00 && cp <= 0xdfff) {
uint32_t low = jsondec_codepoint(d);
if (!upb_Unicode_IsLow(low)) jsondec_err(d, "Invalid low surrogate");
cp = upb_Unicode_FromPair(cp, low);
} else if (upb_Unicode_IsLow(cp)) {
jsondec_err(d, "Unpaired low surrogate");
}
/* Write to UTF-8 */
if (cp <= 0x7f) {
out[0] = cp;
return 1;
} else if (cp <= 0x07FF) {
out[0] = ((cp >> 6) & 0x1F) | 0xC0;
out[1] = ((cp >> 0) & 0x3F) | 0x80;
return 2;
} else if (cp <= 0xFFFF) {
out[0] = ((cp >> 12) & 0x0F) | 0xE0;
out[1] = ((cp >> 6) & 0x3F) | 0x80;
out[2] = ((cp >> 0) & 0x3F) | 0x80;
return 3;
} else if (cp < 0x10FFFF) {
out[0] = ((cp >> 18) & 0x07) | 0xF0;
out[1] = ((cp >> 12) & 0x3f) | 0x80;
out[2] = ((cp >> 6) & 0x3f) | 0x80;
out[3] = ((cp >> 0) & 0x3f) | 0x80;
return 4;
} else {
jsondec_err(d, "Invalid codepoint");
}
int bytes = upb_Unicode_ToUTF8(cp, out);
if (bytes == 0) jsondec_err(d, "Invalid codepoint");
return bytes;
}
static void jsondec_resize(jsondec* d, char** buf, char** end, char** buf_end) {
@ -460,7 +436,7 @@ static upb_StringView jsondec_string(jsondec* d) {
if (*d->ptr == 'u') {
d->ptr++;
if (buf_end - end < 4) {
/* Allow space for maximum-sized code point (4 bytes). */
/* Allow space for maximum-sized codepoint (4 bytes). */
jsondec_resize(d, &buf, &end, &buf_end);
}
end += jsondec_unicode(d, end);

Loading…
Cancel
Save