Protocol Buffers - Google's data interchange format (grpc依赖) https://developers.google.com/protocol-buffers/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

134 lines
3.9 KiB

#include <stdio.h>
/*
* UTF-8 to UTF-16
* Table from https://woboq.com/blog/utf-8-processing-using-simd.html
*
* +-------------------------------------+-------------------+
* | UTF-8 | UTF-16LE (HI LO) |
* +-------------------------------------+-------------------+
* | 0aaaaaaa | 00000000 0aaaaaaa |
* +-------------------------------------+-------------------+
* | 110bbbbb 10aaaaaa | 00000bbb bbaaaaaa |
* +-------------------------------------+-------------------+
* | 1110cccc 10bbbbbb 10aaaaaa | ccccbbbb bbaaaaaa |
* +-------------------------------------+-------------------+
* | 11110ddd 10ddcccc 10bbbbbb 10aaaaaa | 110110uu uuccccbb |
* + uuuu = ddddd - 1 | 110111bb bbaaaaaa |
* +-------------------------------------+-------------------+
*/
/*
* Parameters:
* - buf8, len8: input utf-8 string
* - buf16: buffer to store decoded utf-16 string
* - *len16: on entry - utf-16 buffer length in bytes
* on exit - length in bytes of valid decoded utf-16 string
* Returns:
* - 0: success
* - >0: error position of input utf-8 string
* - -1: utf-16 buffer overflow
* LE/BE depends on host
*/
int utf8_to16_naive(const unsigned char *buf8, size_t len8,
unsigned short *buf16, size_t *len16)
{
int err_pos = 1;
size_t len16_left = *len16;
*len16 = 0;
while (len8) {
unsigned char b0, b1, b2, b3;
unsigned int u;
/* Output buffer full */
if (len16_left < 2)
return -1;
/* 1st byte */
b0 = buf8[0];
if ((b0 & 0x80) == 0) {
/* 0aaaaaaa -> 00000000 0aaaaaaa */
*buf16++ = b0;
++buf8;
--len8;
++err_pos;
*len16 += 2;
len16_left -= 2;
continue;
}
/* Character length */
size_t clen = b0 & 0xF0;
clen >>= 4; /* 10xx, 110x, 1110, 1111 */
clen -= 12; /* -4~-1, 0/1, 2, 3 */
clen += !clen; /* -4~-1, 1, 2, 3 */
/* String too short or invalid 1st byte (10xxxxxx) */
if (len8 <= clen)
return err_pos;
/* Trailing bytes must be within 0x80 ~ 0xBF */
b1 = buf8[1];
if ((signed char)b1 >= (signed char)0xC0)
return err_pos;
b1 &= 0x3F;
++clen;
if (clen == 2) {
u = b0 & 0x1F;
u <<= 6;
u |= b1;
if (u <= 0x7F)
return err_pos;
*buf16++ = u;
} else {
b2 = buf8[2];
if ((signed char)b2 >= (signed char)0xC0)
return err_pos;
b2 &= 0x3F;
if (clen == 3) {
u = b0 & 0x0F;
u <<= 6;
u |= b1;
u <<= 6;
u |= b2;
if (u <= 0x7FF || (u >= 0xD800 && u <= 0xDFFF))
return err_pos;
*buf16++ = u;
} else {
/* clen == 4 */
if (len16_left < 4)
return -1; /* Output buffer full */
b3 = buf8[3];
if ((signed char)b3 >= (signed char)0xC0)
return err_pos;
u = b0 & 0x07;
u <<= 6;
u |= b1;
u <<= 6;
u |= b2;
u <<= 6;
u |= (b3 & 0x3F);
if (u <= 0xFFFF || u > 0x10FFFF)
return err_pos;
u -= 0x10000;
*buf16++ = (((u >> 10) & 0x3FF) | 0xD800);
*buf16++ = ((u & 0x3FF) | 0xDC00);
*len16 += 2;
len16_left -= 2;
}
}
buf8 += clen;
len8 -= clen;
err_pos += clen;
*len16 += 2;
len16_left -= 2;
}
return 0;
}