Split varint decoders into separate .h file.

This makes it easier to benchmark and test the
multiple possible implementations of varint decoding.
pull/13171/head
Joshua Haberman 14 years ago
parent da95bf34ae
commit 0c6786c6fa
  1. 120
      src/upb_varint_decoder.h
  2. 216
      tests/test_varint.c
  3. 27
      tests/upb_test.h

@ -0,0 +1,120 @@
/*
* upb - a minimalist implementation of protocol buffers.
*
* A number of routines for varint decoding (we keep them all around to have
* multiple approaches available for benchmarking). All of these functions
* require the buffer to have at least 10 bytes available; if we don't know
* for sure that there are 10 bytes, then there is only one viable option
* (branching on every byte).
*
* Copyright (c) 2011 Joshua Haberman. See LICENSE for details.
*/
#ifndef UPB_VARINT_DECODER_H_
#define UPB_VARINT_DECODER_H_
#include "upb.h"
#include <stdint.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
// All decoding functions return this struct by value.
typedef struct {
const char *p; // NULL if the varint was unterminated.
uint64_t val;
} upb_decoderet;
// A basic branch-based decoder, uses 32-bit values to get good performance
// on 32-bit architectures (but performs well on 64-bits also).
INLINE upb_decoderet upb_decode_varint_branch32(const char *p) {
upb_decoderet r = {NULL, 0};
uint32_t low, high = 0;
uint32_t b;
b = *(p++); low = (b & 0x7f) ; if(!(b & 0x80)) goto done;
b = *(p++); low |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done;
b = *(p++); low |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done;
b = *(p++); low |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done;
b = *(p++); low |= (b & 0x7f) << 28;
high = (b & 0x7f) >> 4; if(!(b & 0x80)) goto done;
b = *(p++); high |= (b & 0x7f) << 3; if(!(b & 0x80)) goto done;
b = *(p++); high |= (b & 0x7f) << 10; if(!(b & 0x80)) goto done;
b = *(p++); high |= (b & 0x7f) << 17; if(!(b & 0x80)) goto done;
b = *(p++); high |= (b & 0x7f) << 24; if(!(b & 0x80)) goto done;
b = *(p++); high |= (b & 0x7f) << 31; if(!(b & 0x80)) goto done;
return r;
done:
r.val = ((uint64_t)high << 32) | low;
r.p = p;
return r;
}
// Like the previous, but uses 64-bit values.
INLINE upb_decoderet upb_decode_varint_branch64(const char *p) {
uint64_t val;
uint64_t b;
upb_decoderet r = {(void*)0, 0};
b = *(p++); val = (b & 0x7f) ; if(!(b & 0x80)) goto done;
b = *(p++); val |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done;
b = *(p++); val |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done;
b = *(p++); val |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done;
b = *(p++); val |= (b & 0x7f) << 28; if(!(b & 0x80)) goto done;
b = *(p++); val |= (b & 0x7f) << 35; if(!(b & 0x80)) goto done;
b = *(p++); val |= (b & 0x7f) << 42; if(!(b & 0x80)) goto done;
b = *(p++); val |= (b & 0x7f) << 49; if(!(b & 0x80)) goto done;
b = *(p++); val |= (b & 0x7f) << 56; if(!(b & 0x80)) goto done;
b = *(p++); val |= (b & 0x7f) << 63; if(!(b & 0x80)) goto done;
return r;
done:
r.val = val;
r.p = p;
return r;
}
#ifdef __SSE__
#include <xmmintrin.h>
// Avoids branches (this can very likely be improved). Requires SSE.
INLINE upb_decoderet upb_decode_varint_nobranch(const char *p) {
upb_decoderet r = {(void*)0, 0};
__m128i val128 = _mm_loadu_si128((void*)p);
unsigned int continuation_bits = _mm_movemask_epi8(val128);
unsigned int bsr_val = ~continuation_bits;
int varint_length = __builtin_ffs(bsr_val);
if (varint_length > 10) return r;
uint16_t twob;
memcpy(&twob, p, 2);
twob &= 0x7f7f;
twob = ((twob & 0xff00) >> 1) | (twob & 0xff);
uint64_t eightb;
memcpy(&eightb, p + 2, 8);
eightb &= 0x7f7f7f7f7f7f7f7f;
eightb = ((eightb & 0xff00ff00ff00ff00) >> 1) | (eightb & 0x00ff00ff00ff00ff);
eightb = ((eightb & 0xffff0000ffff0000) >> 2) | (eightb & 0x0000ffff0000ffff);
eightb = ((eightb & 0xffffffff00000000) >> 4) | (eightb & 0x00000000ffffffff);
uint64_t all_bits = twob | (eightb << 14);
int varint_bits = varint_length * 7;
uint64_t mask = varint_bits == 70 ? (uint64_t)-1 : (1ULL << (varint_bits)) - 1;
r.val = all_bits & mask;
r.p = p + varint_length;
return r;
}
#endif
// For now, always use the branch32 decoder.
#define upb_decode_varint_fast upb_decode_varint_branch32
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* UPB_VARINT_DECODER_H_ */

@ -0,0 +1,216 @@
/*
* upb - a minimalist implementation of protocol buffers.
*
* Copyright (c) 2011 Joshua Haberman. See LICENSE for details.
*/
#include "upb_varint_decoder.h"
#include "upb_test.h"
static void test_varint_decoder(upb_decoderet (*decoder)(const char*)) {
#define TEST(bytes, expected_val) {\
const char buf[] = bytes "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" ; \
upb_decoderet r = decoder(buf); \
ASSERT(r.val == expected_val); \
ASSERT(r.p == buf + sizeof(buf) - 16); /* - 1 for NULL */ \
}
TEST("\x00", 0ULL);
TEST("\x01", 1ULL);
TEST("\x81\x14", 0xa01ULL);
TEST("\x81\x03", 0x181ULL);
TEST("\x81\x83\x07", 0x1c181ULL);
TEST("\x81\x83\x87\x0f", 0x1e1c181ULL);
TEST("\x81\x83\x87\x8f\x1f", 0x1f1e1c181ULL);
TEST("\x81\x83\x87\x8f\x9f\x3f", 0x1f9f1e1c181ULL);
TEST("\x81\x83\x87\x8f\x9f\xbf\x7f", 0x1fdf9f1e1c181ULL);
TEST("\x81\x83\x87\x8f\x9f\xbf\xff\x01", 0x3fdf9f1e1c181ULL);
TEST("\x81\x83\x87\x8f\x9f\xbf\xff\x81\x03", 0x303fdf9f1e1c181ULL);
TEST("\x81\x83\x87\x8f\x9f\xbf\xff\x81\x83\x07", 0x8303fdf9f1e1c181ULL);
#undef TEST
char twelvebyte[16] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01};
const char *twelvebyte_buf = twelvebyte;
// A varint that terminates before hitting the end of the provided buffer,
// but in too many bytes (11 instead of 10).
upb_decoderet r = upb_decode_varint_fast(twelvebyte_buf);
ASSERT(r.p == NULL);
}
#define TEST_VARINT_DECODER(decoder) \
/* Create non-inline versions for convenient inspection of assembly language \
* output. */ \
upb_decoderet _upb_decode_varint_ ## decoder(const char *p) { \
return upb_decode_varint_ ## decoder(p); \
} \
void test_ ## decoder() { \
test_varint_decoder(&_upb_decode_varint_ ## decoder); \
} \
TEST_VARINT_DECODER(branch32);
TEST_VARINT_DECODER(branch64);
#ifdef __SSE__
TEST_VARINT_DECODER(nobranch);
#endif
int main() {
test_branch32();
test_branch64();
#ifdef __SSE__
test_nobranch();
#endif
}
#if 0
static void test_get_v_uint32_t()
{
#define TEST(name, bytes, val) {\
upb_status status = UPB_STATUS_INIT; \
const uint8_t name[] = bytes; \
const uint8_t *name ## _buf = name; \
uint32_t name ## _val = 0; \
name ## _buf = upb_get_v_uint32_t(name, name + sizeof(name), &name ## _val, &status); \
ASSERT(upb_ok(&status)); \
ASSERT(name ## _val == val); \
ASSERT(name ## _buf == name + sizeof(name) - 1); /* - 1 for NULL */ \
/* Test NEED_MORE_DATA. */ \
if(sizeof(name) > 2) { \
name ## _buf = upb_get_v_uint32_t(name, name + sizeof(name) - 2, &name ## _val, &status); \
ASSERT(status.code == UPB_STATUS_NEED_MORE_DATA); \
} \
}
TEST(zero, "\x00", 0UL);
TEST(one, "\x01", 1UL);
TEST(twob, "\x81\x03", 0x181UL);
TEST(threeb, "\x81\x83\x07", 0x1c181UL);
TEST(fourb, "\x81\x83\x87\x0f", 0x1e1c181UL);
/* get_v_uint32_t truncates, so all the rest return the same thing. */
TEST(fiveb, "\x81\x83\x87\x8f\x1f", 0xf1e1c181UL);
TEST(sixb, "\x81\x83\x87\x8f\x9f\x3f", 0xf1e1c181UL);
TEST(sevenb, "\x81\x83\x87\x8f\x9f\xbf\x7f", 0xf1e1c181UL);
TEST(eightb, "\x81\x83\x87\x8f\x9f\xbf\xff\x01", 0xf1e1c181UL);
TEST(nineb, "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x03", 0xf1e1c181UL);
TEST(tenb, "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x83\x07", 0xf1e1c181UL);
#undef TEST
uint8_t twelvebyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01};
uint32_t twelvebyte_val = 0;
upb_status status = UPB_STATUS_INIT;
/* A varint that terminates before hitting the end of the provided buffer,
* but in too many bytes (11 instead of 10). */
upb_get_v_uint32_t(twelvebyte, twelvebyte + 12, &twelvebyte_val, &status);
ASSERT(status.code == UPB_ERROR_UNTERMINATED_VARINT);
/* A varint that terminates simultaneously with the end of the provided
* buffer, but in too many bytes (11 instead of 10). */
upb_reset(&status);
upb_get_v_uint32_t(twelvebyte, twelvebyte + 11, &twelvebyte_val, &status);
ASSERT(status.code == UPB_ERROR_UNTERMINATED_VARINT);
/* A varint whose buffer ends on exactly the byte where the varint must
* terminate, but the final byte does not terminate. The absolutely most
* correct return code here is UPB_ERROR_UNTERMINATED_VARINT, because we know
* by this point that the varint does not properly terminate. But we also
* allow a return value of UPB_STATUS_NEED_MORE_DATA here, because it does not
* compromise overall correctness -- clients who supply more data later will
* then receive a UPB_ERROR_UNTERMINATED_VARINT error; clients who have no
* more data to supply will (rightly) conclude that their protobuf is corrupt.
*/
upb_reset(&status);
upb_get_v_uint32_t(twelvebyte, twelvebyte + 10, &twelvebyte_val, &status);
ASSERT(status.code == UPB_ERROR_UNTERMINATED_VARINT ||
status.code == UPB_STATUS_NEED_MORE_DATA);
upb_reset(&status);
upb_get_v_uint32_t(twelvebyte, twelvebyte + 9, &twelvebyte_val, &status);
ASSERT(status.code == UPB_STATUS_NEED_MORE_DATA);
}
static void test_skip_v_uint64_t()
{
#define TEST(name, bytes) {\
upb_status status = UPB_STATUS_INIT; \
const uint8_t name[] = bytes; \
const uint8_t *name ## _buf = name; \
name ## _buf = upb_skip_v_uint64_t(name ## _buf, name + sizeof(name), &status); \
ASSERT(upb_ok(&status)); \
ASSERT(name ## _buf == name + sizeof(name) - 1); /* - 1 for NULL */ \
/* Test NEED_MORE_DATA. */ \
if(sizeof(name) > 2) { \
name ## _buf = upb_skip_v_uint64_t(name, name + sizeof(name) - 2, &status); \
ASSERT(status.code == UPB_STATUS_NEED_MORE_DATA); \
} \
}
TEST(zero, "\x00");
TEST(one, "\x01");
TEST(twob, "\x81\x03");
TEST(threeb, "\x81\x83\x07");
TEST(fourb, "\x81\x83\x87\x0f");
TEST(fiveb, "\x81\x83\x87\x8f\x1f");
TEST(sixb, "\x81\x83\x87\x8f\x9f\x3f");
TEST(sevenb, "\x81\x83\x87\x8f\x9f\xbf\x7f");
TEST(eightb, "\x81\x83\x87\x8f\x9f\xbf\xff\x01");
TEST(nineb, "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x03");
TEST(tenb, "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x83\x07");
#undef TEST
uint8_t twelvebyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01};
upb_status status = UPB_STATUS_INIT;
/* A varint that terminates before hitting the end of the provided buffer,
* but in too many bytes (11 instead of 10). */
upb_skip_v_uint64_t(twelvebyte, twelvebyte + 12, &status);
ASSERT(status.code == UPB_ERROR_UNTERMINATED_VARINT);
/* A varint that terminates simultaneously with the end of the provided
* buffer, but in too many bytes (11 instead of 10). */
upb_reset(&status);
upb_skip_v_uint64_t(twelvebyte, twelvebyte + 11, &status);
ASSERT(status.code == UPB_ERROR_UNTERMINATED_VARINT);
/* A varint whose buffer ends on exactly the byte where the varint must
* terminate, but the final byte does not terminate. The absolutely most
* correct return code here is UPB_ERROR_UNTERMINATED_VARINT, because we know
* by this point that the varint does not properly terminate. But we also
* allow a return value of UPB_STATUS_NEED_MORE_DATA here, because it does not
* compromise overall correctness -- clients who supply more data later will
* then receive a UPB_ERROR_UNTERMINATED_VARINT error; clients who have no
* more data to supply will (rightly) conclude that their protobuf is corrupt.
*/
upb_reset(&status);
upb_skip_v_uint64_t(twelvebyte, twelvebyte + 10, &status);
ASSERT(status.code == UPB_ERROR_UNTERMINATED_VARINT ||
status.code == UPB_STATUS_NEED_MORE_DATA);
upb_reset(&status);
upb_skip_v_uint64_t(twelvebyte, twelvebyte + 9, &status);
ASSERT(status.code == UPB_STATUS_NEED_MORE_DATA);
}
static void test_get_f_uint32_t()
{
#define TEST(name, bytes, val) {\
upb_status status = UPB_STATUS_INIT; \
const uint8_t name[] = bytes; \
const uint8_t *name ## _buf = name; \
uint32_t name ## _val = 0; \
name ## _buf = upb_get_f_uint32_t(name ## _buf, name + sizeof(name), &name ## _val, &status); \
ASSERT(upb_ok(&status)); \
ASSERT(name ## _val == val); \
ASSERT(name ## _buf == name + sizeof(name) - 1); /* - 1 for NULL */ \
}
TEST(zero, "\x00\x00\x00\x00", 0x0UL);
TEST(one, "\x01\x00\x00\x00", 0x1UL);
uint8_t threeb[] = {0x00, 0x00, 0x00};
uint32_t threeb_val;
upb_status status = UPB_STATUS_INIT;
upb_get_f_uint32_t(threeb, threeb + sizeof(threeb), &threeb_val, &status);
ASSERT(status.code == UPB_STATUS_NEED_MORE_DATA);
#undef TEST
}
#endif

@ -0,0 +1,27 @@
/*
* upb - a minimalist implementation of protocol buffers.
*
* Copyright (c) 2011 Joshua Haberman. See LICENSE for details.
*/
#ifndef UPB_TEST_H_
#define UPB_TEST_H_
#ifdef __cplusplus
extern "C" {
#endif
int num_assertions = 0;
#define ASSERT(expr) do { \
++num_assertions; \
if (!(expr)) { \
fprintf(stderr, "Assertion failed: %s:%d\n", __FILE__, __LINE__); \
abort(); \
} \
} while(0)
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* UPB_DECODER_H_ */
Loading…
Cancel
Save