protobuf/upb/pb/varint.h

/*
 * upb - a minimalist implementation of protocol buffers.
 *
 * Copyright (c) 2011 Google Inc.  See LICENSE for details.
 * Author: Josh Haberman <jhaberman@gmail.com>
 *
 * A number of routines for varint manipulation (we keep them all around to
 * have multiple approaches available for benchmarking).
 */

#ifndef UPB_VARINT_DECODER_H_
#define UPB_VARINT_DECODER_H_

#include <stdint.h>
#include <string.h>
#include "upb/upb.h"

#ifdef __cplusplus
extern "C" {
#endif

// The maximum number of bytes that it takes to encode a 64-bit varint.
// Note that with a better encoding this could be 9 (TODO: write up a
// wiki document about this).
#define UPB_PB_VARINT_MAX_LEN 10

/* Zig-zag encoding/decoding **************************************************/

INLINE int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); }
INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); }
INLINE uint32_t upb_zzenc_32(int32_t n) { return (n << 1) ^ (n >> 31); }
INLINE uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); }

/* Decoding *******************************************************************/

// All decoding functions return this struct by value.
typedef struct {
  const char *p;  // NULL if the varint was unterminated.
  uint64_t val;
} upb_decoderet;

// A basic branch-based decoder, uses 32-bit values to get good performance
// on 32-bit architectures (but performs well on 64-bits also).
INLINE upb_decoderet upb_vdecode_branch32(const char *p) {
  upb_decoderet r = {NULL, 0};
  uint32_t low, high = 0;
  uint32_t b;
  b = *(p++); low   = (b & 0x7f)      ; if(!(b & 0x80)) goto done;
  b = *(p++); low  |= (b & 0x7f) <<  7; if(!(b & 0x80)) goto done;
  b = *(p++); low  |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done;
  b = *(p++); low  |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done;
  b = *(p++); low  |= (b & 0x7f) << 28;
              high  = (b & 0x7f) >>  4; if(!(b & 0x80)) goto done;
  b = *(p++); high |= (b & 0x7f) <<  3; if(!(b & 0x80)) goto done;
  b = *(p++); high |= (b & 0x7f) << 10; if(!(b & 0x80)) goto done;
  b = *(p++); high |= (b & 0x7f) << 17; if(!(b & 0x80)) goto done;
  b = *(p++); high |= (b & 0x7f) << 24; if(!(b & 0x80)) goto done;
  b = *(p++); high |= (b & 0x7f) << 31; if(!(b & 0x80)) goto done;
  return r;

done:
  r.val = ((uint64_t)high << 32) | low;
  r.p = p;
  return r;
}

// Like the previous, but uses 64-bit values.
INLINE upb_decoderet upb_vdecode_branch64(const char *p) {
  uint64_t val;
  uint64_t b;
  upb_decoderet r = {NULL, 0};
  b = *(p++); val  = (b & 0x7f)      ; if(!(b & 0x80)) goto done;
  b = *(p++); val |= (b & 0x7f) <<  7; if(!(b & 0x80)) goto done;
  b = *(p++); val |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done;
  b = *(p++); val |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done;
  b = *(p++); val |= (b & 0x7f) << 28; if(!(b & 0x80)) goto done;
  b = *(p++); val |= (b & 0x7f) << 35; if(!(b & 0x80)) goto done;
  b = *(p++); val |= (b & 0x7f) << 42; if(!(b & 0x80)) goto done;
  b = *(p++); val |= (b & 0x7f) << 49; if(!(b & 0x80)) goto done;
  b = *(p++); val |= (b & 0x7f) << 56; if(!(b & 0x80)) goto done;
  b = *(p++); val |= (b & 0x7f) << 63; if(!(b & 0x80)) goto done;
  return r;

done:
  r.val = val;
  r.p = p;
  return r;
}

// Decodes a varint of at most 8 bytes without branching (except for error).
upb_decoderet upb_vdecode_max8_wright(upb_decoderet r);

// Another implementation of the previous.
upb_decoderet upb_vdecode_max8_massimino(upb_decoderet r);

// Template for a function that checks the first two bytes with branching
// and dispatches 2-10 bytes with a separate function.
#define UPB_VARINT_DECODER_CHECK2(name, decode_max8_function)                \
INLINE upb_decoderet upb_vdecode_check2_ ## name(const char *_p) {           \
  uint8_t *p = (uint8_t*)_p;                                                 \
  if ((*p & 0x80) == 0) { upb_decoderet r = {_p + 1, *p & 0x7f}; return r; } \
  upb_decoderet r = {_p + 2, (*p & 0x7f) | ((*(p + 1) & 0x7f) << 7)};        \
  if ((*(p + 1) & 0x80) == 0) return r;                                      \
  return decode_max8_function(r);                                            \
}

UPB_VARINT_DECODER_CHECK2(wright, upb_vdecode_max8_wright);
UPB_VARINT_DECODER_CHECK2(massimino, upb_vdecode_max8_massimino);
#undef UPB_VARINT_DECODER_CHECK2

// Our canonical functions for decoding varints, based on the currently
// favored best-performing implementations.
INLINE upb_decoderet upb_vdecode_fast(const char *p) {
  // Use nobranch2 on 64-bit, branch32 on 32-bit.
  if (sizeof(long) == 8)
    return upb_vdecode_check2_massimino(p);
  else
    return upb_vdecode_branch32(p);
}

INLINE upb_decoderet upb_vdecode_max8_fast(upb_decoderet r) {
  return upb_vdecode_max8_massimino(r);
}


/* Encoding *******************************************************************/

INLINE int upb_value_size(uint64_t val) {
#ifdef __GNUC__
  int high_bit = 63 - __builtin_clzll(val);  // 0-based, undef if val == 0.
#else
  int high_bit = 0;
  uint64_t tmp = val;
  while(tmp >>= 1) high_bit++;
#endif
  return val == 0 ? 1 : high_bit / 8 + 1;
}

// Encodes a 64-bit varint into buf (which must be >=UPB_PB_VARINT_MAX_LEN
// bytes long), returning how many bytes were used.
//
// TODO: benchmark and optimize if necessary.
INLINE size_t upb_vencode64(uint64_t val, char *buf) {
  if (val == 0) { buf[0] = 0; return 1; }
  size_t i = 0;
  while (val) {
    uint8_t byte = val & 0x7f;
    val >>= 7;
    if (val) byte |= 0x80;
    buf[i++] = byte;
  }
  return i;
}

// Encodes a 32-bit varint, *not* sign-extended.
INLINE uint64_t upb_vencode32(uint32_t val) {
  char buf[UPB_PB_VARINT_MAX_LEN];
  size_t bytes = upb_vencode64(val, buf);
  uint64_t ret = 0;
  assert(bytes <= 5);
  memcpy(&ret, buf, bytes);
  assert(ret <= 0xffffffffff);
  return ret;
}

#ifdef __cplusplus
}  /* extern "C" */
#endif

#endif  /* UPB_VARINT_DECODER_H_ */
Split varint decoders into separate .h file. This makes it easier to benchmark and test the multiple possible implementations of varint decoding. 14 years ago			`/*`
			`* upb - a minimalist implementation of protocol buffers.`
			`*`
Update copyright to be Google Inc. This doesn't reflect any material change in how I will be working on upb, and I have no problem making this change. It's still open source under the BSD license, and I'll still be working on it well beyond the hours that constitute a normal job. 14 years ago			`* Copyright (c) 2011 Google Inc. See LICENSE for details.`
			`* Author: Josh Haberman <jhaberman@gmail.com>`
			`*`
Refactor varint encoding/decoding. 14 years ago			`* A number of routines for varint manipulation (we keep them all around to`
			`* have multiple approaches available for benchmarking).`
Split varint decoders into separate .h file. This makes it easier to benchmark and test the multiple possible implementations of varint decoding. 14 years ago			`*/`

			`#ifndef UPB_VARINT_DECODER_H_`
			`#define UPB_VARINT_DECODER_H_`

			`#include <stdint.h>`
			`#include <string.h>`
Directory restructure. Includes are now via upb/foo.h. Files specific to the protobuf format are now in upb/pb (the core library is concerned with message definitions, handlers, and byte streams, but knows nothing about any particular serializationf format). 14 years ago			`#include "upb/upb.h"`
Split varint decoders into separate .h file. This makes it easier to benchmark and test the multiple possible implementations of varint decoding. 14 years ago
			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`// The maximum number of bytes that it takes to encode a 64-bit varint.`
			`// Note that with a better encoding this could be 9 (TODO: write up a`
			`// wiki document about this).`
			`#define UPB_PB_VARINT_MAX_LEN 10`

			`/* Zig-zag encoding/decoding **************************************************/`

			`INLINE int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); }`
			`INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); }`
			`INLINE uint32_t upb_zzenc_32(int32_t n) { return (n << 1) ^ (n >> 31); }`
			`INLINE uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); }`

Refactor varint encoding/decoding. 14 years ago			`/* Decoding *******************************************************************/`

Split varint decoders into separate .h file. This makes it easier to benchmark and test the multiple possible implementations of varint decoding. 14 years ago			`// All decoding functions return this struct by value.`
			`typedef struct {`
			`const char *p; // NULL if the varint was unterminated.`
			`uint64_t val;`
			`} upb_decoderet;`

			`// A basic branch-based decoder, uses 32-bit values to get good performance`
			`// on 32-bit architectures (but performs well on 64-bits also).`
First rough version of the JIT. It can successfully parse SpeedMessage1. Preliminary results: 750MB/s on Core2 2.4GHz. This number is 2.5x proto2. This isn't apples-to-apples, because proto2 is parsing to a struct and we are just doing stream parsing, but for apps that are currently using proto2, this is the improvement they would see if they could move to stream-based processing. Unfortunately perf-regression-test.py is broken, and I'm not 100% sure why. It would be nice to fix it first (to ensure that there are no performance regressions for the table-based decoder) but I'm really impatient to get the JIT checked in. 14 years ago			`INLINE upb_decoderet upb_vdecode_branch32(const char *p) {`
Split varint decoders into separate .h file. This makes it easier to benchmark and test the multiple possible implementations of varint decoding. 14 years ago			`upb_decoderet r = {NULL, 0};`
			`uint32_t low, high = 0;`
			`uint32_t b;`
			`b = *(p++); low = (b & 0x7f) ; if(!(b & 0x80)) goto done;`
			`b = *(p++); low \|= (b & 0x7f) << 7; if(!(b & 0x80)) goto done;`
			`b = *(p++); low \|= (b & 0x7f) << 14; if(!(b & 0x80)) goto done;`
			`b = *(p++); low \|= (b & 0x7f) << 21; if(!(b & 0x80)) goto done;`
			`b = *(p++); low \|= (b & 0x7f) << 28;`
			`high = (b & 0x7f) >> 4; if(!(b & 0x80)) goto done;`
			`b = *(p++); high \|= (b & 0x7f) << 3; if(!(b & 0x80)) goto done;`
			`b = *(p++); high \|= (b & 0x7f) << 10; if(!(b & 0x80)) goto done;`
			`b = *(p++); high \|= (b & 0x7f) << 17; if(!(b & 0x80)) goto done;`
			`b = *(p++); high \|= (b & 0x7f) << 24; if(!(b & 0x80)) goto done;`
			`b = *(p++); high \|= (b & 0x7f) << 31; if(!(b & 0x80)) goto done;`
			`return r;`

			`done:`
			`r.val = ((uint64_t)high << 32) \| low;`
			`r.p = p;`
			`return r;`
			`}`

			`// Like the previous, but uses 64-bit values.`
First rough version of the JIT. It can successfully parse SpeedMessage1. Preliminary results: 750MB/s on Core2 2.4GHz. This number is 2.5x proto2. This isn't apples-to-apples, because proto2 is parsing to a struct and we are just doing stream parsing, but for apps that are currently using proto2, this is the improvement they would see if they could move to stream-based processing. Unfortunately perf-regression-test.py is broken, and I'm not 100% sure why. It would be nice to fix it first (to ensure that there are no performance regressions for the table-based decoder) but I'm really impatient to get the JIT checked in. 14 years ago			`INLINE upb_decoderet upb_vdecode_branch64(const char *p) {`
Split varint decoders into separate .h file. This makes it easier to benchmark and test the multiple possible implementations of varint decoding. 14 years ago			`uint64_t val;`
			`uint64_t b;`
Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`upb_decoderet r = {NULL, 0};`
Split varint decoders into separate .h file. This makes it easier to benchmark and test the multiple possible implementations of varint decoding. 14 years ago			`b = *(p++); val = (b & 0x7f) ; if(!(b & 0x80)) goto done;`
			`b = *(p++); val \|= (b & 0x7f) << 7; if(!(b & 0x80)) goto done;`
			`b = *(p++); val \|= (b & 0x7f) << 14; if(!(b & 0x80)) goto done;`
			`b = *(p++); val \|= (b & 0x7f) << 21; if(!(b & 0x80)) goto done;`
			`b = *(p++); val \|= (b & 0x7f) << 28; if(!(b & 0x80)) goto done;`
			`b = *(p++); val \|= (b & 0x7f) << 35; if(!(b & 0x80)) goto done;`
			`b = *(p++); val \|= (b & 0x7f) << 42; if(!(b & 0x80)) goto done;`
			`b = *(p++); val \|= (b & 0x7f) << 49; if(!(b & 0x80)) goto done;`
			`b = *(p++); val \|= (b & 0x7f) << 56; if(!(b & 0x80)) goto done;`
			`b = *(p++); val \|= (b & 0x7f) << 63; if(!(b & 0x80)) goto done;`
			`return r;`

			`done:`
			`r.val = val;`
			`r.p = p;`
			`return r;`
			`}`

First rough version of the JIT. It can successfully parse SpeedMessage1. Preliminary results: 750MB/s on Core2 2.4GHz. This number is 2.5x proto2. This isn't apples-to-apples, because proto2 is parsing to a struct and we are just doing stream parsing, but for apps that are currently using proto2, this is the improvement they would see if they could move to stream-based processing. Unfortunately perf-regression-test.py is broken, and I'm not 100% sure why. It would be nice to fix it first (to ensure that there are no performance regressions for the table-based decoder) but I'm really impatient to get the JIT checked in. 14 years ago			`// Decodes a varint of at most 8 bytes without branching (except for error).`
Decoder redesign in preparation for packed fields and start/endseq. 14 years ago			`upb_decoderet upb_vdecode_max8_wright(upb_decoderet r);`
Split varint decoders into separate .h file. This makes it easier to benchmark and test the multiple possible implementations of varint decoding. 14 years ago
First rough version of the JIT. It can successfully parse SpeedMessage1. Preliminary results: 750MB/s on Core2 2.4GHz. This number is 2.5x proto2. This isn't apples-to-apples, because proto2 is parsing to a struct and we are just doing stream parsing, but for apps that are currently using proto2, this is the improvement they would see if they could move to stream-based processing. Unfortunately perf-regression-test.py is broken, and I'm not 100% sure why. It would be nice to fix it first (to ensure that there are no performance regressions for the table-based decoder) but I'm really impatient to get the JIT checked in. 14 years ago			`// Another implementation of the previous.`
Decoder redesign in preparation for packed fields and start/endseq. 14 years ago			`upb_decoderet upb_vdecode_max8_massimino(upb_decoderet r);`
Split varint decoders into separate .h file. This makes it easier to benchmark and test the multiple possible implementations of varint decoding. 14 years ago
First rough version of the JIT. It can successfully parse SpeedMessage1. Preliminary results: 750MB/s on Core2 2.4GHz. This number is 2.5x proto2. This isn't apples-to-apples, because proto2 is parsing to a struct and we are just doing stream parsing, but for apps that are currently using proto2, this is the improvement they would see if they could move to stream-based processing. Unfortunately perf-regression-test.py is broken, and I'm not 100% sure why. It would be nice to fix it first (to ensure that there are no performance regressions for the table-based decoder) but I'm really impatient to get the JIT checked in. 14 years ago			`// Template for a function that checks the first two bytes with branching`
			`// and dispatches 2-10 bytes with a separate function.`
Major refactoring: upb_string is gone in favor of upb_strref. 14 years ago			`#define UPB_VARINT_DECODER_CHECK2(name, decode_max8_function) \`
			`INLINE upb_decoderet upb_vdecode_check2_ ## name(const char *_p) { \`
			`uint8_t p = (uint8_t)_p; \`
			`if ((p & 0x80) == 0) { upb_decoderet r = {_p + 1, p & 0x7f}; return r; } \`
			`upb_decoderet r = {_p + 2, (p & 0x7f) \| (((p + 1) & 0x7f) << 7)}; \`
			`if ((*(p + 1) & 0x80) == 0) return r; \`
			`return decode_max8_function(r); \`
First rough version of the JIT. It can successfully parse SpeedMessage1. Preliminary results: 750MB/s on Core2 2.4GHz. This number is 2.5x proto2. This isn't apples-to-apples, because proto2 is parsing to a struct and we are just doing stream parsing, but for apps that are currently using proto2, this is the improvement they would see if they could move to stream-based processing. Unfortunately perf-regression-test.py is broken, and I'm not 100% sure why. It would be nice to fix it first (to ensure that there are no performance regressions for the table-based decoder) but I'm really impatient to get the JIT checked in. 14 years ago			`}`

			`UPB_VARINT_DECODER_CHECK2(wright, upb_vdecode_max8_wright);`
			`UPB_VARINT_DECODER_CHECK2(massimino, upb_vdecode_max8_massimino);`
			`#undef UPB_VARINT_DECODER_CHECK2`

			`// Our canonical functions for decoding varints, based on the currently`
			`// favored best-performing implementations.`
			`INLINE upb_decoderet upb_vdecode_fast(const char *p) {`
Switch to non-branching varint decoder. 14 years ago			`// Use nobranch2 on 64-bit, branch32 on 32-bit.`
			`if (sizeof(long) == 8)`
First rough version of the JIT. It can successfully parse SpeedMessage1. Preliminary results: 750MB/s on Core2 2.4GHz. This number is 2.5x proto2. This isn't apples-to-apples, because proto2 is parsing to a struct and we are just doing stream parsing, but for apps that are currently using proto2, this is the improvement they would see if they could move to stream-based processing. Unfortunately perf-regression-test.py is broken, and I'm not 100% sure why. It would be nice to fix it first (to ensure that there are no performance regressions for the table-based decoder) but I'm really impatient to get the JIT checked in. 14 years ago			`return upb_vdecode_check2_massimino(p);`
Switch to non-branching varint decoder. 14 years ago			`else`
First rough version of the JIT. It can successfully parse SpeedMessage1. Preliminary results: 750MB/s on Core2 2.4GHz. This number is 2.5x proto2. This isn't apples-to-apples, because proto2 is parsing to a struct and we are just doing stream parsing, but for apps that are currently using proto2, this is the improvement they would see if they could move to stream-based processing. Unfortunately perf-regression-test.py is broken, and I'm not 100% sure why. It would be nice to fix it first (to ensure that there are no performance regressions for the table-based decoder) but I'm really impatient to get the JIT checked in. 14 years ago			`return upb_vdecode_branch32(p);`
			`}`

			`INLINE upb_decoderet upb_vdecode_max8_fast(upb_decoderet r) {`
			`return upb_vdecode_max8_massimino(r);`
Switch to non-branching varint decoder. 14 years ago			`}`
Split varint decoders into separate .h file. This makes it easier to benchmark and test the multiple possible implementations of varint decoding. 14 years ago
Refactor varint encoding/decoding. 14 years ago
			`/* Encoding *******************************************************************/`

Refinement of upb_bytesrc interface. Added a upb_byteregion that tracks a region of the input buffer; decoders use this instead of using a upb_bytesrc directly. upb_byteregion is also used as the way of passing a string to a upb_handlers callback. This symmetry makes decoders compose better; if you want to take a parsed string and decode it as something else, you can take the string directly from the callback and feed it as input to another parser. A commented-out version of a pinning interface is present; I decline to actually implement it (and accept its extra complexity) until/unless it is clear that it is actually a win. But it is included as a proof-of-concept, to show that it fits well with the existing interface. 13 years ago			`INLINE int upb_value_size(uint64_t val) {`
Refactor varint encoding/decoding. 14 years ago			`#ifdef __GNUC__`
			`int high_bit = 63 - __builtin_clzll(val); // 0-based, undef if val == 0.`
			`#else`
			`int high_bit = 0;`
			`uint64_t tmp = val;`
			`while(tmp >>= 1) high_bit++;`
			`#endif`
			`return val == 0 ? 1 : high_bit / 8 + 1;`
			`}`

Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`// Encodes a 64-bit varint into buf (which must be >=UPB_PB_VARINT_MAX_LEN`
			`// bytes long), returning how many bytes were used.`
			`//`
			`// TODO: benchmark and optimize if necessary.`
			`INLINE size_t upb_vencode64(uint64_t val, char *buf) {`
			`if (val == 0) { buf[0] = 0; return 1; }`
			`size_t i = 0;`
			`while (val) {`
			`uint8_t byte = val & 0x7f;`
			`val >>= 7;`
			`if (val) byte \|= 0x80;`
			`buf[i++] = byte;`
			`}`
			`return i;`
			`}`

Decoder redesign in preparation for packed fields and start/endseq. 14 years ago			`// Encodes a 32-bit varint, not sign-extended.`
			`INLINE uint64_t upb_vencode32(uint32_t val) {`
Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`char buf[UPB_PB_VARINT_MAX_LEN];`
			`size_t bytes = upb_vencode64(val, buf);`
Refactor varint encoding/decoding. 14 years ago			`uint64_t ret = 0;`
Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`assert(bytes <= 5);`
			`memcpy(&ret, buf, bytes);`
			`assert(ret <= 0xffffffffff);`
Refactor varint encoding/decoding. 14 years ago			`return ret;`
			`}`

Split varint decoders into separate .h file. This makes it easier to benchmark and test the multiple possible implementations of varint decoding. 14 years ago			`#ifdef __cplusplus`
			`} /* extern "C" */`
			`#endif`

			`#endif /* UPB_VARINT_DECODER_H_ */`