Add (but do not activate) an SSE varint decoder.

14 years ago · ee84a7da16
parent bd1dfd397e
commit ee84a7da16
2 changed files with 55 additions and 9 deletions
--- a/stream/upb_decoder.c
+++ b/stream/upb_decoder.c
@ -16,8 +16,47 @@
 // The key fast-path varint-decoding routine.  Here we can assume we have at
 // least UPB_MAX_VARINT_ENCODED_SIZE bytes available.  There are a lot of
 // possibilities for optimization/experimentation here.
-INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val,
-                                   upb_status *status) {
+
+#ifdef USE_SSE_VARINT_DECODING
+#include <emmintrin.h>
+
+// This works, but is empirically slower than the branchy version below.  Why?
+// Most varints are very short.  Next step: use branches for 1/2-byte varints,
+// but use the SSE version for 3-10 byte varints.
+INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val, upb_status *s) {
+  const char *p = *ptr;
+  __m128i val128 = _mm_loadu_si128((void*)p);
+  unsigned int continuation_bits = _mm_movemask_epi8(val128);
+  unsigned int bsr_val = ~continuation_bits;
+  int varint_length = __builtin_ffs(bsr_val);
+  if (varint_length > 10) {
+    upb_seterr(s, UPB_ERROR, "Unterminated varint");
+    return false;
+  }
+
+  uint16_t twob;
+  memcpy(&twob, p, 2);
+  twob &= 0x7f7f;
+  twob = ((twob & 0xff00) >> 1) | (twob & 0xff);
+
+  uint64_t eightb;
+  memcpy(&eightb, p + 2, 8);
+  eightb &= 0x7f7f7f7f7f7f7f7f;
+  eightb = ((eightb & 0xff00ff00ff00ff00) >> 1) | (eightb & 0x00ff00ff00ff00ff);
+  eightb = ((eightb & 0xffff0000ffff0000) >> 2) | (eightb & 0x0000ffff0000ffff);
+  eightb = ((eightb & 0xffffffff00000000) >> 4) | (eightb & 0x00000000ffffffff);
+
+  uint64_t all_bits = twob | (eightb << 14);
+  int varint_bits = varint_length * 7;
+  uint64_t mask = varint_bits == 70 ? (uint64_t)-1 : (1ULL << (varint_bits)) - 1;
+  *val = all_bits & mask;
+  *ptr = p + varint_length;
+  return true;
+}
+
+#else
+
+INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val, upb_status *s) {
  const char *p = *ptr;
  uint32_t low, high = 0;
  uint32_t b;
@ -33,14 +72,17 @@ INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val,
  b = *(p++); high |= (b & 0x7f) << 24; if(!(b & 0x80)) goto done;
  b = *(p++); high |= (b & 0x7f) << 31; if(!(b & 0x80)) goto done;

-  upb_seterr(status, UPB_ERROR, "Unterminated varint");
+  upb_seterr(s, UPB_ERROR, "Unterminated varint");
  return false;
+
 done:
-  *ptr = p;
  *val = ((uint64_t)high << 32) | low;
+  *ptr = p;
  return true;
 }

+#endif
+

 /* Decoding/Buffering of individual values ************************************/

@ -163,7 +205,7 @@ done:
 }

 INLINE bool upb_decode_varint(upb_decoder *d, upb_dstate *s, upb_value *val) {
-  if (s->len >= UPB_MAX_VARINT_ENCODED_SIZE) {
+  if (s->len >= 16) {
    // Common (fast) case.
    uint64_t val64;
    const char *p = s->ptr;
@ -315,7 +357,9 @@ void upb_decoder_run(upb_src *src, upb_status *status) {
      CHECK_FLOW(upb_dispatch_unknownval(&d->dispatcher, tag.field_number, val));
    } else if (!upb_check_type(tag.wire_type, f->type)) {
      // TODO: put more details in this error msg.
-      upb_seterr(status, UPB_ERROR, "Field had incorrect type.");
+      upb_seterr(status, UPB_ERROR, "Field had incorrect type, name: " UPB_STRFMT, UPB_STRARG(f->name));
+      upb_printerr(status);
+      *(int*)0 = 0;
      goto err;
    }

--- a/tests/tests.c
+++ b/tests/tests.c
@ -17,17 +17,18 @@ static void test_get_v_uint64_t()
 {
 #define TEST(name, bytes, val) {\
    upb_status status = UPB_STATUS_INIT; \
-    const char name[] = bytes; \
+    const char name[] = bytes "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" ; \
    const char *name ## _buf = name; \
    uint64_t name ## _val = 0; \
    upb_decode_varint_fast(&name ## _buf, &name ## _val, &status); \
    ASSERT(upb_ok(&status)); \
    ASSERT(name ## _val == val); \
-    ASSERT(name ## _buf == name + sizeof(name) - 1);  /* - 1 for NULL */ \
+    ASSERT(name ## _buf == name + sizeof(name) - 16);  /* - 1 for NULL */ \
  }

  TEST(zero,   "\x00",                                                      0ULL);
  TEST(one,    "\x01",                                                      1ULL);
+  TEST(twob,   "\x81\x14",                                              0xa01ULL);
  TEST(twob,   "\x81\x03",                                              0x181ULL);
  TEST(threeb, "\x81\x83\x07",                                        0x1c181ULL);
  TEST(fourb,  "\x81\x83\x87\x0f",                                  0x1e1c181ULL);
@ -39,7 +40,7 @@ static void test_get_v_uint64_t()
  TEST(tenb,   "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x83\x07", 0x8303fdf9f1e1c181ULL);
 #undef TEST

-  char twelvebyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01};
+  char twelvebyte[16] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01};
  const char *twelvebyte_buf = twelvebyte;
  uint64_t twelvebyte_val = 0;
  upb_status status = UPB_STATUS_INIT;
@ -214,6 +215,7 @@ static void test_upb_symtab() {
  }
  upb_status status = UPB_STATUS_INIT;
  upb_parsedesc(s, descriptor, &status);
+  upb_printerr(&status);
  ASSERT(upb_ok(&status));
  upb_status_uninit(&status);
  upb_string_unref(descriptor);