diff --git a/src/upb.h b/src/upb.h
index 0dfcd5e65b..38808ac663 100644
--- a/src/upb.h
+++ b/src/upb.h
@@ -54,9 +54,16 @@ INLINE size_t upb_align_up(size_t val, size_t align) {
 
 // The maximum number of fields that any one .proto type can have.  Note that
 // this is very different than the max field number.  It is hard to imagine a
-// scenario where more than 32k fields (each with its own name and field number)
-// makes sense.
-#define UPB_MAX_FIELDS (1<<15)
+// scenario where more than 2k fields (each with its own name and field number)
+// makes sense.  The .proto file to describe it would be 2000 lines long and
+// contain 2000 unique names.
+//
+// With this limit we can store a has-bit offset in 8 bits (2**8 * 8 = 2048)
+// and we can store a value offset in 16 bits, since the maximum message
+// size is 16,640 bytes (2**8 has-bits + 2048 * 8-byte value).  Note that
+// strings and arrays are not counted in this, only the *pointer* to them is.
+// An individual string or array is unaffected by this 16k byte limit.
+#define UPB_MAX_FIELDS (2048)
 typedef int16_t upb_field_count_t;
 
 // Nested type names are separated by periods.
diff --git a/src/upb_decoder_x86.dasc b/src/upb_decoder_x86.dasc
index ecd98999e3..5e85bd9104 100644
--- a/src/upb_decoder_x86.dasc
+++ b/src/upb_decoder_x86.dasc
@@ -188,9 +188,11 @@ void __attribute__((noinline)) __jit_debug_register_code() { __asm__ __volatile_
 |// TODO: optimize for 0 (xor) and 32-bits.
 |.macro loadfval, f
 || if (f->fval.val.uint64 == 0) {
-|    xor ARG2_32, ARG2_32
+|    xor     ARG2_32, ARG2_32
+|| } else if (f->fval.val.uint64 < 0xffffffff) {
+|    mov     ARG2_32, f->fval.val.uint64
 || } else {
-|    mov   ARG2_64, f->fval.val.uint64
+|    mov64   ARG2_64, f->fval.val.uint64
 || }
 |.endmacro
 
diff --git a/src/upb_msg.c b/src/upb_msg.c
index cf0625e769..2cc503e3fc 100644
--- a/src/upb_msg.c
+++ b/src/upb_msg.c
@@ -70,16 +70,29 @@ upb_array *upb_array_new(void) {
   return arr;
 }
 
-void upb_array_recycle(upb_array **_arr, upb_fielddef *f) {
+void __attribute__((noinline)) upb_array_dorecycle(upb_array **_arr) {
   upb_array *arr = *_arr;
   if(arr && upb_atomic_only(&arr->refcount)) {
     arr->len = 0;
   } else {
-    upb_array_unref(arr, f);
+    if (arr) {
+      bool was_lastref = upb_atomic_unref(&arr->refcount);
+      (void)was_lastref;
+      assert(!was_lastref);   // If it was, we would have just recycled.
+    }
     *_arr = upb_array_new();
   }
 }
 
+void upb_array_recycle(upb_array **_arr) {
+  upb_array *arr = *_arr;
+  if(arr && upb_atomic_only(&arr->refcount)) {
+    arr->len = 0;
+  } else {
+    upb_array_dorecycle(_arr);
+  }
+}
+
 void _upb_array_free(upb_array *arr, upb_fielddef *f) {
   if (upb_elem_ismm(f)) {
     // Need to release refs on sub-objects.
@@ -93,20 +106,26 @@ void _upb_array_free(upb_array *arr, upb_fielddef *f) {
   free(arr);
 }
 
-void upb_array_resize(upb_array *arr, upb_fielddef *f, upb_arraylen_t len) {
-  size_t type_size = upb_types[f->type].size;
+void __attribute__((noinline)) upb_array_doresize(
+    upb_array *arr, size_t type_size, upb_arraylen_t len) {
   upb_arraylen_t old_size = arr->size;
-  if (old_size < len) {
-    // Need to resize.
-    size_t new_size = upb_round_up_pow2(len);
-    arr->ptr = realloc(arr->ptr, new_size * type_size);
-    arr->size = new_size;
-    memset(arr->ptr + (old_size * type_size), 0,
-           (new_size - old_size) * type_size);
-  }
+  size_t new_size = upb_round_up_pow2(len);
+  arr->ptr = realloc(arr->ptr, new_size * type_size);
+  arr->size = new_size;
+  memset(arr->ptr + (old_size * type_size), 0,
+         (new_size - old_size) * type_size);
+}
+
+void upb_array_resizefortypesize(upb_array *arr, size_t type_size,
+                                 upb_arraylen_t len) {
+  if (arr->size < len) upb_array_doresize(arr, type_size, len);
   arr->len = len;
 }
 
+void upb_array_resize(upb_array *arr, upb_fielddef *f, upb_arraylen_t len) {
+  upb_array_resizefortypesize(arr, upb_types[f->type].size, len);
+}
+
 
 /* upb_msg ********************************************************************/
 
@@ -136,6 +155,11 @@ void upb_msg_recycle(upb_msg **_msg, upb_msgdef *msgdef) {
     upb_msg_clear(msg, msgdef);
   } else {
     upb_msg_unref(msg, msgdef);
+    if (msg) {
+      bool was_lastref = upb_atomic_unref(&msg->refcount);
+      (void)was_lastref;
+      assert(!was_lastref);
+    }
     *_msg = upb_msg_new(msgdef);
   }
 }
@@ -249,7 +273,7 @@ static upb_valueptr upb_msg_getappendptr(upb_msg *msg, upb_fielddef *f) {
     // Create/recycle/resize the array if necessary, and find a pointer to
     // a newly-appended element.
     if (!upb_msg_has(msg, f)) {
-      upb_array_recycle(p.arr, f);
+      upb_array_recycle(p.arr);
       upb_msg_sethas(msg, f);
     }
     assert(*p.arr != NULL);
@@ -260,30 +284,6 @@ static upb_valueptr upb_msg_getappendptr(upb_msg *msg, upb_fielddef *f) {
   return p;
 }
 
-static void upb_msg_appendval(upb_msg *msg, upb_fielddef *f, upb_value val) {
-  upb_valueptr p = upb_msg_getappendptr(msg, f);
-  if (upb_isstring(f)) {
-    // We do:
-    //  - upb_string_recycle(), upb_string_substr() instead of
-    //  - upb_string_unref(), upb_string_getref()
-    // because we can conveniently cache these upb_string objects in the
-    // upb_msg, whereas the upb_src who is sending us these strings may not
-    // have a good way of caching them.  This saves the upb_src from allocating
-    // new upb_strings all the time to give us.
-    //
-    // If you were using this to copy one upb_msg to another this would
-    // allocate string objects whereas a upb_string_getref could have avoided
-    // those allocations completely; if this is an issue, we could make it an
-    // option of the upb_msgsink which behavior is desired.
-    upb_string *src = upb_value_getstr(val);
-    upb_string_recycle(p.str);
-    upb_string_substr(*p.str, src, 0, upb_string_len(src));
-  } else {
-    upb_value_write(p, val, f->type);
-  }
-  upb_msg_sethas(msg, f);
-}
-
 upb_msg *upb_msg_appendmsg(upb_msg *msg, upb_fielddef *f, upb_msgdef *msgdef) {
   upb_valueptr p = upb_msg_getappendptr(msg, f);
   if (upb_isarray(f) || !upb_msg_has(msg, f)) {
@@ -294,44 +294,221 @@ upb_msg *upb_msg_appendmsg(upb_msg *msg, upb_fielddef *f, upb_msgdef *msgdef) {
 }
 
 
-/* upb_msg dynhandlers  *******************************************************/
+/* upb_msg handlers ***********************************************************/
+
+#if UPB_MAX_FIELDS > 2048
+#error "We're using an 8-bit integer to store a has_offset."
+#endif
+typedef struct {
+  uint8_t has_offset;
+  uint8_t has_mask;
+  uint16_t val_offset;
+  uint16_t msg_size;
+  uint8_t set_flags_bytes;
+  uint8_t padding;
+} upb_msgsink_fval;
+
+static upb_msgsink_fval upb_msgsink_unpackfval(upb_value fval) {
+  assert(sizeof(upb_msgsink_fval) == 8);
+  upb_msgsink_fval ret;
+  uint64_t fval_u64 = upb_value_getuint64(fval);
+  memcpy(&ret, &fval_u64, 8);
+  return ret;
+}
 
-static upb_flow_t upb_dmsgsink_value(void *_m, upb_value fval, upb_value val) {
-  upb_msg *m = _m;
-  upb_fielddef *f = upb_value_getfielddef(fval);
-  if (upb_isstring(f)) {
-    //fprintf(stderr, "dmsg_value!  this=%p f=%p name=" UPB_STRFMT ", "
-    //        UPB_STRFMT "\n", m, f, UPB_STRARG(f->name), UPB_STRARG(val.val.str));
-  } else {
-    //fprintf(stderr, "dmsg_value!  this=%p f=%p name=" UPB_STRFMT ", %llu\n",
-    //        m, f, UPB_STRARG(f->name), val.val.uint64);
+static uint64_t upb_msgsink_packfval(uint8_t has_offset, uint8_t has_mask,
+                                     uint16_t val_offset, uint16_t msg_size,
+                                     uint8_t set_flags_bytes) {
+  upb_msgsink_fval fval = {
+      has_offset, has_mask, val_offset, msg_size, set_flags_bytes, 0};
+  uint64_t ret = 0;
+  memcpy(&ret, &fval, sizeof(fval));
+  return ret;
+}
+
+upb_valueptr __attribute__((noinline)) upb_array_recycleorresize(
+    upb_msg *m, upb_msgsink_fval fval, size_t type_size) {
+  upb_array **arr = (void*)&m->data[fval.val_offset];
+  if (!(m->data[fval.has_offset] & fval.has_mask)) {
+    upb_array_recycle(arr);
+    m->data[fval.has_offset] |= fval.has_mask;
   }
-  upb_msg_appendval(m, f, val);
-  return UPB_CONTINUE;
+  upb_arraylen_t len = upb_array_len(*arr);
+  upb_array_resizefortypesize(*arr, type_size, len+1);
+  return _upb_array_getptrforsize(*arr, type_size, len);
 }
 
-static upb_sflow_t upb_dmsgsink_startsubmsg(void *_m, upb_value fval) {
+#define SCALAR_VALUE_CB_PAIR(type, ctype)                                     \
+  upb_flow_t upb_msgsink_ ## type ## value(void *_m, upb_value _fval,         \
+                                                  upb_value val) {            \
+    upb_msg *m = _m;                                                          \
+    upb_msgsink_fval fval = upb_msgsink_unpackfval(_fval);                    \
+    m->data[fval.has_offset] |= fval.has_mask;                                \
+    *(ctype*)&m->data[fval.val_offset] = upb_value_get ## type(val);          \
+    return UPB_CONTINUE;                                                      \
+  }                                                                           \
+                                                                              \
+  upb_flow_t upb_msgsink_ ## type ## value_r(void *_m, upb_value _fval,       \
+                                                    upb_value val) {          \
+    upb_msg *m = _m;                                                          \
+    upb_msgsink_fval fval = upb_msgsink_unpackfval(_fval);                    \
+    upb_array *arr = *(upb_array**)&m->data[fval.val_offset];                 \
+    if (!(m->data[fval.has_offset] & fval.has_mask)) {                        \
+      if(arr && upb_atomic_only(&arr->refcount)) {                            \
+        m->data[fval.has_offset] |= fval.has_mask;                            \
+        arr->len = 0;                                                         \
+      } else {                                                                \
+        goto slow;                                                            \
+      }                                                                       \
+    } else if (arr->len == arr->size) goto slow;                              \
+    upb_valueptr p = _upb_array_getptrforsize(arr, sizeof(ctype),             \
+                                              arr->len++);                    \
+    *(ctype*)p._void = upb_value_get ## type(val);                            \
+    return UPB_CONTINUE;                                                      \
+  slow:                                                                       \
+    p = upb_array_recycleorresize(m, fval, sizeof(ctype));                    \
+    *(ctype*)p._void = upb_value_get ## type(val);                            \
+    return UPB_CONTINUE;                                                      \
+  }                                                                           \
+
+SCALAR_VALUE_CB_PAIR(double, double)
+SCALAR_VALUE_CB_PAIR(float, float)
+SCALAR_VALUE_CB_PAIR(int32, int32_t)
+SCALAR_VALUE_CB_PAIR(int64, int64_t)
+SCALAR_VALUE_CB_PAIR(uint32, uint32_t)
+SCALAR_VALUE_CB_PAIR(uint64, uint64_t)
+SCALAR_VALUE_CB_PAIR(bool, bool)
+
+upb_flow_t upb_msgsink_strvalue(void *_m, upb_value _fval, upb_value val) {
   upb_msg *m = _m;
-  upb_fielddef *f = upb_value_getfielddef(fval);
-  //fprintf(stderr, "dmsg_startsubmsg!  " UPB_STRFMT " %p\n", UPB_STRARG(fval.val.fielddef->name), f);
-  upb_msgdef *msgdef = upb_downcast_msgdef(f->def);
-  void *p = upb_msg_appendmsg(m, f, msgdef);
-  //printf("Continuing with: %p\n", p);
-  return UPB_CONTINUE_WITH(p);
+  upb_msgsink_fval fval = upb_msgsink_unpackfval(_fval);
+  m->data[fval.has_offset] |= fval.has_mask;
+  // We do:
+  //  - upb_string_recycle(), upb_string_substr() instead of
+  //  - upb_string_unref(), upb_string_getref()
+  // because we can conveniently cache these upb_string objects in the
+  // upb_msg, whereas the upb_src who is sending us these strings may not
+  // have a good way of caching them.  This saves the upb_src from allocating
+  // new upb_strings all the time to give us.
+  //
+  // If you were using this to copy one upb_msg to another this would
+  // allocate string objects whereas a upb_string_getref could have avoided
+  // those allocations completely; if this is an issue, we could make it an
+  // option of the upb_msgsink which behavior is desired.
+  upb_string *src = upb_value_getstr(val);
+  upb_string **dst = (void*)&m->data[fval.val_offset];
+  upb_string_recycle(dst);
+  upb_string_substr(*dst, src, 0, upb_string_len(src));
+  return UPB_CONTINUE;
 }
 
-static upb_flow_t endsubmsg(void *closure, upb_value fval) {
-  upb_fielddef *f = upb_value_getfielddef(fval);
-  //fprintf(stderr, "dmsg_endsubmsg!  " UPB_STRFMT " %p\n", UPB_STRARG(f->name), f);
+upb_flow_t upb_msgsink_strvalue_r(void *_m, upb_value _fval,
+                                  upb_value val) {
+  upb_msg *m = _m;
+  upb_msgsink_fval fval = upb_msgsink_unpackfval(_fval);
+  upb_array *arr = *(upb_array**)&m->data[fval.val_offset];
+  if (!(m->data[fval.has_offset] & fval.has_mask)) {
+    if(arr && upb_atomic_only(&arr->refcount)) {
+      m->data[fval.has_offset] |= fval.has_mask;
+      arr->len = 0;
+    } else {
+      goto slow;
+    }
+  } else if (arr->len == arr->size) goto slow;
+  ++arr->len;
+  upb_valueptr p = _upb_array_getptrforsize(arr, sizeof(void*),
+                                            upb_array_len(arr));
+  upb_string *src = upb_value_getstr(val);
+  upb_string_recycle(p.str);
+  upb_string_substr(*p.str, src, 0, upb_string_len(src));
+  return UPB_CONTINUE;
+slow:
+  p = upb_array_recycleorresize(m, fval, sizeof(void*));
+  src = upb_value_getstr(val);
+  upb_string_recycle(p.str);
+  upb_string_substr(*p.str, src, 0, upb_string_len(src));
   return UPB_CONTINUE;
 }
 
+
+upb_sflow_t upb_msgsink_startsubmsg(void *_m, upb_value _fval) {
+  upb_msg *m = _m;
+  upb_msgsink_fval fval = upb_msgsink_unpackfval(_fval);
+  upb_msgdef md;
+  md.size = fval.msg_size;
+  md.set_flags_bytes = fval.set_flags_bytes;
+  upb_fielddef f;
+  f.set_bit_mask = fval.has_mask;
+  f.set_bit_offset = fval.has_offset;
+  f.label = UPB_LABEL(OPTIONAL);  // Just not repeated.
+  f.type = UPB_TYPE(MESSAGE);
+  f.byte_offset = fval.val_offset;
+  return UPB_CONTINUE_WITH(upb_msg_appendmsg(m, &f, &md));
+}
+
+upb_sflow_t upb_msgsink_startsubmsg_r(void *_m, upb_value _fval) {
+  upb_msg *m = _m;
+  upb_msgsink_fval fval = upb_msgsink_unpackfval(_fval);
+  upb_msgdef md;
+  md.size = fval.msg_size;
+  md.set_flags_bytes = fval.set_flags_bytes;
+  upb_fielddef f;
+  f.set_bit_mask = fval.has_mask;
+  f.set_bit_offset = fval.has_offset;
+  f.label = UPB_LABEL(REPEATED);
+  f.type = UPB_TYPE(MESSAGE);
+  f.byte_offset = fval.val_offset;
+  return UPB_CONTINUE_WITH(upb_msg_appendmsg(m, &f, &md));
+}
+
 void upb_msg_regdhandlers(upb_handlers *h) {
-  upb_register_all(h,
-      NULL,  // startmsg
-      NULL,  // endmsg
-      &upb_dmsgsink_value,
-      &upb_dmsgsink_startsubmsg,
-      endsubmsg,  // endsubmsg
-      NULL); // unknown
+  upb_register_all(h, NULL, NULL, NULL, NULL, NULL, NULL);
+  for (int i = 0; i < h->msgs_len; i++) {
+    upb_handlers_msgent *m = &h->msgs[i];
+    upb_inttable_iter iter = upb_inttable_begin(&m->fieldtab);
+    for(; !upb_inttable_done(iter);
+        iter = upb_inttable_next(&m->fieldtab, iter)) {
+      upb_handlers_fieldent *fe = upb_inttable_iter_value(iter);
+      upb_fielddef *f = upb_value_getfielddef(fe->fval);
+      uint16_t msg_size = 0;
+      uint8_t set_flags_bytes = 0;
+      if (upb_issubmsg(f)) {
+        upb_msgdef *md = upb_downcast_msgdef(f->def);
+        msg_size = md->size;
+        set_flags_bytes = md->set_flags_bytes;
+      }
+      upb_value_setuint64(&fe->fval,
+          upb_msgsink_packfval(f->set_bit_offset, f->set_bit_mask,
+                               f->byte_offset, msg_size, set_flags_bytes));
+#define CASE(upb_type, type) \
+    case UPB_TYPE(upb_type): \
+        fe->cb.value = upb_isarray(f) ? \
+            upb_msgsink_ ## type ## value_r : upb_msgsink_ ## type ## value; \
+        break;
+      switch (f->type) {
+        CASE(DOUBLE,   double)
+        CASE(FLOAT,    float)
+        CASE(INT32,    int32)
+        CASE(INT64,    int64)
+        CASE(UINT32,   uint32)
+        CASE(UINT64,   uint64)
+        CASE(SINT32,   int32)
+        CASE(SINT64,   int64)
+        CASE(FIXED32,  uint32)
+        CASE(FIXED64,  uint64)
+        CASE(SFIXED32, int32)
+        CASE(SFIXED64, int64)
+        CASE(BOOL,     bool)
+        CASE(ENUM,     int32)
+        CASE(STRING,   str)
+        CASE(BYTES,    str)
+#undef CASE
+        case UPB_TYPE(MESSAGE):
+        case UPB_TYPE(GROUP):
+          fe->cb.startsubmsg =
+              upb_isarray(f) ? upb_msgsink_startsubmsg_r : upb_msgsink_startsubmsg;
+          break;
+      }
+    }
+  }
 }
diff --git a/src/upb_msg.h b/src/upb_msg.h
index 6785131cae..b172134f04 100644
--- a/src/upb_msg.h
+++ b/src/upb_msg.h
@@ -150,20 +150,25 @@ struct _upb_array {
 };
 
 void _upb_array_free(upb_array *a, upb_fielddef *f);
-INLINE upb_valueptr _upb_array_getptr(upb_array *a, upb_fielddef *f,
-                                      uint32_t elem) {
+INLINE upb_valueptr _upb_array_getptrforsize(upb_array *a, size_t type_size,
+                                             uint32_t elem) {
   upb_valueptr p;
-  p._void = &a->ptr[elem * upb_types[f->type].size];
+  p._void = &a->ptr[elem * type_size];
   return p;
 }
 
+INLINE upb_valueptr _upb_array_getptr(upb_array *a, upb_fielddef *f,
+                                      uint32_t elem) {
+  return _upb_array_getptrforsize(a, upb_types[f->type].size, elem);
+}
+
 upb_array *upb_array_new(void);
 
 INLINE void upb_array_unref(upb_array *a, upb_fielddef *f) {
   if (a && upb_atomic_unref(&a->refcount)) _upb_array_free(a, f);
 }
 
-void upb_array_recycle(upb_array **arr, upb_fielddef *f);
+void upb_array_recycle(upb_array **arr);
 
 INLINE uint32_t upb_array_len(upb_array *a) {
   return a->len;