Changes from Google-internal development.

* JSON parser expanded to handle split buffers. * bugfix to the protobuf decoder.
10 years ago · 87fc2c516b
parent d18475ae57
commit 87fc2c516b
14 changed files with 1499 additions and 770 deletions
--- a/2
+++ b/2
@ -15,7 +15,7 @@
 # Threading:
 # * -DUPB_THREAD_UNSAFE: remove all thread-safety.

-.PHONY: all lib clean tests test benchmark descriptorgen amalgamate
+.PHONY: all lib clean tests test descriptorgen amalgamate
 .PHONY: clean_leave_profile

 # Prevents the deletion of intermediate files.
--- a/tests/json/test_json.cc
+++ b/tests/json/test_json.cc
@ -6,6 +6,7 @@
 * A set of tests for JSON parsing and serialization.
 */

+#include "tests/test_util.h"
 #include "tests/upb_test.h"
 #include "upb/handlers.h"
 #include "upb/symtab.h"
@ -27,6 +28,8 @@ struct TestCase {
  const char* expected;
 };

+bool verbose = false;
+
 static TestCase kTestRoundtripMessages[] = {
  // Test most fields here.
  {
@ -190,6 +193,51 @@ class StringSink {
  std::string s_;
 };

+void test_json_roundtrip_message(const char* json_src,
+                                 const char* json_expected,
+                                 const upb::Handlers* serialize_handlers,
+                                 int seam) {
+  upb::Status st;
+  upb::json::Parser parser(&st);
+  upb::json::Printer printer(serialize_handlers);
+  StringSink data_sink;
+
+  parser.ResetOutput(printer.input());
+  printer.ResetOutput(data_sink.Sink());
+
+  upb::BytesSink* input = parser.input();
+  void *sub;
+  size_t len = strlen(json_src);
+  size_t ofs = 0;
+
+  bool ok = input->Start(0, &sub) &&
+            parse_buffer(input, sub, json_src, 0, seam, &ofs, &st, verbose) &&
+            parse_buffer(input, sub, json_src, seam, len, &ofs, &st, verbose) &&
+            ofs == len;
+
+  if (ok) {
+    if (verbose) {
+      fprintf(stderr, "calling end()\n");
+    }
+    ok = input->End();
+  }
+
+  if (!ok) {
+    fprintf(stderr, "upb parse error: %s\n", st.error_message());
+  }
+  ASSERT(ok);
+
+  if (memcmp(json_expected,
+             data_sink.Data().data(),
+             data_sink.Data().size())) {
+    fprintf(stderr,
+            "JSON parse/serialize roundtrip result differs:\n"
+            "Original:\n%s\nParsed/Serialized:\n%s\n",
+            json_src, data_sink.Data().c_str());
+    abort();
+  }
+}
+
 // Starts with a message in JSON format, parses and directly serializes again,
 // and compares the result.
 void test_json_roundtrip() {
@ -200,36 +248,14 @@ void test_json_roundtrip() {

  for (const TestCase* test_case = kTestRoundtripMessages;
       test_case->input != NULL; test_case++) {
+    const char *expected =
+        (test_case->expected == EXPECT_SAME) ?
+        test_case->input :
+        test_case->expected;

-    const char *json_src = test_case->input;
-    const char *json_expected = test_case->expected;
-    if (json_expected == EXPECT_SAME) {
-      json_expected = json_src;
-    }
-
-    upb::Status st;
-    upb::json::Parser parser(&st);
-    upb::json::Printer printer(serialize_handlers.get());
-    StringSink data_sink;
-
-    parser.ResetOutput(printer.input());
-    printer.ResetOutput(data_sink.Sink());
-
-    bool ok = upb::BufferSource::PutBuffer(json_src, strlen(json_src),
-                                           parser.input());
-    if (!ok) {
-      fprintf(stderr, "upb parse error: %s\n", st.error_message());
-    }
-    ASSERT(ok);
-
-    if (memcmp(json_expected,
-               data_sink.Data().data(),
-               data_sink.Data().size())) {
-      fprintf(stderr,
-              "JSON parse/serialize roundtrip result differs:\n"
-              "Original:\n%s\nParsed/Serialized:\n%s\n",
-              json_src, data_sink.Data().c_str());
-      abort();
+    for (int i = 0; i < strlen(test_case->input); i++) {
+      test_json_roundtrip_message(test_case->input, expected,
+                                  serialize_handlers.get(), i);
    }
  }
 }
--- a/tests/pb/test_decoder.cc
+++ b/tests/pb/test_decoder.cc
@ -36,11 +36,17 @@
 #include <stdlib.h>
 #include <string.h>

+#include "tests/test_util.h"
 #include "tests/upb_test.h"
+
+#ifdef AMALGAMATED
+#include "upb.h"
+#else  // AMALGAMATED
 #include "upb/handlers.h"
 #include "upb/pb/decoder.h"
 #include "upb/pb/varint.int.h"
 #include "upb/upb.h"
+#endif  // !AMALGAMATED

 #undef PRINT_FAILURE
 #define PRINT_FAILURE(expr)                                           \
@ -62,7 +68,6 @@ uint32_t filter_hash = 0;
 double completed;
 double total;
 double *count;
-upb::BufferHandle global_handle;

 enum TestMode {
  COUNT_ONLY = 1,
@ -525,55 +530,16 @@ void CheckBytesParsed(const upb::pb::Decoder& decoder, size_t ofs) {
  ASSERT(ofs <= (decoder.BytesParsed() + MAX_BUFFERED));
 }

-bool parse(upb::pb::Decoder* decoder, void* subc, const char* buf,
-           size_t start, size_t end, size_t* ofs, upb::Status* status) {
+static bool parse(upb::pb::Decoder* decoder, void* subc, const char* buf,
+                  size_t start, size_t end, size_t* ofs, upb::Status* status) {
  CheckBytesParsed(*decoder, *ofs);
-  upb::BytesSink* s = decoder->input();
-  start = UPB_MAX(start, *ofs);
-  if (start <= end) {
-    size_t len = end - start;
-    if (filter_hash) {
-      fprintf(stderr, "Calling parse(%zu) for bytes %zu-%zu of the input\n",
-              len, start, end);
-    }
-    size_t parsed = s->PutBuffer(subc, buf + start, len, &global_handle);
-    if (filter_hash) {
-      if (parsed == len) {
-        fprintf(stderr,
-                "parse(%zu) = %zu, complete byte count indicates success\n",
-                len, len);
-      } else if (parsed > len) {
-        fprintf(stderr,
-                "parse(%zu) = %zu, long byte count indicates success and skip"
-                "of the next %zu bytes\n",
-                len, parsed, parsed - len);
-      } else {
-        fprintf(stderr,
-                "parse(%zu) = %zu, short byte count indicates failure; "
-                "last %zu bytes were not consumed\n",
-                len, parsed, len - parsed);
-      }
-    }
-    if (status->ok() != (parsed >= len)) {
-      if (status->ok()) {
-        fprintf(stderr,
-                "Error: decode function returned short byte count but set no "
-                "error status\n");
-      } else {
-        fprintf(stderr,
-                "Error: decode function returned complete byte count but set "
-                "error status\n");
-      }
-      fprintf(stderr, "Status: %s, parsed=%zu, len=%zu\n",
-              status->error_message(), parsed, len);
-      ASSERT(false);
-    }
-    if (!status->ok())
-      return false;
-    *ofs += parsed;
+  bool ret = parse_buffer(decoder->input(), subc, buf, start, end, ofs, status,
+                          filter_hash != 0);
+  if (ret) {
    CheckBytesParsed(*decoder, *ofs);
  }
-  return true;
+
+  return ret;
 }

 #define LINE(x) x "\n"
@ -1148,7 +1114,41 @@ void test_emptyhandlers(bool allowjit) {
  upb::reffed_ptr<upb::Handlers> h(upb::Handlers::New(md.get()));
  bool ok = h->Freeze(NULL);
  ASSERT(ok);
-  NewMethod(h.get(), allowjit);
+upb::reffed_ptr<const upb::pb::DecoderMethod> method =
+      NewMethod(h.get(), allowjit);
+  ASSERT(method.get());
+
+  // TODO: also test the case where a message has fields, but the fields are
+  // submessage fields and have no handlers. This also results in a decoder
+  // method with no field-handling code.
+
+  // Ensure that the method can run with empty and non-empty input.
+  string test_unknown_field_msg =
+    cat(tag(1, UPB_WIRE_TYPE_VARINT), varint(42),
+        tag(2, UPB_WIRE_TYPE_DELIMITED), delim("My test data"));
+  const struct {
+    const char* data;
+    size_t length;
+  } testdata[] = {
+    { "", 0 },
+    { test_unknown_field_msg.data(), test_unknown_field_msg.size() },
+    { NULL, 0 },
+  };
+  for (int i = 0; testdata[i].data; i++) {
+    upb::Status status;
+    upb::pb::Decoder decoder(method.get(), &status);
+    upb::Sink sink(global_handlers, &closures[0]);
+    decoder.ResetOutput(&sink);
+    upb::BytesSink* input = decoder.input();
+    void* subc;
+    ASSERT(input->Start(0, &subc));
+    size_t ofs = 0;
+    ASSERT(parse_buffer(input, subc,
+                        testdata[i].data, 0, testdata[i].length,
+                        &ofs, &status, false));
+    ASSERT(ofs == testdata[i].length);
+    ASSERT(input->End());
+  }
 }

 void run_tests(bool use_jit) {
@ -1166,7 +1166,7 @@ void run_tests(bool use_jit) {
  test_invalid();
  test_valid();

-  test_emptyhandlers(false);
+  test_emptyhandlers(use_jit);
 }

 void run_test_suite() {
--- a/tests/test_table.cc
+++ b/tests/test_table.cc
@ -15,7 +15,6 @@
 #include <set>
 #include <string>
 #include <vector>
-#include "tests/test_util.h"
 #include "tests/upb_test.h"
 #include "upb/table.int.h"

@ -214,7 +213,8 @@ void test_inttable(int32_t *keys, uint16_t num_entries, const char *desc) {
    x += (uintptr_t)ok;
  }
  double total = get_usertime() - before;
-  printf("%s/s\n", eng(i/total, 3, false));
+  printf("%ld/s\n", (long)(i/total));
+  double upb_seq_i = i / 100;  // For later percentage calcuation.

  printf("upb_inttable(rand): ");
  fflush(stdout);
@ -227,7 +227,8 @@ void test_inttable(int32_t *keys, uint16_t num_entries, const char *desc) {
    x += (uintptr_t)ok;
  }
  total = get_usertime() - before;
-  printf("%s/s\n", eng(i/total, 3, false));
+  printf("%ld/s\n", (long)(i/total));
+  double upb_rand_i = i / 100;  // For later percentage calculation.

  printf("std::map<int32_t, int32_t>(seq): ");
  fflush(stdout);
@ -238,7 +239,7 @@ void test_inttable(int32_t *keys, uint16_t num_entries, const char *desc) {
    x += m[key];
  }
  total = get_usertime() - before;
-  printf("%s/s\n", eng(i/total, 3, false));
+  printf("%ld/s (%0.1f%% of upb)\n", (long)(i/total), i / upb_seq_i);

  printf("std::map<int32_t, int32_t>(rand): ");
  fflush(stdout);
@ -249,7 +250,7 @@ void test_inttable(int32_t *keys, uint16_t num_entries, const char *desc) {
    x += m[key];
  }
  total = get_usertime() - before;
-  printf("%s/s\n", eng(i/total, 3, false));
+  printf("%ld/s (%0.1f%% of upb)\n", (long)(i/total), i / upb_rand_i);

  printf("__gnu_cxx::hash_map<uint32_t, uint32_t>(seq): ");
  fflush(stdout);
@ -260,7 +261,7 @@ void test_inttable(int32_t *keys, uint16_t num_entries, const char *desc) {
    x += hm[key];
  }
  total = get_usertime() - before;
-  printf("%s/s\n", eng(i/total, 3, false));
+  printf("%ld/s (%0.1f%% of upb)\n", (long)(i/total), i / upb_seq_i);

  printf("__gnu_cxx::hash_map<uint32_t, uint32_t>(rand): ");
  fflush(stdout);
@ -272,7 +273,7 @@ void test_inttable(int32_t *keys, uint16_t num_entries, const char *desc) {
  }
  total = get_usertime() - before;
  if (x == INT_MAX) abort();
-  printf("%s/s\n\n", eng(i/total, 3, false));
+  printf("%ld/s (%0.1f%% of upb)\n\n", (long)(i/total), i / upb_rand_i);
  upb_inttable_uninit(&table);
  delete rand_order;
 }
@ -308,7 +309,7 @@ extern "C" {

 int run_tests(int argc, char *argv[]) {
  for (int i = 1; i < argc; i++) {
-    if (strcmp(argv[i], "--benchmark") == 0) benchmark = true;
+    if (strcmp(argv[i], "benchmark") == 0) benchmark = true;
  }

  vector<std::string> keys;
--- a/tests/test_util.h
+++ b/tests/test_util.h
@ -1,53 +1,89 @@
-/* Function for printing numbers using si prefixes (k, M, G, etc.).
- * From http://www.cs.tut.fi/~jkorpela/c/eng.html */
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2014 Google Inc.  See LICENSE for details.
+ *
+ * Common functionality for tests.
+ */

-#define PREFIX_START (-24)
-/* Smallest power of then for which there is a prefix defined.
-   If the set of prefixes will be extended, change this constant
-   and update the table "prefix". */
+#ifndef UPB_TEST_UTIL_H_
+#define UPB_TEST_UTIL_H_

 #include <stdio.h>
 #include <math.h>
+#include "tests/upb_test.h"
+#include "upb/sink.h"

-static char *eng(double value, int digits, int numeric)
-{
-  static const char *prefix[] = {
-  "y", "z", "a", "f", "p", "n", "u", "m", "",
-  "k", "M", "G", "T", "P", "E", "Z", "Y"
-  };
-#define PREFIX_END (PREFIX_START+\
-(int)((sizeof(prefix)/sizeof(char *)-1)*3))
-
-      int expof10;
-      static char result[100];
-      char *res = result;
-
-      if (value < 0.)
-        {
-            *res++ = '-';
-            value = -value;
-        }
-
-      expof10 = (int) log10(value);
-      if(expof10 > 0)
-        expof10 = (expof10/3)*3;
-      else
-        expof10 = (-expof10+3)/3*(-3); 
-
-      value *= pow(10,-expof10);
-
-      if (value >= 1000.)
-         { value /= 1000.0; expof10 += 3; }
-      else if(value >= 100.0)
-         digits -= 2;
-      else if(value >= 10.0)
-         digits -= 1;
-
-      if(numeric || (expof10 < PREFIX_START) ||    
-                    (expof10 > PREFIX_END))
-        sprintf(res, "%.*fe%d", digits-1, value, expof10); 
-      else
-        sprintf(res, "%.*f %s", digits-1, value, 
-          prefix[(expof10-PREFIX_START)/3]);
-      return result;
+upb::BufferHandle global_handle;
+
+// Puts a region of the given buffer [start, end) into the given sink (which
+// probably represents a parser.  Can gracefully handle the case where the
+// parser returns a "parsed" length that is less or greater than the input
+// buffer length, and tracks the overall parse offset in *ofs.
+//
+// Pass verbose=true to print detailed diagnostics to stderr.
+bool parse_buffer(upb::BytesSink* sink, void* subc, const char* buf,
+                  size_t start, size_t end, size_t* ofs,
+                  upb::Status* status, bool verbose) {
+  start = UPB_MAX(start, *ofs);
+
+  if (start <= end) {
+    size_t len = end - start;
+
+    // Copy buffer into a separate, temporary buffer.
+    // This is necessary to verify that the parser is not erroneously
+    // reading outside the specified bounds.
+    char *buf2 = (char*)malloc(len);
+    assert(buf2);
+    memcpy(buf2, buf + start, len);
+
+    if (verbose) {
+      fprintf(stderr, "Calling parse(%zu) for bytes %zu-%zu of the input\n",
+              len, start, end);
+    }
+
+    size_t parsed = sink->PutBuffer(subc, buf2, len, &global_handle);
+    free(buf2);
+
+    if (verbose) {
+      if (parsed == len) {
+        fprintf(stderr,
+                "parse(%zu) = %zu, complete byte count indicates success\n",
+                len, len);
+      } else if (parsed > len) {
+        fprintf(stderr,
+                "parse(%zu) = %zu, long byte count indicates success and skip"
+                "of the next %zu bytes\n",
+                len, parsed, parsed - len);
+      } else {
+        fprintf(stderr,
+                "parse(%zu) = %zu, short byte count indicates failure; "
+                "last %zu bytes were not consumed\n",
+                len, parsed, len - parsed);
+      }
+    }
+
+    if (status->ok() != (parsed >= len)) {
+      if (status->ok()) {
+        fprintf(stderr,
+                "Error: decode function returned short byte count but set no "
+                "error status\n");
+      } else {
+        fprintf(stderr,
+                "Error: decode function returned complete byte count but set "
+                "error status\n");
+      }
+      fprintf(stderr, "Status: %s, parsed=%zu, len=%zu\n",
+              status->error_message(), parsed, len);
+      ASSERT(false);
+    }
+
+    if (!status->ok())
+      return false;
+
+    *ofs += parsed;
+  }
+  return true;
 }
+
+#endif
--- a/upb/json/parser.c
+++ b/upb/json/parser.c
--- a/upb/json/parser.h
+++ b/upb/json/parser.h
@ -69,15 +69,24 @@ UPB_DEFINE_STRUCT0(upb_json_parser,
  int parser_stack[UPB_JSON_MAX_DEPTH];
  int parser_top;

-  // A pointer to the beginning of whatever text we are currently parsing.
-  const char *text_begin;
+  // The handle for the current buffer.
+  const upb_bufhandle *handle;

-  // We have to accumulate text for member names, integers, unicode escapes, and
-  // base64 partial results.
+  // Accumulate buffer.  See details in parser.rl.
  const char *accumulated;
  size_t accumulated_len;
-  // TODO: add members and code for allocating a buffer when necessary (when the
-  // member spans input buffers or contains escapes).
+  char *accumulate_buf;
+  size_t accumulate_buf_size;
+
+  // Multi-part text data.  See details in parser.rl.
+  int multipart_state;
+  upb_selector_t string_selector;
+
+  // Input capture.  See details in parser.rl.
+  const char *capture;
+
+  // Intermediate result of parsing a unicode escape sequence.
+  uint32_t digit;
 ));

 UPB_BEGIN_EXTERN_C
--- a/upb/json/parser.rl
+++ b/upb/json/parser.rl
@ -33,6 +33,9 @@

 #define PARSER_CHECK_RETURN(x) if (!(x)) return false

+// Used to signal that a capture has been suspended.
+static char suspend_capture;
+
 static upb_selector_t getsel_for_handlertype(upb_json_parser *p,
                                             upb_handlertype_t type) {
  upb_selector_t sel;
@ -46,41 +49,6 @@ static upb_selector_t parser_getsel(upb_json_parser *p) {
      p, upb_handlers_getprimitivehandlertype(p->top->f));
 }

-static void start_member(upb_json_parser *p) {
-  assert(!p->top->f);
-  assert(!p->accumulated);
-  p->accumulated_len = 0;
-}
-
-static bool end_member(upb_json_parser *p) {
-  // TODO(haberman): support keys that span buffers or have escape sequences.
-  assert(!p->top->f);
-  assert(p->accumulated);
-  const upb_fielddef *f =
-      upb_msgdef_ntof(p->top->m, p->accumulated, p->accumulated_len);
-
-  if (!f) {
-    // TODO(haberman): Ignore unknown fields if requested/configured to do so.
-    upb_status_seterrf(p->status, "No such field: %.*s\n",
-                       (int)p->accumulated_len, p->accumulated);
-    return false;
-  }
-
-  p->top->f = f;
-  p->accumulated = NULL;
-
-  return true;
-}
-
-static void start_object(upb_json_parser *p) {
-  upb_sink_startmsg(&p->top->sink);
-}
-
-static void end_object(upb_json_parser *p) {
-  upb_status status;
-  upb_sink_endmsg(&p->top->sink, &status);
-}
-
 static bool check_stack(upb_json_parser *p) {
  if ((p->top + 1) == p->limit) {
    upb_status_seterrmsg(p->status, "Nesting too deep");
@ -90,83 +58,28 @@ static bool check_stack(upb_json_parser *p) {
  return true;
 }

-static bool start_subobject(upb_json_parser *p) {
-  assert(p->top->f);
-
-  if (!upb_fielddef_issubmsg(p->top->f)) {
-    upb_status_seterrf(p->status,
-                       "Object specified for non-message/group field: %s",
-                       upb_fielddef_name(p->top->f));
-    return false;
-  }
-
-  if (!check_stack(p)) return false;
-
-  upb_jsonparser_frame *inner = p->top + 1;
-
-  upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSUBMSG);
-  upb_sink_startsubmsg(&p->top->sink, sel, &inner->sink);
-  inner->m = upb_fielddef_msgsubdef(p->top->f);
-  inner->f = NULL;
-  p->top = inner;
+// There are GCC/Clang built-ins for overflow checking which we could start
+// using if there was any performance benefit to it.

+static bool checked_add(size_t a, size_t b, size_t *c) {
+  if (SIZE_MAX - a < b) return false;
+  *c = a + b;
  return true;
 }

-static void end_subobject(upb_json_parser *p) {
-  p->top--;
-  upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSUBMSG);
-  upb_sink_endsubmsg(&p->top->sink, sel);
-}
-
-static bool start_array(upb_json_parser *p) {
-  assert(p->top->f);
-
-  if (!upb_fielddef_isseq(p->top->f)) {
-    upb_status_seterrf(p->status,
-                       "Array specified for non-repeated field: %s",
-                       upb_fielddef_name(p->top->f));
-    return false;
+static size_t saturating_multiply(size_t a, size_t b) {
+  // size_t is unsigned, so this is defined behavior even on overflow.
+  size_t ret = a * b;
+  if (b != 0 && ret / b != a) {
+    ret = SIZE_MAX;
  }
-
-  if (!check_stack(p)) return false;
-
-  upb_jsonparser_frame *inner = p->top + 1;
-  upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSEQ);
-  upb_sink_startseq(&p->top->sink, sel, &inner->sink);
-  inner->m = p->top->m;
-  inner->f = p->top->f;
-  p->top = inner;
-
-  return true;
+  return ret;
 }

-static void end_array(upb_json_parser *p) {
-  assert(p->top > p->stack);

-  p->top--;
-  upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSEQ);
-  upb_sink_endseq(&p->top->sink, sel);
-}
+/* Base64 decoding ************************************************************/

-static void clear_member(upb_json_parser *p) { p->top->f = NULL; }
-
-static bool parser_putbool(upb_json_parser *p, bool val) {
-  if (upb_fielddef_type(p->top->f) != UPB_TYPE_BOOL) {
-    upb_status_seterrf(p->status,
-                       "Boolean value specified for non-bool field: %s",
-                       upb_fielddef_name(p->top->f));
-    return false;
-  }
-
-  bool ok = upb_sink_putbool(&p->top->sink, parser_getsel(p), val);
-  UPB_ASSERT_VAR(ok, ok);
-  return true;
-}
-
-static void start_text(upb_json_parser *p, const char *ptr) {
-  p->text_begin = ptr;
-}
+// TODO(haberman): make this streaming.

 static const signed char b64table[] = {
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
@ -286,89 +199,323 @@ badpadding:
  return false;
 }

-static bool end_text(upb_json_parser *p, const char *ptr, bool is_num) {
-  assert(!p->accumulated);  // TODO: handle this case.
-  p->accumulated = p->text_begin;
-  p->accumulated_len = ptr - p->text_begin;

-  if (p->top->f && upb_fielddef_isstring(p->top->f)) {
-    // This is a string field (as opposed to a member name).
-    upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STRING);
-    if (upb_fielddef_type(p->top->f) == UPB_TYPE_BYTES) {
-      PARSER_CHECK_RETURN(base64_push(p, sel, p->accumulated,
-                                      p->accumulated_len));
-    } else {
-      upb_sink_putstring(&p->top->sink, sel, p->accumulated, p->accumulated_len, NULL);
+/* Accumulate buffer **********************************************************/
+
+// Functionality for accumulating a buffer.
+//
+// Some parts of the parser need an entire value as a contiguous string.  For
+// example, to look up a member name in a hash table, or to turn a string into
+// a number, the relevant library routines need the input string to be in
+// contiguous memory, even if the value spanned two or more buffers in the
+// input.  These routines handle that.
+//
+// In the common case we can just point to the input buffer to get this
+// contiguous string and avoid any actual copy.  So we optimistically begin
+// this way.  But there are a few cases where we must instead copy into a
+// separate buffer:
+//
+//   1. The string was not contiguous in the input (it spanned buffers).
+//
+//   2. The string included escape sequences that need to be interpreted to get
+//      the true value in a contiguous buffer.
+
+static void assert_accumulate_empty(upb_json_parser *p) {
+  UPB_UNUSED(p);
+  assert(p->accumulated == NULL);
+  assert(p->accumulated_len == 0);
+}
+
+static void accumulate_clear(upb_json_parser *p) {
+  p->accumulated = NULL;
+  p->accumulated_len = 0;
+}
+
+// Used internally by accumulate_append().
+static bool accumulate_realloc(upb_json_parser *p, size_t need) {
+  size_t new_size = UPB_MAX(p->accumulate_buf_size, 128);
+  while (new_size < need) {
+    new_size = saturating_multiply(new_size, 2);
+  }
+
+  void *mem = realloc(p->accumulate_buf, new_size);
+  if (!mem) {
+    upb_status_seterrmsg(p->status, "Out of memory allocating buffer.");
+    return false;
+  }
+
+  p->accumulate_buf = mem;
+  p->accumulate_buf_size = new_size;
+  return true;
+}
+
+// Logically appends the given data to the append buffer.
+// If "can_alias" is true, we will try to avoid actually copying, but the buffer
+// must be valid until the next accumulate_append() call (if any).
+static bool accumulate_append(upb_json_parser *p, const char *buf, size_t len,
+                              bool can_alias) {
+  if (!p->accumulated && can_alias) {
+    p->accumulated = buf;
+    p->accumulated_len = len;
+    return true;
+  }
+
+  if (p->accumulate_buf_size - p->accumulated_len < len) {
+    size_t need;
+    if (!checked_add(p->accumulated_len, len, &need)) {
+      upb_status_seterrmsg(p->status, "Integer overflow.");
+      return false;
    }
-    p->accumulated = NULL;
-  } else if (p->top->f &&
-             upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM &&
-             !is_num) {
-
-    // Enum case: resolve enum symbolic name to integer value.
-    const upb_enumdef *enumdef =
-        (const upb_enumdef*)upb_fielddef_subdef(p->top->f);
-
-    int32_t int_val = 0;
-    if (upb_enumdef_ntoi(enumdef, p->accumulated, p->accumulated_len,
-                         &int_val)) {
-      upb_selector_t sel = parser_getsel(p);
-      upb_sink_putint32(&p->top->sink, sel, int_val);
-    } else {
-      upb_status_seterrmsg(p->status, "Enum value name unknown");
+
+    if (!accumulate_realloc(p, need)) {
      return false;
    }
-    p->accumulated = NULL;
  }

+  if (p->accumulated != p->accumulate_buf) {
+    memcpy(p->accumulate_buf, p->accumulated, p->accumulated_len);
+    p->accumulated = p->accumulate_buf;
+  }
+
+  memcpy(p->accumulate_buf + p->accumulated_len, buf, len);
+  p->accumulated_len += len;
  return true;
 }

-static bool start_stringval(upb_json_parser *p) {
-  assert(p->top->f);
+// Returns a pointer to the data accumulated since the last accumulate_clear()
+// call, and writes the length to *len.  This with point either to the input
+// buffer or a temporary accumulate buffer.
+static const char *accumulate_getptr(upb_json_parser *p, size_t *len) {
+  assert(p->accumulated);
+  *len = p->accumulated_len;
+  return p->accumulated;
+}

-  if (upb_fielddef_isstring(p->top->f)) {
-    if (!check_stack(p)) return false;

-    // Start a new parser frame: parser frames correspond one-to-one with
-    // handler frames, and string events occur in a sub-frame.
-    upb_jsonparser_frame *inner = p->top + 1;
-    upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSTR);
-    upb_sink_startstr(&p->top->sink, sel, 0, &inner->sink);
-    inner->m = p->top->m;
-    inner->f = p->top->f;
-    p->top = inner;
+/* Mult-part text data ********************************************************/
+
+// When we have text data in the input, it can often come in multiple segments.
+// For example, there may be some raw string data followed by an escape
+// sequence.  The two segments are processed with different logic.  Also buffer
+// seams in the input can cause multiple segments.
+//
+// As we see segments, there are two main cases for how we want to process them:
+//
+//  1. we want to push the captured input directly to string handlers.
+//
+//  2. we need to accumulate all the parts into a contiguous buffer for further
+//     processing (field name lookup, string->number conversion, etc).
+
+// This is the set of states for p->multipart_state.
+enum {
+  // We are not currently processing multipart data.
+  MULTIPART_INACTIVE = 0,
+
+  // We are processing multipart data by accumulating it into a contiguous
+  // buffer.
+  MULTIPART_ACCUMULATE = 1,
+
+  // We are processing multipart data by pushing each part directly to the
+  // current string handlers.
+  MULTIPART_PUSHEAGERLY = 2
+};

-    return true;
-  } else if (upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM) {
-    // Do nothing -- symbolic enum names in quotes remain in the
-    // current parser frame.
+// Start a multi-part text value where we accumulate the data for processing at
+// the end.
+static void multipart_startaccum(upb_json_parser *p) {
+  assert_accumulate_empty(p);
+  assert(p->multipart_state == MULTIPART_INACTIVE);
+  p->multipart_state = MULTIPART_ACCUMULATE;
+}
+
+// Start a multi-part text value where we immediately push text data to a string
+// value with the given selector.
+static void multipart_start(upb_json_parser *p, upb_selector_t sel) {
+  assert_accumulate_empty(p);
+  assert(p->multipart_state == MULTIPART_INACTIVE);
+  p->multipart_state = MULTIPART_PUSHEAGERLY;
+  p->string_selector = sel;
+}
+
+static bool multipart_text(upb_json_parser *p, const char *buf, size_t len,
+                           bool can_alias) {
+  switch (p->multipart_state) {
+    case MULTIPART_INACTIVE:
+      upb_status_seterrmsg(
+          p->status, "Internal error: unexpected state MULTIPART_INACTIVE");
+      return false;
+
+    case MULTIPART_ACCUMULATE:
+      if (!accumulate_append(p, buf, len, can_alias)) {
+        return false;
+      }
+      break;
+
+    case MULTIPART_PUSHEAGERLY: {
+      const upb_bufhandle *handle = can_alias ? p->handle : NULL;
+      upb_sink_putstring(&p->top->sink, p->string_selector, buf, len, handle);
+      break;
+    }
+  }
+
+  return true;
+}
+
+// Note: this invalidates the accumulate buffer!  Call only after reading its
+// contents.
+static void multipart_end(upb_json_parser *p) {
+  assert(p->multipart_state != MULTIPART_INACTIVE);
+  p->multipart_state = MULTIPART_INACTIVE;
+  accumulate_clear(p);
+}
+
+
+/* Input capture **************************************************************/
+
+// Functionality for capturing a region of the input as text.  Gracefully
+// handles the case where a buffer seam occurs in the middle of the captured
+// region.
+
+static void capture_begin(upb_json_parser *p, const char *ptr) {
+  assert(p->multipart_state != MULTIPART_INACTIVE);
+  assert(p->capture == NULL);
+  p->capture = ptr;
+}
+
+static bool capture_end(upb_json_parser *p, const char *ptr) {
+  assert(p->capture);
+  if (multipart_text(p, p->capture, ptr - p->capture, true)) {
+    p->capture = NULL;
    return true;
  } else {
-    upb_status_seterrf(p->status,
-                       "String specified for non-string/non-enum field: %s",
-                       upb_fielddef_name(p->top->f));
    return false;
  }
+}

+// This is called at the end of each input buffer (ie. when we have hit a
+// buffer seam).  If we are in the middle of capturing the input, this
+// processes the unprocessed capture region.
+static void capture_suspend(upb_json_parser *p, const char **ptr) {
+  if (!p->capture) return;
+
+  if (multipart_text(p, p->capture, *ptr - p->capture, false)) {
+    // We use this as a signal that we were in the middle of capturing, and
+    // that capturing should resume at the beginning of the next buffer.
+    //
+    // We can't use *ptr here, because we have no guarantee that this pointer
+    // will be valid when we resume (if the underlying memory is freed, then
+    // using the pointer at all, even to compare to NULL, is likely undefined
+    // behavior).
+    p->capture = &suspend_capture;
+  } else {
+    // Need to back up the pointer to the beginning of the capture, since
+    // we were not able to actually preserve it.
+    *ptr = p->capture;
+  }
 }

-static void end_stringval(upb_json_parser *p) {
-  if (upb_fielddef_isstring(p->top->f)) {
-    upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR);
-    upb_sink_endstr(&p->top->sink, sel);
-    p->top--;
+static void capture_resume(upb_json_parser *p, const char *ptr) {
+  if (p->capture) {
+    assert(p->capture == &suspend_capture);
+    p->capture = ptr;
+  }
+}
+
+
+/* Callbacks from the parser **************************************************/
+
+// These are the functions called directly from the parser itself.
+// We define these in the same order as their declarations in the parser.
+
+static char escape_char(char in) {
+  switch (in) {
+    case 'r': return '\r';
+    case 't': return '\t';
+    case 'n': return '\n';
+    case 'f': return '\f';
+    case 'b': return '\b';
+    case '/': return '/';
+    case '"': return '"';
+    case '\\': return '\\';
+    default:
+      assert(0);
+      return 'x';
+  }
+}
+
+static bool escape(upb_json_parser *p, const char *ptr) {
+  char ch = escape_char(*ptr);
+  return multipart_text(p, &ch, 1, false);
+}
+
+static void start_hex(upb_json_parser *p) {
+  p->digit = 0;
+}
+
+static void hexdigit(upb_json_parser *p, const char *ptr) {
+  char ch = *ptr;
+
+  p->digit <<= 4;
+
+  if (ch >= '0' && ch <= '9') {
+    p->digit += (ch - '0');
+  } else if (ch >= 'a' && ch <= 'f') {
+    p->digit += ((ch - 'a') + 10);
+  } else {
+    assert(ch >= 'A' && ch <= 'F');
+    p->digit += ((ch - 'A') + 10);
  }
 }

+static bool end_hex(upb_json_parser *p) {
+  uint32_t codepoint = p->digit;
+
+  // emit the codepoint as UTF-8.
+  char utf8[3]; // support \u0000 -- \uFFFF -- need only three bytes.
+  int length = 0;
+  if (codepoint <= 0x7F) {
+    utf8[0] = codepoint;
+    length = 1;
+  } else if (codepoint <= 0x07FF) {
+    utf8[1] = (codepoint & 0x3F) | 0x80;
+    codepoint >>= 6;
+    utf8[0] = (codepoint & 0x1F) | 0xC0;
+    length = 2;
+  } else /* codepoint <= 0xFFFF */ {
+    utf8[2] = (codepoint & 0x3F) | 0x80;
+    codepoint >>= 6;
+    utf8[1] = (codepoint & 0x3F) | 0x80;
+    codepoint >>= 6;
+    utf8[0] = (codepoint & 0x0F) | 0xE0;
+    length = 3;
+  }
+  // TODO(haberman): Handle high surrogates: if codepoint is a high surrogate
+  // we have to wait for the next escape to get the full code point).
+
+  return multipart_text(p, utf8, length, false);
+}
+
+static void start_text(upb_json_parser *p, const char *ptr) {
+  capture_begin(p, ptr);
+}
+
+static bool end_text(upb_json_parser *p, const char *ptr) {
+  return capture_end(p, ptr);
+}
+
 static void start_number(upb_json_parser *p, const char *ptr) {
-  start_text(p, ptr);
-  assert(p->accumulated == NULL);
+  multipart_startaccum(p);
+  capture_begin(p, ptr);
 }

-static void end_number(upb_json_parser *p, const char *ptr) {
-  end_text(p, ptr, true);
-  const char *myend = p->accumulated + p->accumulated_len;
+static bool end_number(upb_json_parser *p, const char *ptr) {
+  if (!capture_end(p, ptr)) {
+    return false;
+  }
+
+  size_t len;
+  const char *buf = accumulate_getptr(p, &len);
+  const char *myend = buf + len;
  char *end;

  switch (upb_fielddef_type(p->top->f)) {
@ -376,7 +523,7 @@ static void end_number(upb_json_parser *p, const char *ptr) {
    case UPB_TYPE_INT32: {
      long val = strtol(p->accumulated, &end, 0);
      if (val > INT32_MAX || val < INT32_MIN || errno == ERANGE || end != myend)
-        assert(false);
+        goto err;
      else
        upb_sink_putint32(&p->top->sink, parser_getsel(p), val);
      break;
@ -384,7 +531,7 @@ static void end_number(upb_json_parser *p, const char *ptr) {
    case UPB_TYPE_INT64: {
      long long val = strtoll(p->accumulated, &end, 0);
      if (val > INT64_MAX || val < INT64_MIN || errno == ERANGE || end != myend)
-        assert(false);
+        goto err;
      else
        upb_sink_putint64(&p->top->sink, parser_getsel(p), val);
      break;
@ -392,7 +539,7 @@ static void end_number(upb_json_parser *p, const char *ptr) {
    case UPB_TYPE_UINT32: {
      unsigned long val = strtoul(p->accumulated, &end, 0);
      if (val > UINT32_MAX || errno == ERANGE || end != myend)
-        assert(false);
+        goto err;
      else
        upb_sink_putuint32(&p->top->sink, parser_getsel(p), val);
      break;
@ -400,7 +547,7 @@ static void end_number(upb_json_parser *p, const char *ptr) {
    case UPB_TYPE_UINT64: {
      unsigned long long val = strtoull(p->accumulated, &end, 0);
      if (val > UINT64_MAX || errno == ERANGE || end != myend)
-        assert(false);
+        goto err;
      else
        upb_sink_putuint64(&p->top->sink, parser_getsel(p), val);
      break;
@ -408,7 +555,7 @@ static void end_number(upb_json_parser *p, const char *ptr) {
    case UPB_TYPE_DOUBLE: {
      double val = strtod(p->accumulated, &end);
      if (errno == ERANGE || end != myend)
-        assert(false);
+        goto err;
      else
        upb_sink_putdouble(&p->top->sink, parser_getsel(p), val);
      break;
@ -416,7 +563,7 @@ static void end_number(upb_json_parser *p, const char *ptr) {
    case UPB_TYPE_FLOAT: {
      float val = strtof(p->accumulated, &end);
      if (errno == ERANGE || end != myend)
-        assert(false);
+        goto err;
      else
        upb_sink_putfloat(&p->top->sink, parser_getsel(p), val);
      break;
@ -425,84 +572,236 @@ static void end_number(upb_json_parser *p, const char *ptr) {
      assert(false);
  }

-  p->accumulated = NULL;
+  multipart_end(p);
+  return true;
+
+err:
+  upb_status_seterrf(p->status, "error parsing number: %.*s", buf, len);
+  multipart_end(p);
+  return false;
 }

-static char escape_char(char in) {
-  switch (in) {
-    case 'r': return '\r';
-    case 't': return '\t';
-    case 'n': return '\n';
-    case 'f': return '\f';
-    case 'b': return '\b';
-    case '/': return '/';
-    case '"': return '"';
-    case '\\': return '\\';
+static bool parser_putbool(upb_json_parser *p, bool val) {
+  if (upb_fielddef_type(p->top->f) != UPB_TYPE_BOOL) {
+    upb_status_seterrf(p->status,
+                       "Boolean value specified for non-bool field: %s",
+                       upb_fielddef_name(p->top->f));
+    return false;
+  }
+
+  bool ok = upb_sink_putbool(&p->top->sink, parser_getsel(p), val);
+  UPB_ASSERT_VAR(ok, ok);
+  return true;
+}
+
+static bool start_stringval(upb_json_parser *p) {
+  assert(p->top->f);
+
+  if (upb_fielddef_isstring(p->top->f)) {
+    if (!check_stack(p)) return false;
+
+    // Start a new parser frame: parser frames correspond one-to-one with
+    // handler frames, and string events occur in a sub-frame.
+    upb_jsonparser_frame *inner = p->top + 1;
+    upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSTR);
+    upb_sink_startstr(&p->top->sink, sel, 0, &inner->sink);
+    inner->m = p->top->m;
+    inner->f = p->top->f;
+    p->top = inner;
+
+    if (upb_fielddef_type(p->top->f) == UPB_TYPE_STRING) {
+      // For STRING fields we push data directly to the handlers as it is
+      // parsed.  We don't do this yet for BYTES fields, because our base64
+      // decoder is not streaming.
+      //
+      // TODO(haberman): make base64 decoding streaming also.
+      multipart_start(p, getsel_for_handlertype(p, UPB_HANDLER_STRING));
+      return true;
+    } else {
+      multipart_startaccum(p);
+      return true;
+    }
+  } else if (upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM) {
+    // No need to push a frame -- symbolic enum names in quotes remain in the
+    // current parser frame.
+    //
+    // Enum string values must accumulate so we can look up the value in a table
+    // once it is complete.
+    multipart_startaccum(p);
+    return true;
+  } else {
+    upb_status_seterrf(p->status,
+                       "String specified for non-string/non-enum field: %s",
+                       upb_fielddef_name(p->top->f));
+    return false;
+  }
+}
+
+static bool end_stringval(upb_json_parser *p) {
+  bool ok = true;
+
+  switch (upb_fielddef_type(p->top->f)) {
+    case UPB_TYPE_BYTES:
+      if (!base64_push(p, getsel_for_handlertype(p, UPB_HANDLER_STRING),
+                       p->accumulated, p->accumulated_len)) {
+        return false;
+      }
+      // Fall through.
+
+    case UPB_TYPE_STRING: {
+      upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR);
+      upb_sink_endstr(&p->top->sink, sel);
+      p->top--;
+      break;
+    }
+
+    case UPB_TYPE_ENUM: {
+      // Resolve enum symbolic name to integer value.
+      const upb_enumdef *enumdef =
+          (const upb_enumdef*)upb_fielddef_subdef(p->top->f);
+
+      size_t len;
+      const char *buf = accumulate_getptr(p, &len);
+
+      int32_t int_val = 0;
+      ok = upb_enumdef_ntoi(enumdef, buf, len, &int_val);
+
+      if (ok) {
+        upb_selector_t sel = parser_getsel(p);
+        upb_sink_putint32(&p->top->sink, sel, int_val);
+      } else {
+        upb_status_seterrf(p->status, "Enum value unknown: '%.*s'", len, buf);
+      }
+
+      break;
+    }
+
    default:
-      assert(0);
-      return 'x';
+      assert(false);
+      upb_status_seterrmsg(p->status, "Internal error in JSON decoder");
+      ok = false;
+      break;
  }
+
+  multipart_end(p);
+  return ok;
 }

-static void escape(upb_json_parser *p, const char *ptr) {
-  char ch = escape_char(*ptr);
-  upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STRING);
-  upb_sink_putstring(&p->top->sink, sel, &ch, 1, NULL);
+static void start_member(upb_json_parser *p) {
+  assert(!p->top->f);
+  multipart_startaccum(p);
 }

-static uint8_t hexdigit(char ch) {
-  if (ch >= '0' && ch <= '9') {
-    return ch - '0';
-  } else if (ch >= 'a' && ch <= 'f') {
-    return ch - 'a' + 10;
-  } else {
-    assert(ch >= 'A' && ch <= 'F');
-    return ch - 'A' + 10;
+static bool end_member(upb_json_parser *p) {
+  assert(!p->top->f);
+  size_t len;
+  const char *buf = accumulate_getptr(p, &len);
+
+  const upb_fielddef *f = upb_msgdef_ntof(p->top->m, buf, len);
+
+  if (!f) {
+    // TODO(haberman): Ignore unknown fields if requested/configured to do so.
+    upb_status_seterrf(p->status, "No such field: %.*s\n", (int)len, buf);
+    return false;
  }
+
+  p->top->f = f;
+  multipart_end(p);
+
+  return true;
 }

-static void start_hex(upb_json_parser *p, const char *ptr) {
-  start_text(p, ptr);
+static void clear_member(upb_json_parser *p) { p->top->f = NULL; }
+
+static bool start_subobject(upb_json_parser *p) {
+  assert(p->top->f);
+
+  if (!upb_fielddef_issubmsg(p->top->f)) {
+    upb_status_seterrf(p->status,
+                       "Object specified for non-message/group field: %s",
+                       upb_fielddef_name(p->top->f));
+    return false;
+  }
+
+  if (!check_stack(p)) return false;
+
+  upb_jsonparser_frame *inner = p->top + 1;
+
+  upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSUBMSG);
+  upb_sink_startsubmsg(&p->top->sink, sel, &inner->sink);
+  inner->m = upb_fielddef_msgsubdef(p->top->f);
+  inner->f = NULL;
+  p->top = inner;
+
+  return true;
 }

-static void hex(upb_json_parser *p, const char *end) {
-  const char *start = p->text_begin;
-  UPB_ASSERT_VAR(end, end - start == 4);
-  uint16_t codepoint =
-      (hexdigit(start[0]) << 12) |
-      (hexdigit(start[1]) << 8) |
-      (hexdigit(start[2]) << 4) |
-      hexdigit(start[3]);
-  // emit the codepoint as UTF-8.
-  char utf8[3]; // support \u0000 -- \uFFFF -- need only three bytes.
-  int length = 0;
-  if (codepoint <= 0x7F) {
-    utf8[0] = codepoint;
-    length = 1;
-  } else if (codepoint <= 0x07FF) {
-    utf8[1] = (codepoint & 0x3F) | 0x80;
-    codepoint >>= 6;
-    utf8[0] = (codepoint & 0x1F) | 0xC0;
-    length = 2;
-  } else /* codepoint <= 0xFFFF */ {
-    utf8[2] = (codepoint & 0x3F) | 0x80;
-    codepoint >>= 6;
-    utf8[1] = (codepoint & 0x3F) | 0x80;
-    codepoint >>= 6;
-    utf8[0] = (codepoint & 0x0F) | 0xE0;
-    length = 3;
+static void end_subobject(upb_json_parser *p) {
+  p->top--;
+  upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSUBMSG);
+  upb_sink_endsubmsg(&p->top->sink, sel);
+}
+
+static bool start_array(upb_json_parser *p) {
+  assert(p->top->f);
+
+  if (!upb_fielddef_isseq(p->top->f)) {
+    upb_status_seterrf(p->status,
+                       "Array specified for non-repeated field: %s",
+                       upb_fielddef_name(p->top->f));
+    return false;
  }
-  // TODO(haberman): Handle high surrogates: if codepoint is a high surrogate
-  // we have to wait for the next escape to get the full code point).

-  upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STRING);
-  upb_sink_putstring(&p->top->sink, sel, utf8, length, NULL);
+  if (!check_stack(p)) return false;
+
+  upb_jsonparser_frame *inner = p->top + 1;
+  upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSEQ);
+  upb_sink_startseq(&p->top->sink, sel, &inner->sink);
+  inner->m = p->top->m;
+  inner->f = p->top->f;
+  p->top = inner;
+
+  return true;
+}
+
+static void end_array(upb_json_parser *p) {
+  assert(p->top > p->stack);
+
+  p->top--;
+  upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSEQ);
+  upb_sink_endseq(&p->top->sink, sel);
+}
+
+static void start_object(upb_json_parser *p) {
+  upb_sink_startmsg(&p->top->sink);
+}
+
+static void end_object(upb_json_parser *p) {
+  upb_status status;
+  upb_sink_endmsg(&p->top->sink, &status);
 }

+
 #define CHECK_RETURN_TOP(x) if (!(x)) goto error

+
+/* The actual parser **********************************************************/
+
 // What follows is the Ragel parser itself.  The language is specified in Ragel
 // and the actions call our C functions above.
+//
+// Ragel has an extensive set of functionality, and we use only a small part of
+// it.  There are many action types but we only use a few:
+//
+//   ">" -- transition into a machine
+//   "%" -- transition out of a machine
+//   "@" -- transition into a final state of a machine.
+//
+// "@" transitions are tricky because a machine can transition into a final
+// state repeatedly.  But in some cases we know this can't happen, for example
+// a string which is delimited by a final '"' can only transition into its
+// final state once, when the closing '"' is seen.
+
 %%{
  machine json;

@ -520,24 +819,30 @@ static void hex(upb_json_parser *p, const char *end) {
  text =
    /[^\\"]/+
      >{ start_text(parser, p); }
-      %{ CHECK_RETURN_TOP(end_text(parser, p, false)); }
+      %{ CHECK_RETURN_TOP(end_text(parser, p)); }
    ;

  unicode_char =
    "\\u"
    /[0-9A-Fa-f]/{4}
-      >{ start_hex(parser, p); }
-      %{ hex(parser, p); }
+      >{ start_hex(parser); }
+      ${ hexdigit(parser, p); }
+      %{ CHECK_RETURN_TOP(end_hex(parser)); }
    ;

  escape_char  =
    "\\"
    /[rtbfn"\/\\]/
-      >{ escape(parser, p); }
+      >{ CHECK_RETURN_TOP(escape(parser, p)); }
+    ;
+
+  string_machine :=
+    (text | unicode_char | escape_char)**
+    '"'
+      @{ fhold; fret; }
    ;

-  string_machine := (text | unicode_char | escape_char)** '"' @{ fret; } ;
-  string       = '"' @{ fcall string_machine; };
+  string       = '"' @{ fcall string_machine; } '"';

  value2 = ^(space | "]" | "}") >{ fhold; fcall value_machine; } ;

@ -545,7 +850,7 @@ static void hex(upb_json_parser *p, const char *end) {
    ws
    string
      >{ start_member(parser); }
-      %{ CHECK_RETURN_TOP(end_member(parser)); }
+      @{ CHECK_RETURN_TOP(end_member(parser)); }
    ws ":" ws
    value2
      %{ clear_member(parser); }
@ -573,10 +878,10 @@ static void hex(upb_json_parser *p, const char *end) {
  value =
    number
      >{ start_number(parser, p); }
-      %{ end_number(parser, p); }
+      %{ CHECK_RETURN_TOP(end_number(parser, p)); }
    | string
      >{ CHECK_RETURN_TOP(start_stringval(parser)); }
-      %{ end_stringval(parser); }
+      @{ CHECK_RETURN_TOP(end_stringval(parser)); }
    | "true"
      %{ CHECK_RETURN_TOP(parser_putbool(parser, true)); }
    | "false"
@ -602,6 +907,7 @@ size_t parse(void *closure, const void *hd, const char *buf, size_t size,
  UPB_UNUSED(hd);
  UPB_UNUSED(handle);
  upb_json_parser *parser = closure;
+  parser->handle = handle;

  // Variables used by Ragel's generated code.
  int cs = parser->current_state;
@ -611,10 +917,14 @@ size_t parse(void *closure, const void *hd, const char *buf, size_t size,
  const char *p = buf;
  const char *pe = buf + size;

+  capture_resume(parser, buf);
+
  %% write exec;

  if (p != pe) {
    upb_status_seterrf(parser->status, "Parse error at %s\n", p);
+  } else {
+    capture_suspend(parser, &p);
  }

 error:
@ -631,8 +941,13 @@ bool end(void *closure, const void *hd) {
  return true;
 }

+
+/* Public API *****************************************************************/
+
 void upb_json_parser_init(upb_json_parser *p, upb_status *status) {
  p->limit = p->stack + UPB_JSON_MAX_DEPTH;
+  p->accumulate_buf = NULL;
+  p->accumulate_buf_size = 0;
  upb_byteshandler_init(&p->input_handler_);
  upb_byteshandler_setstring(&p->input_handler_, parse, NULL);
  upb_byteshandler_setendstr(&p->input_handler_, end, NULL);
@ -642,6 +957,7 @@ void upb_json_parser_init(upb_json_parser *p, upb_status *status) {

 void upb_json_parser_uninit(upb_json_parser *p) {
  upb_byteshandler_uninit(&p->input_handler_);
+  free(p->accumulate_buf);
 }

 void upb_json_parser_reset(upb_json_parser *p) {
@ -654,9 +970,9 @@ void upb_json_parser_reset(upb_json_parser *p) {
  %% write init;
  p->current_state = cs;
  p->parser_top = top;
-  p->text_begin = NULL;
-  p->accumulated = NULL;
-  p->accumulated_len = 0;
+  accumulate_clear(p);
+  p->multipart_state = MULTIPART_INACTIVE;
+  p->capture = NULL;
 }

 void upb_json_parser_resetoutput(upb_json_parser *p, upb_sink *sink) {
--- a/upb/pb/compile_decoder.c
+++ b/upb/pb/compile_decoder.c
@ -302,6 +302,7 @@ static void putop(compiler *c, opcode op, ...) {
    case OP_SETDELIM:
    case OP_HALT:
    case OP_RET:
+    case OP_DISPATCH:
      put32(c, op);
      break;
    case OP_PARSE_DOUBLE:
@ -382,7 +383,7 @@ const char *upb_pbdecoder_getopname(unsigned int op) {
    OP(ENDSUBMSG), OP(STARTSTR), OP(STRING), OP(ENDSTR), OP(CALL), OP(RET),
    OP(PUSHLENDELIM), OP(PUSHTAGDELIM), OP(SETDELIM), OP(CHECKDELIM),
    OP(BRANCH), OP(TAG1), OP(TAG2), OP(TAGN), OP(SETDISPATCH), OP(POP),
-    OP(SETBIGGROUPNUM), OP(HALT),
+    OP(SETBIGGROUPNUM), OP(DISPATCH), OP(HALT),
  };
  return op > OP_HALT ? names[0] : names[op];
 #undef OP
@ -414,6 +415,7 @@ static void dumpbc(uint32_t *p, uint32_t *end, FILE *f) {
                              upb_handlers_msgdef(method->dest_handlers_)));
        break;
      }
+      case OP_DISPATCH:
      case OP_STARTMSG:
      case OP_ENDMSG:
      case OP_PUSHLENDELIM:
@ -759,6 +761,7 @@ static void compile_method(compiler *c, upb_pbdecodermethod *method) {
  putop(c, OP_SETDISPATCH, &method->dispatch);
  putsel(c, OP_STARTMSG, UPB_STARTMSG_SELECTOR, h);
 label(c, LABEL_FIELD);
+  uint32_t* start_pc = c->pc;
  upb_msg_iter i;
  for(upb_msg_begin(&i, md); !upb_msg_done(&i); upb_msg_next(&i)) {
    const upb_fielddef *f = upb_msg_iter_field(&i);
@ -774,8 +777,18 @@ static void compile_method(compiler *c, upb_pbdecodermethod *method) {
    }
  }

+  // If there were no fields, or if no handlers were defined, we need to
+  // generate a non-empty loop body so that we can at least dispatch for unknown
+  // fields and check for the end of the message.
+  if (c->pc == start_pc) {
+    // Check for end-of-message.
+    putop(c, OP_CHECKDELIM, LABEL_ENDMSG);
+    // Unconditionally dispatch.
+    putop(c, OP_DISPATCH, 0);
+  }
+
  // For now we just loop back to the last field of the message (or if none,
-  // the DISPATCH opcode for the message.
+  // the DISPATCH opcode for the message).
  putop(c, OP_BRANCH, -LABEL_FIELD);

  // Insert both a label and a dispatch table entry for this end-of-msg.
--- a/upb/pb/compile_decoder_x64.dasc
+++ b/upb/pb/compile_decoder_x64.dasc
@ -1124,6 +1124,9 @@ static void jitbytecode(jitcompiler *jc) {
      jittag(jc, tag, arg >> 8, (int8_t)arg, method);
      break;
    }
+    case OP_DISPATCH:
+      |  call   =>jmptarget(jc, &method->dispatch)
+      break;
    case OP_HALT:
      assert(false);
    }
--- a/upb/pb/compile_decoder_x64.h
+++ b/upb/pb/compile_decoder_x64.h
@ -1680,6 +1680,11 @@ static void jitbytecode(jitcompiler *jc) {
      jittag(jc, tag, arg >> 8, (int8_t)arg, method);
      break;
    }
+    case OP_DISPATCH:
+      //|  call   =>jmptarget(jc, &method->dispatch)
+      dasm_put(Dst, 2151, jmptarget(jc, &method->dispatch));
+# 1129 "upb/pb/compile_decoder_x64.dasc"
+      break;
    case OP_HALT:
      assert(false);
    }
@ -1688,5 +1693,5 @@ static void jitbytecode(jitcompiler *jc) {
  asmlabel(jc, "eof");
  //|  nop
  dasm_put(Dst, 1909);
-# 1134 "upb/pb/compile_decoder_x64.dasc"
+# 1137 "upb/pb/compile_decoder_x64.dasc"
 }
--- a/upb/pb/decoder.c
+++ b/upb/pb/decoder.c
@ -801,6 +801,9 @@ size_t upb_pbdecoder_decode(void *closure, const void *hd, const char *buf,
        if (result == DECODE_MISMATCH) goto badtag;
        if (result >= 0) return result;
      })
+      VMCASE(OP_DISPATCH, {
+        CHECK_RETURN(dispatch(d));
+      })
      VMCASE(OP_HALT, {
        return size;
      })
@ -859,7 +862,8 @@ bool upb_pbdecoder_end(void *closure, const void *handler_data) {
      // Rewind from OP_TAG* to OP_CHECKDELIM.
      assert(getop(*d->pc) == OP_TAG1 ||
             getop(*d->pc) == OP_TAG2 ||
-             getop(*d->pc) == OP_TAGN);
+             getop(*d->pc) == OP_TAGN ||
+             getop(*d->pc == OP_DISPATCH));
      d->pc = p;
    }
    upb_pbdecoder_decode(closure, handler_data, &dummy, 0, NULL);
--- a/upb/pb/decoder.int.h
+++ b/upb/pb/decoder.int.h
@ -66,7 +66,9 @@ typedef enum {
                           //   | unused (24)         | opc |
                           //   | upb_inttable* (32 or 64)  |

-  OP_HALT           = 36,  // No arg.
+  OP_DISPATCH       = 36,  // No arg.
+
+  OP_HALT           = 37,  // No arg.
 } opcode;

 #define OP_MAX OP_HALT
--- a/upb/upb.c
+++ b/upb/upb.c
@ -32,8 +32,10 @@ static void nullz(upb_status *status) {
 }

 void upb_status_clear(upb_status *status) {
-  upb_status blank = UPB_STATUS_INIT;
-  upb_status_copy(status, &blank);
+  if (!status) return;
+  status->ok_ = true;
+  status->code_ = 0;
+  status->msg[0] = '\0';
 }

 bool upb_ok(const upb_status *status) { return status->ok_; }