Merge pull request #3754 from ctiller/latent-see

Latency profiling support
9 years ago · 93cec49197
parent a538c40b54 494f7da271
commit 93cec49197
49 changed files with 715 additions and 497 deletions
--- a/.gitignore
+++ b/.gitignore
@ -37,10 +37,11 @@ cache.mk

 # Temporary test reports
 report.xml
+latency_trace.txt

 # port server log
 portlog.txt

 # gyp generated make files
 *-gyp.mk
-out
+out
--- a/17
+++ b/17
@ -44,6 +44,7 @@ package(default_visibility = ["//visibility:public"])
 cc_library(
  name = "gpr",
  srcs = [
+    "src/core/profiling/timers.h",
    "src/core/support/block_annotate.h",
    "src/core/support/env.h",
    "src/core/support/file.h",
@ -53,6 +54,8 @@ cc_library(
    "src/core/support/string_win32.h",
    "src/core/support/thd_internal.h",
    "src/core/support/time_precise.h",
+    "src/core/profiling/basic_timers.c",
+    "src/core/profiling/stap_timers.c",
    "src/core/support/alloc.c",
    "src/core/support/cmdline.c",
    "src/core/support/cpu_iphone.c",
@ -88,6 +91,7 @@ cc_library(
    "src/core/support/thd_win32.c",
    "src/core/support/time.c",
    "src/core/support/time_posix.c",
+    "src/core/support/time_precise.c",
    "src/core/support/time_win32.c",
    "src/core/support/tls_pthread.c",
  ],
@ -217,7 +221,6 @@ cc_library(
    "src/core/json/json_common.h",
    "src/core/json/json_reader.h",
    "src/core/json/json_writer.h",
-    "src/core/profiling/timers.h",
    "src/core/statistics/census_interface.h",
    "src/core/statistics/census_rpc_stats.h",
    "src/core/surface/api_trace.h",
@ -356,8 +359,6 @@ cc_library(
    "src/core/json/json_reader.c",
    "src/core/json/json_string.c",
    "src/core/json/json_writer.c",
-    "src/core/profiling/basic_timers.c",
-    "src/core/profiling/stap_timers.c",
    "src/core/surface/api_trace.c",
    "src/core/surface/byte_buffer.c",
    "src/core/surface/byte_buffer_queue.c",
@ -504,7 +505,6 @@ cc_library(
    "src/core/json/json_common.h",
    "src/core/json/json_reader.h",
    "src/core/json/json_writer.h",
-    "src/core/profiling/timers.h",
    "src/core/statistics/census_interface.h",
    "src/core/statistics/census_rpc_stats.h",
    "src/core/surface/api_trace.h",
@ -623,8 +623,6 @@ cc_library(
    "src/core/json/json_reader.c",
    "src/core/json/json_string.c",
    "src/core/json/json_writer.c",
-    "src/core/profiling/basic_timers.c",
-    "src/core/profiling/stap_timers.c",
    "src/core/surface/api_trace.c",
    "src/core/surface/byte_buffer.c",
    "src/core/surface/byte_buffer_queue.c",
@ -962,6 +960,8 @@ cc_library(
 objc_library(
  name = "gpr_objc",
  srcs = [
+    "src/core/profiling/basic_timers.c",
+    "src/core/profiling/stap_timers.c",
    "src/core/support/alloc.c",
    "src/core/support/cmdline.c",
    "src/core/support/cpu_iphone.c",
@ -997,6 +997,7 @@ objc_library(
    "src/core/support/thd_win32.c",
    "src/core/support/time.c",
    "src/core/support/time_posix.c",
+    "src/core/support/time_precise.c",
    "src/core/support/time_win32.c",
    "src/core/support/tls_pthread.c",
  ],
@ -1028,6 +1029,7 @@ objc_library(
    "include/grpc/support/tls_msvc.h",
    "include/grpc/support/tls_pthread.h",
    "include/grpc/support/useful.h",
+    "src/core/profiling/timers.h",
    "src/core/support/block_annotate.h",
    "src/core/support/env.h",
    "src/core/support/file.h",
@ -1147,8 +1149,6 @@ objc_library(
    "src/core/json/json_reader.c",
    "src/core/json/json_string.c",
    "src/core/json/json_writer.c",
-    "src/core/profiling/basic_timers.c",
-    "src/core/profiling/stap_timers.c",
    "src/core/surface/api_trace.c",
    "src/core/surface/byte_buffer.c",
    "src/core/surface/byte_buffer_queue.c",
@ -1292,7 +1292,6 @@ objc_library(
    "src/core/json/json_common.h",
    "src/core/json/json_reader.h",
    "src/core/json/json_writer.h",
-    "src/core/profiling/timers.h",
    "src/core/statistics/census_interface.h",
    "src/core/statistics/census_rpc_stats.h",
    "src/core/surface/api_trace.h",
--- a/7
+++ b/7
@ -3893,6 +3893,8 @@ clean:


 LIBGPR_SRC = \
+    src/core/profiling/basic_timers.c \
+    src/core/profiling/stap_timers.c \
    src/core/support/alloc.c \
    src/core/support/cmdline.c \
    src/core/support/cpu_iphone.c \
@ -3928,6 +3930,7 @@ LIBGPR_SRC = \
    src/core/support/thd_win32.c \
    src/core/support/time.c \
    src/core/support/time_posix.c \
+    src/core/support/time_precise.c \
    src/core/support/time_win32.c \
    src/core/support/tls_pthread.c \

@ -4117,8 +4120,6 @@ LIBGRPC_SRC = \
    src/core/json/json_reader.c \
    src/core/json/json_string.c \
    src/core/json/json_writer.c \
-    src/core/profiling/basic_timers.c \
-    src/core/profiling/stap_timers.c \
    src/core/surface/api_trace.c \
    src/core/surface/byte_buffer.c \
    src/core/surface/byte_buffer_queue.c \
@ -4399,8 +4400,6 @@ LIBGRPC_UNSECURE_SRC = \
    src/core/json/json_reader.c \
    src/core/json/json_string.c \
    src/core/json/json_writer.c \
-    src/core/profiling/basic_timers.c \
-    src/core/profiling/stap_timers.c \
    src/core/surface/api_trace.c \
    src/core/surface/byte_buffer.c \
    src/core/surface/byte_buffer_queue.c \
--- a/binding.gyp
+++ b/binding.gyp
@ -64,6 +64,8 @@
      'dependencies': [
      ],
      'sources': [
+        'src/core/profiling/basic_timers.c',
+        'src/core/profiling/stap_timers.c',
        'src/core/support/alloc.c',
        'src/core/support/cmdline.c',
        'src/core/support/cpu_iphone.c',
@ -99,6 +101,7 @@
        'src/core/support/thd_win32.c',
        'src/core/support/time.c',
        'src/core/support/time_posix.c',
+        'src/core/support/time_precise.c',
        'src/core/support/time_win32.c',
        'src/core/support/tls_pthread.c',
      ],
@ -208,8 +211,6 @@
        'src/core/json/json_reader.c',
        'src/core/json/json_string.c',
        'src/core/json/json_writer.c',
-        'src/core/profiling/basic_timers.c',
-        'src/core/profiling/stap_timers.c',
        'src/core/surface/api_trace.c',
        'src/core/surface/byte_buffer.c',
        'src/core/surface/byte_buffer_queue.c',
--- a/build.yaml
+++ b/build.yaml
@ -177,7 +177,6 @@ filegroups:
  - src/core/json/json_common.h
  - src/core/json/json_reader.h
  - src/core/json/json_writer.h
-  - src/core/profiling/timers.h
  - src/core/statistics/census_interface.h
  - src/core/statistics/census_rpc_stats.h
  - src/core/surface/api_trace.h
@ -293,8 +292,6 @@ filegroups:
  - src/core/json/json_reader.c
  - src/core/json/json_string.c
  - src/core/json/json_writer.c
-  - src/core/profiling/basic_timers.c
-  - src/core/profiling/stap_timers.c
  - src/core/surface/api_trace.c
  - src/core/surface/byte_buffer.c
  - src/core/surface/byte_buffer_queue.c
@ -393,6 +390,7 @@ libs:
  - include/grpc/support/tls_pthread.h
  - include/grpc/support/useful.h
  headers:
+  - src/core/profiling/timers.h
  - src/core/support/block_annotate.h
  - src/core/support/env.h
  - src/core/support/file.h
@ -403,6 +401,8 @@ libs:
  - src/core/support/thd_internal.h
  - src/core/support/time_precise.h
  src:
+  - src/core/profiling/basic_timers.c
+  - src/core/profiling/stap_timers.c
  - src/core/support/alloc.c
  - src/core/support/cmdline.c
  - src/core/support/cpu_iphone.c
@ -438,6 +438,7 @@ libs:
  - src/core/support/thd_win32.c
  - src/core/support/time.c
  - src/core/support/time_posix.c
+  - src/core/support/time_precise.c
  - src/core/support/time_win32.c
  - src/core/support/tls_pthread.c
  secure: false
--- a/gRPC.podspec
+++ b/gRPC.podspec
@ -63,7 +63,8 @@ Pod::Spec.new do |s|

  # Core cross-platform gRPC library, written in C.
  s.subspec 'C-Core' do |ss|
-    ss.source_files = 'src/core/support/block_annotate.h',
+    ss.source_files = 'src/core/profiling/timers.h',
+                      'src/core/support/block_annotate.h',
                      'src/core/support/env.h',
                      'src/core/support/file.h',
                      'src/core/support/murmur_hash.h',
@ -99,6 +100,8 @@ Pod::Spec.new do |s|
                      'grpc/support/tls_msvc.h',
                      'grpc/support/tls_pthread.h',
                      'grpc/support/useful.h',
+                      'src/core/profiling/basic_timers.c',
+                      'src/core/profiling/stap_timers.c',
                      'src/core/support/alloc.c',
                      'src/core/support/cmdline.c',
                      'src/core/support/cpu_iphone.c',
@ -134,6 +137,7 @@ Pod::Spec.new do |s|
                      'src/core/support/thd_win32.c',
                      'src/core/support/time.c',
                      'src/core/support/time_posix.c',
+                      'src/core/support/time_precise.c',
                      'src/core/support/time_win32.c',
                      'src/core/support/tls_pthread.c',
                      'src/core/security/auth_filters.h',
@ -221,7 +225,6 @@ Pod::Spec.new do |s|
                      'src/core/json/json_common.h',
                      'src/core/json/json_reader.h',
                      'src/core/json/json_writer.h',
-                      'src/core/profiling/timers.h',
                      'src/core/statistics/census_interface.h',
                      'src/core/statistics/census_rpc_stats.h',
                      'src/core/surface/api_trace.h',
@ -367,8 +370,6 @@ Pod::Spec.new do |s|
                      'src/core/json/json_reader.c',
                      'src/core/json/json_string.c',
                      'src/core/json/json_writer.c',
-                      'src/core/profiling/basic_timers.c',
-                      'src/core/profiling/stap_timers.c',
                      'src/core/surface/api_trace.c',
                      'src/core/surface/byte_buffer.c',
                      'src/core/surface/byte_buffer_queue.c',
@ -419,7 +420,8 @@ Pod::Spec.new do |s|
                      'src/core/census/operation.c',
                      'src/core/census/tracing.c'

-    ss.private_header_files = 'src/core/support/block_annotate.h',
+    ss.private_header_files = 'src/core/profiling/timers.h',
+                              'src/core/support/block_annotate.h',
                              'src/core/support/env.h',
                              'src/core/support/file.h',
                              'src/core/support/murmur_hash.h',
@ -513,7 +515,6 @@ Pod::Spec.new do |s|
                              'src/core/json/json_common.h',
                              'src/core/json/json_reader.h',
                              'src/core/json/json_writer.h',
-                              'src/core/profiling/timers.h',
                              'src/core/statistics/census_interface.h',
                              'src/core/statistics/census_rpc_stats.h',
                              'src/core/surface/api_trace.h',
--- a/src/core/channel/client_channel.c
+++ b/src/core/channel/client_channel.c
@ -36,16 +36,18 @@
 #include <stdio.h>
 #include <string.h>

+#include <grpc/support/alloc.h>
+#include <grpc/support/log.h>
+#include <grpc/support/sync.h>
+#include <grpc/support/useful.h>
+
 #include "src/core/channel/channel_args.h"
 #include "src/core/channel/connected_channel.h"
-#include "src/core/surface/channel.h"
 #include "src/core/iomgr/iomgr.h"
+#include "src/core/profiling/timers.h"
 #include "src/core/support/string.h"
+#include "src/core/surface/channel.h"
 #include "src/core/transport/connectivity_state.h"
-#include <grpc/support/alloc.h>
-#include <grpc/support/log.h>
-#include <grpc/support/sync.h>
-#include <grpc/support/useful.h>

 /* Client channel implementation */

@ -242,6 +244,8 @@ static void picked_target(grpc_exec_ctx *exec_ctx, void *arg,
  grpc_pollset *pollset;
  grpc_subchannel_call_create_status call_creation_status;

+  GPR_TIMER_BEGIN("picked_target", 0);
+
  if (calld->picked_channel == NULL) {
    /* treat this like a cancellation */
    calld->waiting_op.cancel_with_status = GRPC_STATUS_UNAVAILABLE;
@ -266,6 +270,8 @@ static void picked_target(grpc_exec_ctx *exec_ctx, void *arg,
      }
    }
  }
+
+  GPR_TIMER_END("picked_target", 0);
 }

 static grpc_closure *merge_into_waiting_op(grpc_call_element *elem,
@ -326,6 +332,7 @@ static void perform_transport_stream_op(grpc_exec_ctx *exec_ctx,
  grpc_subchannel_call *subchannel_call;
  grpc_lb_policy *lb_policy;
  grpc_transport_stream_op op2;
+  GPR_TIMER_BEGIN("perform_transport_stream_op", 0);
  GPR_ASSERT(elem->filter == &grpc_client_channel_filter);
  GRPC_CALL_LOG_OP(GPR_INFO, elem, op);

@ -437,6 +444,8 @@ static void perform_transport_stream_op(grpc_exec_ctx *exec_ctx,
      }
      break;
  }
+
+  GPR_TIMER_END("perform_transport_stream_op", 0);
 }

 static void cc_start_transport_stream_op(grpc_exec_ctx *exec_ctx,
--- a/src/core/channel/compress_filter.c
+++ b/src/core/channel/compress_filter.c
@ -41,6 +41,7 @@

 #include "src/core/channel/compress_filter.h"
 #include "src/core/channel/channel_args.h"
+#include "src/core/profiling/timers.h"
 #include "src/core/compression/message_compress.h"
 #include "src/core/support/string.h"

@ -271,10 +272,14 @@ static void process_send_ops(grpc_call_element *elem,
 static void compress_start_transport_stream_op(grpc_exec_ctx *exec_ctx,
                                               grpc_call_element *elem,
                                               grpc_transport_stream_op *op) {
+  GPR_TIMER_BEGIN("compress_start_transport_stream_op", 0);
+
  if (op->send_ops && op->send_ops->nops > 0) {
    process_send_ops(elem, op->send_ops);
  }

+  GPR_TIMER_END("compress_start_transport_stream_op", 0);
+
  /* pass control down the stack */
  grpc_call_next_op(exec_ctx, elem, op);
 }
--- a/src/core/channel/connected_channel.c
+++ b/src/core/channel/connected_channel.c
@ -39,6 +39,7 @@

 #include "src/core/support/string.h"
 #include "src/core/transport/transport.h"
+#include "src/core/profiling/timers.h"
 #include <grpc/byte_buffer.h>
 #include <grpc/support/alloc.h>
 #include <grpc/support/log.h>
--- a/src/core/channel/http_client_filter.c
+++ b/src/core/channel/http_client_filter.c
@ -36,6 +36,7 @@
 #include <grpc/support/log.h>
 #include <grpc/support/string_util.h>
 #include "src/core/support/string.h"
+#include "src/core/profiling/timers.h"

 typedef struct call_data {
  grpc_linked_mdelem method;
@ -162,8 +163,10 @@ static void hc_mutate_op(grpc_call_element *elem,
 static void hc_start_transport_op(grpc_exec_ctx *exec_ctx,
                                  grpc_call_element *elem,
                                  grpc_transport_stream_op *op) {
+  GPR_TIMER_BEGIN("hc_start_transport_op", 0);
  GRPC_CALL_LOG_OP(GPR_INFO, elem, op);
  hc_mutate_op(elem, op);
+  GPR_TIMER_END("hc_start_transport_op", 0);
  grpc_call_next_op(exec_ctx, elem, op);
 }

--- a/src/core/channel/http_server_filter.c
+++ b/src/core/channel/http_server_filter.c
@ -36,6 +36,7 @@
 #include <string.h>
 #include <grpc/support/alloc.h>
 #include <grpc/support/log.h>
+#include "src/core/profiling/timers.h"

 typedef struct call_data {
  gpr_uint8 got_initial_metadata;
@ -230,8 +231,10 @@ static void hs_start_transport_op(grpc_exec_ctx *exec_ctx,
                                  grpc_call_element *elem,
                                  grpc_transport_stream_op *op) {
  GRPC_CALL_LOG_OP(GPR_INFO, elem, op);
+  GPR_TIMER_BEGIN("hs_start_transport_op", 0);
  hs_mutate_op(elem, op);
  grpc_call_next_op(exec_ctx, elem, op);
+  GPR_TIMER_END("hs_start_transport_op", 0);
 }

 /* Constructor for call_data */
--- a/src/core/client_config/lb_policies/pick_first.c
+++ b/src/core/client_config/lb_policies/pick_first.c
@ -175,7 +175,7 @@ void pf_pick(grpc_exec_ctx *exec_ctx, grpc_lb_policy *pol,
  }
 }

-static void destroy_subchannels(grpc_exec_ctx *exec_ctx, void *arg, 
+static void destroy_subchannels(grpc_exec_ctx *exec_ctx, void *arg,
                                int iomgr_success) {
  pick_first_lb_policy *p = arg;
  size_t i;
@ -235,7 +235,8 @@ static void pf_connectivity_changed(grpc_exec_ctx *exec_ctx, void *arg,
        GRPC_SUBCHANNEL_REF(p->selected, "picked_first");
        /* drop the pick list: we are connected now */
        GRPC_LB_POLICY_REF(&p->base, "destroy_subchannels");
-        grpc_exec_ctx_enqueue(exec_ctx, grpc_closure_create(destroy_subchannels, p), 1);
+        grpc_exec_ctx_enqueue(exec_ctx,
+                              grpc_closure_create(destroy_subchannels, p), 1);
        /* update any calls that were waiting for a pick */
        while ((pp = p->pending_picks)) {
          p->pending_picks = pp->next;
--- a/src/core/iomgr/exec_ctx.c
+++ b/src/core/iomgr/exec_ctx.c
@ -35,18 +35,24 @@

 #include <grpc/support/log.h>

+#include "src/core/profiling/timers.h"
+
 int grpc_exec_ctx_flush(grpc_exec_ctx *exec_ctx) {
  int did_something = 0;
+  GPR_TIMER_BEGIN("grpc_exec_ctx_flush", 0);
  while (!grpc_closure_list_empty(exec_ctx->closure_list)) {
    grpc_closure *c = exec_ctx->closure_list.head;
    exec_ctx->closure_list.head = exec_ctx->closure_list.tail = NULL;
    while (c != NULL) {
      grpc_closure *next = c->next;
-      did_something = 1;
+      did_something++;
+      GPR_TIMER_BEGIN("grpc_exec_ctx_flush.cb", 0);
      c->cb(exec_ctx, c->cb_arg, c->success);
+      GPR_TIMER_END("grpc_exec_ctx_flush.cb", 0);
      c = next;
    }
  }
+  GPR_TIMER_END("grpc_exec_ctx_flush", 0);
  return did_something;
 }

--- a/src/core/iomgr/pollset_multipoller_with_epoll.c
+++ b/src/core/iomgr/pollset_multipoller_with_epoll.c
@ -41,10 +41,11 @@
 #include <sys/epoll.h>
 #include <unistd.h>

-#include "src/core/iomgr/fd_posix.h"
-#include "src/core/support/block_annotate.h"
 #include <grpc/support/alloc.h>
 #include <grpc/support/log.h>
+#include "src/core/iomgr/fd_posix.h"
+#include "src/core/support/block_annotate.h"
+#include "src/core/profiling/timers.h"

 typedef struct wakeup_fd_hdl {
  grpc_wakeup_fd wakeup_fd;
@ -182,9 +183,11 @@ static void multipoll_with_epoll_pollset_maybe_work_and_unlock(

  /* TODO(vpai): Consider first doing a 0 timeout poll here to avoid
     even going into the blocking annotation if possible */
+  GPR_TIMER_BEGIN("poll", 0);
  GRPC_SCHEDULING_START_BLOCKING_REGION;
  poll_rv = grpc_poll_function(pfds, 2, timeout_ms);
  GRPC_SCHEDULING_END_BLOCKING_REGION;
+  GPR_TIMER_END("poll", 0);

  if (poll_rv < 0) {
    if (errno != EINTR) {
--- a/src/core/iomgr/pollset_posix.c
+++ b/src/core/iomgr/pollset_posix.c
@ -101,9 +101,12 @@ static void push_front_worker(grpc_pollset *p, grpc_pollset_worker *worker) {
 void grpc_pollset_kick_ext(grpc_pollset *p,
                           grpc_pollset_worker *specific_worker,
                           gpr_uint32 flags) {
+  GPR_TIMER_BEGIN("grpc_pollset_kick_ext", 0);
+
  /* pollset->mu already held */
  if (specific_worker != NULL) {
    if (specific_worker == GRPC_POLLSET_KICK_BROADCAST) {
+      GPR_TIMER_BEGIN("grpc_pollset_kick_ext.broadcast", 0);
      GPR_ASSERT((flags & GRPC_POLLSET_REEVALUATE_POLLING_ON_WAKEUP) == 0);
      for (specific_worker = p->root_worker.next;
           specific_worker != &p->root_worker;
@ -111,44 +114,50 @@ void grpc_pollset_kick_ext(grpc_pollset *p,
        grpc_wakeup_fd_wakeup(&specific_worker->wakeup_fd);
      }
      p->kicked_without_pollers = 1;
-      return;
+      GPR_TIMER_END("grpc_pollset_kick_ext.broadcast", 0);
    } else if (gpr_tls_get(&g_current_thread_worker) !=
               (gpr_intptr)specific_worker) {
+      GPR_TIMER_MARK("different_thread_worker", 0);
      if ((flags & GRPC_POLLSET_REEVALUATE_POLLING_ON_WAKEUP) != 0) {
        specific_worker->reevaluate_polling_on_wakeup = 1;
      }
      grpc_wakeup_fd_wakeup(&specific_worker->wakeup_fd);
-      return;
    } else if ((flags & GRPC_POLLSET_CAN_KICK_SELF) != 0) {
+      GPR_TIMER_MARK("kick_yoself", 0);
      if ((flags & GRPC_POLLSET_REEVALUATE_POLLING_ON_WAKEUP) != 0) {
        specific_worker->reevaluate_polling_on_wakeup = 1;
      }
      grpc_wakeup_fd_wakeup(&specific_worker->wakeup_fd);
-      return;
    }
  } else if (gpr_tls_get(&g_current_thread_poller) != (gpr_intptr)p) {
    GPR_ASSERT((flags & GRPC_POLLSET_REEVALUATE_POLLING_ON_WAKEUP) == 0);
+    GPR_TIMER_MARK("kick_anonymous", 0);
    specific_worker = pop_front_worker(p);
    if (specific_worker != NULL) {
      if (gpr_tls_get(&g_current_thread_worker) ==
          (gpr_intptr)specific_worker) {
+        GPR_TIMER_MARK("kick_anonymous_not_self", 0);
        push_back_worker(p, specific_worker);
        specific_worker = pop_front_worker(p);
        if ((flags & GRPC_POLLSET_CAN_KICK_SELF) == 0 &&
            gpr_tls_get(&g_current_thread_worker) ==
                (gpr_intptr)specific_worker) {
          push_back_worker(p, specific_worker);
-          return;
+          specific_worker = NULL;
        }
      }
-      push_back_worker(p, specific_worker);
-      grpc_wakeup_fd_wakeup(&specific_worker->wakeup_fd);
-      return;
+      if (specific_worker != NULL) {
+        GPR_TIMER_MARK("finally_kick", 0);
+        push_back_worker(p, specific_worker);
+        grpc_wakeup_fd_wakeup(&specific_worker->wakeup_fd);
+      }
    } else {
+      GPR_TIMER_MARK("kicked_no_pollers", 0);
      p->kicked_without_pollers = 1;
-      return;
    }
  }
+
+  GPR_TIMER_END("grpc_pollset_kick_ext", 0);
 }

 void grpc_pollset_kick(grpc_pollset *p, grpc_pollset_worker *specific_worker) {
@ -229,6 +238,7 @@ void grpc_pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
  int locked = 1;
  int queued_work = 0;
  int keep_polling = 0;
+  GPR_TIMER_BEGIN("grpc_pollset_work", 0);
  /* this must happen before we (potentially) drop pollset->mu */
  worker->next = worker->prev = NULL;
  worker->reevaluate_polling_on_wakeup = 0;
@ -273,8 +283,10 @@ void grpc_pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
      }
      gpr_tls_set(&g_current_thread_poller, (gpr_intptr)pollset);
      gpr_tls_set(&g_current_thread_worker, (gpr_intptr)worker);
+      GPR_TIMER_BEGIN("maybe_work_and_unlock", 0);
      pollset->vtable->maybe_work_and_unlock(exec_ctx, pollset, worker,
                                             deadline, now);
+      GPR_TIMER_END("maybe_work_and_unlock", 0);
      locked = 0;
      gpr_tls_set(&g_current_thread_poller, 0);
      gpr_tls_set(&g_current_thread_worker, 0);
@ -329,6 +341,7 @@ void grpc_pollset_work(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
      gpr_mu_lock(&pollset->mu);
    }
  }
+  GPR_TIMER_END("grpc_pollset_work", 0);
 }

 void grpc_pollset_shutdown(grpc_exec_ctx *exec_ctx, grpc_pollset *pollset,
@ -563,10 +576,11 @@ static void basic_pollset_maybe_work_and_unlock(grpc_exec_ctx *exec_ctx,
     even going into the blocking annotation if possible */
  /* poll fd count (argument 2) is shortened by one if we have no events
     to poll on - such that it only includes the kicker */
+  GPR_TIMER_BEGIN("poll", 0);
  GRPC_SCHEDULING_START_BLOCKING_REGION;
  r = grpc_poll_function(pfd, nfds, timeout);
  GRPC_SCHEDULING_END_BLOCKING_REGION;
-  GRPC_TIMER_MARK(GRPC_PTAG_POLL_FINISHED, r);
+  GPR_TIMER_END("poll", 0);

  if (r < 0) {
    gpr_log(GPR_ERROR, "poll() failed: %s", strerror(errno));
--- a/src/core/iomgr/tcp_posix.c
+++ b/src/core/iomgr/tcp_posix.c
@ -180,7 +180,7 @@ static void tcp_continue_read(grpc_exec_ctx *exec_ctx, grpc_tcp *tcp) {
  GPR_ASSERT(!tcp->finished_edge);
  GPR_ASSERT(tcp->iov_size <= MAX_READ_IOVEC);
  GPR_ASSERT(tcp->incoming_buffer->count <= MAX_READ_IOVEC);
-  GRPC_TIMER_BEGIN(GRPC_PTAG_HANDLE_READ, 0);
+  GPR_TIMER_BEGIN("tcp_continue_read", 0);

  while (tcp->incoming_buffer->count < (size_t)tcp->iov_size) {
    gpr_slice_buffer_add_indexed(tcp->incoming_buffer,
@ -199,11 +199,11 @@ static void tcp_continue_read(grpc_exec_ctx *exec_ctx, grpc_tcp *tcp) {
  msg.msg_controllen = 0;
  msg.msg_flags = 0;

-  GRPC_TIMER_BEGIN(GRPC_PTAG_RECVMSG, 0);
+  GPR_TIMER_BEGIN("recvmsg", 1);
  do {
    read_bytes = recvmsg(tcp->fd, &msg, 0);
  } while (read_bytes < 0 && errno == EINTR);
-  GRPC_TIMER_END(GRPC_PTAG_RECVMSG, 0);
+  GPR_TIMER_END("recvmsg", 0);

  if (read_bytes < 0) {
    /* NB: After calling call_read_cb a parallel call of the read handler may
@ -240,7 +240,7 @@ static void tcp_continue_read(grpc_exec_ctx *exec_ctx, grpc_tcp *tcp) {
    TCP_UNREF(exec_ctx, tcp, "read");
  }

-  GRPC_TIMER_END(GRPC_PTAG_HANDLE_READ, 0);
+  GPR_TIMER_END("tcp_continue_read", 0);
 }

 static void tcp_handle_read(grpc_exec_ctx *exec_ctx, void *arg /* grpc_tcp */,
@ -316,12 +316,12 @@ static flush_result tcp_flush(grpc_tcp *tcp) {
    msg.msg_controllen = 0;
    msg.msg_flags = 0;

-    GRPC_TIMER_BEGIN(GRPC_PTAG_SENDMSG, 0);
+    GPR_TIMER_BEGIN("sendmsg", 1);
    do {
      /* TODO(klempner): Cork if this is a partial write */
      sent_length = sendmsg(tcp->fd, &msg, SENDMSG_FLAGS);
    } while (sent_length < 0 && errno == EINTR);
-    GRPC_TIMER_END(GRPC_PTAG_SENDMSG, 0);
+    GPR_TIMER_END("sendmsg", 0);

    if (sent_length < 0) {
      if (errno == EAGAIN) {
@ -370,17 +370,17 @@ static void tcp_handle_write(grpc_exec_ctx *exec_ctx, void *arg /* grpc_tcp */,
    return;
  }

-  GRPC_TIMER_BEGIN(GRPC_PTAG_TCP_CB_WRITE, 0);
  status = tcp_flush(tcp);
  if (status == FLUSH_PENDING) {
    grpc_fd_notify_on_write(exec_ctx, tcp->em_fd, &tcp->write_closure);
  } else {
    cb = tcp->write_cb;
    tcp->write_cb = NULL;
+    GPR_TIMER_BEGIN("tcp_handle_write.cb", 0);
    cb->cb(exec_ctx, cb->cb_arg, status == FLUSH_DONE);
+    GPR_TIMER_END("tcp_handle_write.cb", 0);
    TCP_UNREF(exec_ctx, tcp, "write");
  }
-  GRPC_TIMER_END(GRPC_PTAG_TCP_CB_WRITE, 0);
 }

 static void tcp_write(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
@ -399,11 +399,11 @@ static void tcp_write(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
    }
  }

-  GRPC_TIMER_BEGIN(GRPC_PTAG_TCP_WRITE, 0);
+  GPR_TIMER_BEGIN("tcp_write", 0);
  GPR_ASSERT(tcp->write_cb == NULL);

  if (buf->length == 0) {
-    GRPC_TIMER_END(GRPC_PTAG_TCP_WRITE, 0);
+    GPR_TIMER_END("tcp_write", 0);
    grpc_exec_ctx_enqueue(exec_ctx, cb, 1);
    return;
  }
@ -420,7 +420,7 @@ static void tcp_write(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
    grpc_exec_ctx_enqueue(exec_ctx, cb, status == FLUSH_DONE);
  }

-  GRPC_TIMER_END(GRPC_PTAG_TCP_WRITE, 0);
+  GPR_TIMER_END("tcp_write", 0);
 }

 static void tcp_add_to_pollset(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
--- a/src/core/iomgr/wakeup_fd_eventfd.c
+++ b/src/core/iomgr/wakeup_fd_eventfd.c
@ -39,9 +39,11 @@
 #include <sys/eventfd.h>
 #include <unistd.h>

-#include "src/core/iomgr/wakeup_fd_posix.h"
 #include <grpc/support/log.h>

+#include "src/core/iomgr/wakeup_fd_posix.h"
+#include "src/core/profiling/timers.h"
+
 static void eventfd_create(grpc_wakeup_fd* fd_info) {
  int efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
  /* TODO(klempner): Handle failure more gracefully */
@ -60,9 +62,11 @@ static void eventfd_consume(grpc_wakeup_fd* fd_info) {

 static void eventfd_wakeup(grpc_wakeup_fd* fd_info) {
  int err;
+  GPR_TIMER_BEGIN("eventfd_wakeup", 0);
  do {
    err = eventfd_write(fd_info->read_fd, 1);
  } while (err < 0 && errno == EINTR);
+  GPR_TIMER_END("eventfd_wakeup", 0);
 }

 static void eventfd_destroy(grpc_wakeup_fd* fd_info) {
--- a/src/core/profiling/basic_timers.c
+++ b/src/core/profiling/basic_timers.c
@ -44,98 +44,91 @@
 #include <grpc/support/thd.h>
 #include <stdio.h>

-typedef enum {
-  BEGIN = '{',
-  END = '}',
-  MARK = '.',
-  IMPORTANT = '!'
-} marker_type;
-
-typedef struct grpc_timer_entry {
+typedef enum { BEGIN = '{', END = '}', MARK = '.' } marker_type;
+
+typedef struct gpr_timer_entry {
  gpr_timespec tm;
-  int tag;
  const char *tagstr;
-  marker_type type;
-  void *id;
  const char *file;
  int line;
-} grpc_timer_entry;
+  char type;
+  gpr_uint8 important;
+} gpr_timer_entry;
+
+#define MAX_COUNT (1024 * 1024 / sizeof(gpr_timer_entry))

-#define MAX_COUNT (1024 * 1024 / sizeof(grpc_timer_entry))
+static __thread gpr_timer_entry g_log[MAX_COUNT];
+static __thread int g_count;
+static gpr_once g_once_init = GPR_ONCE_INIT;
+static FILE *output_file;

-static __thread grpc_timer_entry log[MAX_COUNT];
-static __thread int count;
+static void close_output() { fclose(output_file); }
+
+static void init_output() {
+  output_file = fopen("latency_trace.txt", "w");
+  GPR_ASSERT(output_file);
+  atexit(close_output);
+}

 static void log_report() {
  int i;
-  for (i = 0; i < count; i++) {
-    grpc_timer_entry *entry = &(log[i]);
-    printf("GRPC_LAT_PROF %ld.%09d  %p %c %d(%s) %p %s %d\n", entry->tm.tv_sec,
-           entry->tm.tv_nsec, (void *)(gpr_intptr)gpr_thd_currentid(),
-           entry->type, entry->tag, entry->tagstr, entry->id, entry->file,
-           entry->line);
+  gpr_once_init(&g_once_init, init_output);
+  for (i = 0; i < g_count; i++) {
+    gpr_timer_entry *entry = &(g_log[i]);
+    fprintf(output_file,
+            "{\"t\": %ld.%09d, \"thd\": \"%p\", \"type\": \"%c\", \"tag\": "
+            "\"%s\", \"file\": \"%s\", \"line\": %d, \"imp\": %d}\n",
+            entry->tm.tv_sec, entry->tm.tv_nsec,
+            (void *)(gpr_intptr)gpr_thd_currentid(), entry->type, entry->tagstr,
+            entry->file, entry->line, entry->important);
  }

  /* Now clear out the log */
-  count = 0;
+  g_count = 0;
 }

-static void grpc_timers_log_add(int tag, const char *tagstr, marker_type type,
-                                void *id, const char *file, int line) {
-  grpc_timer_entry *entry;
+static void gpr_timers_log_add(const char *tagstr, marker_type type,
+                               int important, const char *file, int line) {
+  gpr_timer_entry *entry;

  /* TODO (vpai) : Improve concurrency */
-  if (count == MAX_COUNT) {
+  if (g_count == MAX_COUNT) {
    log_report();
  }

-  entry = &log[count++];
+  entry = &g_log[g_count++];

  entry->tm = gpr_now(GPR_CLOCK_PRECISE);
-  entry->tag = tag;
  entry->tagstr = tagstr;
  entry->type = type;
-  entry->id = id;
  entry->file = file;
  entry->line = line;
+  entry->important = important != 0;
 }

 /* Latency profiler API implementation. */
-void grpc_timer_add_mark(int tag, const char *tagstr, void *id,
-                         const char *file, int line) {
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {
-    grpc_timers_log_add(tag, tagstr, MARK, id, file, line);
-  }
+void gpr_timer_add_mark(const char *tagstr, int important, const char *file,
+                        int line) {
+  gpr_timers_log_add(tagstr, MARK, important, file, line);
 }

-void grpc_timer_add_important_mark(int tag, const char *tagstr, void *id,
-                                   const char *file, int line) {
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {
-    grpc_timers_log_add(tag, tagstr, IMPORTANT, id, file, line);
-  }
+void gpr_timer_begin(const char *tagstr, int important, const char *file,
+                     int line) {
+  gpr_timers_log_add(tagstr, BEGIN, important, file, line);
 }

-void grpc_timer_begin(int tag, const char *tagstr, void *id, const char *file,
-                      int line) {
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {
-    grpc_timers_log_add(tag, tagstr, BEGIN, id, file, line);
-  }
-}
-
-void grpc_timer_end(int tag, const char *tagstr, void *id, const char *file,
-                    int line) {
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {
-    grpc_timers_log_add(tag, tagstr, END, id, file, line);
-  }
+void gpr_timer_end(const char *tagstr, int important, const char *file,
+                   int line) {
+  gpr_timers_log_add(tagstr, END, important, file, line);
 }

 /* Basic profiler specific API functions. */
-void grpc_timers_global_init(void) {}
+void gpr_timers_global_init(void) {}

-void grpc_timers_global_destroy(void) {}
+void gpr_timers_global_destroy(void) {}

 #else  /* !GRPC_BASIC_PROFILER */
-void grpc_timers_global_init(void) {}
+void gpr_timers_global_init(void) {}

-void grpc_timers_global_destroy(void) {}
+void gpr_timers_global_destroy(void) {}
 #endif /* GRPC_BASIC_PROFILER */
--- a/src/core/profiling/stap_timers.c
+++ b/src/core/profiling/stap_timers.c
@ -42,23 +42,23 @@
 #include "src/core/profiling/stap_probes.h"

 /* Latency profiler API implementation. */
-void grpc_timer_add_mark(int tag, const char *tagstr, void *id,
-                         const char *file, int line) {
+void gpr_timer_add_mark(int tag, const char *tagstr, void *id, const char *file,
+                        int line) {
  _STAP_ADD_MARK(tag);
 }

-void grpc_timer_add_important_mark(int tag, const char *tagstr, void *id,
-                                   const char *file, int line) {
+void gpr_timer_add_important_mark(int tag, const char *tagstr, void *id,
+                                  const char *file, int line) {
  _STAP_ADD_IMPORTANT_MARK(tag);
 }

-void grpc_timer_begin(int tag, const char *tagstr, void *id, const char *file,
-                      int line) {
+void gpr_timer_begin(int tag, const char *tagstr, void *id, const char *file,
+                     int line) {
  _STAP_TIMING_NS_BEGIN(tag);
 }

-void grpc_timer_end(int tag, const char *tagstr, void *id, const char *file,
-                    int line) {
+void gpr_timer_end(int tag, const char *tagstr, void *id, const char *file,
+                   int line) {
  _STAP_TIMING_NS_END(tag);
 }

--- a/src/core/profiling/timers.h
+++ b/src/core/profiling/timers.h
@ -38,65 +38,28 @@
 extern "C" {
 #endif

-void grpc_timers_global_init(void);
-void grpc_timers_global_destroy(void);
-
-void grpc_timer_add_mark(int tag, const char *tagstr, void *id,
-                         const char *file, int line);
-void grpc_timer_add_important_mark(int tag, const char *tagstr, void *id,
-                                   const char *file, int line);
-void grpc_timer_begin(int tag, const char *tagstr, void *id, const char *file,
-                      int line);
-void grpc_timer_end(int tag, const char *tagstr, void *id, const char *file,
-                    int line);
-
-enum grpc_profiling_tags {
-  /* Any GRPC_PTAG_* >= than the threshold won't generate any profiling mark. */
-  GRPC_PTAG_IGNORE_THRESHOLD = 1000000,
-
-  /* Re. Protos. */
-  GRPC_PTAG_PROTO_SERIALIZE = 100 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_PROTO_DESERIALIZE = 101 + GRPC_PTAG_IGNORE_THRESHOLD,
-
-  /* Re. sockets. */
-  GRPC_PTAG_HANDLE_READ = 200 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_SENDMSG = 201 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_RECVMSG = 202 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_POLL_FINISHED = 203 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_TCP_CB_WRITE = 204 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_TCP_WRITE = 205 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_CALL_ON_DONE_RECV = 206 + GRPC_PTAG_IGNORE_THRESHOLD,
-
-  /* C++ */
-  GRPC_PTAG_CPP_CALL_CREATED = 300 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_CPP_PERFORM_OPS = 301 + GRPC_PTAG_IGNORE_THRESHOLD,
-
-  /* Transports */
-  GRPC_PTAG_HTTP2_UNLOCK = 401 + GRPC_PTAG_IGNORE_THRESHOLD,
-  GRPC_PTAG_HTTP2_UNLOCK_CLEANUP = 402 + GRPC_PTAG_IGNORE_THRESHOLD,
-
-  /* > 1024 Unassigned reserved. For any miscellaneous use.
-   * Use addition to generate tags from this base or take advantage of the 10
-   * zero'd bits for OR-ing. */
-  GRPC_PTAG_OTHER_BASE = 1024
-};
+void gpr_timers_global_init(void);
+void gpr_timers_global_destroy(void);
+
+void gpr_timer_add_mark(const char *tagstr, int important, const char *file,
+                        int line);
+void gpr_timer_begin(const char *tagstr, int important, const char *file,
+                     int line);
+void gpr_timer_end(const char *tagstr, int important, const char *file,
+                   int line);

 #if !(defined(GRPC_STAP_PROFILER) + defined(GRPC_BASIC_PROFILER))
 /* No profiling. No-op all the things. */
-#define GRPC_TIMER_MARK(tag, id) \
-  do {                           \
-  } while (0)
-
-#define GRPC_TIMER_IMPORTANT_MARK(tag, id) \
-  do {                                     \
+#define GPR_TIMER_MARK(tag, important) \
+  do {                                 \
  } while (0)

-#define GRPC_TIMER_BEGIN(tag, id) \
-  do {                            \
+#define GPR_TIMER_BEGIN(tag, important) \
+  do {                                  \
  } while (0)

-#define GRPC_TIMER_END(tag, id) \
-  do {                          \
+#define GPR_TIMER_END(tag, important) \
+  do {                                \
  } while (0)

 #else /* at least one profiler requested... */
@ -106,28 +69,14 @@ enum grpc_profiling_tags {
 #endif

 /* Generic profiling interface. */
-#define GRPC_TIMER_MARK(tag, id)                                         \
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                                \
-    grpc_timer_add_mark(tag, #tag, ((void *)(gpr_intptr)(id)), __FILE__, \
-                        __LINE__);                                       \
-  }
-
-#define GRPC_TIMER_IMPORTANT_MARK(tag, id)                               \
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                                \
-    grpc_timer_add_important_mark(tag, #tag, ((void *)(gpr_intptr)(id)), \
-                                  __FILE__, __LINE__);                   \
-  }
+#define GPR_TIMER_MARK(tag, important) \
+  gpr_timer_add_mark(tag, important, __FILE__, __LINE__);

-#define GRPC_TIMER_BEGIN(tag, id)                                     \
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                             \
-    grpc_timer_begin(tag, #tag, ((void *)(gpr_intptr)(id)), __FILE__, \
-                     __LINE__);                                       \
-  }
+#define GPR_TIMER_BEGIN(tag, important) \
+  gpr_timer_begin(tag, important, __FILE__, __LINE__);

-#define GRPC_TIMER_END(tag, id)                                                \
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                                      \
-    grpc_timer_end(tag, #tag, ((void *)(gpr_intptr)(id)), __FILE__, __LINE__); \
-  }
+#define GPR_TIMER_END(tag, important) \
+  gpr_timer_end(tag, important, __FILE__, __LINE__);

 #ifdef GRPC_STAP_PROFILER
 /* Empty placeholder for now. */
@ -141,6 +90,28 @@ enum grpc_profiling_tags {

 #ifdef __cplusplus
 }
+
+#if (defined(GRPC_STAP_PROFILER) + defined(GRPC_BASIC_PROFILER))
+namespace grpc {
+class ProfileScope {
+ public:
+  ProfileScope(const char *desc, bool important) : desc_(desc) {
+    GPR_TIMER_BEGIN(desc_, important ? 1 : 0);
+  }
+  ~ProfileScope() { GPR_TIMER_END(desc_, 0); }
+
+ private:
+  const char *const desc_;
+};
+}
+
+#define GPR_TIMER_SCOPE(tag, important) \
+  ::grpc::ProfileScope _profile_scope_##__LINE__((tag), (important))
+#else
+#define GPR_TIMER_SCOPE(tag, important) \
+  do {                                  \
+  } while (false)
+#endif
 #endif

 #endif /* GRPC_CORE_PROFILING_TIMERS_H */
--- a/src/core/support/alloc.c
+++ b/src/core/support/alloc.c
@ -35,22 +35,32 @@

 #include <stdlib.h>
 #include <grpc/support/port_platform.h>
+#include "src/core/profiling/timers.h"

 void *gpr_malloc(size_t size) {
-  void *p = malloc(size);
+  void *p;
+  GPR_TIMER_BEGIN("gpr_malloc", 0);
+  p = malloc(size);
  if (!p) {
    abort();
  }
+  GPR_TIMER_END("gpr_malloc", 0);
  return p;
 }

-void gpr_free(void *p) { free(p); }
+void gpr_free(void *p) {
+  GPR_TIMER_BEGIN("gpr_free", 0);
+  free(p);
+  GPR_TIMER_END("gpr_free", 0);
+}

 void *gpr_realloc(void *p, size_t size) {
+  GPR_TIMER_BEGIN("gpr_realloc", 0);
  p = realloc(p, size);
  if (!p) {
    abort();
  }
+  GPR_TIMER_END("gpr_realloc", 0);
  return p;
 }

--- a/src/core/support/sync_posix.c
+++ b/src/core/support/sync_posix.c
@ -40,14 +40,23 @@
 #include <grpc/support/log.h>
 #include <grpc/support/sync.h>
 #include <grpc/support/time.h>
+#include "src/core/profiling/timers.h"

 void gpr_mu_init(gpr_mu* mu) { GPR_ASSERT(pthread_mutex_init(mu, NULL) == 0); }

 void gpr_mu_destroy(gpr_mu* mu) { GPR_ASSERT(pthread_mutex_destroy(mu) == 0); }

-void gpr_mu_lock(gpr_mu* mu) { GPR_ASSERT(pthread_mutex_lock(mu) == 0); }
+void gpr_mu_lock(gpr_mu* mu) {
+  GPR_TIMER_BEGIN("gpr_mu_lock", 0);
+  GPR_ASSERT(pthread_mutex_lock(mu) == 0);
+  GPR_TIMER_END("gpr_mu_lock", 0);
+}

-void gpr_mu_unlock(gpr_mu* mu) { GPR_ASSERT(pthread_mutex_unlock(mu) == 0); }
+void gpr_mu_unlock(gpr_mu* mu) {
+  GPR_TIMER_BEGIN("gpr_mu_unlock", 0);
+  GPR_ASSERT(pthread_mutex_unlock(mu) == 0);
+  GPR_TIMER_END("gpr_mu_unlock", 0);
+}

 int gpr_mu_trylock(gpr_mu* mu) {
  int err = pthread_mutex_trylock(mu);
--- a/src/core/support/time_posix.c
+++ b/src/core/support/time_posix.c
@ -63,7 +63,7 @@ static gpr_timespec gpr_from_timespec(struct timespec ts,
 /** maps gpr_clock_type --> clockid_t for clock_gettime */
 static clockid_t clockid_for_gpr_clock[] = {CLOCK_MONOTONIC, CLOCK_REALTIME};

-void gpr_time_init(void) {}
+void gpr_time_init(void) { gpr_precise_clock_init(); }

 gpr_timespec gpr_now(gpr_clock_type clock_type) {
  struct timespec now;
@ -89,6 +89,7 @@ static uint64_t g_time_start;

 void gpr_time_init(void) {
  mach_timebase_info_data_t tb = {0, 1};
+  gpr_precise_clock_init();
  mach_timebase_info(&tb);
  g_time_scale = tb.numer;
  g_time_scale /= tb.denom;
--- a/src/core/support/time_precise.c
+++ b/src/core/support/time_precise.c
@ -0,0 +1,89 @@
+/*
+ *
+ * Copyright 2015, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <grpc/support/log.h>
+#include <grpc/support/time.h>
+#include <stdio.h>
+
+#ifdef GRPC_TIMERS_RDTSC
+#if defined(__i386__)
+static void gpr_get_cycle_counter(long long int *clk) {
+  long long int ret;
+  __asm__ volatile("rdtsc" : "=A"(ret));
+  *clk = ret;
+}
+
+// ----------------------------------------------------------------
+#elif defined(__x86_64__) || defined(__amd64__)
+static void gpr_get_cycle_counter(long long int *clk) {
+  unsigned long long low, high;
+  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
+  *clk = (long long)(high << 32) | (long long)low;
+}
+#endif
+
+static double cycles_per_second = 0;
+static long long int start_cycle;
+void gpr_precise_clock_init(void) {
+  time_t start;
+  long long end_cycle;
+  gpr_log(GPR_DEBUG, "Calibrating timers");
+  start = time(NULL);
+  while (time(NULL) == start)
+    ;
+  gpr_get_cycle_counter(&start_cycle);
+  while (time(NULL) <= start + 10)
+    ;
+  gpr_get_cycle_counter(&end_cycle);
+  cycles_per_second = (double)(end_cycle - start_cycle) / 10.0;
+  gpr_log(GPR_DEBUG, "... cycles_per_second = %f\n", cycles_per_second);
+}
+
+void gpr_precise_clock_now(gpr_timespec *clk) {
+  long long int counter;
+  double secs;
+  gpr_get_cycle_counter(&counter);
+  secs = (double)(counter - start_cycle) / cycles_per_second;
+  clk->clock_type = GPR_CLOCK_PRECISE;
+  clk->tv_sec = (time_t)secs;
+  clk->tv_nsec = (int)(1e9 * (secs - (double)clk->tv_sec));
+}
+
+#else  /* GRPC_TIMERS_RDTSC */
+void gpr_precise_clock_init(void) {}
+
+void gpr_precise_clock_now(gpr_timespec *clk) {
+  *clk = gpr_now(GPR_CLOCK_REALTIME);
+  clk->clock_type = GPR_CLOCK_PRECISE;
+}
+#endif /* GRPC_TIMERS_RDTSC */
--- a/src/core/support/time_precise.h
+++ b/src/core/support/time_precise.h
@ -34,60 +34,9 @@
 #ifndef GRPC_CORE_SUPPORT_TIME_PRECISE_H_
 #define GRPC_CORE_SUPPORT_TIME_PRECISE_H_

-#include <grpc/support/sync.h>
 #include <grpc/support/time.h>
-#include <stdio.h>

-#ifdef GRPC_TIMERS_RDTSC
-#if defined(__i386__)
-static void gpr_get_cycle_counter(long long int *clk) {
-  long long int ret;
-  __asm__ volatile("rdtsc" : "=A"(ret));
-  *clk = ret;
-}
-
-// ----------------------------------------------------------------
-#elif defined(__x86_64__) || defined(__amd64__)
-static void gpr_get_cycle_counter(long long int *clk) {
-  unsigned long long low, high;
-  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
-  *clk = (high << 32) | low;
-}
-#endif
-
-static gpr_once precise_clock_init = GPR_ONCE_INIT;
-static long long cycles_per_second = 0;
-static void gpr_precise_clock_init() {
-  time_t start = time(NULL);
-  gpr_precise_clock start_cycle;
-  gpr_precise_clock end_cycle;
-  while (time(NULL) == start)
-    ;
-  gpr_get_cycle_counter(&start_cycle);
-  while (time(NULL) == start + 1)
-    ;
-  gpr_get_cycle_counter(&end_cycle);
-  cycles_per_second = end_cycle - start_cycle;
-}
-
-static double grpc_precise_clock_scaling_factor() {
-  gpr_once_init(&precise_clock_init, grpc_precise_clock_init);
-  return 1e6 / cycles_per_second;
-}
-
-static void gpr_precise_clock_now(gpr_timespec *clk) {
-  long long int counter;
-  gpr_get_cycle_counter(&counter);
-  clk->clock = GPR_CLOCK_REALTIME;
-  clk->tv_sec = counter / cycles_per_second;
-  clk->tv_nsec = counter % cycles_per_second;
-}
-
-#else  /* GRPC_TIMERS_RDTSC */
-static void gpr_precise_clock_now(gpr_timespec *clk) {
-  *clk = gpr_now(GPR_CLOCK_REALTIME);
-  clk->clock_type = GPR_CLOCK_PRECISE;
-}
-#endif /* GRPC_TIMERS_RDTSC */
+void gpr_precise_clock_init(void);
+void gpr_precise_clock_now(gpr_timespec *clk);

 #endif /* GRPC_CORE_SUPPORT_TIME_PRECISE_ */
--- a/src/core/surface/call.c
+++ b/src/core/surface/call.c
@ -306,8 +306,9 @@ grpc_call *grpc_call_create(grpc_channel *channel, grpc_call *parent_call,
  grpc_transport_stream_op *initial_op_ptr = NULL;
  grpc_channel_stack *channel_stack = grpc_channel_get_channel_stack(channel);
  grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
-  grpc_call *call =
-      gpr_malloc(sizeof(grpc_call) + channel_stack->call_stack_size);
+  grpc_call *call;
+  GPR_TIMER_BEGIN("grpc_call_create", 0);
+  call = gpr_malloc(sizeof(grpc_call) + channel_stack->call_stack_size);
  memset(call, 0, sizeof(grpc_call));
  gpr_mu_init(&call->mu);
  gpr_mu_init(&call->completion_mu);
@ -401,6 +402,7 @@ grpc_call *grpc_call_create(grpc_channel *channel, grpc_call *parent_call,
    set_deadline_alarm(&exec_ctx, call, send_deadline);
  }
  grpc_exec_ctx_finish(&exec_ctx);
+  GPR_TIMER_END("grpc_call_create", 0);
  return call;
 }

@ -461,6 +463,7 @@ void grpc_call_internal_ref(grpc_call *c) {
 static void destroy_call(grpc_exec_ctx *exec_ctx, grpc_call *call) {
  size_t i;
  grpc_call *c = call;
+  GPR_TIMER_BEGIN("destroy_call", 0);
  grpc_call_stack_destroy(exec_ctx, CALL_STACK_FROM_CALL(c));
  GRPC_CHANNEL_INTERNAL_UNREF(exec_ctx, c->channel, "call");
  gpr_mu_destroy(&c->mu);
@ -493,6 +496,7 @@ static void destroy_call(grpc_exec_ctx *exec_ctx, grpc_call *call) {
    GRPC_CQ_INTERNAL_UNREF(c->cq, "bind");
  }
  gpr_free(c);
+  GPR_TIMER_END("destroy_call", 0);
 }

 #ifdef GRPC_CALL_REF_COUNT_DEBUG
@ -624,6 +628,8 @@ static void unlock(grpc_exec_ctx *exec_ctx, grpc_call *call) {
  const size_t MAX_RECV_PEEK_AHEAD = 65536;
  size_t buffered_bytes;

+  GPR_TIMER_BEGIN("unlock", 0);
+
  memset(&op, 0, sizeof(op));

  op.cancel_with_status = call->cancel_with_status;
@ -694,6 +700,8 @@ static void unlock(grpc_exec_ctx *exec_ctx, grpc_call *call) {
    unlock(exec_ctx, call);
    GRPC_CALL_INTERNAL_UNREF(exec_ctx, call, "completing");
  }
+
+  GPR_TIMER_END("unlock", 0);
 }

 static void get_final_status(grpc_call *call, grpc_ioreq_data out) {
@ -843,6 +851,7 @@ static void early_out_write_ops(grpc_call *call) {

 static void call_on_done_send(grpc_exec_ctx *exec_ctx, void *pc, int success) {
  grpc_call *call = pc;
+  GPR_TIMER_BEGIN("call_on_done_send", 0);
  lock(call);
  if (call->last_send_contains & (1 << GRPC_IOREQ_SEND_INITIAL_METADATA)) {
    finish_ioreq_op(call, GRPC_IOREQ_SEND_INITIAL_METADATA, success);
@ -866,9 +875,11 @@ static void call_on_done_send(grpc_exec_ctx *exec_ctx, void *pc, int success) {
  call->sending = 0;
  unlock(exec_ctx, call);
  GRPC_CALL_INTERNAL_UNREF(exec_ctx, call, "sending");
+  GPR_TIMER_END("call_on_done_send", 0);
 }

 static void finish_message(grpc_call *call) {
+  GPR_TIMER_BEGIN("finish_message", 0);
  if (call->error_status_set == 0) {
    /* TODO(ctiller): this could be a lot faster if coded directly */
    grpc_byte_buffer *byte_buffer;
@ -888,6 +899,7 @@ static void finish_message(grpc_call *call) {
  gpr_slice_buffer_reset_and_unref(&call->incoming_message);
  GPR_ASSERT(call->incoming_message.count == 0);
  call->reading_message = 0;
+  GPR_TIMER_END("finish_message", 0);
 }

 static int begin_message(grpc_call *call, grpc_begin_message msg) {
@ -977,7 +989,7 @@ static void call_on_done_recv(grpc_exec_ctx *exec_ctx, void *pc, int success) {
  grpc_call *child_call;
  grpc_call *next_child_call;
  size_t i;
-  GRPC_TIMER_BEGIN(GRPC_PTAG_CALL_ON_DONE_RECV, 0);
+  GPR_TIMER_BEGIN("call_on_done_recv", 0);
  lock(call);
  call->receiving = 0;
  if (success) {
@ -987,13 +999,19 @@ static void call_on_done_recv(grpc_exec_ctx *exec_ctx, void *pc, int success) {
        case GRPC_NO_OP:
          break;
        case GRPC_OP_METADATA:
+          GPR_TIMER_BEGIN("recv_metadata", 0);
          recv_metadata(exec_ctx, call, &op->data.metadata);
+          GPR_TIMER_END("recv_metadata", 0);
          break;
        case GRPC_OP_BEGIN_MESSAGE:
+          GPR_TIMER_BEGIN("begin_message", 0);
          success = begin_message(call, op->data.begin_message);
+          GPR_TIMER_END("begin_message", 0);
          break;
        case GRPC_OP_SLICE:
+          GPR_TIMER_BEGIN("add_slice_to_message", 0);
          success = add_slice_to_message(call, op->data.slice);
+          GPR_TIMER_END("add_slice_to_message", 0);
          break;
      }
    }
@ -1039,7 +1057,7 @@ static void call_on_done_recv(grpc_exec_ctx *exec_ctx, void *pc, int success) {
  unlock(exec_ctx, call);

  GRPC_CALL_INTERNAL_UNREF(exec_ctx, call, "receiving");
-  GRPC_TIMER_END(GRPC_PTAG_CALL_ON_DONE_RECV, 0);
+  GPR_TIMER_END("call_on_done_recv", 0);
 }

 static int prepare_application_metadata(grpc_call *call, size_t count,
@ -1517,16 +1535,25 @@ static void recv_metadata(grpc_exec_ctx *exec_ctx, grpc_call *call,
    grpc_mdelem *mdel = l->md;
    grpc_mdstr *key = mdel->key;
    if (key == grpc_channel_get_status_string(call->channel)) {
+      GPR_TIMER_BEGIN("status", 0);
      set_status_code(call, STATUS_FROM_WIRE, decode_status(mdel));
+      GPR_TIMER_END("status", 0);
    } else if (key == grpc_channel_get_message_string(call->channel)) {
+      GPR_TIMER_BEGIN("status-details", 0);
      set_status_details(call, STATUS_FROM_WIRE, GRPC_MDSTR_REF(mdel->value));
+      GPR_TIMER_END("status-details", 0);
    } else if (key ==
               grpc_channel_get_compression_algorithm_string(call->channel)) {
+      GPR_TIMER_BEGIN("compression_algorithm", 0);
      set_compression_algorithm(call, decode_compression(mdel));
+      GPR_TIMER_END("compression_algorithm", 0);
    } else if (key == grpc_channel_get_encodings_accepted_by_peer_string(
                          call->channel)) {
+      GPR_TIMER_BEGIN("encodings_accepted_by_peer", 0);
      set_encodings_accepted_by_peer(call, mdel->value->slice);
+      GPR_TIMER_END("encodings_accepted_by_peer", 0);
    } else {
+      GPR_TIMER_BEGIN("report_up", 0);
      dest = &call->buffered_metadata[is_trailing];
      if (dest->count == dest->capacity) {
        dest->capacity = GPR_MAX(dest->capacity + 8, dest->capacity * 2);
@ -1547,12 +1574,15 @@ static void recv_metadata(grpc_exec_ctx *exec_ctx, grpc_call *call,
      }
      call->owned_metadata[call->owned_metadata_count++] = mdel;
      l->md = NULL;
+      GPR_TIMER_END("report_up", 0);
    }
  }
  if (gpr_time_cmp(md->deadline, gpr_inf_future(md->deadline.clock_type)) !=
          0 &&
      !call->is_client) {
+    GPR_TIMER_BEGIN("set_deadline_alarm", 0);
    set_deadline_alarm(exec_ctx, call, md->deadline);
+    GPR_TIMER_END("set_deadline_alarm", 0);
  }
  if (!is_trailing) {
    call->read_state = READ_STATE_GOT_INITIAL_METADATA;
@ -1615,6 +1645,8 @@ grpc_call_error grpc_call_start_batch(grpc_call *call, const grpc_op *ops,
  grpc_call_error error;
  grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;

+  GPR_TIMER_BEGIN("grpc_call_start_batch", 0);
+
  GRPC_API_TRACE(
      "grpc_call_start_batch(call=%p, ops=%p, nops=%lu, tag=%p, reserved=%p)",
      5, (call, ops, (unsigned long)nops, tag, reserved));
@ -1852,6 +1884,7 @@ grpc_call_error grpc_call_start_batch(grpc_call *call, const grpc_op *ops,
                                              finish_func, tag);
 done:
  grpc_exec_ctx_finish(&exec_ctx);
+  GPR_TIMER_END("grpc_call_start_batch", 0);
  return error;
 }

--- a/src/core/surface/completion_queue.c
+++ b/src/core/surface/completion_queue.c
@ -42,6 +42,7 @@
 #include "src/core/surface/call.h"
 #include "src/core/surface/event_string.h"
 #include "src/core/surface/surface_trace.h"
+#include "src/core/profiling/timers.h"
 #include <grpc/support/alloc.h>
 #include <grpc/support/atm.h>
 #include <grpc/support/log.h>
@ -143,6 +144,8 @@ void grpc_cq_end_op(grpc_exec_ctx *exec_ctx, grpc_completion_queue *cc,
  int i;
  grpc_pollset_worker *pluck_worker;

+  GPR_TIMER_BEGIN("grpc_cq_end_op", 0);
+
  storage->tag = tag;
  storage->done = done;
  storage->done_arg = done_arg;
@ -174,6 +177,8 @@ void grpc_cq_end_op(grpc_exec_ctx *exec_ctx, grpc_completion_queue *cc,
    gpr_mu_unlock(GRPC_POLLSET_MU(&cc->pollset));
    grpc_pollset_shutdown(exec_ctx, &cc->pollset, &cc->pollset_destroy_done);
  }
+
+  GPR_TIMER_END("grpc_cq_end_op", 0);
 }

 grpc_event grpc_completion_queue_next(grpc_completion_queue *cc,
@ -184,6 +189,8 @@ grpc_event grpc_completion_queue_next(grpc_completion_queue *cc,
  gpr_timespec now;
  grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;

+  GPR_TIMER_BEGIN("grpc_completion_queue_next", 0);
+
  GRPC_API_TRACE(
      "grpc_completion_queue_next("
      "cc=%p, "
@ -230,6 +237,9 @@ grpc_event grpc_completion_queue_next(grpc_completion_queue *cc,
  GRPC_SURFACE_TRACE_RETURNED_EVENT(cc, &ret);
  GRPC_CQ_INTERNAL_UNREF(cc, "next");
  grpc_exec_ctx_finish(&exec_ctx);
+
+  GPR_TIMER_END("grpc_completion_queue_next", 0);
+
  return ret;
 }

@ -267,6 +277,8 @@ grpc_event grpc_completion_queue_pluck(grpc_completion_queue *cc, void *tag,
  int first_loop = 1;
  grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;

+  GPR_TIMER_BEGIN("grpc_completion_queue_pluck", 0);
+
  GRPC_API_TRACE(
      "grpc_completion_queue_pluck("
      "cc=%p, tag=%p, "
@ -332,6 +344,9 @@ done:
  GRPC_SURFACE_TRACE_RETURNED_EVENT(cc, &ret);
  GRPC_CQ_INTERNAL_UNREF(cc, "pluck");
  grpc_exec_ctx_finish(&exec_ctx);
+
+  GPR_TIMER_END("grpc_completion_queue_pluck", 0);
+
  return ret;
 }

--- a/src/core/surface/init.c
+++ b/src/core/surface/init.c
@ -115,7 +115,7 @@ void grpc_init(void) {
        gpr_log(GPR_ERROR, "Could not initialize census.");
      }
    }
-    grpc_timers_global_init();
+    gpr_timers_global_init();
    for (i = 0; i < g_number_of_plugins; i++) {
      if (g_all_of_the_plugins[i].init != NULL) {
        g_all_of_the_plugins[i].init();
@ -133,7 +133,7 @@ void grpc_shutdown(void) {
  if (--g_initializations == 0) {
    grpc_iomgr_shutdown();
    census_shutdown();
-    grpc_timers_global_destroy();
+    gpr_timers_global_destroy();
    grpc_tracer_shutdown();
    grpc_resolver_registry_shutdown();
    for (i = 0; i < g_number_of_plugins; i++) {
--- a/src/core/transport/chttp2/parsing.c
+++ b/src/core/transport/chttp2/parsing.c
@ -35,6 +35,7 @@

 #include <string.h>

+#include "src/core/profiling/timers.h"
 #include "src/core/transport/chttp2/http2_errors.h"
 #include "src/core/transport/chttp2/status_conversion.h"
 #include "src/core/transport/chttp2/timeout_encoding.h"
@ -68,6 +69,8 @@ void grpc_chttp2_prepare_to_read(
  grpc_chttp2_stream_global *stream_global;
  grpc_chttp2_stream_parsing *stream_parsing;

+  GPR_TIMER_BEGIN("grpc_chttp2_prepare_to_read", 0);
+
  transport_parsing->next_stream_id = transport_global->next_stream_id;

  /* update the parsing view of incoming window */
@ -89,6 +92,8 @@ void grpc_chttp2_prepare_to_read(
      stream_parsing->incoming_window = stream_global->incoming_window;
    }
  }
+
+  GPR_TIMER_END("grpc_chttp2_prepare_to_read", 0);
 }

 void grpc_chttp2_publish_reads(
--- a/src/core/transport/chttp2/writing.c
+++ b/src/core/transport/chttp2/writing.c
@ -37,6 +37,7 @@

 #include <grpc/support/log.h>

+#include "src/core/profiling/timers.h"
 #include "src/core/transport/chttp2/http2_errors.h"

 static void finalize_outbuf(grpc_chttp2_transport_writing *transport_writing);
@ -180,6 +181,8 @@ void grpc_chttp2_perform_writes(
 static void finalize_outbuf(grpc_chttp2_transport_writing *transport_writing) {
  grpc_chttp2_stream_writing *stream_writing;

+  GPR_TIMER_BEGIN("finalize_outbuf", 0);
+
  while (
      grpc_chttp2_list_pop_writing_stream(transport_writing, &stream_writing)) {
    if (stream_writing->sopb.nops > 0 ||
@ -208,6 +211,8 @@ static void finalize_outbuf(grpc_chttp2_transport_writing *transport_writing) {
    }
    grpc_chttp2_list_add_written_stream(transport_writing, stream_writing);
  }
+
+  GPR_TIMER_END("finalize_outbuf", 0);
 }

 void grpc_chttp2_cleanup_writing(
--- a/src/core/transport/chttp2_transport.c
+++ b/src/core/transport/chttp2_transport.c
@ -510,6 +510,7 @@ grpc_chttp2_stream_parsing *grpc_chttp2_parsing_accept_stream(
 static void lock(grpc_chttp2_transport *t) { gpr_mu_lock(&t->mu); }

 static void unlock(grpc_exec_ctx *exec_ctx, grpc_chttp2_transport *t) {
+  GPR_TIMER_BEGIN("unlock", 0);
  unlock_check_read_write_state(exec_ctx, t);
  if (!t->writing_active && !t->closed &&
      grpc_chttp2_unlocking_check_writes(&t->global, &t->writing)) {
@ -520,6 +521,7 @@ static void unlock(grpc_exec_ctx *exec_ctx, grpc_chttp2_transport *t) {
  }

  gpr_mu_unlock(&t->mu);
+  GPR_TIMER_END("unlock", 0);
 }

 /*
@ -546,6 +548,8 @@ void grpc_chttp2_terminate_writing(grpc_exec_ctx *exec_ctx,
  grpc_chttp2_transport_writing *transport_writing = transport_writing_ptr;
  grpc_chttp2_transport *t = TRANSPORT_FROM_WRITING(transport_writing);

+  GPR_TIMER_BEGIN("grpc_chttp2_terminate_writing", 0);
+
  lock(t);

  allow_endpoint_shutdown_locked(exec_ctx, t);
@ -567,12 +571,16 @@ void grpc_chttp2_terminate_writing(grpc_exec_ctx *exec_ctx,
  unlock(exec_ctx, t);

  UNREF_TRANSPORT(exec_ctx, t, "writing");
+
+  GPR_TIMER_END("grpc_chttp2_terminate_writing", 0);
 }

 static void writing_action(grpc_exec_ctx *exec_ctx, void *gt,
                           int iomgr_success_ignored) {
  grpc_chttp2_transport *t = gt;
+  GPR_TIMER_BEGIN("writing_action", 0);
  grpc_chttp2_perform_writes(exec_ctx, &t->writing, t->ep);
+  GPR_TIMER_END("writing_action", 0);
 }

 void grpc_chttp2_add_incoming_goaway(
@ -642,6 +650,7 @@ static void maybe_start_some_streams(
 static void perform_stream_op_locked(
    grpc_exec_ctx *exec_ctx, grpc_chttp2_transport_global *transport_global,
    grpc_chttp2_stream_global *stream_global, grpc_transport_stream_op *op) {
+  GPR_TIMER_BEGIN("perform_stream_op_locked", 0);
  if (op->cancel_with_status != GRPC_STATUS_OK) {
    cancel_from_api(transport_global, stream_global, op->cancel_with_status);
  }
@ -713,6 +722,7 @@ static void perform_stream_op_locked(
  }

  grpc_exec_ctx_enqueue(exec_ctx, op->on_consumed, 1);
+  GPR_TIMER_END("perform_stream_op_locked", 0);
 }

 static void perform_stream_op(grpc_exec_ctx *exec_ctx, grpc_transport *gt,
@ -1103,6 +1113,8 @@ static void recv_data(grpc_exec_ctx *exec_ctx, void *tp, int success) {
  int keep_reading = 0;
  grpc_chttp2_transport *t = tp;

+  GPR_TIMER_BEGIN("recv_data", 0);
+
  lock(t);
  i = 0;
  GPR_ASSERT(!t->parsing_active);
@ -1113,11 +1125,13 @@ static void recv_data(grpc_exec_ctx *exec_ctx, void *tp, int success) {
                                     &t->parsing_stream_map);
    grpc_chttp2_prepare_to_read(&t->global, &t->parsing);
    gpr_mu_unlock(&t->mu);
+    GPR_TIMER_BEGIN("recv_data.parse", 0);
    for (; i < t->read_buffer.count &&
               grpc_chttp2_perform_read(exec_ctx, &t->parsing,
                                        t->read_buffer.slices[i]);
         i++)
      ;
+    GPR_TIMER_END("recv_data.parse", 0);
    gpr_mu_lock(&t->mu);
    if (i != t->read_buffer.count) {
      drop_connection(exec_ctx, t);
@ -1154,6 +1168,8 @@ static void recv_data(grpc_exec_ctx *exec_ctx, void *tp, int success) {
  } else {
    UNREF_TRANSPORT(exec_ctx, t, "recv_data");
  }
+
+  GPR_TIMER_END("recv_data", 0);
 }

 /*
--- a/src/core/transport/stream_op.c
+++ b/src/core/transport/stream_op.c
@ -38,6 +38,8 @@
 #include <grpc/support/alloc.h>
 #include <grpc/support/log.h>

+#include "src/core/profiling/timers.h"
+
 /* Exponential growth function: Given x, return a larger x.
   Currently we grow by 1.5 times upon reallocation. */
 #define GROW(x) (3 * (x) / 2)
@ -300,6 +302,8 @@ void grpc_metadata_batch_filter(grpc_metadata_batch *batch,
  grpc_linked_mdelem *l;
  grpc_linked_mdelem *next;

+  GPR_TIMER_BEGIN("grpc_metadata_batch_filter", 0);
+
  assert_valid_list(&batch->list);
  assert_valid_list(&batch->garbage);
  for (l = batch->list.head; l; l = next) {
@ -328,4 +332,6 @@ void grpc_metadata_batch_filter(grpc_metadata_batch *batch,
  }
  assert_valid_list(&batch->list);
  assert_valid_list(&batch->garbage);
+
+  GPR_TIMER_END("grpc_metadata_batch_filter", 0);
 }
--- a/src/core/tsi/fake_transport_security.c
+++ b/src/core/tsi/fake_transport_security.c
@ -121,7 +121,7 @@ static void store32_little_endian(gpr_uint32 value, unsigned char *buf) {
  buf[3] = (unsigned char)((value >> 24) & 0xFF);
  buf[2] = (unsigned char)((value >> 16) & 0xFF);
  buf[1] = (unsigned char)((value >> 8) & 0xFF);
-  buf[0] = (unsigned char)((value) & 0xFF);
+  buf[0] = (unsigned char)((value)&0xFF);
 }

 static void tsi_fake_frame_reset(tsi_fake_frame *frame, int needs_draining) {
--- a/src/cpp/client/channel.cc
+++ b/src/cpp/client/channel.cc
@ -78,7 +78,6 @@ Call Channel::CreateCall(const RpcMethod& method, ClientContext* context,
                                      context->raw_deadline(), nullptr);
  }
  grpc_census_call_set_context(c_call, context->census_context());
-  GRPC_TIMER_MARK(GRPC_PTAG_CPP_CALL_CREATED, c_call);
  context->set_call(c_call, shared_from_this());
  return Call(c_call, this, cq);
 }
@ -87,11 +86,9 @@ void Channel::PerformOpsOnCall(CallOpSetInterface* ops, Call* call) {
  static const size_t MAX_OPS = 8;
  size_t nops = 0;
  grpc_op cops[MAX_OPS];
-  GRPC_TIMER_BEGIN(GRPC_PTAG_CPP_PERFORM_OPS, call->call());
  ops->FillOps(cops, &nops);
  GPR_ASSERT(GRPC_CALL_OK ==
             grpc_call_start_batch(call->call(), cops, nops, ops, nullptr));
-  GRPC_TIMER_END(GRPC_PTAG_CPP_PERFORM_OPS, call->call());
 }

 void* Channel::RegisterMethod(const char* method) {
--- a/src/cpp/proto/proto_utils.cc
+++ b/src/cpp/proto/proto_utils.cc
@ -42,6 +42,8 @@
 #include <grpc/support/port_platform.h>
 #include <grpc++/support/config.h>

+#include "src/core/profiling/timers.h"
+
 const int kMaxBufferLength = 8192;

 class GrpcBufferWriter GRPC_FINAL
@ -158,6 +160,7 @@ namespace grpc {

 Status SerializeProto(const grpc::protobuf::Message& msg,
                      grpc_byte_buffer** bp) {
+  GPR_TIMER_SCOPE("SerializeProto", 0);
  int byte_size = msg.ByteSize();
  if (byte_size <= kMaxBufferLength) {
    gpr_slice slice = gpr_slice_malloc(byte_size);
@ -176,6 +179,7 @@ Status SerializeProto(const grpc::protobuf::Message& msg,

 Status DeserializeProto(grpc_byte_buffer* buffer, grpc::protobuf::Message* msg,
                        int max_message_size) {
+  GPR_TIMER_SCOPE("DeserializeProto", 0);
  if (!buffer) {
    return Status(StatusCode::INTERNAL, "No payload");
  }
--- a/src/cpp/server/server.cc
+++ b/src/cpp/server/server.cc
@ -540,6 +540,7 @@ void Server::ScheduleCallback() {
 void Server::RunRpc() {
  // Wait for one more incoming rpc.
  bool ok;
+  GPR_TIMER_SCOPE("Server::RunRpc", 0);
  auto* mrd = SyncRequest::Wait(&cq_, &ok);
  if (mrd) {
    ScheduleCallback();
@ -555,6 +556,7 @@ void Server::RunRpc() {
          mrd->TeardownRequest();
        }
      }
+      GPR_TIMER_SCOPE("cd.Run()", 0);
      cd.Run();
    }
  }
--- a/test/core/profiling/timers_test.c
+++ b/test/core/profiling/timers_test.c
@ -54,15 +54,15 @@ void test_log_events(size_t num_seqs) {
      for (j = 0; j < advance; j++) {
        switch (state[i]) {
          case 0:
-            GRPC_TIMER_MARK(STATE_0, i);
+            GPR_TIMER_MARK(STATE_0, i);
            state[i]++;
            break;
          case 1:
-            GRPC_TIMER_MARK(STATE_1, i);
+            GPR_TIMER_MARK(STATE_1, i);
            state[i]++;
            break;
          case 2:
-            GRPC_TIMER_MARK(STATE_2, i);
+            GPR_TIMER_MARK(STATE_2, i);
            state[i]++;
            break;
          case 3:
@ -76,8 +76,8 @@ void test_log_events(size_t num_seqs) {

 int main(int argc, char **argv) {
  grpc_test_init(argc, argv);
-  grpc_timers_global_init();
+  gpr_timers_global_init();
  test_log_events(1000000);
-  grpc_timers_global_destroy();
+  gpr_timers_global_destroy();
  return 0;
 }
--- a/test/cpp/qps/client_sync.cc
+++ b/test/cpp/qps/client_sync.cc
@ -59,6 +59,8 @@
 #include "test/cpp/qps/interarrival.h"
 #include "test/cpp/qps/timer.h"

+#include "src/core/profiling/timers.h"
+
 namespace grpc {
 namespace testing {

@ -99,6 +101,7 @@ class SynchronousUnaryClient GRPC_FINAL : public SynchronousClient {
    WaitToIssue(thread_idx);
    auto* stub = channels_[thread_idx % channels_.size()].get_stub();
    double start = Timer::Now();
+    GPR_TIMER_SCOPE("SynchronousUnaryClient::ThreadFunc", 0);
    grpc::ClientContext context;
    grpc::Status s =
        stub->UnaryCall(&context, request_, &responses_[thread_idx]);
@ -135,6 +138,7 @@ class SynchronousStreamingClient GRPC_FINAL : public SynchronousClient {

  bool ThreadFunc(Histogram* histogram, size_t thread_idx) GRPC_OVERRIDE {
    WaitToIssue(thread_idx);
+    GPR_TIMER_SCOPE("SynchronousStreamingClient::ThreadFunc", 0);
    double start = Timer::Now();
    if (stream_[thread_idx]->Write(request_) &&
        stream_[thread_idx]->Read(&responses_[thread_idx])) {
--- a/tools/doxygen/Doxyfile.core.internal
+++ b/tools/doxygen/Doxyfile.core.internal
@ -852,7 +852,6 @@ src/core/json/json.h \
 src/core/json/json_common.h \
 src/core/json/json_reader.h \
 src/core/json/json_writer.h \
-src/core/profiling/timers.h \
 src/core/statistics/census_interface.h \
 src/core/statistics/census_rpc_stats.h \
 src/core/surface/api_trace.h \
@ -991,8 +990,6 @@ src/core/json/json.c \
 src/core/json/json_reader.c \
 src/core/json/json_string.c \
 src/core/json/json_writer.c \
-src/core/profiling/basic_timers.c \
-src/core/profiling/stap_timers.c \
 src/core/surface/api_trace.c \
 src/core/surface/byte_buffer.c \
 src/core/surface/byte_buffer_queue.c \
@ -1069,6 +1066,7 @@ include/grpc/support/tls_gcc.h \
 include/grpc/support/tls_msvc.h \
 include/grpc/support/tls_pthread.h \
 include/grpc/support/useful.h \
+src/core/profiling/timers.h \
 src/core/support/block_annotate.h \
 src/core/support/env.h \
 src/core/support/file.h \
@ -1078,6 +1076,8 @@ src/core/support/string.h \
 src/core/support/string_win32.h \
 src/core/support/thd_internal.h \
 src/core/support/time_precise.h \
+src/core/profiling/basic_timers.c \
+src/core/profiling/stap_timers.c \
 src/core/support/alloc.c \
 src/core/support/cmdline.c \
 src/core/support/cpu_iphone.c \
@ -1113,6 +1113,7 @@ src/core/support/thd_posix.c \
 src/core/support/thd_win32.c \
 src/core/support/time.c \
 src/core/support/time_posix.c \
+src/core/support/time_precise.c \
 src/core/support/time_win32.c \
 src/core/support/tls_pthread.c

--- a/tools/profile_analyzer/profile_analyzer.py
+++ b/tools/profile_analyzer/profile_analyzer.py
@ -1,188 +0,0 @@
-#!/usr/bin/env python2.7
-# Copyright 2015, Google Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#     * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following disclaimer
-# in the documentation and/or other materials provided with the
-# distribution.
-#     * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""
-Read GRPC basic profiles, analyze the data.
-
-Usage:
-  bins/basicprof/qps_smoke_test > log
-  cat log | tools/profile_analyzer/profile_analyzer.py
-"""
-
-
-import collections
-import itertools
-import math
-import re
-
-# Create a regex to parse output of the C core basic profiler,
-# as defined in src/core/profiling/basic_timers.c.
-_RE_LINE = re.compile(r'GRPC_LAT_PROF ' +
-                      r'([0-9]+\.[0-9]+) 0x([0-9a-f]+) ([{}.!]) ([0-9]+) ' +
-                      r'([^ ]+) ([^ ]+) ([0-9]+)')
-
-Entry = collections.namedtuple(
-    'Entry',
-    ['time', 'thread', 'type', 'tag', 'id', 'file', 'line'])
-
-
-class ImportantMark(object):
-  def __init__(self, entry, stack):
-    self._entry = entry
-    self._pre_stack = stack
-    self._post_stack = list()
-    self._n = len(stack)  # we'll also compute times to that many closing }s
-
-  @property
-  def entry(self):
-    return self._entry
-
-  @property
-  def max_depth(self):
-    return self._n
-
-  def append_post_entry(self, post_entry):
-    if self._n > 0 and post_entry.thread == self._entry.thread:
-      self._post_stack.append(post_entry)
-      self._n -= 1
-
-  def get_deltas(self):
-    pre_and_post_stacks = itertools.chain(self._pre_stack, self._post_stack)
-    return collections.OrderedDict((stack_entry,
-                                   abs(self._entry.time - stack_entry.time))
-                                   for stack_entry in pre_and_post_stacks)
-
-
-def print_block_statistics(block_times):
-  print '{:<12s}  {:>12s} {:>12s} {:>12s} {:>12s}'.format(
-          'Block tag', '50th p.', '90th p.', '95th p.', '99th p.')
-  for tag, tag_times in sorted(block_times.iteritems()):
-    times = sorted(tag_times)
-    print '{:<12d}: {:>12.3f} {:>12.3f} {:>12.3f} {:>12.3f}'.format(
-            tag, percentile(times, 50), percentile(times, 90),
-            percentile(times, 95), percentile(times, 99))
-  print
-
-def print_grouped_imark_statistics(group_key, imarks_group):
-  values = collections.OrderedDict()
-  for imark in imarks_group:
-    deltas = imark.get_deltas()
-    for relative_entry, time_delta_us in deltas.iteritems():
-      key = '{tag} {type} ({file}:{line})'.format(**relative_entry._asdict())
-      l = values.setdefault(key, list())
-      l.append(time_delta_us)
-
-  print group_key
-  print '{:<50s}  {:>12s} {:>12s} {:>12s} {:>12s}'.format(
-        'Relative mark', '50th p.', '90th p.', '95th p.', '99th p.')
-  for key, time_values in values.iteritems():
-    time_values = sorted(time_values)
-    print '{:<50s}: {:>12.3f} {:>12.3f} {:>12.3f} {:>12.3f}'.format(
-          key, percentile(time_values, 50), percentile(time_values, 90),
-          percentile(time_values, 95), percentile(time_values, 99))
-  print
-
-def percentile(vals, percent):
-  """ Calculates the interpolated percentile given a sorted sequence and a
-  percent (in the usual 0-100 range)."""
-  assert vals, "Empty input sequence."
-  percent /= 100.0
-  k = (len(vals)-1) * percent
-  f = math.floor(k)
-  c = math.ceil(k)
-  if f == c:
-      return vals[int(k)]
-  # else, interpolate
-  d0 = vals[int(f)] * (c-k)
-  d1 = vals[int(c)] * (k-f)
-  return d0 + d1
-
-def entries(f):
-  for line in f:
-    m = _RE_LINE.match(line)
-    if not m: continue
-    yield Entry(time=float(m.group(1)),
-                thread=m.group(2),
-                type=m.group(3),
-                tag=int(m.group(4)),
-                id=m.group(5),
-                file=m.group(6),
-                line=m.group(7))
-
-def main(f):
-  percentiles = (50, 90, 95, 99)
-  threads = collections.defaultdict(lambda: collections.defaultdict(list))
-  times = collections.defaultdict(list)
-  important_marks = collections.defaultdict(list)
-  stack_depth = collections.defaultdict(int)
-  for entry in entries(f):
-    thread = threads[entry.thread]
-    if entry.type == '{':
-      thread[entry.tag].append(entry)
-      stack_depth[entry.thread] += 1
-    if entry.type == '!':
-      # Save a snapshot of the current stack inside a new ImportantMark instance.
-      # Get all entries _for any tag in the thread_.
-      stack = [e for entries_for_tag in thread.itervalues()
-                 for e in entries_for_tag]
-      imark_group_key = '{tag}/{thread}@{file}:{line}'.format(**entry._asdict())
-      important_marks[imark_group_key].append(ImportantMark(entry, stack))
-    elif entry.type == '}':
-      last = thread[entry.tag].pop()
-      times[entry.tag].append(entry.time - last.time)
-      # only access the last "depth" imarks for the tag.
-      depth = stack_depth[entry.thread]
-      for imarks_group in important_marks.itervalues():
-        for imark in imarks_group[-depth:]:
-          # if at a '}' deeper than where the current "imark" was found, ignore.
-          if depth > imark.max_depth: continue
-          imark.append_post_entry(entry)
-      stack_depth[entry.thread] -= 1
-
-  print
-  print 'Block marks:'
-  print '============'
-  print_block_statistics(times)
-
-  print
-  print 'Important marks:'
-  print '================'
-  for group_key, imarks_group in important_marks.iteritems():
-    print_grouped_imark_statistics(group_key, imarks_group)
-
-
-if __name__ == '__main__':
-  # If invoked without arguments, read off sys.stdin. If one argument is given,
-  # take it as a file name and open it for reading.
-  import sys
-  f = sys.stdin
-  if len(sys.argv) == 2:
-    f = open(sys.argv[1], 'r')
-  main(f)
--- a/tools/profiling/latency_profile/profile_analyzer.py
+++ b/tools/profiling/latency_profile/profile_analyzer.py
@ -0,0 +1,231 @@
+#!/usr/bin/env python2.7
+import argparse
+import collections
+import hashlib
+import itertools
+import json
+import math
+import tabulate
+import time
+
+
+SELF_TIME = object()
+TIME_FROM_SCOPE_START = object()
+TIME_TO_SCOPE_END = object()
+TIME_FROM_STACK_START = object()
+TIME_TO_STACK_END = object()
+
+
+argp = argparse.ArgumentParser(description='Process output of basic_prof builds')
+argp.add_argument('--source', default='latency_trace.txt', type=str)
+argp.add_argument('--fmt', choices=tabulate.tabulate_formats, default='simple')
+args = argp.parse_args()
+
+class LineItem(object):
+
+  def __init__(self, line, indent):
+    self.tag = line['tag']
+    self.indent = indent
+    self.start_time = line['t']
+    self.end_time = None
+    self.important = line['imp']
+    self.filename = line['file']
+    self.fileline = line['line']
+    self.times = {}
+
+
+class ScopeBuilder(object):
+
+  def __init__(self, call_stack_builder, line):
+    self.call_stack_builder = call_stack_builder
+    self.indent = len(call_stack_builder.stk)
+    self.top_line = LineItem(line, self.indent)
+    call_stack_builder.lines.append(self.top_line)
+    self.first_child_pos = len(call_stack_builder.lines)
+
+  def mark(self, line):
+    line_item = LineItem(line, self.indent + 1)
+    line_item.end_time = line_item.start_time
+    self.call_stack_builder.lines.append(line_item)
+
+  def finish(self, line):
+    assert line['tag'] == self.top_line.tag, 'expected %s, got %s' % (self.top_line.tag, line['tag'])
+    final_time_stamp = line['t']
+    assert self.top_line.end_time is None
+    self.top_line.end_time = final_time_stamp
+    assert SELF_TIME not in self.top_line.times
+    self.top_line.times[SELF_TIME] = final_time_stamp - self.top_line.start_time
+    for line in self.call_stack_builder.lines[self.first_child_pos:]:
+      if TIME_FROM_SCOPE_START not in line.times:
+        line.times[TIME_FROM_SCOPE_START] = line.start_time - self.top_line.start_time
+        line.times[TIME_TO_SCOPE_END] = final_time_stamp - line.end_time
+
+
+class CallStackBuilder(object):
+
+  def __init__(self):
+    self.stk = []
+    self.signature = hashlib.md5()
+    self.lines = []
+
+  def finish(self):
+    start_time = self.lines[0].start_time
+    end_time = self.lines[0].end_time
+    self.signature = self.signature.hexdigest()
+    for line in self.lines:
+      line.times[TIME_FROM_STACK_START] = line.start_time - start_time
+      line.times[TIME_TO_STACK_END] = end_time - line.end_time
+
+  def add(self, line):
+    line_type = line['type']
+    self.signature.update(line_type)
+    self.signature.update(line['tag'])
+    if line_type == '{':
+      self.stk.append(ScopeBuilder(self, line))
+      return False
+    elif line_type == '}':
+      self.stk.pop().finish(line)
+      if not self.stk:
+        self.finish()
+        return True
+      return False
+    elif line_type == '.' or line_type == '!':
+      self.stk[-1].mark(line)
+      return False
+    else:
+      raise Exception('Unknown line type: \'%s\'' % line_type)
+
+
+class CallStack(object):
+
+  def __init__(self, initial_call_stack_builder):
+    self.count = 1
+    self.signature = initial_call_stack_builder.signature
+    self.lines = initial_call_stack_builder.lines
+    for line in self.lines:
+      for key, val in line.times.items():
+        line.times[key] = [val]
+
+  def add(self, call_stack_builder):
+    assert self.signature == call_stack_builder.signature
+    self.count += 1
+    assert len(self.lines) == len(call_stack_builder.lines)
+    for lsum, line in itertools.izip(self.lines, call_stack_builder.lines):
+      assert lsum.tag == line.tag
+      assert lsum.times.keys() == line.times.keys()
+      for k, lst in lsum.times.iteritems():
+        lst.append(line.times[k])
+
+  def finish(self):
+    for line in self.lines:
+      for lst in line.times.itervalues():
+        lst.sort()
+
+builder = collections.defaultdict(CallStackBuilder)
+call_stacks = collections.defaultdict(CallStack)
+
+lines = 0
+start = time.time()
+with open(args.source) as f:
+  for line in f:
+    lines += 1
+    inf = json.loads(line)
+    thd = inf['thd']
+    cs = builder[thd]
+    if cs.add(inf):
+      if cs.signature in call_stacks:
+        call_stacks[cs.signature].add(cs)
+      else:
+        call_stacks[cs.signature] = CallStack(cs)
+      del builder[thd]
+time_taken = time.time() - start
+
+call_stacks = sorted(call_stacks.values(), key=lambda cs: cs.count, reverse=True)
+total_stacks = 0
+for cs in call_stacks:
+  total_stacks += cs.count
+  cs.finish()
+
+def percentile(N, percent, key=lambda x:x):
+    """
+    Find the percentile of a list of values.
+
+    @parameter N - is a list of values. Note N MUST BE already sorted.
+    @parameter percent - a float value from 0.0 to 1.0.
+    @parameter key - optional key function to compute value from each element of N.
+
+    @return - the percentile of the values
+    """
+    if not N:
+        return None
+    k = (len(N)-1) * percent
+    f = math.floor(k)
+    c = math.ceil(k)
+    if f == c:
+        return key(N[int(k)])
+    d0 = key(N[int(f)]) * (c-k)
+    d1 = key(N[int(c)]) * (k-f)
+    return d0+d1
+
+def tidy_tag(tag):
+  if tag[0:10] == 'GRPC_PTAG_':
+    return tag[10:]
+  return tag
+
+def time_string(values):
+  num_values = len(values)
+  return '%.1f/%.1f/%.1f' % (
+      1e6 * percentile(values, 0.5),
+      1e6 * percentile(values, 0.9),
+      1e6 * percentile(values, 0.99))
+
+def time_format(idx):
+  def ent(line, idx=idx):
+    if idx in line.times:
+      return time_string(line.times[idx])
+    return ''
+  return ent
+
+FORMAT = [
+  ('TAG', lambda line: '..'*line.indent + tidy_tag(line.tag)),
+  ('LOC', lambda line: '%s:%d' % (line.filename[line.filename.rfind('/')+1:], line.fileline)),
+  ('FROM_STACK_START', time_format(TIME_FROM_STACK_START)),
+  ('SELF', time_format(SELF_TIME)),
+  ('TO_STACK_END', time_format(TIME_TO_STACK_END)),
+  ('FROM_SCOPE_START', time_format(TIME_FROM_SCOPE_START)),
+  ('SELF', time_format(SELF_TIME)),
+  ('TO_SCOPE_END', time_format(TIME_TO_SCOPE_END)),
+]
+
+BANNER = {
+    'simple': 'Count: %(count)d',
+    'html': '<h1>Count: %(count)d</h1>'
+}
+
+if args.fmt == 'html':
+  print '<html>'
+  print '<head>'
+  print '<title>Profile Report</title>'
+  print '</head>'
+
+accounted_for = 0
+for cs in call_stacks:
+  if args.fmt in BANNER:
+    print BANNER[args.fmt] % {
+        'count': cs.count,
+    }
+  header, _ = zip(*FORMAT)
+  table = []
+  for line in cs.lines:
+    fields = []
+    for _, fn in FORMAT:
+      fields.append(fn(line))
+    table.append(fields)
+  print tabulate.tabulate(table, header, tablefmt=args.fmt)
+  accounted_for += cs.count
+  if accounted_for > .99 * total_stacks:
+    break
+
+if args.fmt == 'html':
+  print '</html>'
+
--- a/tools/profiling/latency_profile/run_latency_profile.sh
+++ b/tools/profiling/latency_profile/run_latency_profile.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+set -ex
+
+cd $(dirname $0)/../../..
+
+BINS="sync_unary_ping_pong_test sync_streaming_ping_pong_test"
+CPUS=`python -c 'import multiprocessing; print multiprocessing.cpu_count()'`
+
+make CONFIG=basicprof -j$CPUS $BINS
+
+mkdir -p reports
+
+echo '<html><head></head><body>' > reports/index.html
+for bin in $BINS
+do
+  bins/basicprof/$bin
+  mv latency_trace.txt $bin.trace
+  echo "<a href='$bin.txt'>$bin</a><br/>" >> reports/index.html
+done
+for bin in $BINS
+do
+  tools/profiling/latency_profile/profile_analyzer.py \
+    --source=$bin.trace --fmt=simple > reports/$bin.txt &
+done
+echo '</body></html>' >> reports/index.html
+
+wait
+
--- a/tools/run_tests/sources_and_headers.json
+++ b/tools/run_tests/sources_and_headers.json
@ -12161,6 +12161,7 @@
      "include/grpc/support/tls_msvc.h", 
      "include/grpc/support/tls_pthread.h", 
      "include/grpc/support/useful.h", 
+      "src/core/profiling/timers.h", 
      "src/core/support/block_annotate.h", 
      "src/core/support/env.h", 
      "src/core/support/file.h", 
@ -12201,6 +12202,9 @@
      "include/grpc/support/tls_msvc.h", 
      "include/grpc/support/tls_pthread.h", 
      "include/grpc/support/useful.h", 
+      "src/core/profiling/basic_timers.c", 
+      "src/core/profiling/stap_timers.c", 
+      "src/core/profiling/timers.h", 
      "src/core/support/alloc.c", 
      "src/core/support/block_annotate.h", 
      "src/core/support/cmdline.c", 
@ -12244,6 +12248,7 @@
      "src/core/support/thd_win32.c", 
      "src/core/support/time.c", 
      "src/core/support/time_posix.c", 
+      "src/core/support/time_precise.c", 
      "src/core/support/time_precise.h", 
      "src/core/support/time_win32.c", 
      "src/core/support/tls_pthread.c"
@ -12350,7 +12355,6 @@
      "src/core/json/json_common.h", 
      "src/core/json/json_reader.h", 
      "src/core/json/json_writer.h", 
-      "src/core/profiling/timers.h", 
      "src/core/security/auth_filters.h", 
      "src/core/security/base64.h", 
      "src/core/security/credentials.h", 
@ -12569,9 +12573,6 @@
      "src/core/json/json_string.c", 
      "src/core/json/json_writer.c", 
      "src/core/json/json_writer.h", 
-      "src/core/profiling/basic_timers.c", 
-      "src/core/profiling/stap_timers.c", 
-      "src/core/profiling/timers.h", 
      "src/core/security/auth_filters.h", 
      "src/core/security/base64.c", 
      "src/core/security/base64.h", 
@ -12857,7 +12858,6 @@
      "src/core/json/json_common.h", 
      "src/core/json/json_reader.h", 
      "src/core/json/json_writer.h", 
-      "src/core/profiling/timers.h", 
      "src/core/statistics/census_interface.h", 
      "src/core/statistics/census_rpc_stats.h", 
      "src/core/surface/api_trace.h", 
@ -13061,9 +13061,6 @@
      "src/core/json/json_string.c", 
      "src/core/json/json_writer.c", 
      "src/core/json/json_writer.h", 
-      "src/core/profiling/basic_timers.c", 
-      "src/core/profiling/stap_timers.c", 
-      "src/core/profiling/timers.h", 
      "src/core/statistics/census_interface.h", 
      "src/core/statistics/census_rpc_stats.h", 
      "src/core/surface/api_trace.c", 
--- a/vsprojects/vcxproj/gpr/gpr.vcxproj
+++ b/vsprojects/vcxproj/gpr/gpr.vcxproj
@ -163,6 +163,7 @@
    <ClInclude Include="..\..\..\include\grpc\support\useful.h" />
  </ItemGroup>
  <ItemGroup>
+    <ClInclude Include="..\..\..\src\core\profiling\timers.h" />
    <ClInclude Include="..\..\..\src\core\support\block_annotate.h" />
    <ClInclude Include="..\..\..\src\core\support\env.h" />
    <ClInclude Include="..\..\..\src\core\support\file.h" />
@ -174,6 +175,10 @@
    <ClInclude Include="..\..\..\src\core\support\time_precise.h" />
  </ItemGroup>
  <ItemGroup>
+    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
+    </ClCompile>
+    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
+    </ClCompile>
    <ClCompile Include="..\..\..\src\core\support\alloc.c">
    </ClCompile>
    <ClCompile Include="..\..\..\src\core\support\cmdline.c">
@ -244,6 +249,8 @@
    </ClCompile>
    <ClCompile Include="..\..\..\src\core\support\time_posix.c">
    </ClCompile>
+    <ClCompile Include="..\..\..\src\core\support\time_precise.c">
+    </ClCompile>
    <ClCompile Include="..\..\..\src\core\support\time_win32.c">
    </ClCompile>
    <ClCompile Include="..\..\..\src\core\support\tls_pthread.c">
--- a/vsprojects/vcxproj/gpr/gpr.vcxproj.filters
+++ b/vsprojects/vcxproj/gpr/gpr.vcxproj.filters
@ -1,6 +1,12 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
+    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
+      <Filter>src\core\profiling</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
+      <Filter>src\core\profiling</Filter>
+    </ClCompile>
    <ClCompile Include="..\..\..\src\core\support\alloc.c">
      <Filter>src\core\support</Filter>
    </ClCompile>
@ -106,6 +112,9 @@
    <ClCompile Include="..\..\..\src\core\support\time_posix.c">
      <Filter>src\core\support</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\..\src\core\support\time_precise.c">
+      <Filter>src\core\support</Filter>
+    </ClCompile>
    <ClCompile Include="..\..\..\src\core\support\time_win32.c">
      <Filter>src\core\support</Filter>
    </ClCompile>
@ -197,6 +206,9 @@
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
+    <ClInclude Include="..\..\..\src\core\profiling\timers.h">
+      <Filter>src\core\profiling</Filter>
+    </ClInclude>
    <ClInclude Include="..\..\..\src\core\support\block_annotate.h">
      <Filter>src\core\support</Filter>
    </ClInclude>
@ -242,6 +254,9 @@
    <Filter Include="src\core">
      <UniqueIdentifier>{c5e1baa7-de77-beb1-9675-942261648f79}</UniqueIdentifier>
    </Filter>
+    <Filter Include="src\core\profiling">
+      <UniqueIdentifier>{93b7086c-8c8a-6bbf-fb14-1f166bf0146a}</UniqueIdentifier>
+    </Filter>
    <Filter Include="src\core\support">
      <UniqueIdentifier>{bb116f2a-ea2a-c233-82da-0c54e3cbfec1}</UniqueIdentifier>
    </Filter>
--- a/vsprojects/vcxproj/grpc/grpc.vcxproj
+++ b/vsprojects/vcxproj/grpc/grpc.vcxproj
@ -338,7 +338,6 @@
    <ClInclude Include="..\..\..\src\core\json\json_common.h" />
    <ClInclude Include="..\..\..\src\core\json\json_reader.h" />
    <ClInclude Include="..\..\..\src\core\json\json_writer.h" />
-    <ClInclude Include="..\..\..\src\core\profiling\timers.h" />
    <ClInclude Include="..\..\..\src\core\statistics\census_interface.h" />
    <ClInclude Include="..\..\..\src\core\statistics\census_rpc_stats.h" />
    <ClInclude Include="..\..\..\src\core\surface\api_trace.h" />
@ -576,10 +575,6 @@
    </ClCompile>
    <ClCompile Include="..\..\..\src\core\json\json_writer.c">
    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
-    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
-    </ClCompile>
    <ClCompile Include="..\..\..\src\core\surface\api_trace.c">
    </ClCompile>
    <ClCompile Include="..\..\..\src\core\surface\byte_buffer.c">
--- a/vsprojects/vcxproj/grpc/grpc.vcxproj.filters
+++ b/vsprojects/vcxproj/grpc/grpc.vcxproj.filters
@ -292,12 +292,6 @@
    <ClCompile Include="..\..\..\src\core\json\json_writer.c">
      <Filter>src\core\json</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
-      <Filter>src\core\profiling</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
-      <Filter>src\core\profiling</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\..\src\core\surface\api_trace.c">
      <Filter>src\core\surface</Filter>
    </ClCompile>
@ -725,9 +719,6 @@
    <ClInclude Include="..\..\..\src\core\json\json_writer.h">
      <Filter>src\core\json</Filter>
    </ClInclude>
-    <ClInclude Include="..\..\..\src\core\profiling\timers.h">
-      <Filter>src\core\profiling</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\..\src\core\statistics\census_interface.h">
      <Filter>src\core\statistics</Filter>
    </ClInclude>
@ -899,9 +890,6 @@
    <Filter Include="src\core\json">
      <UniqueIdentifier>{e665cc0e-b994-d7c5-cc18-2007392019f0}</UniqueIdentifier>
    </Filter>
-    <Filter Include="src\core\profiling">
-      <UniqueIdentifier>{87674b72-0f05-0469-481a-bd8c7af9ad80}</UniqueIdentifier>
-    </Filter>
    <Filter Include="src\core\security">
      <UniqueIdentifier>{1d850ac6-e639-4eab-5338-4ba40272fcc9}</UniqueIdentifier>
    </Filter>
--- a/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj
+++ b/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj
@ -317,7 +317,6 @@
    <ClInclude Include="..\..\..\src\core\json\json_common.h" />
    <ClInclude Include="..\..\..\src\core\json\json_reader.h" />
    <ClInclude Include="..\..\..\src\core\json\json_writer.h" />
-    <ClInclude Include="..\..\..\src\core\profiling\timers.h" />
    <ClInclude Include="..\..\..\src\core\statistics\census_interface.h" />
    <ClInclude Include="..\..\..\src\core\statistics\census_rpc_stats.h" />
    <ClInclude Include="..\..\..\src\core\surface\api_trace.h" />
@ -515,10 +514,6 @@
    </ClCompile>
    <ClCompile Include="..\..\..\src\core\json\json_writer.c">
    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
-    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
-    </ClCompile>
    <ClCompile Include="..\..\..\src\core\surface\api_trace.c">
    </ClCompile>
    <ClCompile Include="..\..\..\src\core\surface\byte_buffer.c">
--- a/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj.filters
+++ b/vsprojects/vcxproj/grpc_unsecure/grpc_unsecure.vcxproj.filters
@ -232,12 +232,6 @@
    <ClCompile Include="..\..\..\src\core\json\json_writer.c">
      <Filter>src\core\json</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\basic_timers.c">
-      <Filter>src\core\profiling</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\..\src\core\profiling\stap_timers.c">
-      <Filter>src\core\profiling</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\..\src\core\surface\api_trace.c">
      <Filter>src\core\surface</Filter>
    </ClCompile>
@ -623,9 +617,6 @@
    <ClInclude Include="..\..\..\src\core\json\json_writer.h">
      <Filter>src\core\json</Filter>
    </ClInclude>
-    <ClInclude Include="..\..\..\src\core\profiling\timers.h">
-      <Filter>src\core\profiling</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\..\src\core\statistics\census_interface.h">
      <Filter>src\core\statistics</Filter>
    </ClInclude>
@ -797,9 +788,6 @@
    <Filter Include="src\core\json">
      <UniqueIdentifier>{443ffc61-1bea-2477-6e54-1ddf8c139264}</UniqueIdentifier>
    </Filter>
-    <Filter Include="src\core\profiling">
-      <UniqueIdentifier>{7f91d9bf-c9de-835a-d74d-b16f843b89a9}</UniqueIdentifier>
-    </Filter>
    <Filter Include="src\core\statistics">
      <UniqueIdentifier>{e084164c-a069-00e3-db35-4e0b1cd6f0b7}</UniqueIdentifier>
    </Filter>