Merge pull request #8854 from dgquintas/backoff

Updated backoff to spec.
pull/8944/head
Craig Tiller 8 years ago committed by GitHub
commit ad158893de
  1. 7
      doc/connection-backoff.md
  2. 10
      src/core/ext/client_channel/subchannel.c
  3. 18
      src/core/ext/lb_policy/grpclb/grpclb.c
  4. 16
      src/core/ext/resolver/dns/native/dns_resolver.c
  5. 37
      src/core/lib/support/backoff.c
  6. 7
      src/core/lib/support/backoff.h
  7. 4
      test/core/client_channel/lb_policies_test.c
  8. 10
      test/core/end2end/goaway_server_test.c
  9. 10
      test/core/end2end/tests/connectivity.c
  10. 19
      test/core/end2end/tests/simple_delayed_request.c
  11. 89
      test/core/support/backoff_test.c

@ -7,9 +7,10 @@ requests) and instead do some form of exponential backoff.
We have several parameters:
1. INITIAL_BACKOFF (how long to wait after the first failure before retrying)
2. MULTIPLIER (factor with which to multiply backoff after a failed retry)
3. MAX_BACKOFF (upper bound on backoff)
4. MIN_CONNECT_TIMEOUT (minimum time we're willing to give a connection to
1. MULTIPLIER (factor with which to multiply backoff after a failed retry)
1. JITTER (by how much to randomize backoffs).
1. MAX_BACKOFF (upper bound on backoff)
1. MIN_CONNECT_TIMEOUT (minimum time we're willing to give a connection to
complete)
## Proposed Backoff Algorithm

@ -336,14 +336,16 @@ grpc_subchannel *grpc_subchannel_create(grpc_exec_ctx *exec_ctx,
int initial_backoff_ms =
GRPC_SUBCHANNEL_INITIAL_CONNECT_BACKOFF_SECONDS * 1000;
int max_backoff_ms = GRPC_SUBCHANNEL_RECONNECT_MAX_BACKOFF_SECONDS * 1000;
int min_backoff_ms = GRPC_SUBCHANNEL_MIN_CONNECT_TIMEOUT_SECONDS * 1000;
bool fixed_reconnect_backoff = false;
if (c->args) {
for (size_t i = 0; i < c->args->num_args; i++) {
if (0 == strcmp(c->args->args[i].key,
"grpc.testing.fixed_reconnect_backoff")) {
"grpc.testing.fixed_reconnect_backoff_ms")) {
GPR_ASSERT(c->args->args[i].type == GRPC_ARG_INTEGER);
fixed_reconnect_backoff = true;
initial_backoff_ms = max_backoff_ms = grpc_channel_arg_get_integer(
initial_backoff_ms = min_backoff_ms = max_backoff_ms =
grpc_channel_arg_get_integer(
&c->args->args[i],
(grpc_integer_options){initial_backoff_ms, 100, INT_MAX});
} else if (0 == strcmp(c->args->args[i].key,
@ -362,11 +364,11 @@ grpc_subchannel *grpc_subchannel_create(grpc_exec_ctx *exec_ctx,
}
}
gpr_backoff_init(
&c->backoff_state,
&c->backoff_state, initial_backoff_ms,
fixed_reconnect_backoff ? 1.0
: GRPC_SUBCHANNEL_RECONNECT_BACKOFF_MULTIPLIER,
fixed_reconnect_backoff ? 0.0 : GRPC_SUBCHANNEL_RECONNECT_JITTER,
initial_backoff_ms, max_backoff_ms);
min_backoff_ms, max_backoff_ms);
gpr_mu_init(&c->mu);
return grpc_subchannel_index_register(exec_ctx, key, c);

@ -123,10 +123,11 @@
#include "src/core/lib/surface/channel.h"
#include "src/core/lib/transport/static_metadata.h"
#define BACKOFF_MULTIPLIER 1.6
#define BACKOFF_JITTER 0.2
#define BACKOFF_MIN_SECONDS 10
#define BACKOFF_MAX_SECONDS 60
#define GRPC_GRPCLB_MIN_CONNECT_TIMEOUT_SECONDS 20
#define GRPC_GRPCLB_INITIAL_CONNECT_BACKOFF_SECONDS 1
#define GRPC_GRPCLB_RECONNECT_BACKOFF_MULTIPLIER 1.6
#define GRPC_GRPCLB_RECONNECT_MAX_BACKOFF_SECONDS 120
#define GRPC_GRPCLB_RECONNECT_JITTER 0.2
int grpc_lb_glb_trace = 0;
@ -1107,9 +1108,12 @@ static void lb_call_init_locked(glb_lb_policy *glb_policy) {
grpc_closure_init(&glb_policy->lb_on_response_received,
lb_on_response_received, glb_policy);
gpr_backoff_init(&glb_policy->lb_call_backoff_state, BACKOFF_MULTIPLIER,
BACKOFF_JITTER, BACKOFF_MIN_SECONDS * 1000,
BACKOFF_MAX_SECONDS * 1000);
gpr_backoff_init(&glb_policy->lb_call_backoff_state,
GRPC_GRPCLB_INITIAL_CONNECT_BACKOFF_SECONDS,
GRPC_GRPCLB_RECONNECT_BACKOFF_MULTIPLIER,
GRPC_GRPCLB_RECONNECT_JITTER,
GRPC_GRPCLB_MIN_CONNECT_TIMEOUT_SECONDS * 1000,
GRPC_GRPCLB_RECONNECT_MAX_BACKOFF_SECONDS * 1000);
}
static void lb_call_destroy_locked(glb_lb_policy *glb_policy) {

@ -46,10 +46,11 @@
#include "src/core/lib/support/backoff.h"
#include "src/core/lib/support/string.h"
#define BACKOFF_MULTIPLIER 1.6
#define BACKOFF_JITTER 0.2
#define BACKOFF_MIN_SECONDS 1
#define BACKOFF_MAX_SECONDS 120
#define GRPC_DNS_MIN_CONNECT_TIMEOUT_SECONDS 1
#define GRPC_DNS_INITIAL_CONNECT_BACKOFF_SECONDS 1
#define GRPC_DNS_RECONNECT_BACKOFF_MULTIPLIER 1.6
#define GRPC_DNS_RECONNECT_MAX_BACKOFF_SECONDS 120
#define GRPC_DNS_RECONNECT_JITTER 0.2
typedef struct {
/** base class: must be first */
@ -269,8 +270,11 @@ static grpc_resolver *dns_create(grpc_resolver_args *args,
server_name_arg.value.string = (char *)path;
r->channel_args =
grpc_channel_args_copy_and_add(args->args, &server_name_arg, 1);
gpr_backoff_init(&r->backoff_state, BACKOFF_MULTIPLIER, BACKOFF_JITTER,
BACKOFF_MIN_SECONDS * 1000, BACKOFF_MAX_SECONDS * 1000);
gpr_backoff_init(&r->backoff_state, GRPC_DNS_INITIAL_CONNECT_BACKOFF_SECONDS,
GRPC_DNS_RECONNECT_BACKOFF_MULTIPLIER,
GRPC_DNS_RECONNECT_JITTER,
GRPC_DNS_MIN_CONNECT_TIMEOUT_SECONDS * 1000,
GRPC_DNS_RECONNECT_MAX_BACKOFF_SECONDS * 1000);
return &r->base;
}

@ -35,8 +35,10 @@
#include <grpc/support/useful.h>
void gpr_backoff_init(gpr_backoff *backoff, double multiplier, double jitter,
void gpr_backoff_init(gpr_backoff *backoff, int64_t initial_connect_timeout,
double multiplier, double jitter,
int64_t min_timeout_millis, int64_t max_timeout_millis) {
backoff->initial_connect_timeout = initial_connect_timeout;
backoff->multiplier = multiplier;
backoff->jitter = jitter;
backoff->min_timeout_millis = min_timeout_millis;
@ -45,9 +47,10 @@ void gpr_backoff_init(gpr_backoff *backoff, double multiplier, double jitter,
}
gpr_timespec gpr_backoff_begin(gpr_backoff *backoff, gpr_timespec now) {
backoff->current_timeout_millis = backoff->min_timeout_millis;
return gpr_time_add(
now, gpr_time_from_millis(backoff->current_timeout_millis, GPR_TIMESPAN));
backoff->current_timeout_millis = backoff->initial_connect_timeout;
const int64_t first_timeout =
GPR_MAX(backoff->current_timeout_millis, backoff->min_timeout_millis);
return gpr_time_add(now, gpr_time_from_millis(first_timeout, GPR_TIMESPAN));
}
/* Generate a random number between 0 and 1. */
@ -57,20 +60,28 @@ static double generate_uniform_random_number(uint32_t *rng_state) {
}
gpr_timespec gpr_backoff_step(gpr_backoff *backoff, gpr_timespec now) {
double new_timeout_millis =
const double new_timeout_millis =
backoff->multiplier * (double)backoff->current_timeout_millis;
double jitter_range = backoff->jitter * new_timeout_millis;
double jitter =
backoff->current_timeout_millis =
GPR_MIN((int64_t)new_timeout_millis, backoff->max_timeout_millis);
const double jitter_range_width = backoff->jitter * new_timeout_millis;
const double jitter =
(2 * generate_uniform_random_number(&backoff->rng_state) - 1) *
jitter_range;
jitter_range_width;
backoff->current_timeout_millis =
GPR_CLAMP((int64_t)(new_timeout_millis + jitter),
backoff->min_timeout_millis, backoff->max_timeout_millis);
return gpr_time_add(
(int64_t)((double)(backoff->current_timeout_millis) + jitter);
const gpr_timespec current_deadline = gpr_time_add(
now, gpr_time_from_millis(backoff->current_timeout_millis, GPR_TIMESPAN));
const gpr_timespec min_deadline = gpr_time_add(
now, gpr_time_from_millis(backoff->min_timeout_millis, GPR_TIMESPAN));
return gpr_time_max(current_deadline, min_deadline);
}
void gpr_backoff_reset(gpr_backoff *backoff) {
// forces step() to return a timeout of min_timeout_millis
backoff->current_timeout_millis = 0;
backoff->current_timeout_millis = backoff->initial_connect_timeout;
}

@ -37,7 +37,9 @@
#include <grpc/support/time.h>
typedef struct {
/// const: multiplier between retry attempts
/// const: how long to wait after the first failure before retrying
int64_t initial_connect_timeout;
/// const: factor with which to multiply backoff after a failed retry
double multiplier;
/// const: amount to randomize backoffs
double jitter;
@ -54,7 +56,8 @@ typedef struct {
} gpr_backoff;
/// Initialize backoff machinery - does not need to be destroyed
void gpr_backoff_init(gpr_backoff *backoff, double multiplier, double jitter,
void gpr_backoff_init(gpr_backoff *backoff, int64_t initial_connect_timeout,
double multiplier, double jitter,
int64_t min_timeout_millis, int64_t max_timeout_millis);
/// Begin retry loop: returns a timespec for the NEXT retry

@ -483,7 +483,7 @@ void run_spec(const test_spec *spec) {
gpr_asprintf(&client_hostport, "ipv4:%s", servers_hostports_str);
arg_array[0].type = GRPC_ARG_INTEGER;
arg_array[0].key = "grpc.testing.fixed_reconnect_backoff";
arg_array[0].key = "grpc.testing.fixed_reconnect_backoff_ms";
arg_array[0].value.integer = RETRY_TIMEOUT;
arg_array[1].type = GRPC_ARG_STRING;
arg_array[1].key = GRPC_ARG_LB_POLICY_NAME;
@ -521,7 +521,7 @@ static grpc_channel *create_client(const servers_fixture *f) {
gpr_asprintf(&client_hostport, "ipv4:%s", servers_hostports_str);
arg_array[0].type = GRPC_ARG_INTEGER;
arg_array[0].key = "grpc.testing.fixed_reconnect_backoff";
arg_array[0].key = "grpc.testing.fixed_reconnect_backoff_ms";
arg_array[0].value.integer = RETRY_TIMEOUT;
arg_array[1].type = GRPC_ARG_STRING;
arg_array[1].key = GRPC_ARG_LB_POLICY_NAME;

@ -126,8 +126,16 @@ int main(int argc, char **argv) {
char *addr;
grpc_channel_args client_args;
grpc_arg arg_array[1];
arg_array[0].type = GRPC_ARG_INTEGER;
arg_array[0].key = "grpc.testing.fixed_reconnect_backoff_ms";
arg_array[0].value.integer = 1000;
client_args.args = arg_array;
client_args.num_args = 1;
/* create a channel that picks first amongst the servers */
grpc_channel *chan = grpc_insecure_channel_create("test", NULL, NULL);
grpc_channel *chan = grpc_insecure_channel_create("test", &client_args, NULL);
/* and an initial call to them */
grpc_call *call1 = grpc_channel_create_call(
chan, NULL, GRPC_PROPAGATE_DEFAULTS, cq, "/foo", "127.0.0.1",

@ -68,7 +68,15 @@ static void test_connectivity(grpc_end2end_test_config config) {
gpr_thd_options thdopt = gpr_thd_options_default();
gpr_thd_id thdid;
config.init_client(&f, NULL);
grpc_channel_args client_args;
grpc_arg arg_array[1];
arg_array[0].type = GRPC_ARG_INTEGER;
arg_array[0].key = "grpc.testing.fixed_reconnect_backoff_ms";
arg_array[0].value.integer = 1000;
client_args.args = arg_array;
client_args.num_args = 1;
config.init_client(&f, &client_args);
ce.channel = f.client;
ce.cq = f.cq;

@ -200,21 +200,36 @@ static void simple_delayed_request_body(grpc_end2end_test_config config,
static void test_simple_delayed_request_short(grpc_end2end_test_config config) {
grpc_end2end_test_fixture f;
grpc_channel_args client_args;
grpc_arg arg_array[1];
arg_array[0].type = GRPC_ARG_INTEGER;
arg_array[0].key = "grpc.testing.fixed_reconnect_backoff_ms";
arg_array[0].value.integer = 1000;
client_args.args = arg_array;
client_args.num_args = 1;
gpr_log(GPR_INFO, "%s/%s", "test_simple_delayed_request_short", config.name);
f = config.create_fixture(NULL, NULL);
simple_delayed_request_body(config, &f, NULL, NULL, 100000);
simple_delayed_request_body(config, &f, &client_args, NULL, 100000);
end_test(&f);
config.tear_down_data(&f);
}
static void test_simple_delayed_request_long(grpc_end2end_test_config config) {
grpc_end2end_test_fixture f;
grpc_channel_args client_args;
grpc_arg arg_array[1];
arg_array[0].type = GRPC_ARG_INTEGER;
arg_array[0].key = "grpc.testing.fixed_reconnect_backoff_ms";
arg_array[0].value.integer = 1000;
client_args.args = arg_array;
client_args.num_args = 1;
gpr_log(GPR_INFO, "%s/%s", "test_simple_delayed_request_long", config.name);
f = config.create_fixture(NULL, NULL);
/* This timeout should be longer than a single retry */
simple_delayed_request_body(config, &f, NULL, NULL, 1500000);
simple_delayed_request_body(config, &f, &client_args, NULL, 1500000);
end_test(&f);
config.tear_down_data(&f);
}

@ -39,61 +39,110 @@
static void test_constant_backoff(void) {
gpr_backoff backoff;
gpr_backoff_init(&backoff, 1.0, 0.0, 1000, 1000);
gpr_backoff_init(&backoff, 200 /* initial timeout */, 1.0 /* multiplier */,
0.0 /* jitter */, 100 /* min timeout */,
1000 /* max timeout */);
gpr_timespec now = gpr_time_0(GPR_TIMESPAN);
gpr_timespec next = gpr_backoff_begin(&backoff, now);
GPR_ASSERT(gpr_time_to_millis(gpr_time_sub(next, now)) == 1000);
GPR_ASSERT(gpr_time_to_millis(gpr_time_sub(next, now)) == 200);
for (int i = 0; i < 10000; i++) {
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_to_millis(gpr_time_sub(next, now)) == 1000);
GPR_ASSERT(gpr_time_to_millis(gpr_time_sub(next, now)) == 200);
now = next;
}
}
static void test_no_jitter_backoff(void) {
static void test_min_connect(void) {
gpr_backoff backoff;
gpr_backoff_init(&backoff, 2.0, 0.0, 1, 513);
gpr_backoff_init(&backoff, 100 /* initial timeout */, 1.0 /* multiplier */,
0.0 /* jitter */, 200 /* min timeout */,
1000 /* max timeout */);
gpr_timespec now = gpr_time_0(GPR_TIMESPAN);
gpr_timespec next = gpr_backoff_begin(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(1, GPR_TIMESPAN), next) == 0);
GPR_ASSERT(gpr_time_to_millis(gpr_time_sub(next, now)) == 200);
}
static void test_no_jitter_backoff(void) {
gpr_backoff backoff;
gpr_backoff_init(&backoff, 2 /* initial timeout */, 2.0 /* multiplier */,
0.0 /* jitter */, 1 /* min timeout */,
513 /* max timeout */);
// x_1 = 2
// x_n = 2**i + x_{i-1} ( = 2**(n+1) - 2 )
gpr_timespec now = gpr_time_0(GPR_TIMESPAN);
gpr_timespec next = gpr_backoff_begin(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(2, GPR_TIMESPAN), next) == 0);
now = next;
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(3, GPR_TIMESPAN), next) == 0);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(6, GPR_TIMESPAN), next) == 0);
now = next;
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(7, GPR_TIMESPAN), next) == 0);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(14, GPR_TIMESPAN), next) == 0);
now = next;
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(15, GPR_TIMESPAN), next) == 0);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(30, GPR_TIMESPAN), next) == 0);
now = next;
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(31, GPR_TIMESPAN), next) == 0);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(62, GPR_TIMESPAN), next) == 0);
now = next;
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(63, GPR_TIMESPAN), next) == 0);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(126, GPR_TIMESPAN), next) == 0);
now = next;
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(127, GPR_TIMESPAN), next) == 0);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(254, GPR_TIMESPAN), next) == 0);
now = next;
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(255, GPR_TIMESPAN), next) == 0);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(510, GPR_TIMESPAN), next) == 0);
now = next;
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(511, GPR_TIMESPAN), next) == 0);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(1022, GPR_TIMESPAN), next) == 0);
now = next;
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(1023, GPR_TIMESPAN), next) == 0);
// Hit the maximum timeout. From this point onwards, retries will increase
// only by max timeout.
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(1535, GPR_TIMESPAN), next) == 0);
now = next;
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(1536, GPR_TIMESPAN), next) == 0);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(2048, GPR_TIMESPAN), next) == 0);
now = next;
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(2049, GPR_TIMESPAN), next) == 0);
now = next;
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(2561, GPR_TIMESPAN), next) == 0);
}
static void test_jitter_backoff(void) {
const int64_t initial_timeout = 500;
const double jitter = 0.1;
gpr_backoff backoff;
gpr_backoff_init(&backoff, initial_timeout, 1.0 /* multiplier */, jitter,
100 /* min timeout */, 1000 /* max timeout */);
backoff.rng_state = 0; // force consistent PRNG
gpr_timespec now = gpr_time_0(GPR_TIMESPAN);
gpr_timespec next = gpr_backoff_begin(&backoff, now);
GPR_ASSERT(gpr_time_to_millis(gpr_time_sub(next, now)) == 500);
int64_t expected_next_lower_bound =
(int64_t)((double)initial_timeout * (1 - jitter));
int64_t expected_next_upper_bound =
(int64_t)((double)initial_timeout * (1 + jitter));
for (int i = 0; i < 10000; i++) {
next = gpr_backoff_step(&backoff, now);
GPR_ASSERT(gpr_time_cmp(gpr_time_from_millis(2562, GPR_TIMESPAN), next) == 0);
// next-now must be within (jitter*100)% of the previous timeout.
const int64_t timeout_millis = gpr_time_to_millis(gpr_time_sub(next, now));
GPR_ASSERT(timeout_millis >= expected_next_lower_bound);
GPR_ASSERT(timeout_millis <= expected_next_upper_bound);
expected_next_lower_bound =
(int64_t)((double)timeout_millis * (1 - jitter));
expected_next_upper_bound =
(int64_t)((double)timeout_millis * (1 + jitter));
now = next;
}
}
int main(int argc, char **argv) {
@ -101,7 +150,9 @@ int main(int argc, char **argv) {
gpr_time_init();
test_constant_backoff();
test_min_connect();
test_no_jitter_backoff();
test_jitter_backoff();
return 0;
}

Loading…
Cancel
Save