[chttp2] Add an experiment to separate liveness checks from ping timeouts (#34647)

Just seeing data flowing in after a ping is enough to establish liveness
of a connection, and so we can limit keepalive timeouts to that. Ping
timeouts are necessary for protocol correctness, but may be stuck behind
other traffic, so give them a little more of a grace period.

---------

Co-authored-by: ctiller <ctiller@users.noreply.github.com>
pull/34612/head
Craig Tiller 1 year ago committed by GitHub
parent 2b2a34d8dd
commit 594d4ed34b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 49
      src/core/ext/transport/chttp2/transport/chttp2_transport.cc
  2. 14
      src/core/ext/transport/chttp2/transport/internal.h
  3. 30
      src/core/ext/transport/chttp2/transport/writing.cc
  4. 24
      src/core/lib/experiments/experiments.cc
  5. 11
      src/core/lib/experiments/experiments.h
  6. 9
      src/core/lib/experiments/experiments.yaml
  7. 2
      src/core/lib/experiments/rollouts.yaml
  8. 2
      test/core/end2end/tests/keepalive_timeout.cc

@ -407,6 +407,12 @@ static void read_channel_args(grpc_chttp2_transport* t,
? grpc_core::Duration::Infinity()
: (t->is_client ? g_default_client_keepalive_timeout
: g_default_server_keepalive_timeout)));
t->ping_timeout = std::max(
grpc_core::Duration::Zero(),
channel_args.GetDurationFromIntMillis(GRPC_ARG_PING_TIMEOUT_MS)
.value_or(t->keepalive_time == grpc_core::Duration::Infinity()
? grpc_core::Duration::Infinity()
: grpc_core::Duration::Minutes(1)));
if (t->is_client) {
t->keepalive_permit_without_calls =
channel_args.GetBool(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS)
@ -739,6 +745,12 @@ static void close_transport_locked(grpc_chttp2_transport* t,
t->closed_with_error = error;
connectivity_state_set(t, GRPC_CHANNEL_SHUTDOWN, absl::Status(),
"close_transport");
if (t->keepalive_ping_timeout_handle !=
grpc_event_engine::experimental::EventEngine::TaskHandle::kInvalid) {
t->event_engine->Cancel(std::exchange(
t->keepalive_ping_timeout_handle,
grpc_event_engine::experimental::EventEngine::TaskHandle::kInvalid));
}
if (t->settings_ack_watchdog !=
grpc_event_engine::experimental::EventEngine::TaskHandle::kInvalid) {
t->event_engine->Cancel(std::exchange(
@ -1772,6 +1784,27 @@ void grpc_chttp2_ack_ping(grpc_chttp2_transport* t, uint64_t id) {
}
}
void grpc_chttp2_keepalive_timeout(
grpc_core::RefCountedPtr<grpc_chttp2_transport> t) {
t->combiner->Run(
grpc_core::NewClosure([t](grpc_error_handle) {
gpr_log(GPR_INFO, "%s: Keepalive timeout. Closing transport.",
std::string(t->peer_string.as_string_view()).c_str());
send_goaway(
t.get(),
grpc_error_set_int(GRPC_ERROR_CREATE("keepalive_timeout"),
grpc_core::StatusIntProperty::kHttp2Error,
GRPC_HTTP2_ENHANCE_YOUR_CALM),
/*immediate_disconnect_hint=*/true);
close_transport_locked(
t.get(),
grpc_error_set_int(GRPC_ERROR_CREATE("keepalive timeout"),
grpc_core::StatusIntProperty::kRpcStatus,
GRPC_STATUS_UNAVAILABLE));
}),
absl::OkStatus());
}
void grpc_chttp2_ping_timeout(
grpc_core::RefCountedPtr<grpc_chttp2_transport> t) {
t->combiner->Run(
@ -1835,6 +1868,8 @@ class GracefulGoaway : public grpc_core::RefCounted<GracefulGoaway> {
grpc_chttp2_goaway_append((1u << 31) - 1, 0, grpc_empty_slice(), &t->qbuf);
t->keepalive_timeout =
std::min(t->keepalive_timeout, grpc_core::Duration::Seconds(20));
t->ping_timeout =
std::min(t->ping_timeout, grpc_core::Duration::Seconds(20));
send_ping_locked(
t, nullptr, GRPC_CLOSURE_INIT(&on_ping_ack_, OnPingAck, this, nullptr));
grpc_chttp2_initiate_write(t, GRPC_CHTTP2_INITIATE_WRITE_GOAWAY_SENT);
@ -2754,6 +2789,20 @@ static void read_action_parse_loop_locked(
static void read_action_locked(
grpc_core::RefCountedPtr<grpc_chttp2_transport> t,
grpc_error_handle error) {
// got an incoming read, cancel any pending keepalive timers
t->keepalive_incoming_data_wanted = false;
if (t->keepalive_ping_timeout_handle !=
grpc_event_engine::experimental::EventEngine::TaskHandle::kInvalid) {
if (GRPC_TRACE_FLAG_ENABLED(grpc_ping_trace) ||
GRPC_TRACE_FLAG_ENABLED(grpc_keepalive_trace)) {
gpr_log(GPR_INFO,
"%s[%p]: Clear keepalive timer because data was received",
t->is_client ? "CLIENT" : "SERVER", t.get());
}
t->event_engine->Cancel(std::exchange(
t->keepalive_ping_timeout_handle,
grpc_event_engine::experimental::EventEngine::TaskHandle::kInvalid));
}
grpc_error_handle err = error;
if (!err.ok()) {
err = grpc_error_set_int(

@ -458,7 +458,8 @@ struct grpc_chttp2_transport : public grpc_core::KeepsGrpcInitialized {
keepalive_ping_timer_handle;
/// time duration in between pings
grpc_core::Duration keepalive_time;
/// grace period for a ping to complete before watchdog kicks in
/// grace period to wait for data after sending a ping before keepalives
/// timeout
grpc_core::Duration keepalive_timeout;
/// keep-alive state machine state
grpc_chttp2_keepalive_state keepalive_state;
@ -475,6 +476,11 @@ struct grpc_chttp2_transport : public grpc_core::KeepsGrpcInitialized {
uint32_t num_pending_induced_frames = 0;
uint32_t incoming_stream_id = 0;
/// grace period after sending a ping to wait for the ping ack
grpc_core::Duration ping_timeout;
grpc_event_engine::experimental::EventEngine::TaskHandle
keepalive_ping_timeout_handle =
grpc_event_engine::experimental::EventEngine::TaskHandle::kInvalid;
/// grace period before settings timeout expires
grpc_core::Duration settings_timeout;
@ -523,6 +529,8 @@ struct grpc_chttp2_transport : public grpc_core::KeepsGrpcInitialized {
bool bdp_ping_started = false;
// True if pings should be acked
bool ack_pings = true;
/// True if the keepalive system wants to see some data incoming
bool keepalive_incoming_data_wanted = false;
// What percentage of rst_stream frames on the server should cause a ping
// frame to be generated.
@ -653,6 +661,8 @@ struct grpc_chttp2_stream {
bool traced = false;
};
#define GRPC_ARG_PING_TIMEOUT_MS "grpc.http2.ping_timeout_ms"
/// Transport writing call flow:
/// grpc_chttp2_initiate_write() is called anywhere that we know bytes need to
/// go out on the wire.
@ -762,6 +772,8 @@ void grpc_chttp2_complete_closure_step(grpc_chttp2_transport* t,
const char* desc,
grpc_core::DebugLocation whence = {});
void grpc_chttp2_keepalive_timeout(
grpc_core::RefCountedPtr<grpc_chttp2_transport> t);
void grpc_chttp2_ping_timeout(
grpc_core::RefCountedPtr<grpc_chttp2_transport> t);

@ -124,6 +124,7 @@ static void maybe_initiate_ping(grpc_chttp2_transport* t) {
const uint64_t id = t->ping_callbacks.StartPing(t->bitgen);
grpc_slice_buffer_add(t->outbuf.c_slice_buffer(),
grpc_chttp2_ping_create(false, id));
t->keepalive_incoming_data_wanted = true;
if (t->channelz_socket != nullptr) {
t->channelz_socket->RecordKeepaliveSent();
}
@ -708,8 +709,11 @@ void grpc_chttp2_end_write(grpc_chttp2_transport* t, grpc_error_handle error) {
t->keepalive_timeout != grpc_core::Duration::Infinity()) {
// Set ping timeout after finishing write so we don't measure our own send
// time.
const auto timeout = grpc_core::IsSeparatePingFromKeepaliveEnabled()
? t->ping_timeout
: t->keepalive_timeout;
auto id = t->ping_callbacks.OnPingTimeout(
t->keepalive_timeout, t->event_engine.get(), [t = t->Ref()] {
timeout, t->event_engine.get(), [t = t->Ref()] {
grpc_core::ApplicationCallbackExecCtx callback_exec_ctx;
grpc_core::ExecCtx exec_ctx;
grpc_chttp2_ping_timeout(t);
@ -717,8 +721,28 @@ void grpc_chttp2_end_write(grpc_chttp2_transport* t, grpc_error_handle error) {
if (GRPC_TRACE_FLAG_ENABLED(grpc_ping_trace) && id.has_value()) {
gpr_log(GPR_INFO,
"%s[%p]: Set ping timeout timer of %s for ping id %" PRIx64,
t->is_client ? "CLIENT" : "SERVER", t,
t->keepalive_timeout.ToString().c_str(), id.value());
t->is_client ? "CLIENT" : "SERVER", t, timeout.ToString().c_str(),
id.value());
}
if (grpc_core::IsSeparatePingFromKeepaliveEnabled() &&
t->keepalive_incoming_data_wanted &&
t->keepalive_timeout < t->ping_timeout &&
t->keepalive_ping_timeout_handle !=
grpc_event_engine::experimental::EventEngine::TaskHandle::
kInvalid) {
if (GRPC_TRACE_FLAG_ENABLED(grpc_ping_trace) ||
GRPC_TRACE_FLAG_ENABLED(grpc_keepalive_trace)) {
gpr_log(GPR_INFO, "%s[%p]: Set keepalive ping timeout timer of %s",
t->is_client ? "CLIENT" : "SERVER", t,
t->keepalive_timeout.ToString().c_str());
}
t->keepalive_ping_timeout_handle =
t->event_engine->RunAfter(t->keepalive_timeout, [t = t->Ref()] {
grpc_core::ApplicationCallbackExecCtx callback_exec_ctx;
grpc_core::ExecCtx exec_ctx;
grpc_chttp2_keepalive_timeout(t);
});
}
}

@ -121,6 +121,12 @@ const char* const description_schedule_cancellation_over_write =
"Allow cancellation op to be scheduled over a write";
const char* const additional_constraints_schedule_cancellation_over_write =
"{}";
const char* const description_separate_ping_from_keepalive =
"Keep a different keepalive timeout (resolution is seeing data after "
"sending a ping) from a ping timeout (resolution is getting a ping ack "
"after sending a ping) The first can be short and determines liveness. The "
"second can be longer and determines protocol correctness.";
const char* const additional_constraints_separate_ping_from_keepalive = "{}";
const char* const description_server_privacy = "If set, server privacy";
const char* const additional_constraints_server_privacy = "{}";
const char* const description_settings_timeout =
@ -229,6 +235,8 @@ const ExperimentMetadata g_experiment_metadata[] = {
{"schedule_cancellation_over_write",
description_schedule_cancellation_over_write,
additional_constraints_schedule_cancellation_over_write, false, true},
{"separate_ping_from_keepalive", description_separate_ping_from_keepalive,
additional_constraints_separate_ping_from_keepalive, true, true},
{"server_privacy", description_server_privacy,
additional_constraints_server_privacy, false, false},
{"settings_timeout", description_settings_timeout,
@ -355,6 +363,12 @@ const char* const description_schedule_cancellation_over_write =
"Allow cancellation op to be scheduled over a write";
const char* const additional_constraints_schedule_cancellation_over_write =
"{}";
const char* const description_separate_ping_from_keepalive =
"Keep a different keepalive timeout (resolution is seeing data after "
"sending a ping) from a ping timeout (resolution is getting a ping ack "
"after sending a ping) The first can be short and determines liveness. The "
"second can be longer and determines protocol correctness.";
const char* const additional_constraints_separate_ping_from_keepalive = "{}";
const char* const description_server_privacy = "If set, server privacy";
const char* const additional_constraints_server_privacy = "{}";
const char* const description_settings_timeout =
@ -463,6 +477,8 @@ const ExperimentMetadata g_experiment_metadata[] = {
{"schedule_cancellation_over_write",
description_schedule_cancellation_over_write,
additional_constraints_schedule_cancellation_over_write, false, true},
{"separate_ping_from_keepalive", description_separate_ping_from_keepalive,
additional_constraints_separate_ping_from_keepalive, true, true},
{"server_privacy", description_server_privacy,
additional_constraints_server_privacy, false, false},
{"settings_timeout", description_settings_timeout,
@ -589,6 +605,12 @@ const char* const description_schedule_cancellation_over_write =
"Allow cancellation op to be scheduled over a write";
const char* const additional_constraints_schedule_cancellation_over_write =
"{}";
const char* const description_separate_ping_from_keepalive =
"Keep a different keepalive timeout (resolution is seeing data after "
"sending a ping) from a ping timeout (resolution is getting a ping ack "
"after sending a ping) The first can be short and determines liveness. The "
"second can be longer and determines protocol correctness.";
const char* const additional_constraints_separate_ping_from_keepalive = "{}";
const char* const description_server_privacy = "If set, server privacy";
const char* const additional_constraints_server_privacy = "{}";
const char* const description_settings_timeout =
@ -697,6 +719,8 @@ const ExperimentMetadata g_experiment_metadata[] = {
{"schedule_cancellation_over_write",
description_schedule_cancellation_over_write,
additional_constraints_schedule_cancellation_over_write, false, true},
{"separate_ping_from_keepalive", description_separate_ping_from_keepalive,
additional_constraints_separate_ping_from_keepalive, true, true},
{"server_privacy", description_server_privacy,
additional_constraints_server_privacy, false, false},
{"settings_timeout", description_settings_timeout,

@ -103,6 +103,8 @@ inline bool IsRegisteredMethodLookupInTransportEnabled() { return true; }
inline bool IsRoundRobinDelegateToPickFirstEnabled() { return true; }
inline bool IsRstpitEnabled() { return false; }
inline bool IsScheduleCancellationOverWriteEnabled() { return false; }
#define GRPC_EXPERIMENT_IS_INCLUDED_SEPARATE_PING_FROM_KEEPALIVE
inline bool IsSeparatePingFromKeepaliveEnabled() { return true; }
inline bool IsServerPrivacyEnabled() { return false; }
#define GRPC_EXPERIMENT_IS_INCLUDED_SETTINGS_TIMEOUT
inline bool IsSettingsTimeoutEnabled() { return true; }
@ -165,6 +167,8 @@ inline bool IsRegisteredMethodLookupInTransportEnabled() { return true; }
inline bool IsRoundRobinDelegateToPickFirstEnabled() { return true; }
inline bool IsRstpitEnabled() { return false; }
inline bool IsScheduleCancellationOverWriteEnabled() { return false; }
#define GRPC_EXPERIMENT_IS_INCLUDED_SEPARATE_PING_FROM_KEEPALIVE
inline bool IsSeparatePingFromKeepaliveEnabled() { return true; }
inline bool IsServerPrivacyEnabled() { return false; }
#define GRPC_EXPERIMENT_IS_INCLUDED_SETTINGS_TIMEOUT
inline bool IsSettingsTimeoutEnabled() { return true; }
@ -227,6 +231,8 @@ inline bool IsRegisteredMethodLookupInTransportEnabled() { return true; }
inline bool IsRoundRobinDelegateToPickFirstEnabled() { return true; }
inline bool IsRstpitEnabled() { return false; }
inline bool IsScheduleCancellationOverWriteEnabled() { return false; }
#define GRPC_EXPERIMENT_IS_INCLUDED_SEPARATE_PING_FROM_KEEPALIVE
inline bool IsSeparatePingFromKeepaliveEnabled() { return true; }
inline bool IsServerPrivacyEnabled() { return false; }
#define GRPC_EXPERIMENT_IS_INCLUDED_SETTINGS_TIMEOUT
inline bool IsSettingsTimeoutEnabled() { return true; }
@ -272,6 +278,7 @@ enum ExperimentIds {
kExperimentIdRoundRobinDelegateToPickFirst,
kExperimentIdRstpit,
kExperimentIdScheduleCancellationOverWrite,
kExperimentIdSeparatePingFromKeepalive,
kExperimentIdServerPrivacy,
kExperimentIdSettingsTimeout,
kExperimentIdTarpit,
@ -393,6 +400,10 @@ inline bool IsRstpitEnabled() {
inline bool IsScheduleCancellationOverWriteEnabled() {
return IsExperimentEnabled(kExperimentIdScheduleCancellationOverWrite);
}
#define GRPC_EXPERIMENT_IS_INCLUDED_SEPARATE_PING_FROM_KEEPALIVE
inline bool IsSeparatePingFromKeepaliveEnabled() {
return IsExperimentEnabled(kExperimentIdSeparatePingFromKeepalive);
}
#define GRPC_EXPERIMENT_IS_INCLUDED_SERVER_PRIVACY
inline bool IsServerPrivacyEnabled() {
return IsExperimentEnabled(kExperimentIdServerPrivacy);

@ -205,6 +205,15 @@
expiry: 2024/01/01
owner: vigneshbabu@google.com
test_tags: []
- name: separate_ping_from_keepalive
description:
Keep a different keepalive timeout (resolution is seeing data after sending a ping)
from a ping timeout (resolution is getting a ping ack after sending a ping)
The first can be short and determines liveness.
The second can be longer and determines protocol correctness.
expiry: 2024/03/03
owner: ctiller@google.com
test_tags: []
- name: server_privacy
description:
If set, server privacy

@ -102,6 +102,8 @@
default: false
- name: schedule_cancellation_over_write
default: false
- name: separate_ping_from_keepalive
default: true
- name: server_privacy
default: false
- name: settings_timeout

@ -24,6 +24,7 @@
#include <grpc/impl/channel_arg_names.h>
#include <grpc/status.h>
#include "src/core/ext/transport/chttp2/transport/internal.h"
#include "src/core/lib/channel/channel_args.h"
#include "src/core/lib/config/config_vars.h"
#include "src/core/lib/gprpp/time.h"
@ -41,6 +42,7 @@ CORE_END2END_TEST(Http2SingleHopTest, KeepaliveTimeout) {
InitClient(ChannelArgs()
.Set(GRPC_ARG_KEEPALIVE_TIME_MS, 10)
.Set(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, 0)
.Set(GRPC_ARG_PING_TIMEOUT_MS, 0)
.Set(GRPC_ARG_HTTP2_BDP_PROBE, false));
auto c = NewClientCall("/foo").Timeout(Duration::Minutes(1)).Create();
IncomingMetadata server_initial_metadata;

Loading…
Cancel
Save