xDS circuit breaking support

pull/24180/head
Donna Dionne 4 years ago
parent 86f93801e5
commit e7d7b7de86
  1. 7
      src/core/ext/filters/client_channel/lb_policy/xds/cds.cc
  2. 100
      src/core/ext/filters/client_channel/lb_policy/xds/eds.cc
  3. 27
      src/core/ext/xds/xds_api.cc
  4. 6
      src/core/ext/xds/xds_api.h
  5. 1
      src/proto/grpc/testing/xds/BUILD
  6. 17
      src/proto/grpc/testing/xds/cds_for_test.proto
  7. 1
      src/proto/grpc/testing/xds/v3/BUILD
  8. 17
      src/proto/grpc/testing/xds/v3/cluster.proto
  9. 10
      test/cpp/end2end/test_service_impl.h
  10. 73
      test/cpp/end2end/xds_end2end_test.cc

@ -314,15 +314,18 @@ void CdsLb::OnClusterChanged(XdsApi::CdsUpdate cluster_data) {
if (GRPC_TRACE_FLAG_ENABLED(grpc_cds_lb_trace)) {
gpr_log(GPR_INFO,
"[cdslb %p] received CDS update from xds client %p: "
"eds_service_name=%s lrs_load_reporting_server_name=%s",
"eds_service_name=%s lrs_load_reporting_server_name=%s "
"max_concurrent_requests=%d",
this, xds_client_.get(), cluster_data.eds_service_name.c_str(),
cluster_data.lrs_load_reporting_server_name.has_value()
? cluster_data.lrs_load_reporting_server_name.value().c_str()
: "(unset)");
: "(unset)",
cluster_data.max_concurrent_requests);
}
// Construct config for child policy.
Json::Object child_config = {
{"clusterName", config_->cluster()},
{"max_concurrent_requests", cluster_data.max_concurrent_requests},
{"localityPickingPolicy",
Json::Array{
Json::Object{

@ -36,6 +36,7 @@
#include "src/core/ext/xds/xds_client.h"
#include "src/core/ext/xds/xds_client_stats.h"
#include "src/core/lib/channel/channel_args.h"
#include "src/core/lib/gpr/string.h"
#include "src/core/lib/gprpp/orphanable.h"
#include "src/core/lib/gprpp/ref_counted_ptr.h"
#include "src/core/lib/iomgr/timer.h"
@ -58,13 +59,15 @@ class EdsLbConfig : public LoadBalancingPolicy::Config {
public:
EdsLbConfig(std::string cluster_name, std::string eds_service_name,
absl::optional<std::string> lrs_load_reporting_server_name,
Json locality_picking_policy, Json endpoint_picking_policy)
Json locality_picking_policy, Json endpoint_picking_policy,
uint32_t max_concurrent_requests)
: cluster_name_(std::move(cluster_name)),
eds_service_name_(std::move(eds_service_name)),
lrs_load_reporting_server_name_(
std::move(lrs_load_reporting_server_name)),
locality_picking_policy_(std::move(locality_picking_policy)),
endpoint_picking_policy_(std::move(endpoint_picking_policy)) {}
endpoint_picking_policy_(std::move(endpoint_picking_policy)),
max_concurrent_requests_(max_concurrent_requests) {}
const char* name() const override { return kEds; }
@ -79,6 +82,9 @@ class EdsLbConfig : public LoadBalancingPolicy::Config {
const Json& endpoint_picking_policy() const {
return endpoint_picking_policy_;
}
const uint32_t max_concurrent_requests() const {
return max_concurrent_requests_;
}
private:
std::string cluster_name_;
@ -86,6 +92,7 @@ class EdsLbConfig : public LoadBalancingPolicy::Config {
absl::optional<std::string> lrs_load_reporting_server_name_;
Json locality_picking_policy_;
Json endpoint_picking_policy_;
uint32_t max_concurrent_requests_;
};
// EDS LB policy.
@ -145,14 +152,16 @@ class EdsLb : public LoadBalancingPolicy {
// A picker that handles drops.
class DropPicker : public SubchannelPicker {
public:
explicit DropPicker(EdsLb* eds_policy);
explicit DropPicker(RefCountedPtr<EdsLb> eds_policy);
PickResult Pick(PickArgs args) override;
private:
RefCountedPtr<EdsLb> eds_policy_;
RefCountedPtr<XdsApi::EdsUpdate::DropConfig> drop_config_;
RefCountedPtr<XdsClusterDropStats> drop_stats_;
RefCountedPtr<ChildPickerWrapper> child_picker_;
uint32_t max_concurrent_requests_;
};
class Helper : public ChannelControlHelper {
@ -236,6 +245,8 @@ class EdsLb : public LoadBalancingPolicy {
RefCountedPtr<XdsApi::EdsUpdate::DropConfig> drop_config_;
RefCountedPtr<XdsClusterDropStats> drop_stats_;
// Current concurrent number of requests;
Atomic<uint32_t> concurrent_requests_{0};
OrphanablePtr<LoadBalancingPolicy> child_policy_;
@ -249,13 +260,16 @@ class EdsLb : public LoadBalancingPolicy {
// EdsLb::DropPicker
//
EdsLb::DropPicker::DropPicker(EdsLb* eds_policy)
: drop_config_(eds_policy->drop_config_),
drop_stats_(eds_policy->drop_stats_),
child_picker_(eds_policy->child_picker_) {
EdsLb::DropPicker::DropPicker(RefCountedPtr<EdsLb> eds_policy)
: eds_policy_(std::move(eds_policy)),
drop_config_(eds_policy_->drop_config_),
drop_stats_(eds_policy_->drop_stats_),
child_picker_(eds_policy_->child_picker_),
max_concurrent_requests_(
eds_policy_->config_->max_concurrent_requests()) {
if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_eds_trace)) {
gpr_log(GPR_INFO, "[edslb %p] constructed new drop picker %p", eds_policy,
this);
gpr_log(GPR_INFO, "[edslb %p] constructed new drop picker %p",
eds_policy_.get(), this);
}
}
@ -268,6 +282,17 @@ EdsLb::PickResult EdsLb::DropPicker::Pick(PickArgs args) {
result.type = PickResult::PICK_COMPLETE;
return result;
}
// Check and see if we exceeded the max concurrent requests count.
uint32_t current = eds_policy_->concurrent_requests_.FetchAdd(1);
if (current >= max_concurrent_requests_) {
eds_policy_->concurrent_requests_.FetchSub(1);
if (drop_stats_ != nullptr) {
drop_stats_->AddUncategorizedDrops();
}
PickResult result;
result.type = PickResult::PICK_COMPLETE;
return result;
}
// If we're not dropping all calls, we should always have a child picker.
if (child_picker_ == nullptr) { // Should never happen.
PickResult result;
@ -276,10 +301,30 @@ EdsLb::PickResult EdsLb::DropPicker::Pick(PickArgs args) {
grpc_error_set_int(GRPC_ERROR_CREATE_FROM_STATIC_STRING(
"eds drop picker not given any child picker"),
GRPC_ERROR_INT_GRPC_STATUS, GRPC_STATUS_INTERNAL);
eds_policy_->concurrent_requests_.FetchSub(1);
return result;
}
// Not dropping, so delegate to child's picker.
return child_picker_->Pick(args);
PickResult result = child_picker_->Pick(args);
if (result.type == PickResult::PICK_COMPLETE) {
EdsLb* eds_policy = static_cast<EdsLb*>(
eds_policy_->Ref(DEBUG_LOCATION, "DropPickPicker+call").release());
auto original_recv_trailing_metadata_ready =
result.recv_trailing_metadata_ready;
result.recv_trailing_metadata_ready =
[original_recv_trailing_metadata_ready, eds_policy](
grpc_error* error, MetadataInterface* metadata,
CallState* call_state) {
if (original_recv_trailing_metadata_ready != nullptr) {
original_recv_trailing_metadata_ready(error, metadata, call_state);
}
eds_policy->concurrent_requests_.FetchSub(1);
eds_policy->Unref(DEBUG_LOCATION, "DropPickPicker+call");
};
} else {
eds_policy_->concurrent_requests_.FetchSub(1);
}
return result;
}
//
@ -469,9 +514,14 @@ void EdsLb::UpdateLocked(UpdateArgs args) {
grpc_channel_args_destroy(args_);
args_ = args.args;
args.args = nullptr;
const bool lrs_server_changed =
is_initial_update || config_->lrs_load_reporting_server_name() !=
old_config->lrs_load_reporting_server_name();
const bool max_concurrent_requests_changed =
is_initial_update || config_->max_concurrent_requests() !=
old_config->max_concurrent_requests();
// Update drop stats for load reporting if needed.
if (is_initial_update || config_->lrs_load_reporting_server_name() !=
old_config->lrs_load_reporting_server_name()) {
if (lrs_server_changed) {
drop_stats_.reset();
if (config_->lrs_load_reporting_server_name().has_value()) {
const auto key = GetLrsClusterKey();
@ -479,6 +529,8 @@ void EdsLb::UpdateLocked(UpdateArgs args) {
config_->lrs_load_reporting_server_name().value(),
key.first /*cluster_name*/, key.second /*eds_service_name*/);
}
}
if (lrs_server_changed || max_concurrent_requests_changed) {
MaybeUpdateDropPickerLocked();
}
// Update child policy if needed.
@ -815,14 +867,16 @@ void EdsLb::MaybeUpdateDropPickerLocked() {
// If we're dropping all calls, report READY, regardless of what (or
// whether) the child has reported.
if (drop_config_ != nullptr && drop_config_->drop_all()) {
channel_control_helper()->UpdateState(GRPC_CHANNEL_READY, absl::Status(),
absl::make_unique<DropPicker>(this));
channel_control_helper()->UpdateState(
GRPC_CHANNEL_READY, absl::Status(),
absl::make_unique<DropPicker>(Ref(DEBUG_LOCATION, "DropPicker")));
return;
}
// Update only if we have a child picker.
if (child_picker_ != nullptr) {
channel_control_helper()->UpdateState(child_state_, child_status_,
absl::make_unique<DropPicker>(this));
channel_control_helper()->UpdateState(
child_state_, child_status_,
absl::make_unique<DropPicker>(Ref(DEBUG_LOCATION, "DropPicker")));
}
}
@ -938,13 +992,25 @@ class EdsLbFactory : public LoadBalancingPolicyFactory {
"endpointPickingPolicy", &parse_error, 1));
GRPC_ERROR_UNREF(parse_error);
}
// Max concurrent requests.
uint32_t max_concurrent_requests = 1024;
it = json.object_value().find("max_concurrent_requests");
if (it != json.object_value().end()) {
if (it->second.type() != Json::Type::NUMBER) {
error_list.push_back(GRPC_ERROR_CREATE_FROM_STATIC_STRING(
"field:max_concurrent_requests error:must be of type number"));
} else {
max_concurrent_requests =
gpr_parse_nonnegative_int(it->second.string_value().c_str());
}
}
// Construct config.
if (error_list.empty()) {
return MakeRefCounted<EdsLbConfig>(
std::move(cluster_name), std::move(eds_service_name),
std::move(lrs_load_reporting_server_name),
std::move(locality_picking_policy),
std::move(endpoint_picking_policy));
std::move(endpoint_picking_policy), max_concurrent_requests);
} else {
*error = GRPC_ERROR_CREATE_FROM_VECTOR(
"eds_experimental LB policy config", &error_list);

@ -42,6 +42,7 @@
#include "src/core/lib/iomgr/error.h"
#include "src/core/lib/iomgr/sockaddr_utils.h"
#include "envoy/config/cluster/v3/circuit_breaker.upb.h"
#include "envoy/config/cluster/v3/cluster.upb.h"
#include "envoy/config/core/v3/address.upb.h"
#include "envoy/config/core/v3/base.upb.h"
@ -1838,6 +1839,32 @@ grpc_error* CdsResponseParse(
}
cds_update.lrs_load_reporting_server_name.emplace("");
}
// The Cluster resource encodes the circuit breaking parameters in a list of
// Thresholds messages, where each message specifies the parameters for a
// particular RoutingPriority. we will look only at the first entry in the
// list for priority DEFAULT and default to 1024 if not found.
if (envoy_config_cluster_v3_Cluster_has_circuit_breakers(cluster)) {
const envoy_config_cluster_v3_CircuitBreakers* circuit_breakers =
envoy_config_cluster_v3_Cluster_circuit_breakers(cluster);
size_t num_thresholds;
const envoy_config_cluster_v3_CircuitBreakers_Thresholds* const*
thresholds = envoy_config_cluster_v3_CircuitBreakers_thresholds(
circuit_breakers, &num_thresholds);
for (size_t i = 0; i < num_thresholds; ++i) {
const auto* threshold = thresholds[i];
if (envoy_config_cluster_v3_CircuitBreakers_Thresholds_priority(
threshold) == envoy_config_core_v3_DEFAULT) {
const google_protobuf_UInt32Value* max_requests =
envoy_config_cluster_v3_CircuitBreakers_Thresholds_max_requests(
threshold);
if (max_requests != nullptr) {
cds_update.max_concurrent_requests =
google_protobuf_UInt32Value_value(max_requests);
}
break;
}
}
}
}
return GRPC_ERROR_NONE;
}

@ -178,11 +178,15 @@ class XdsApi {
// If set to the empty string, will use the same server we obtained the CDS
// data from.
absl::optional<std::string> lrs_load_reporting_server_name;
// Maximum number of outstanding requests can be made to the upstream
// cluster.
uint32_t max_concurrent_requests = 1024;
bool operator==(const CdsUpdate& other) const {
return eds_service_name == other.eds_service_name &&
lrs_load_reporting_server_name ==
other.lrs_load_reporting_server_name;
other.lrs_load_reporting_server_name &&
max_concurrent_requests == other.max_concurrent_requests;
}
};

@ -35,6 +35,7 @@ grpc_proto_library(
srcs = [
"cds_for_test.proto",
],
well_known_protos = True,
)
grpc_proto_library(

@ -27,6 +27,8 @@ syntax = "proto3";
package envoy.api.v2;
import "google/protobuf/wrappers.proto";
// Aggregated Discovery Service (ADS) options. This is currently empty, but when
// set in :ref:`ConfigSource <envoy_api_msg_core.ConfigSource>` can be used to
// specify that ADS is to be used.
@ -57,6 +59,19 @@ message ConfigSource {
}
}
enum RoutingPriority {
DEFAULT = 0;
HIGH = 1;
}
message CircuitBreakers {
message Thresholds {
RoutingPriority priority = 1;
google.protobuf.UInt32Value max_requests = 4;
}
repeated Thresholds thresholds = 1;
}
message Cluster {
// Refer to :ref:`service discovery type <arch_overview_service_discovery_types>`
// for an explanation on each type.
@ -153,5 +168,7 @@ message Cluster {
// Configuration to use for EDS updates for the Cluster.
EdsClusterConfig eds_cluster_config = 3;
CircuitBreakers circuit_breakers = 10;
ConfigSource lrs_server = 42;
}

@ -81,6 +81,7 @@ grpc_proto_library(
srcs = [
"cluster.proto",
],
well_known_protos = True,
deps = [
"config_source_proto",
],

@ -20,6 +20,21 @@ package envoy.config.cluster.v3;
import "src/proto/grpc/testing/xds/v3/config_source.proto";
import "google/protobuf/wrappers.proto";
enum RoutingPriority {
DEFAULT = 0;
HIGH = 1;
}
message CircuitBreakers {
message Thresholds {
RoutingPriority priority = 1;
google.protobuf.UInt32Value max_requests = 4;
}
repeated Thresholds thresholds = 1;
}
// [#protodoc-title: Cluster configuration]
// Configuration for a single upstream cluster.
@ -127,6 +142,8 @@ message Cluster {
// when picking a host in the cluster.
LbPolicy lb_policy = 6;
CircuitBreakers circuit_breakers = 10;
// [#not-implemented-hide:]
// If present, tells the client where to send load reports via LRS. If not present, the
// client will fall back to a client-side default, which may be either (a) don't send any

@ -167,6 +167,7 @@ class TestMultipleServiceImpl : public RpcService {
{
std::unique_lock<std::mutex> lock(mu_);
signal_client_ = true;
++rpcs_waiting_for_client_cancel_;
}
while (!context->IsCancelled()) {
gpr_sleep_until(gpr_time_add(
@ -174,6 +175,10 @@ class TestMultipleServiceImpl : public RpcService {
gpr_time_from_micros(request->param().client_cancel_after_us(),
GPR_TIMESPAN)));
}
{
std::unique_lock<std::mutex> lock(mu_);
--rpcs_waiting_for_client_cancel_;
}
return Status::CANCELLED;
} else if (request->has_param() &&
request->param().server_cancel_after_us()) {
@ -425,12 +430,17 @@ class TestMultipleServiceImpl : public RpcService {
}
void ClientWaitUntilRpcStarted() { signaller_.ClientWaitUntilRpcStarted(); }
void SignalServerToContinue() { signaller_.SignalServerToContinue(); }
uint64_t RpcsWaitingForClientCancel() {
std::unique_lock<std::mutex> lock(mu_);
return rpcs_waiting_for_client_cancel_;
}
private:
bool signal_client_;
std::mutex mu_;
TestServiceSignaller signaller_;
std::unique_ptr<std::string> host_;
uint64_t rpcs_waiting_for_client_cancel_ = 0;
};
class CallbackTestServiceImpl

@ -86,7 +86,9 @@ namespace {
using std::chrono::system_clock;
using ::envoy::config::cluster::v3::CircuitBreakers;
using ::envoy::config::cluster::v3::Cluster;
using ::envoy::config::cluster::v3::RoutingPriority;
using ::envoy::config::endpoint::v3::ClusterLoadAssignment;
using ::envoy::config::endpoint::v3::HealthStatus;
using ::envoy::config::listener::v3::Listener;
@ -2259,6 +2261,77 @@ TEST_P(XdsResolverOnlyTest, DefaultRouteSpecifiesSlashPrefix) {
WaitForAllBackends();
}
TEST_P(XdsResolverOnlyTest, CircuitBreaking) {
class TestRpc {
public:
TestRpc() {}
void StartRpc(grpc::testing::EchoTestService::Stub* stub) {
sender_thread_ = std::thread([this, stub]() {
EchoResponse response;
EchoRequest request;
request.mutable_param()->set_client_cancel_after_us(1 * 1000 * 1000);
request.set_message(kRequestMessage);
status_ = stub->Echo(&context_, request, &response);
});
}
void CancelRpc() {
context_.TryCancel();
sender_thread_.join();
}
private:
std::thread sender_thread_;
ClientContext context_;
Status status_;
};
const char* kNewClusterName = "new_cluster";
constexpr size_t kMaxConcurrentRequests = 10;
SetNextResolution({});
SetNextResolutionForLbChannelAllBalancers();
// Populate new EDS resources.
AdsServiceImpl::EdsResourceArgs args({
{"locality0", GetBackendPorts(0, 1)},
});
balancers_[0]->ads_service()->SetEdsResource(
AdsServiceImpl::BuildEdsResource(args));
// Update CDS resource to set max concurrent request.
CircuitBreakers circuit_breaks;
Cluster cluster = balancers_[0]->ads_service()->default_cluster();
auto* threshold = cluster.mutable_circuit_breakers()->add_thresholds();
threshold->set_priority(RoutingPriority::DEFAULT);
threshold->mutable_max_requests()->set_value(kMaxConcurrentRequests);
balancers_[0]->ads_service()->SetCdsResource(cluster);
// Send exactly max_concurrent_requests long RPCs.
TestRpc rpcs[kMaxConcurrentRequests];
for (size_t i = 0; i < kMaxConcurrentRequests; ++i) {
rpcs[i].StartRpc(stub_.get());
}
// Wait for all RPCs to be in flight.
while (backends_[0]->backend_service()->RpcsWaitingForClientCancel() <
kMaxConcurrentRequests) {
gpr_sleep_until(gpr_time_add(gpr_now(GPR_CLOCK_REALTIME),
gpr_time_from_micros(1 * 1000, GPR_TIMESPAN)));
}
// Sending a RPC now should fail, the error message should tell us
// we hit the max concurrent requests limit and got dropped.
Status status = SendRpc();
EXPECT_FALSE(status.ok());
EXPECT_EQ(status.error_message(), "Call dropped by load balancing policy");
// Cancel one RPC to allow another one through
rpcs[0].CancelRpc();
status = SendRpc();
EXPECT_TRUE(status.ok());
for (size_t i = 1; i < kMaxConcurrentRequests; ++i) {
rpcs[i].CancelRpc();
}
// Make sure RPCs go to the correct backend:
EXPECT_EQ(kMaxConcurrentRequests + 1,
backends_[0]->backend_service()->request_count());
}
TEST_P(XdsResolverOnlyTest, MultipleChannelsShareXdsClient) {
const char* kNewServerName = "new-server.example.com";
Listener listener = balancers_[0]->ads_service()->default_listener();

Loading…
Cancel
Save