[pick_first] implement non-per-call metrics (#35984)

As per gRFC A78 (https://github.com/grpc/proposal/pull/419).

Closes #35984

COPYBARA_INTEGRATE_REVIEW=https://github.com/grpc/grpc/pull/35984 from markdroth:non_per_call_metrics_pf 02db004d5a
PiperOrigin-RevId: 611475162
pull/36029/head
Mark D. Roth 12 months ago committed by Copybara-Service
parent e39bd50716
commit de3e6f8234
  1. 1
      CMakeLists.txt
  2. 2
      build_autogenerated.yaml
  3. 1
      src/core/BUILD
  4. 43
      src/core/load_balancing/pick_first/pick_first.cc
  5. 1
      test/core/client_channel/lb_policy/BUILD
  6. 121
      test/core/client_channel/lb_policy/pick_first_test.cc

1
CMakeLists.txt generated

@ -21105,6 +21105,7 @@ add_executable(pick_first_test
test/core/client_channel/lb_policy/pick_first_test.cc
test/core/event_engine/event_engine_test_utils.cc
test/core/event_engine/fuzzing_event_engine/fuzzing_event_engine.cc
test/core/util/fake_stats_plugin.cc
)
if(WIN32 AND MSVC)
if(BUILD_SHARED_LIBS)

@ -13477,12 +13477,14 @@ targets:
- test/core/client_channel/lb_policy/lb_policy_test_lib.h
- test/core/event_engine/event_engine_test_utils.h
- test/core/event_engine/fuzzing_event_engine/fuzzing_event_engine.h
- test/core/util/fake_stats_plugin.h
- test/core/util/scoped_env_var.h
src:
- test/core/event_engine/fuzzing_event_engine/fuzzing_event_engine.proto
- test/core/client_channel/lb_policy/pick_first_test.cc
- test/core/event_engine/event_engine_test_utils.cc
- test/core/event_engine/fuzzing_event_engine/fuzzing_event_engine.cc
- test/core/util/fake_stats_plugin.cc
deps:
- gtest
- protobuf

@ -5479,6 +5479,7 @@ grpc_cc_library(
"json_object_loader",
"lb_policy",
"lb_policy_factory",
"metrics",
"resolved_address",
"subchannel_interface",
"time",

@ -44,6 +44,7 @@
#include "src/core/load_balancing/health_check_client.h"
#include "src/core/lib/address_utils/sockaddr_utils.h"
#include "src/core/lib/channel/channel_args.h"
#include "src/core/lib/channel/metrics.h"
#include "src/core/lib/config/core_configuration.h"
#include "src/core/lib/debug/trace.h"
#include "src/core/lib/experiments/experiments.h"
@ -78,6 +79,25 @@ namespace {
constexpr absl::string_view kPickFirst = "pick_first";
const auto kMetricDisconnections =
GlobalInstrumentsRegistry::RegisterUInt64Counter(
"grpc.lb.pick_first.disconnections",
"EXPERIMENTAL. Number of times the selected subchannel becomes "
"disconnected.",
"{disconnection}", {kMetricLabelTarget}, {}, false);
const auto kMetricConnectionAttemptsSucceeded =
GlobalInstrumentsRegistry::RegisterUInt64Counter(
"grpc.lb.pick_first.connection_attempts_succeeded",
"EXPERIMENTAL. Number of successful connection attempts.",
"{attempt}", {kMetricLabelTarget}, {}, false);
const auto kMetricConnectionAttemptsFailed =
GlobalInstrumentsRegistry::RegisterUInt64Counter(
"grpc.lb.pick_first.connection_attempts_failed",
"EXPERIMENTAL. Number of failed connection attempts.",
"{attempt}", {kMetricLabelTarget}, {}, false);
class PickFirstConfig : public LoadBalancingPolicy::Config {
public:
absl::string_view name() const override { return kPickFirst; }
@ -675,6 +695,9 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange(
p->latest_pending_subchannel_list_.get());
}
if (subchannel_list_->shutting_down_ || pending_watcher_ == nullptr) return;
auto& stats_plugins =
subchannel_list_->policy_->channel_control_helper()
->GetStatsPluginGroup();
// The notification must be for a subchannel in either the current or
// latest pending subchannel lists.
GPR_ASSERT(subchannel_list_ == p->subchannel_list_.get() ||
@ -693,6 +716,9 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange(
}
// Any state change is considered to be a failure of the existing
// connection.
stats_plugins.AddCounter(
kMetricDisconnections, 1,
{subchannel_list_->policy_->channel_control_helper()->GetTarget()}, {});
// TODO(roth): We could check the connectivity states of all the
// subchannels here, just in case one of them happens to be READY,
// and we could switch to that rather than going IDLE.
@ -749,6 +775,16 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange(
if (!IsPickFirstHappyEyeballsEnabled()) {
subchannel_list_->in_transient_failure_ = false;
}
// We consider it a successful connection attempt only if the
// previous state was CONNECTING. In particular, we don't want to
// increment this counter if we got a new address list and found the
// existing connection already in state READY.
if (old_state == GRPC_CHANNEL_CONNECTING) {
stats_plugins.AddCounter(
kMetricConnectionAttemptsSucceeded, 1,
{subchannel_list_->policy_->channel_control_helper()->GetTarget()},
{});
}
ProcessUnselectedReadyLocked();
return;
}
@ -778,6 +814,13 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange(
subchannel_list_->StartConnectingNextSubchannel();
return;
}
// We've already started trying to connect. Any subchannel that
// reports TF is a connection attempt failure.
if (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE) {
stats_plugins.AddCounter(
kMetricConnectionAttemptsFailed, 1,
{subchannel_list_->policy_->channel_control_helper()->GetTarget()}, {});
}
if (!IsPickFirstHappyEyeballsEnabled()) {
// Ignore any other updates for subchannels we're not currently trying to
// connect to.

@ -56,6 +56,7 @@ grpc_cc_test(
":lb_policy_test_lib",
"//src/core:channel_args",
"//src/core:grpc_lb_policy_pick_first",
"//test/core/util:fake_stats_plugin",
"//test/core/util:grpc_test_util",
"//test/core/util:scoped_env_var",
],

@ -37,6 +37,7 @@
#include <grpc/grpc.h>
#include <grpc/support/json.h>
#include "src/core/lib/channel/metrics.h"
#include "src/core/lib/experiments/experiments.h"
#include "src/core/lib/gprpp/debug_location.h"
#include "src/core/lib/gprpp/orphanable.h"
@ -48,6 +49,7 @@
#include "src/core/load_balancing/lb_policy.h"
#include "src/core/resolver/endpoint_addresses.h"
#include "test/core/client_channel/lb_policy/lb_policy_test_lib.h"
#include "test/core/util/fake_stats_plugin.h"
#include "test/core/util/test_config.h"
namespace grpc_core {
@ -1107,6 +1109,125 @@ TEST_F(PickFirstTest, ShufflingDisabled) {
}
}
TEST_F(PickFirstTest, MetricDefinitionDisconnections) {
const auto* descriptor =
GlobalInstrumentsRegistryTestPeer::FindMetricDescriptorByName(
"grpc.lb.pick_first.disconnections");
ASSERT_NE(descriptor, nullptr);
EXPECT_EQ(descriptor->value_type,
GlobalInstrumentsRegistry::ValueType::kUInt64);
EXPECT_EQ(descriptor->instrument_type,
GlobalInstrumentsRegistry::InstrumentType::kCounter);
EXPECT_EQ(descriptor->enable_by_default, false);
EXPECT_EQ(descriptor->name, "grpc.lb.pick_first.disconnections");
EXPECT_EQ(descriptor->unit, "{disconnection}");
EXPECT_THAT(descriptor->label_keys, ::testing::ElementsAre("grpc.target"));
EXPECT_THAT(descriptor->optional_label_keys, ::testing::ElementsAre());
}
TEST_F(PickFirstTest, MetricDefinitionConnectionAttemptsSucceeded) {
const auto* descriptor =
GlobalInstrumentsRegistryTestPeer::FindMetricDescriptorByName(
"grpc.lb.pick_first.connection_attempts_succeeded");
ASSERT_NE(descriptor, nullptr);
EXPECT_EQ(descriptor->value_type,
GlobalInstrumentsRegistry::ValueType::kUInt64);
EXPECT_EQ(descriptor->instrument_type,
GlobalInstrumentsRegistry::InstrumentType::kCounter);
EXPECT_EQ(descriptor->enable_by_default, false);
EXPECT_EQ(descriptor->name,
"grpc.lb.pick_first.connection_attempts_succeeded");
EXPECT_EQ(descriptor->unit, "{attempt}");
EXPECT_THAT(descriptor->label_keys, ::testing::ElementsAre("grpc.target"));
EXPECT_THAT(descriptor->optional_label_keys, ::testing::ElementsAre());
}
TEST_F(PickFirstTest, MetricDefinitionConnectionAttemptsFailed) {
const auto* descriptor =
GlobalInstrumentsRegistryTestPeer::FindMetricDescriptorByName(
"grpc.lb.pick_first.connection_attempts_failed");
ASSERT_NE(descriptor, nullptr);
EXPECT_EQ(descriptor->value_type,
GlobalInstrumentsRegistry::ValueType::kUInt64);
EXPECT_EQ(descriptor->instrument_type,
GlobalInstrumentsRegistry::InstrumentType::kCounter);
EXPECT_EQ(descriptor->enable_by_default, false);
EXPECT_EQ(descriptor->name, "grpc.lb.pick_first.connection_attempts_failed");
EXPECT_EQ(descriptor->unit, "{attempt}");
EXPECT_THAT(descriptor->label_keys, ::testing::ElementsAre("grpc.target"));
EXPECT_THAT(descriptor->optional_label_keys, ::testing::ElementsAre());
}
TEST_F(PickFirstTest, MetricValues) {
const auto kDisconnections =
GlobalInstrumentsRegistryTestPeer::FindUInt64CounterHandleByName(
"grpc.lb.pick_first.disconnections")
.value();
const auto kConnectionAttemptsSucceeded =
GlobalInstrumentsRegistryTestPeer::FindUInt64CounterHandleByName(
"grpc.lb.pick_first.connection_attempts_succeeded")
.value();
const auto kConnectionAttemptsFailed =
GlobalInstrumentsRegistryTestPeer::FindUInt64CounterHandleByName(
"grpc.lb.pick_first.connection_attempts_failed")
.value();
const absl::string_view kLabelValues[] = {target_};
auto stats_plugin = std::make_shared<FakeStatsPlugin>(
nullptr, /*use_disabled_by_default_metrics=*/true);
stats_plugin_group_.push_back(stats_plugin);
// Send an update containing two addresses.
constexpr std::array<absl::string_view, 2> kAddresses = {
"ipv4:127.0.0.1:443", "ipv4:127.0.0.1:444"};
absl::Status status = ApplyUpdate(
BuildUpdate(kAddresses, MakePickFirstConfig(false)), lb_policy());
EXPECT_TRUE(status.ok()) << status;
// LB policy should have created a subchannel for both addresses.
auto* subchannel = FindSubchannel(kAddresses[0]);
ASSERT_NE(subchannel, nullptr);
auto* subchannel2 = FindSubchannel(kAddresses[1]);
ASSERT_NE(subchannel2, nullptr);
// When the LB policy receives the first subchannel's initial connectivity
// state notification (IDLE), it will request a connection.
EXPECT_TRUE(subchannel->ConnectionRequested());
// This causes the subchannel to start to connect, so it reports
// CONNECTING.
subchannel->SetConnectivityState(GRPC_CHANNEL_CONNECTING);
// LB policy should have reported CONNECTING state.
ExpectConnectingUpdate();
// The second subchannel should not be connecting.
EXPECT_FALSE(subchannel2->ConnectionRequested());
// The first subchannel's connection attempt fails.
subchannel->SetConnectivityState(GRPC_CHANNEL_TRANSIENT_FAILURE,
absl::UnavailableError("failed to connect"));
EXPECT_THAT(stats_plugin->GetCounterValue(kConnectionAttemptsFailed,
kLabelValues, {}),
::testing::Optional(1));
// The LB policy will start a connection attempt on the second subchannel.
EXPECT_TRUE(subchannel2->ConnectionRequested());
// This causes the subchannel to start to connect, so it reports
// CONNECTING.
subchannel2->SetConnectivityState(GRPC_CHANNEL_CONNECTING);
// The connection attempt succeeds.
subchannel2->SetConnectivityState(GRPC_CHANNEL_READY);
EXPECT_THAT(stats_plugin->GetCounterValue(kConnectionAttemptsSucceeded,
kLabelValues, {}),
::testing::Optional(1));
// The LB policy will report CONNECTING some number of times (doesn't
// matter how many) and then report READY.
auto picker = WaitForConnected();
ASSERT_NE(picker, nullptr);
// Picker should return the same subchannel repeatedly.
for (size_t i = 0; i < 3; ++i) {
EXPECT_EQ(ExpectPickComplete(picker.get()), kAddresses[1]);
}
// Now the subchannel becomes disconnected.
subchannel2->SetConnectivityState(GRPC_CHANNEL_IDLE);
ExpectReresolutionRequest();
ExpectState(GRPC_CHANNEL_IDLE);
EXPECT_THAT(stats_plugin->GetCounterValue(kDisconnections, kLabelValues, {}),
::testing::Optional(1));
}
class PickFirstHealthCheckingEnabledTest : public PickFirstTest {
protected:
PickFirstHealthCheckingEnabledTest()

Loading…
Cancel
Save