diff --git a/CMakeLists.txt b/CMakeLists.txt index 7fd4d76b79b..31baf6469d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21105,6 +21105,7 @@ add_executable(pick_first_test test/core/client_channel/lb_policy/pick_first_test.cc test/core/event_engine/event_engine_test_utils.cc test/core/event_engine/fuzzing_event_engine/fuzzing_event_engine.cc + test/core/util/fake_stats_plugin.cc ) if(WIN32 AND MSVC) if(BUILD_SHARED_LIBS) diff --git a/build_autogenerated.yaml b/build_autogenerated.yaml index 0c05a5ee5a1..2c44a4f10a5 100644 --- a/build_autogenerated.yaml +++ b/build_autogenerated.yaml @@ -13477,12 +13477,14 @@ targets: - test/core/client_channel/lb_policy/lb_policy_test_lib.h - test/core/event_engine/event_engine_test_utils.h - test/core/event_engine/fuzzing_event_engine/fuzzing_event_engine.h + - test/core/util/fake_stats_plugin.h - test/core/util/scoped_env_var.h src: - test/core/event_engine/fuzzing_event_engine/fuzzing_event_engine.proto - test/core/client_channel/lb_policy/pick_first_test.cc - test/core/event_engine/event_engine_test_utils.cc - test/core/event_engine/fuzzing_event_engine/fuzzing_event_engine.cc + - test/core/util/fake_stats_plugin.cc deps: - gtest - protobuf diff --git a/src/core/BUILD b/src/core/BUILD index 22063aaa10c..d329d8c287d 100644 --- a/src/core/BUILD +++ b/src/core/BUILD @@ -5479,6 +5479,7 @@ grpc_cc_library( "json_object_loader", "lb_policy", "lb_policy_factory", + "metrics", "resolved_address", "subchannel_interface", "time", diff --git a/src/core/load_balancing/pick_first/pick_first.cc b/src/core/load_balancing/pick_first/pick_first.cc index 406f388ed34..79c04248031 100644 --- a/src/core/load_balancing/pick_first/pick_first.cc +++ b/src/core/load_balancing/pick_first/pick_first.cc @@ -44,6 +44,7 @@ #include "src/core/load_balancing/health_check_client.h" #include "src/core/lib/address_utils/sockaddr_utils.h" #include "src/core/lib/channel/channel_args.h" +#include "src/core/lib/channel/metrics.h" #include "src/core/lib/config/core_configuration.h" #include "src/core/lib/debug/trace.h" #include "src/core/lib/experiments/experiments.h" @@ -78,6 +79,25 @@ namespace { constexpr absl::string_view kPickFirst = "pick_first"; +const auto kMetricDisconnections = + GlobalInstrumentsRegistry::RegisterUInt64Counter( + "grpc.lb.pick_first.disconnections", + "EXPERIMENTAL. Number of times the selected subchannel becomes " + "disconnected.", + "{disconnection}", {kMetricLabelTarget}, {}, false); + +const auto kMetricConnectionAttemptsSucceeded = + GlobalInstrumentsRegistry::RegisterUInt64Counter( + "grpc.lb.pick_first.connection_attempts_succeeded", + "EXPERIMENTAL. Number of successful connection attempts.", + "{attempt}", {kMetricLabelTarget}, {}, false); + +const auto kMetricConnectionAttemptsFailed = + GlobalInstrumentsRegistry::RegisterUInt64Counter( + "grpc.lb.pick_first.connection_attempts_failed", + "EXPERIMENTAL. Number of failed connection attempts.", + "{attempt}", {kMetricLabelTarget}, {}, false); + class PickFirstConfig : public LoadBalancingPolicy::Config { public: absl::string_view name() const override { return kPickFirst; } @@ -675,6 +695,9 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( p->latest_pending_subchannel_list_.get()); } if (subchannel_list_->shutting_down_ || pending_watcher_ == nullptr) return; + auto& stats_plugins = + subchannel_list_->policy_->channel_control_helper() + ->GetStatsPluginGroup(); // The notification must be for a subchannel in either the current or // latest pending subchannel lists. GPR_ASSERT(subchannel_list_ == p->subchannel_list_.get() || @@ -693,6 +716,9 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( } // Any state change is considered to be a failure of the existing // connection. + stats_plugins.AddCounter( + kMetricDisconnections, 1, + {subchannel_list_->policy_->channel_control_helper()->GetTarget()}, {}); // TODO(roth): We could check the connectivity states of all the // subchannels here, just in case one of them happens to be READY, // and we could switch to that rather than going IDLE. @@ -749,6 +775,16 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( if (!IsPickFirstHappyEyeballsEnabled()) { subchannel_list_->in_transient_failure_ = false; } + // We consider it a successful connection attempt only if the + // previous state was CONNECTING. In particular, we don't want to + // increment this counter if we got a new address list and found the + // existing connection already in state READY. + if (old_state == GRPC_CHANNEL_CONNECTING) { + stats_plugins.AddCounter( + kMetricConnectionAttemptsSucceeded, 1, + {subchannel_list_->policy_->channel_control_helper()->GetTarget()}, + {}); + } ProcessUnselectedReadyLocked(); return; } @@ -778,6 +814,13 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( subchannel_list_->StartConnectingNextSubchannel(); return; } + // We've already started trying to connect. Any subchannel that + // reports TF is a connection attempt failure. + if (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE) { + stats_plugins.AddCounter( + kMetricConnectionAttemptsFailed, 1, + {subchannel_list_->policy_->channel_control_helper()->GetTarget()}, {}); + } if (!IsPickFirstHappyEyeballsEnabled()) { // Ignore any other updates for subchannels we're not currently trying to // connect to. diff --git a/test/core/client_channel/lb_policy/BUILD b/test/core/client_channel/lb_policy/BUILD index 1f5d398a76f..292fe932664 100644 --- a/test/core/client_channel/lb_policy/BUILD +++ b/test/core/client_channel/lb_policy/BUILD @@ -56,6 +56,7 @@ grpc_cc_test( ":lb_policy_test_lib", "//src/core:channel_args", "//src/core:grpc_lb_policy_pick_first", + "//test/core/util:fake_stats_plugin", "//test/core/util:grpc_test_util", "//test/core/util:scoped_env_var", ], diff --git a/test/core/client_channel/lb_policy/pick_first_test.cc b/test/core/client_channel/lb_policy/pick_first_test.cc index 77ba738d1a0..05330ec9337 100644 --- a/test/core/client_channel/lb_policy/pick_first_test.cc +++ b/test/core/client_channel/lb_policy/pick_first_test.cc @@ -37,6 +37,7 @@ #include #include +#include "src/core/lib/channel/metrics.h" #include "src/core/lib/experiments/experiments.h" #include "src/core/lib/gprpp/debug_location.h" #include "src/core/lib/gprpp/orphanable.h" @@ -48,6 +49,7 @@ #include "src/core/load_balancing/lb_policy.h" #include "src/core/resolver/endpoint_addresses.h" #include "test/core/client_channel/lb_policy/lb_policy_test_lib.h" +#include "test/core/util/fake_stats_plugin.h" #include "test/core/util/test_config.h" namespace grpc_core { @@ -1107,6 +1109,125 @@ TEST_F(PickFirstTest, ShufflingDisabled) { } } +TEST_F(PickFirstTest, MetricDefinitionDisconnections) { + const auto* descriptor = + GlobalInstrumentsRegistryTestPeer::FindMetricDescriptorByName( + "grpc.lb.pick_first.disconnections"); + ASSERT_NE(descriptor, nullptr); + EXPECT_EQ(descriptor->value_type, + GlobalInstrumentsRegistry::ValueType::kUInt64); + EXPECT_EQ(descriptor->instrument_type, + GlobalInstrumentsRegistry::InstrumentType::kCounter); + EXPECT_EQ(descriptor->enable_by_default, false); + EXPECT_EQ(descriptor->name, "grpc.lb.pick_first.disconnections"); + EXPECT_EQ(descriptor->unit, "{disconnection}"); + EXPECT_THAT(descriptor->label_keys, ::testing::ElementsAre("grpc.target")); + EXPECT_THAT(descriptor->optional_label_keys, ::testing::ElementsAre()); +} + +TEST_F(PickFirstTest, MetricDefinitionConnectionAttemptsSucceeded) { + const auto* descriptor = + GlobalInstrumentsRegistryTestPeer::FindMetricDescriptorByName( + "grpc.lb.pick_first.connection_attempts_succeeded"); + ASSERT_NE(descriptor, nullptr); + EXPECT_EQ(descriptor->value_type, + GlobalInstrumentsRegistry::ValueType::kUInt64); + EXPECT_EQ(descriptor->instrument_type, + GlobalInstrumentsRegistry::InstrumentType::kCounter); + EXPECT_EQ(descriptor->enable_by_default, false); + EXPECT_EQ(descriptor->name, + "grpc.lb.pick_first.connection_attempts_succeeded"); + EXPECT_EQ(descriptor->unit, "{attempt}"); + EXPECT_THAT(descriptor->label_keys, ::testing::ElementsAre("grpc.target")); + EXPECT_THAT(descriptor->optional_label_keys, ::testing::ElementsAre()); +} + +TEST_F(PickFirstTest, MetricDefinitionConnectionAttemptsFailed) { + const auto* descriptor = + GlobalInstrumentsRegistryTestPeer::FindMetricDescriptorByName( + "grpc.lb.pick_first.connection_attempts_failed"); + ASSERT_NE(descriptor, nullptr); + EXPECT_EQ(descriptor->value_type, + GlobalInstrumentsRegistry::ValueType::kUInt64); + EXPECT_EQ(descriptor->instrument_type, + GlobalInstrumentsRegistry::InstrumentType::kCounter); + EXPECT_EQ(descriptor->enable_by_default, false); + EXPECT_EQ(descriptor->name, "grpc.lb.pick_first.connection_attempts_failed"); + EXPECT_EQ(descriptor->unit, "{attempt}"); + EXPECT_THAT(descriptor->label_keys, ::testing::ElementsAre("grpc.target")); + EXPECT_THAT(descriptor->optional_label_keys, ::testing::ElementsAre()); +} + +TEST_F(PickFirstTest, MetricValues) { + const auto kDisconnections = + GlobalInstrumentsRegistryTestPeer::FindUInt64CounterHandleByName( + "grpc.lb.pick_first.disconnections") + .value(); + const auto kConnectionAttemptsSucceeded = + GlobalInstrumentsRegistryTestPeer::FindUInt64CounterHandleByName( + "grpc.lb.pick_first.connection_attempts_succeeded") + .value(); + const auto kConnectionAttemptsFailed = + GlobalInstrumentsRegistryTestPeer::FindUInt64CounterHandleByName( + "grpc.lb.pick_first.connection_attempts_failed") + .value(); + const absl::string_view kLabelValues[] = {target_}; + auto stats_plugin = std::make_shared( + nullptr, /*use_disabled_by_default_metrics=*/true); + stats_plugin_group_.push_back(stats_plugin); + // Send an update containing two addresses. + constexpr std::array kAddresses = { + "ipv4:127.0.0.1:443", "ipv4:127.0.0.1:444"}; + absl::Status status = ApplyUpdate( + BuildUpdate(kAddresses, MakePickFirstConfig(false)), lb_policy()); + EXPECT_TRUE(status.ok()) << status; + // LB policy should have created a subchannel for both addresses. + auto* subchannel = FindSubchannel(kAddresses[0]); + ASSERT_NE(subchannel, nullptr); + auto* subchannel2 = FindSubchannel(kAddresses[1]); + ASSERT_NE(subchannel2, nullptr); + // When the LB policy receives the first subchannel's initial connectivity + // state notification (IDLE), it will request a connection. + EXPECT_TRUE(subchannel->ConnectionRequested()); + // This causes the subchannel to start to connect, so it reports + // CONNECTING. + subchannel->SetConnectivityState(GRPC_CHANNEL_CONNECTING); + // LB policy should have reported CONNECTING state. + ExpectConnectingUpdate(); + // The second subchannel should not be connecting. + EXPECT_FALSE(subchannel2->ConnectionRequested()); + // The first subchannel's connection attempt fails. + subchannel->SetConnectivityState(GRPC_CHANNEL_TRANSIENT_FAILURE, + absl::UnavailableError("failed to connect")); + EXPECT_THAT(stats_plugin->GetCounterValue(kConnectionAttemptsFailed, + kLabelValues, {}), + ::testing::Optional(1)); + // The LB policy will start a connection attempt on the second subchannel. + EXPECT_TRUE(subchannel2->ConnectionRequested()); + // This causes the subchannel to start to connect, so it reports + // CONNECTING. + subchannel2->SetConnectivityState(GRPC_CHANNEL_CONNECTING); + // The connection attempt succeeds. + subchannel2->SetConnectivityState(GRPC_CHANNEL_READY); + EXPECT_THAT(stats_plugin->GetCounterValue(kConnectionAttemptsSucceeded, + kLabelValues, {}), + ::testing::Optional(1)); + // The LB policy will report CONNECTING some number of times (doesn't + // matter how many) and then report READY. + auto picker = WaitForConnected(); + ASSERT_NE(picker, nullptr); + // Picker should return the same subchannel repeatedly. + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(ExpectPickComplete(picker.get()), kAddresses[1]); + } + // Now the subchannel becomes disconnected. + subchannel2->SetConnectivityState(GRPC_CHANNEL_IDLE); + ExpectReresolutionRequest(); + ExpectState(GRPC_CHANNEL_IDLE); + EXPECT_THAT(stats_plugin->GetCounterValue(kDisconnections, kLabelValues, {}), + ::testing::Optional(1)); +} + class PickFirstHealthCheckingEnabledTest : public PickFirstTest { protected: PickFirstHealthCheckingEnabledTest()