From 083bbee4805c14ce62e6c9535fe936f68b854c4f Mon Sep 17 00:00:00 2001 From: "Mark D. Roth" Date: Fri, 14 Jul 2023 15:59:42 -0700 Subject: [PATCH] [LB policies] revert changes for dualstack design (#33718) This reverts the following PRs: #32692 #33087 #33093 #33427 #33568 These changes seem to have introduced some flaky crashes. Reverting while I investigate. --- CMakeLists.txt | 2 - Makefile | 2 - Package.swift | 3 +- build_autogenerated.yaml | 6 +- config.m4 | 1 - config.w32 | 1 - gRPC-C++.podspec | 4 +- gRPC-Core.podspec | 5 +- grpc.gemspec | 3 +- grpc.gyp | 2 - package.xml | 3 +- src/core/BUILD | 41 +- .../client_channel/lb_policy/endpoint_list.cc | 188 ---- .../client_channel/lb_policy/endpoint_list.h | 214 ----- .../lb_policy/health_check_client.cc | 47 +- .../lb_policy/health_check_client_internal.h | 5 +- .../outlier_detection/outlier_detection.cc | 113 ++- .../outlier_detection/outlier_detection.h | 7 + .../lb_policy/pick_first/pick_first.cc | 531 +++-------- .../lb_policy/pick_first/pick_first.h | 16 - .../lb_policy/ring_hash/ring_hash.cc | 891 +++++++++--------- .../lb_policy/round_robin/round_robin.cc | 438 +++++---- .../lb_policy/subchannel_list.h | 488 ++++++++++ .../weighted_round_robin.cc | 652 +++++++------ .../lb_policy/xds/xds_override_host.cc | 41 +- .../ext/filters/client_channel/subchannel.h | 2 - src/core/lib/resolver/server_address.h | 1 - src/python/grpcio/grpc_core_dependencies.py | 1 - .../lb_policy/lb_policy_test_lib.h | 146 +-- .../lb_policy/outlier_detection_test.cc | 13 +- .../lb_policy/pick_first_test.cc | 31 +- .../lb_policy/round_robin_test.cc | 4 +- .../lb_policy/weighted_round_robin_test.cc | 4 +- .../lb_policy/xds_override_host_test.cc | 36 +- test/cpp/end2end/client_lb_end2end_test.cc | 15 +- .../end2end/xds/xds_ring_hash_end2end_test.cc | 6 +- tools/doxygen/Doxyfile.c++.internal | 3 +- tools/doxygen/Doxyfile.core.internal | 3 +- 38 files changed, 1899 insertions(+), 2070 deletions(-) delete mode 100644 src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc delete mode 100644 src/core/ext/filters/client_channel/lb_policy/endpoint_list.h create mode 100644 src/core/ext/filters/client_channel/lb_policy/subchannel_list.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a89ee75fa1..325581d4b3c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1698,7 +1698,6 @@ add_library(grpc src/core/ext/filters/client_channel/http_proxy.cc src/core/ext/filters/client_channel/lb_policy/address_filtering.cc src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc - src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.cc @@ -2740,7 +2739,6 @@ add_library(grpc_unsecure src/core/ext/filters/client_channel/http_proxy.cc src/core/ext/filters/client_channel/lb_policy/address_filtering.cc src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc - src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.cc diff --git a/Makefile b/Makefile index e536932400b..2d516569002 100644 --- a/Makefile +++ b/Makefile @@ -980,7 +980,6 @@ LIBGRPC_SRC = \ src/core/ext/filters/client_channel/http_proxy.cc \ src/core/ext/filters/client_channel/lb_policy/address_filtering.cc \ src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc \ - src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc \ src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc \ src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc \ src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.cc \ @@ -1875,7 +1874,6 @@ LIBGRPC_UNSECURE_SRC = \ src/core/ext/filters/client_channel/http_proxy.cc \ src/core/ext/filters/client_channel/lb_policy/address_filtering.cc \ src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc \ - src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc \ src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc \ src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc \ src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.cc \ diff --git a/Package.swift b/Package.swift index 08a48cb81a7..9c40122a673 100644 --- a/Package.swift +++ b/Package.swift @@ -148,8 +148,6 @@ let package = Package( "src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h", "src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc", "src/core/ext/filters/client_channel/lb_policy/child_policy_handler.h", - "src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc", - "src/core/ext/filters/client_channel/lb_policy/endpoint_list.h", "src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc", "src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.h", "src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc", @@ -175,6 +173,7 @@ let package = Package( "src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.h", "src/core/ext/filters/client_channel/lb_policy/rls/rls.cc", "src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.cc", + "src/core/ext/filters/client_channel/lb_policy/subchannel_list.h", "src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.cc", "src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.h", "src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc", diff --git a/build_autogenerated.yaml b/build_autogenerated.yaml index ccef561955f..f8a523b124c 100644 --- a/build_autogenerated.yaml +++ b/build_autogenerated.yaml @@ -231,7 +231,6 @@ libs: - src/core/ext/filters/client_channel/lb_policy/address_filtering.h - src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h - src/core/ext/filters/client_channel/lb_policy/child_policy_handler.h - - src/core/ext/filters/client_channel/lb_policy/endpoint_list.h - src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.h - src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.h - src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.h @@ -244,6 +243,7 @@ libs: - src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.h - src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h - src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.h + - src/core/ext/filters/client_channel/lb_policy/subchannel_list.h - src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.h - src/core/ext/filters/client_channel/lb_policy/xds/xds_channel_args.h - src/core/ext/filters/client_channel/lb_policy/xds/xds_override_host.h @@ -1037,7 +1037,6 @@ libs: - src/core/ext/filters/client_channel/http_proxy.cc - src/core/ext/filters/client_channel/lb_policy/address_filtering.cc - src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc - - src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc - src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc - src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc - src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.cc @@ -1958,7 +1957,6 @@ libs: - src/core/ext/filters/client_channel/lb_policy/address_filtering.h - src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h - src/core/ext/filters/client_channel/lb_policy/child_policy_handler.h - - src/core/ext/filters/client_channel/lb_policy/endpoint_list.h - src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.h - src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.h - src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.h @@ -1970,6 +1968,7 @@ libs: - src/core/ext/filters/client_channel/lb_policy/oob_backend_metric_internal.h - src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.h - src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h + - src/core/ext/filters/client_channel/lb_policy/subchannel_list.h - src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.h - src/core/ext/filters/client_channel/local_subchannel_pool.h - src/core/ext/filters/client_channel/resolver/dns/c_ares/dns_resolver_ares.h @@ -2371,7 +2370,6 @@ libs: - src/core/ext/filters/client_channel/http_proxy.cc - src/core/ext/filters/client_channel/lb_policy/address_filtering.cc - src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc - - src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc - src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc - src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc - src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.cc diff --git a/config.m4 b/config.m4 index f1113780744..e7ddb42f6c3 100644 --- a/config.m4 +++ b/config.m4 @@ -59,7 +59,6 @@ if test "$PHP_GRPC" != "no"; then src/core/ext/filters/client_channel/http_proxy.cc \ src/core/ext/filters/client_channel/lb_policy/address_filtering.cc \ src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc \ - src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc \ src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc \ src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc \ src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.cc \ diff --git a/config.w32 b/config.w32 index 2019cef3b90..64851c79e43 100644 --- a/config.w32 +++ b/config.w32 @@ -24,7 +24,6 @@ if (PHP_GRPC != "no") { "src\\core\\ext\\filters\\client_channel\\http_proxy.cc " + "src\\core\\ext\\filters\\client_channel\\lb_policy\\address_filtering.cc " + "src\\core\\ext\\filters\\client_channel\\lb_policy\\child_policy_handler.cc " + - "src\\core\\ext\\filters\\client_channel\\lb_policy\\endpoint_list.cc " + "src\\core\\ext\\filters\\client_channel\\lb_policy\\grpclb\\client_load_reporting_filter.cc " + "src\\core\\ext\\filters\\client_channel\\lb_policy\\grpclb\\grpclb.cc " + "src\\core\\ext\\filters\\client_channel\\lb_policy\\grpclb\\grpclb_balancer_addresses.cc " + diff --git a/gRPC-C++.podspec b/gRPC-C++.podspec index 97e621bb6fd..ffc0b135179 100644 --- a/gRPC-C++.podspec +++ b/gRPC-C++.podspec @@ -263,7 +263,6 @@ Pod::Spec.new do |s| 'src/core/ext/filters/client_channel/lb_policy/address_filtering.h', 'src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h', 'src/core/ext/filters/client_channel/lb_policy/child_policy_handler.h', - 'src/core/ext/filters/client_channel/lb_policy/endpoint_list.h', 'src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.h', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.h', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.h', @@ -276,6 +275,7 @@ Pod::Spec.new do |s| 'src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.h', 'src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h', 'src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.h', + 'src/core/ext/filters/client_channel/lb_policy/subchannel_list.h', 'src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.h', 'src/core/ext/filters/client_channel/lb_policy/xds/xds_channel_args.h', 'src/core/ext/filters/client_channel/lb_policy/xds/xds_override_host.h', @@ -1327,7 +1327,6 @@ Pod::Spec.new do |s| 'src/core/ext/filters/client_channel/lb_policy/address_filtering.h', 'src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h', 'src/core/ext/filters/client_channel/lb_policy/child_policy_handler.h', - 'src/core/ext/filters/client_channel/lb_policy/endpoint_list.h', 'src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.h', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.h', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.h', @@ -1340,6 +1339,7 @@ Pod::Spec.new do |s| 'src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.h', 'src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h', 'src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.h', + 'src/core/ext/filters/client_channel/lb_policy/subchannel_list.h', 'src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.h', 'src/core/ext/filters/client_channel/lb_policy/xds/xds_channel_args.h', 'src/core/ext/filters/client_channel/lb_policy/xds/xds_override_host.h', diff --git a/gRPC-Core.podspec b/gRPC-Core.podspec index 90676648594..36f40caf386 100644 --- a/gRPC-Core.podspec +++ b/gRPC-Core.podspec @@ -249,8 +249,6 @@ Pod::Spec.new do |s| 'src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h', 'src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc', 'src/core/ext/filters/client_channel/lb_policy/child_policy_handler.h', - 'src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc', - 'src/core/ext/filters/client_channel/lb_policy/endpoint_list.h', 'src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc', 'src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.h', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc', @@ -276,6 +274,7 @@ Pod::Spec.new do |s| 'src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.h', 'src/core/ext/filters/client_channel/lb_policy/rls/rls.cc', 'src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.cc', + 'src/core/ext/filters/client_channel/lb_policy/subchannel_list.h', 'src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.cc', 'src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.h', 'src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc', @@ -2079,7 +2078,6 @@ Pod::Spec.new do |s| 'src/core/ext/filters/client_channel/lb_policy/address_filtering.h', 'src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h', 'src/core/ext/filters/client_channel/lb_policy/child_policy_handler.h', - 'src/core/ext/filters/client_channel/lb_policy/endpoint_list.h', 'src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.h', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.h', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.h', @@ -2092,6 +2090,7 @@ Pod::Spec.new do |s| 'src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.h', 'src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h', 'src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.h', + 'src/core/ext/filters/client_channel/lb_policy/subchannel_list.h', 'src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.h', 'src/core/ext/filters/client_channel/lb_policy/xds/xds_channel_args.h', 'src/core/ext/filters/client_channel/lb_policy/xds/xds_override_host.h', diff --git a/grpc.gemspec b/grpc.gemspec index edccfbfed4c..f3dccb93dc2 100644 --- a/grpc.gemspec +++ b/grpc.gemspec @@ -154,8 +154,6 @@ Gem::Specification.new do |s| s.files += %w( src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h ) s.files += %w( src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc ) s.files += %w( src/core/ext/filters/client_channel/lb_policy/child_policy_handler.h ) - s.files += %w( src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc ) - s.files += %w( src/core/ext/filters/client_channel/lb_policy/endpoint_list.h ) s.files += %w( src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc ) s.files += %w( src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.h ) s.files += %w( src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc ) @@ -181,6 +179,7 @@ Gem::Specification.new do |s| s.files += %w( src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.h ) s.files += %w( src/core/ext/filters/client_channel/lb_policy/rls/rls.cc ) s.files += %w( src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.cc ) + s.files += %w( src/core/ext/filters/client_channel/lb_policy/subchannel_list.h ) s.files += %w( src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.cc ) s.files += %w( src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.h ) s.files += %w( src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc ) diff --git a/grpc.gyp b/grpc.gyp index ca498cf2516..4571485bff2 100644 --- a/grpc.gyp +++ b/grpc.gyp @@ -284,7 +284,6 @@ 'src/core/ext/filters/client_channel/http_proxy.cc', 'src/core/ext/filters/client_channel/lb_policy/address_filtering.cc', 'src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc', - 'src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc', 'src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.cc', @@ -1119,7 +1118,6 @@ 'src/core/ext/filters/client_channel/http_proxy.cc', 'src/core/ext/filters/client_channel/lb_policy/address_filtering.cc', 'src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc', - 'src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc', 'src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.cc', diff --git a/package.xml b/package.xml index f4ef5d88d8e..db3a373a327 100644 --- a/package.xml +++ b/package.xml @@ -136,8 +136,6 @@ - - @@ -163,6 +161,7 @@ + diff --git a/src/core/BUILD b/src/core/BUILD index fd5828ef742..24e0e676570 100644 --- a/src/core/BUILD +++ b/src/core/BUILD @@ -4582,40 +4582,32 @@ grpc_cc_library( "//:grpc_trace", "//:orphanable", "//:ref_counted_ptr", - "//:sockaddr_utils", "//:work_serializer", ], ) grpc_cc_library( - name = "lb_endpoint_list", - srcs = [ - "ext/filters/client_channel/lb_policy/endpoint_list.cc", - ], + name = "grpc_lb_subchannel_list", hdrs = [ - "ext/filters/client_channel/lb_policy/endpoint_list.h", + "ext/filters/client_channel/lb_policy/subchannel_list.h", ], external_deps = [ - "absl/functional:any_invocable", "absl/status", - "absl/status:statusor", "absl/types:optional", ], language = "c++", deps = [ "channel_args", - "delegating_helper", - "grpc_lb_policy_pick_first", - "json", + "dual_ref_counted", + "gpr_manual_constructor", + "health_check_client", + "iomgr_fwd", "lb_policy", - "lb_policy_registry", - "pollset_set", "subchannel_interface", - "//:config", "//:debug_location", "//:gpr", "//:grpc_base", - "//:orphanable", + "//:grpc_client_channel", "//:ref_counted_ptr", "//:server_address", "//:work_serializer", @@ -4642,8 +4634,8 @@ grpc_cc_library( deps = [ "channel_args", "env", - "health_check_client", - "iomgr_fwd", + "grpc_lb_subchannel_list", + "grpc_outlier_detection_header", "json", "json_args", "json_object_loader", @@ -4659,6 +4651,7 @@ grpc_cc_library( "//:orphanable", "//:ref_counted_ptr", "//:server_address", + "//:work_serializer", ], ) @@ -4683,18 +4676,16 @@ grpc_cc_library( deps = [ "channel_args", "closure", - "delegating_helper", "error", - "grpc_lb_policy_pick_first", + "grpc_lb_subchannel_list", "grpc_service_config", "json", "json_args", "json_object_loader", "lb_policy", "lb_policy_factory", - "lb_policy_registry", - "pollset_set", "ref_counted", + "subchannel_interface", "unique_type_name", "validation_errors", "//:config", @@ -4727,10 +4718,11 @@ grpc_cc_library( language = "c++", deps = [ "channel_args", + "grpc_lb_subchannel_list", "json", - "lb_endpoint_list", "lb_policy", "lb_policy_factory", + "subchannel_interface", "//:config", "//:debug_location", "//:gpr", @@ -4772,16 +4764,15 @@ grpc_cc_library( "absl/status:statusor", "absl/strings", "absl/types:optional", - "absl/types:variant", ], language = "c++", deps = [ "channel_args", "grpc_backend_metric_data", + "grpc_lb_subchannel_list", "json", "json_args", "json_object_loader", - "lb_endpoint_list", "lb_policy", "lb_policy_factory", "ref_counted", @@ -4819,6 +4810,7 @@ grpc_cc_library( "time", "validation_errors", "//:gpr_platform", + "//:server_address", ], ) @@ -4845,6 +4837,7 @@ grpc_cc_library( "lb_policy", "lb_policy_factory", "lb_policy_registry", + "match", "pollset_set", "ref_counted", "subchannel_interface", diff --git a/src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc b/src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc deleted file mode 100644 index 9269359d748..00000000000 --- a/src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc +++ /dev/null @@ -1,188 +0,0 @@ -// -// Copyright 2015 gRPC authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -#include - -#include "src/core/ext/filters/client_channel/lb_policy/endpoint_list.h" - -#include - -#include -#include -#include -#include - -#include "absl/status/status.h" -#include "absl/status/statusor.h" -#include "absl/types/optional.h" - -#include -#include -#include - -#include "src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h" -#include "src/core/lib/channel/channel_args.h" -#include "src/core/lib/config/core_configuration.h" -#include "src/core/lib/gprpp/debug_location.h" -#include "src/core/lib/gprpp/orphanable.h" -#include "src/core/lib/gprpp/ref_counted_ptr.h" -#include "src/core/lib/iomgr/pollset_set.h" -#include "src/core/lib/json/json.h" -#include "src/core/lib/load_balancing/delegating_helper.h" -#include "src/core/lib/load_balancing/lb_policy.h" -#include "src/core/lib/load_balancing/lb_policy_registry.h" -#include "src/core/lib/resolver/server_address.h" - -namespace grpc_core { - -// -// EndpointList::Endpoint::Helper -// - -class EndpointList::Endpoint::Helper - : public LoadBalancingPolicy::DelegatingChannelControlHelper { - public: - explicit Helper(RefCountedPtr endpoint) - : endpoint_(std::move(endpoint)) {} - - ~Helper() override { endpoint_.reset(DEBUG_LOCATION, "Helper"); } - - RefCountedPtr CreateSubchannel( - ServerAddress address, const ChannelArgs& args) override { - return endpoint_->CreateSubchannel(std::move(address), args); - } - - void UpdateState( - grpc_connectivity_state state, const absl::Status& status, - RefCountedPtr picker) override { - auto old_state = std::exchange(endpoint_->connectivity_state_, state); - endpoint_->picker_ = std::move(picker); - endpoint_->OnStateUpdate(old_state, state, status); - } - - private: - LoadBalancingPolicy::ChannelControlHelper* parent_helper() const override { - return endpoint_->endpoint_list_->channel_control_helper(); - } - - RefCountedPtr endpoint_; -}; - -// -// EndpointList::Endpoint -// - -void EndpointList::Endpoint::Init( - const ServerAddress& address, const ChannelArgs& args, - std::shared_ptr work_serializer) { - ChannelArgs child_args = - args.Set(GRPC_ARG_INTERNAL_PICK_FIRST_ENABLE_HEALTH_CHECKING, true) - .Set(GRPC_ARG_INTERNAL_PICK_FIRST_OMIT_STATUS_MESSAGE_PREFIX, true); - LoadBalancingPolicy::Args lb_policy_args; - lb_policy_args.work_serializer = std::move(work_serializer); - lb_policy_args.args = child_args; - lb_policy_args.channel_control_helper = - std::make_unique(Ref(DEBUG_LOCATION, "Helper")); - child_policy_ = - CoreConfiguration::Get().lb_policy_registry().CreateLoadBalancingPolicy( - "pick_first", std::move(lb_policy_args)); - if (GPR_UNLIKELY(endpoint_list_->tracer_ != nullptr)) { - gpr_log(GPR_INFO, "[%s %p] endpoint %p: created child policy %p", - endpoint_list_->tracer_, endpoint_list_->policy_.get(), this, - child_policy_.get()); - } - // Add our interested_parties pollset_set to that of the newly created - // child policy. This will make the child policy progress upon activity on - // this policy, which in turn is tied to the application's call. - grpc_pollset_set_add_pollset_set( - child_policy_->interested_parties(), - endpoint_list_->policy_->interested_parties()); - // Construct pick_first config. - auto config = - CoreConfiguration::Get().lb_policy_registry().ParseLoadBalancingConfig( - Json::FromArray( - {Json::FromObject({{"pick_first", Json::FromObject({})}})})); - GPR_ASSERT(config.ok()); - // Update child policy. - LoadBalancingPolicy::UpdateArgs update_args; - update_args.addresses.emplace().emplace_back(address); - update_args.args = child_args; - update_args.config = std::move(*config); - // TODO(roth): If the child reports a non-OK status with the update, - // we need to propagate that back to the resolver somehow. - (void)child_policy_->UpdateLocked(std::move(update_args)); -} - -void EndpointList::Endpoint::Orphan() { - // Remove pollset_set linkage. - grpc_pollset_set_del_pollset_set( - child_policy_->interested_parties(), - endpoint_list_->policy_->interested_parties()); - child_policy_.reset(); - picker_.reset(); - Unref(); -} - -void EndpointList::Endpoint::ResetBackoffLocked() { - if (child_policy_ != nullptr) child_policy_->ResetBackoffLocked(); -} - -void EndpointList::Endpoint::ExitIdleLocked() { - if (child_policy_ != nullptr) child_policy_->ExitIdleLocked(); -} - -size_t EndpointList::Endpoint::Index() const { - for (size_t i = 0; i < endpoint_list_->endpoints_.size(); ++i) { - if (endpoint_list_->endpoints_[i].get() == this) return i; - } - return -1; -} - -RefCountedPtr EndpointList::Endpoint::CreateSubchannel( - ServerAddress address, const ChannelArgs& args) { - return endpoint_list_->channel_control_helper()->CreateSubchannel( - std::move(address), args); -} - -// -// EndpointList -// - -void EndpointList::Init( - const ServerAddressList& addresses, const ChannelArgs& args, - absl::AnyInvocable( - RefCountedPtr, const ServerAddress&, const ChannelArgs&)> - create_endpoint) { - for (const ServerAddress& address : addresses) { - endpoints_.push_back( - create_endpoint(Ref(DEBUG_LOCATION, "Endpoint"), address, args)); - } -} - -void EndpointList::ResetBackoffLocked() { - for (const auto& endpoint : endpoints_) { - endpoint->ResetBackoffLocked(); - } -} - -bool EndpointList::AllEndpointsSeenInitialState() const { - for (const auto& endpoint : endpoints_) { - if (!endpoint->connectivity_state().has_value()) return false; - } - return true; -} - -} // namespace grpc_core diff --git a/src/core/ext/filters/client_channel/lb_policy/endpoint_list.h b/src/core/ext/filters/client_channel/lb_policy/endpoint_list.h deleted file mode 100644 index 66fce2871e4..00000000000 --- a/src/core/ext/filters/client_channel/lb_policy/endpoint_list.h +++ /dev/null @@ -1,214 +0,0 @@ -// -// Copyright 2015 gRPC authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -#ifndef GRPC_SRC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_ENDPOINT_LIST_H -#define GRPC_SRC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_ENDPOINT_LIST_H - -#include - -#include - -#include -#include -#include - -#include "absl/functional/any_invocable.h" -#include "absl/status/status.h" -#include "absl/types/optional.h" - -#include - -#include "src/core/lib/channel/channel_args.h" -#include "src/core/lib/gprpp/debug_location.h" -#include "src/core/lib/gprpp/orphanable.h" -#include "src/core/lib/gprpp/ref_counted_ptr.h" -#include "src/core/lib/gprpp/work_serializer.h" -#include "src/core/lib/load_balancing/lb_policy.h" -#include "src/core/lib/load_balancing/subchannel_interface.h" -#include "src/core/lib/resolver/server_address.h" - -namespace grpc_core { - -// A list of endpoints for use in a petiole LB policy. Each endpoint may -// have one or more addresses, which will be passed down to a pick_first -// child policy. -// -// To use this, a petiole policy must define its own subclass of both -// EndpointList and EndpointList::Endpoint, like so: -/* -class MyEndpointList : public EndpointList { - public: - MyEndpointList(RefCountedPtr lb_policy, - const ServerAddressList& addresses, const ChannelArgs& args) - : EndpointList(std::move(lb_policy), - GRPC_TRACE_FLAG_ENABLED(grpc_my_tracer) - ? "MyEndpointList" - : nullptr) { - Init(addresses, args, - [&](RefCountedPtr endpoint_list, - const ServerAddress& address, const ChannelArgs& args) { - return MakeOrphanable( - std::move(endpoint_list), address, args, - policy()->work_serializer()); - }); - } - - private: - class MyEndpoint : public Endpoint { - public: - MyEndpoint(RefCountedPtr endpoint_list, - const ServerAddress& address, const ChannelArgs& args, - std::shared_ptr work_serializer) - : Endpoint(std::move(endpoint_list)) { - Init(address, args, std::move(work_serializer)); - } - - private: - void OnStateUpdate( - absl::optional old_state, - grpc_connectivity_state new_state, - const absl::Status& status) override { - // ...handle connectivity state change... - } - }; - - LoadBalancingPolicy::ChannelControlHelper* channel_control_helper() - const override { - return policy()->channel_control_helper(); - } -}; -*/ -// TODO(roth): Consider wrapping this in an LB policy subclass for petiole -// policies to inherit from. -class EndpointList : public InternallyRefCounted { - public: - // An individual endpoint. - class Endpoint : public InternallyRefCounted { - public: - ~Endpoint() override { endpoint_list_.reset(DEBUG_LOCATION, "Endpoint"); } - - void Orphan() override; - - void ResetBackoffLocked(); - void ExitIdleLocked(); - - absl::optional connectivity_state() const { - return connectivity_state_; - } - RefCountedPtr picker() const { - return picker_; - } - - protected: - // We use two-phase initialization here to ensure that the vtable is - // initialized before we need to use it. Subclass must invoke Init() - // from inside its ctor. - explicit Endpoint(RefCountedPtr endpoint_list) - : endpoint_list_(std::move(endpoint_list)) {} - - void Init(const ServerAddress& address, const ChannelArgs& args, - std::shared_ptr work_serializer); - - // Templated for convenience, to provide a short-hand for - // down-casting in the caller. - template - T* endpoint_list() const { - return static_cast(endpoint_list_.get()); - } - - // Templated for convenience, to provide a short-hand for down-casting - // in the caller. - template - T* policy() const { - return endpoint_list_->policy(); - } - - // Returns the index of this endpoint within the EndpointList. - // Intended for trace logging. - size_t Index() const; - - private: - class Helper; - - // Called when the child policy reports a connectivity state update. - virtual void OnStateUpdate( - absl::optional old_state, - grpc_connectivity_state new_state, const absl::Status& status) = 0; - - // Called to create a subchannel. Subclasses may override. - virtual RefCountedPtr CreateSubchannel( - ServerAddress address, const ChannelArgs& args); - - RefCountedPtr endpoint_list_; - - OrphanablePtr child_policy_; - absl::optional connectivity_state_; - RefCountedPtr picker_; - }; - - ~EndpointList() override { policy_.reset(DEBUG_LOCATION, "EndpointList"); } - - void Orphan() override { - endpoints_.clear(); - Unref(); - } - - size_t size() const { return endpoints_.size(); } - - const std::vector>& endpoints() const { - return endpoints_; - } - - void ResetBackoffLocked(); - - protected: - // We use two-phase initialization here to ensure that the vtable is - // initialized before we need to use it. Subclass must invoke Init() - // from inside its ctor. - EndpointList(RefCountedPtr policy, const char* tracer) - : policy_(std::move(policy)), tracer_(tracer) {} - - void Init(const ServerAddressList& addresses, const ChannelArgs& args, - absl::AnyInvocable( - RefCountedPtr, const ServerAddress&, - const ChannelArgs&)> - create_endpoint); - - // Templated for convenience, to provide a short-hand for down-casting - // in the caller. - template - T* policy() const { - return static_cast(policy_.get()); - } - - // Returns true if all endpoints have seen their initial connectivity - // state notification. - bool AllEndpointsSeenInitialState() const; - - private: - // Returns the parent policy's helper. Needed because the accessor - // method is protected on LoadBalancingPolicy. - virtual LoadBalancingPolicy::ChannelControlHelper* channel_control_helper() - const = 0; - - RefCountedPtr policy_; - const char* tracer_; - std::vector> endpoints_; -}; - -} // namespace grpc_core - -#endif // GRPC_SRC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_ENDPOINT_LIST_H diff --git a/src/core/ext/filters/client_channel/lb_policy/health_check_client.cc b/src/core/ext/filters/client_channel/lb_policy/health_check_client.cc index 49710b67ad7..f9b02cdfe53 100644 --- a/src/core/ext/filters/client_channel/lb_policy/health_check_client.cc +++ b/src/core/ext/filters/client_channel/lb_policy/health_check_client.cc @@ -28,7 +28,6 @@ #include "absl/status/status.h" #include "absl/status/statusor.h" -#include "absl/strings/str_cat.h" #include "absl/strings/string_view.h" #include "absl/types/optional.h" #include "upb/base/string_view.h" @@ -45,7 +44,6 @@ #include "src/core/ext/filters/client_channel/lb_policy/health_check_client_internal.h" #include "src/core/ext/filters/client_channel/subchannel.h" #include "src/core/ext/filters/client_channel/subchannel_stream_client.h" -#include "src/core/lib/address_utils/sockaddr_utils.h" #include "src/core/lib/channel/channel_args.h" #include "src/core/lib/channel/channel_trace.h" #include "src/core/lib/debug/trace.h" @@ -116,7 +114,7 @@ void HealthProducer::HealthChecker::Orphan() { void HealthProducer::HealthChecker::AddWatcherLocked(HealthWatcher* watcher) { watchers_.insert(watcher); - if (state_.has_value()) watcher->Notify(*state_, status_); + watcher->Notify(state_, status_); } bool HealthProducer::HealthChecker::RemoveWatcherLocked( @@ -130,18 +128,13 @@ void HealthProducer::HealthChecker::OnConnectivityStateChangeLocked( if (state == GRPC_CHANNEL_READY) { // We should already be in CONNECTING, and we don't want to change // that until we see the initial response on the stream. - if (!state_.has_value()) { - state_ = GRPC_CHANNEL_CONNECTING; - status_ = absl::OkStatus(); - } else { - GPR_ASSERT(state_ == GRPC_CHANNEL_CONNECTING); - } + GPR_ASSERT(state_ == GRPC_CHANNEL_CONNECTING); // Start the health watch stream. StartHealthStreamLocked(); } else { state_ = state; status_ = status; - NotifyWatchersLocked(*state_, status_); + NotifyWatchersLocked(state_, status_); // We're not connected, so stop health checking. stream_client_.reset(); } @@ -184,21 +177,12 @@ void HealthProducer::HealthChecker::NotifyWatchersLocked( void HealthProducer::HealthChecker::OnHealthWatchStatusChange( grpc_connectivity_state state, const absl::Status& status) { if (state == GRPC_CHANNEL_SHUTDOWN) return; - // Prepend the subchannel's address to the status if needed. - absl::Status use_status; - if (!status.ok()) { - std::string address_str = - grpc_sockaddr_to_uri(&producer_->subchannel_->address()) - .value_or(""); - use_status = absl::Status( - status.code(), absl::StrCat(address_str, ": ", status.message())); - } work_serializer_->Schedule( - [self = Ref(), state, status = std::move(use_status)]() mutable { + [self = Ref(), state, status]() { MutexLock lock(&self->producer_->mu_); if (self->stream_client_ != nullptr) { self->state_ = state; - self->status_ = std::move(status); + self->status_ = status; for (HealthWatcher* watcher : self->watchers_) { watcher->Notify(state, self->status_); } @@ -378,7 +362,7 @@ void HealthProducer::AddWatcher( grpc_pollset_set_add_pollset_set(interested_parties_, watcher->interested_parties()); if (!health_check_service_name.has_value()) { - if (state_.has_value()) watcher->Notify(*state_, status_); + watcher->Notify(state_, status_); non_health_watchers_.insert(watcher); } else { auto it = @@ -435,13 +419,6 @@ void HealthProducer::OnConnectivityStateChange(grpc_connectivity_state state, // HealthWatcher::~HealthWatcher() { - if (GRPC_TRACE_FLAG_ENABLED(grpc_health_check_client_trace)) { - gpr_log(GPR_INFO, - "HealthWatcher %p: unregistering from producer %p " - "(health_check_service_name=\"%s\")", - this, producer_.get(), - health_check_service_name_.value_or("N/A").c_str()); - } if (producer_ != nullptr) { producer_->RemoveWatcher(this, health_check_service_name_); } @@ -468,13 +445,6 @@ void HealthWatcher::SetSubchannel(Subchannel* subchannel) { if (created) producer_->Start(subchannel->Ref()); // Register ourself with the producer. producer_->AddWatcher(this, health_check_service_name_); - if (GRPC_TRACE_FLAG_ENABLED(grpc_health_check_client_trace)) { - gpr_log(GPR_INFO, - "HealthWatcher %p: registered with producer %p (created=%d, " - "health_check_service_name=\"%s\")", - this, producer_.get(), created, - health_check_service_name_.value_or("N/A").c_str()); - } } void HealthWatcher::Notify(grpc_connectivity_state state, absl::Status status) { @@ -500,11 +470,6 @@ MakeHealthCheckWatcher( health_check_service_name = args.GetOwnedString(GRPC_ARG_HEALTH_CHECK_SERVICE_NAME); } - if (GRPC_TRACE_FLAG_ENABLED(grpc_health_check_client_trace)) { - gpr_log(GPR_INFO, - "creating HealthWatcher -- health_check_service_name=\"%s\"", - health_check_service_name.value_or("N/A").c_str()); - } return std::make_unique(std::move(work_serializer), std::move(health_check_service_name), std::move(watcher)); diff --git a/src/core/ext/filters/client_channel/lb_policy/health_check_client_internal.h b/src/core/ext/filters/client_channel/lb_policy/health_check_client_internal.h index eee94904ebb..d606e42ae87 100644 --- a/src/core/ext/filters/client_channel/lb_policy/health_check_client_internal.h +++ b/src/core/ext/filters/client_channel/lb_policy/health_check_client_internal.h @@ -127,8 +127,7 @@ class HealthProducer : public Subchannel::DataProducerInterface { std::shared_ptr work_serializer_ = std::make_shared(); - absl::optional state_ - ABSL_GUARDED_BY(&HealthProducer::mu_); + grpc_connectivity_state state_ ABSL_GUARDED_BY(&HealthProducer::mu_); absl::Status status_ ABSL_GUARDED_BY(&HealthProducer::mu_); OrphanablePtr stream_client_ ABSL_GUARDED_BY(&HealthProducer::mu_); @@ -144,7 +143,7 @@ class HealthProducer : public Subchannel::DataProducerInterface { grpc_pollset_set* interested_parties_; Mutex mu_; - absl::optional state_ ABSL_GUARDED_BY(&mu_); + grpc_connectivity_state state_ ABSL_GUARDED_BY(&mu_); absl::Status status_ ABSL_GUARDED_BY(&mu_); RefCountedPtr connected_subchannel_ ABSL_GUARDED_BY(&mu_); diff --git a/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc b/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc index 8b52dbe8108..3cdf715b402 100644 --- a/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc +++ b/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc @@ -50,6 +50,7 @@ #include "src/core/lib/config/core_configuration.h" #include "src/core/lib/debug/trace.h" #include "src/core/lib/gprpp/debug_location.h" +#include "src/core/lib/gprpp/match.h" #include "src/core/lib/gprpp/orphanable.h" #include "src/core/lib/gprpp/ref_counted.h" #include "src/core/lib/gprpp/ref_counted_ptr.h" @@ -124,9 +125,12 @@ class OutlierDetectionLb : public LoadBalancingPolicy { class SubchannelWrapper : public DelegatingSubchannel { public: SubchannelWrapper(RefCountedPtr subchannel_state, - RefCountedPtr subchannel) + RefCountedPtr subchannel, + bool disable_via_raw_connectivity_watch) : DelegatingSubchannel(std::move(subchannel)), - subchannel_state_(std::move(subchannel_state)) { + subchannel_state_(std::move(subchannel_state)), + disable_via_raw_connectivity_watch_( + disable_via_raw_connectivity_watch) { if (subchannel_state_ != nullptr) { subchannel_state_->AddSubchannel(this); if (subchannel_state_->ejection_time().has_value()) { @@ -145,6 +149,12 @@ class OutlierDetectionLb : public LoadBalancingPolicy { void Uneject(); + void WatchConnectivityState( + std::unique_ptr watcher) override; + + void CancelConnectivityStateWatch( + ConnectivityStateWatcherInterface* watcher) override; + void AddDataWatcher(std::unique_ptr watcher) override; RefCountedPtr subchannel_state() const { @@ -152,6 +162,11 @@ class OutlierDetectionLb : public LoadBalancingPolicy { } private: + // TODO(roth): As a temporary hack, this needs to handle watchers + // stored as both unique_ptr<> and shared_ptr<>, since the former is + // used for raw connectivity state watches and the latter is used + // for health watches. This hack will go away as part of implementing + // dualstack backend support. class WatcherWrapper : public SubchannelInterface::ConnectivityStateWatcherInterface { public: @@ -161,10 +176,16 @@ class OutlierDetectionLb : public LoadBalancingPolicy { bool ejected) : watcher_(std::move(health_watcher)), ejected_(ejected) {} + WatcherWrapper(std::unique_ptr< + SubchannelInterface::ConnectivityStateWatcherInterface> + watcher, + bool ejected) + : watcher_(std::move(watcher)), ejected_(ejected) {} + void Eject() { ejected_ = true; if (last_seen_state_.has_value()) { - watcher_->OnConnectivityStateChange( + watcher()->OnConnectivityStateChange( GRPC_CHANNEL_TRANSIENT_FAILURE, absl::UnavailableError( "subchannel ejected by outlier detection")); @@ -174,8 +195,8 @@ class OutlierDetectionLb : public LoadBalancingPolicy { void Uneject() { ejected_ = false; if (last_seen_state_.has_value()) { - watcher_->OnConnectivityStateChange(*last_seen_state_, - last_seen_status_); + watcher()->OnConnectivityStateChange(*last_seen_state_, + last_seen_status_); } } @@ -190,16 +211,30 @@ class OutlierDetectionLb : public LoadBalancingPolicy { status = absl::UnavailableError( "subchannel ejected by outlier detection"); } - watcher_->OnConnectivityStateChange(new_state, status); + watcher()->OnConnectivityStateChange(new_state, status); } } grpc_pollset_set* interested_parties() override { - return watcher_->interested_parties(); + return watcher()->interested_parties(); } private: - std::shared_ptr + SubchannelInterface::ConnectivityStateWatcherInterface* watcher() const { + return Match( + watcher_, + [](const std::shared_ptr< + SubchannelInterface::ConnectivityStateWatcherInterface>& + watcher) { return watcher.get(); }, + [](const std::unique_ptr< + SubchannelInterface::ConnectivityStateWatcherInterface>& + watcher) { return watcher.get(); }); + } + + absl::variant, + std::unique_ptr< + SubchannelInterface::ConnectivityStateWatcherInterface>> watcher_; absl::optional last_seen_state_; absl::Status last_seen_status_; @@ -207,8 +242,12 @@ class OutlierDetectionLb : public LoadBalancingPolicy { }; RefCountedPtr subchannel_state_; + const bool disable_via_raw_connectivity_watch_; bool ejected_ = false; - WatcherWrapper* watcher_wrapper_ = nullptr; + std::map + watchers_; + WatcherWrapper* watcher_wrapper_ = nullptr; // For health watching. }; class SubchannelState : public RefCounted { @@ -389,14 +428,50 @@ class OutlierDetectionLb : public LoadBalancingPolicy { void OutlierDetectionLb::SubchannelWrapper::Eject() { ejected_ = true; + // Ejecting the subchannel may cause the child policy to cancel the watch, + // so we need to be prepared for the map to be modified while we are + // iterating. + for (auto it = watchers_.begin(); it != watchers_.end();) { + WatcherWrapper* watcher = it->second; + ++it; + watcher->Eject(); + } if (watcher_wrapper_ != nullptr) watcher_wrapper_->Eject(); } void OutlierDetectionLb::SubchannelWrapper::Uneject() { ejected_ = false; + for (auto& watcher : watchers_) { + watcher.second->Uneject(); + } if (watcher_wrapper_ != nullptr) watcher_wrapper_->Uneject(); } +void OutlierDetectionLb::SubchannelWrapper::WatchConnectivityState( + std::unique_ptr watcher) { + if (disable_via_raw_connectivity_watch_) { + wrapped_subchannel()->WatchConnectivityState(std::move(watcher)); + return; + } + ConnectivityStateWatcherInterface* watcher_ptr = watcher.get(); + auto watcher_wrapper = + std::make_unique(std::move(watcher), ejected_); + watchers_.emplace(watcher_ptr, watcher_wrapper.get()); + wrapped_subchannel()->WatchConnectivityState(std::move(watcher_wrapper)); +} + +void OutlierDetectionLb::SubchannelWrapper::CancelConnectivityStateWatch( + ConnectivityStateWatcherInterface* watcher) { + if (disable_via_raw_connectivity_watch_) { + wrapped_subchannel()->CancelConnectivityStateWatch(watcher); + return; + } + auto it = watchers_.find(watcher); + if (it == watchers_.end()) return; + wrapped_subchannel()->CancelConnectivityStateWatch(it->second); + watchers_.erase(it); +} + void OutlierDetectionLb::SubchannelWrapper::AddDataWatcher( std::unique_ptr watcher) { auto* w = static_cast(watcher.get()); @@ -702,12 +777,22 @@ OrphanablePtr OutlierDetectionLb::CreateChildPolicyLocked( RefCountedPtr OutlierDetectionLb::Helper::CreateSubchannel( ServerAddress address, const ChannelArgs& args) { if (parent()->shutting_down_) return nullptr; + // If the address has the DisableOutlierDetectionAttribute attribute, + // ignore it for raw connectivity state updates. + // TODO(roth): This is a hack to prevent outlier_detection from + // working with pick_first, as per discussion in + // https://github.com/grpc/grpc/issues/32967. Remove this as part of + // implementing dualstack backend support. + const bool disable_via_raw_connectivity_watch = + address.args().GetInt(GRPC_ARG_OUTLIER_DETECTION_DISABLE) == 1; RefCountedPtr subchannel_state; std::string key = MakeKeyForAddress(address); if (GRPC_TRACE_FLAG_ENABLED(grpc_outlier_detection_lb_trace)) { gpr_log(GPR_INFO, - "[outlier_detection_lb %p] using key %s for subchannel address %s", - parent(), key.c_str(), address.ToString().c_str()); + "[outlier_detection_lb %p] using key %s for subchannel " + "address %s, disable_via_raw_connectivity_watch=%d", + parent(), key.c_str(), address.ToString().c_str(), + disable_via_raw_connectivity_watch); } if (!key.empty()) { auto it = parent()->subchannel_state_map_.find(key); @@ -716,8 +801,10 @@ RefCountedPtr OutlierDetectionLb::Helper::CreateSubchannel( } } auto subchannel = MakeRefCounted( - subchannel_state, parent()->channel_control_helper()->CreateSubchannel( - std::move(address), args)); + subchannel_state, + parent()->channel_control_helper()->CreateSubchannel(std::move(address), + args), + disable_via_raw_connectivity_watch); if (subchannel_state != nullptr) { subchannel_state->AddSubchannel(subchannel.get()); } diff --git a/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.h b/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.h index 4118e99555c..c8c7f52afd3 100644 --- a/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.h +++ b/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.h @@ -28,6 +28,7 @@ #include "src/core/lib/json/json.h" #include "src/core/lib/json/json_args.h" #include "src/core/lib/json/json_object_loader.h" +#include "src/core/lib/resolver/server_address.h" namespace grpc_core { @@ -89,6 +90,12 @@ struct OutlierDetectionConfig { ValidationErrors* errors); }; +// TODO(roth): This is a horrible hack used to disable outlier detection +// when used with the pick_first policy. Remove this as part of +// implementing the dualstack backend design. +#define GRPC_ARG_OUTLIER_DETECTION_DISABLE \ + GRPC_ARG_NO_SUBCHANNEL_PREFIX "outlier_detection_disable" + } // namespace grpc_core #endif // GRPC_SRC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_OUTLIER_DETECTION_OUTLIER_DETECTION_H diff --git a/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc b/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc index 3c6ccd585c5..4b84091950c 100644 --- a/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc +++ b/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc @@ -16,8 +16,6 @@ #include -#include "src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h" - #include #include @@ -35,21 +33,22 @@ #include "absl/strings/string_view.h" #include "absl/types/optional.h" +#include #include #include -#include "src/core/ext/filters/client_channel/lb_policy/health_check_client.h" +#include "src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.h" +#include "src/core/ext/filters/client_channel/lb_policy/subchannel_list.h" #include "src/core/lib/channel/channel_args.h" #include "src/core/lib/config/core_configuration.h" #include "src/core/lib/debug/trace.h" #include "src/core/lib/gpr/string.h" -#include "src/core/lib/gprpp/crash.h" #include "src/core/lib/gprpp/debug_location.h" #include "src/core/lib/gprpp/env.h" #include "src/core/lib/gprpp/orphanable.h" #include "src/core/lib/gprpp/ref_counted_ptr.h" #include "src/core/lib/gprpp/validation_errors.h" -#include "src/core/lib/iomgr/iomgr_fwd.h" +#include "src/core/lib/gprpp/work_serializer.h" #include "src/core/lib/json/json.h" #include "src/core/lib/json/json_args.h" #include "src/core/lib/json/json_object_loader.h" @@ -118,87 +117,52 @@ class PickFirst : public LoadBalancingPolicy { private: ~PickFirst() override; - class SubchannelList : public InternallyRefCounted { - public: - class SubchannelData { - public: - SubchannelData(SubchannelList* subchannel_list, - RefCountedPtr subchannel); - - SubchannelInterface* subchannel() const { return subchannel_.get(); } - absl::optional connectivity_state() const { - return connectivity_state_; - } - - // Returns the index into the subchannel list of this object. - size_t Index() const { - return static_cast(this - - &subchannel_list_->subchannels_.front()); - } - - // Resets the connection backoff. - void ResetBackoffLocked() { - if (subchannel_ != nullptr) subchannel_->ResetBackoff(); - } - - // Cancels any pending connectivity watch and unrefs the subchannel. - void ShutdownLocked(); - - private: - // Watcher for subchannel connectivity state. - class Watcher - : public SubchannelInterface::ConnectivityStateWatcherInterface { - public: - Watcher(SubchannelData* subchannel_data, - RefCountedPtr subchannel_list) - : subchannel_data_(subchannel_data), - subchannel_list_(std::move(subchannel_list)) {} - - ~Watcher() override { - subchannel_list_.reset(DEBUG_LOCATION, "Watcher dtor"); - } + class PickFirstSubchannelList; - void OnConnectivityStateChange(grpc_connectivity_state new_state, - absl::Status status) override { - subchannel_data_->OnConnectivityStateChange(new_state, - std::move(status)); - } + class PickFirstSubchannelData + : public SubchannelData { + public: + PickFirstSubchannelData( + SubchannelList* + subchannel_list, + const ServerAddress& address, + RefCountedPtr subchannel) + : SubchannelData(subchannel_list, address, std::move(subchannel)) {} + + void ProcessConnectivityChangeLocked( + absl::optional old_state, + grpc_connectivity_state new_state) override; + + // Processes the connectivity change to READY for an unselected subchannel. + void ProcessUnselectedReadyLocked(); + }; - grpc_pollset_set* interested_parties() override { - return subchannel_list_->policy()->interested_parties(); - } + class PickFirstSubchannelList + : public SubchannelList { + public: + PickFirstSubchannelList(PickFirst* policy, ServerAddressList addresses, + const ChannelArgs& args) + : SubchannelList(policy, + (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace) + ? "PickFirstSubchannelList" + : nullptr), + std::move(addresses), policy->channel_control_helper(), + args) { + // Need to maintain a ref to the LB policy as long as we maintain + // any references to subchannels, since the subchannels' + // pollset_sets will include the LB policy's pollset_set. + policy->Ref(DEBUG_LOCATION, "subchannel_list").release(); + // Note that we do not start trying to connect to any subchannel here, + // since we will wait until we see the initial connectivity state for all + // subchannels before doing that. + } - private: - SubchannelData* subchannel_data_; - RefCountedPtr subchannel_list_; - }; - - // This method will be invoked once soon after instantiation to report - // the current connectivity state, and it will then be invoked again - // whenever the connectivity state changes. - void OnConnectivityStateChange(grpc_connectivity_state new_state, - absl::Status status); - - // Processes the connectivity change to READY for an unselected - // subchannel. - void ProcessUnselectedReadyLocked(); - - // Backpointer to owning subchannel list. Not owned. - SubchannelList* subchannel_list_; - // The subchannel. - RefCountedPtr subchannel_; - // Will be non-null when the subchannel's state is being watched. - SubchannelInterface::ConnectivityStateWatcherInterface* pending_watcher_ = - nullptr; - // Data updated by the watcher. - absl::optional connectivity_state_; - absl::Status connectivity_status_; - }; - - SubchannelList(RefCountedPtr policy, ServerAddressList addresses, - const ChannelArgs& args); - - ~SubchannelList() override; + ~PickFirstSubchannelList() override { + PickFirst* p = static_cast(policy()); + p->Unref(DEBUG_LOCATION, "subchannel_list"); + } bool in_transient_failure() const { return in_transient_failure_; } void set_in_transient_failure(bool in_transient_failure) { @@ -208,63 +172,15 @@ class PickFirst : public LoadBalancingPolicy { size_t attempting_index() const { return attempting_index_; } void set_attempting_index(size_t index) { attempting_index_ = index; } - // The number of subchannels in the list. - size_t size() const { return subchannels_.size(); } - - // Returns true if the subchannel list is shutting down. - bool shutting_down() const { return shutting_down_; } - - // Accessors. - PickFirst* policy() const { return policy_.get(); } - - // Resets connection backoff of all subchannels. - void ResetBackoffLocked(); - - // Returns true if all subchannels have seen their initial - // connectivity state notifications. - bool AllSubchannelsSeenInitialState(); - - void Orphan() override; - private: - // Backpointer to owning policy. - RefCountedPtr policy_; - - ChannelArgs args_; - - // The list of subchannels. - std::vector subchannels_; - - // Is this list shutting down? This may be true due to the shutdown of the - // policy itself or because a newer update has arrived while this one hadn't - // finished processing. - bool shutting_down_ = false; + std::shared_ptr work_serializer() const override { + return static_cast(policy())->work_serializer(); + } bool in_transient_failure_ = false; size_t attempting_index_ = 0; }; - class HealthWatcher - : public SubchannelInterface::ConnectivityStateWatcherInterface { - public: - explicit HealthWatcher(RefCountedPtr policy) - : policy_(std::move(policy)) {} - - ~HealthWatcher() override { - policy_.reset(DEBUG_LOCATION, "HealthWatcher dtor"); - } - - void OnConnectivityStateChange(grpc_connectivity_state new_state, - absl::Status status) override; - - grpc_pollset_set* interested_parties() override { - return policy_->interested_parties(); - } - - private: - RefCountedPtr policy_; - }; - class Picker : public SubchannelPicker { public: explicit Picker(RefCountedPtr subchannel) @@ -282,24 +198,14 @@ class PickFirst : public LoadBalancingPolicy { void AttemptToConnectUsingLatestUpdateArgsLocked(); - void UnsetSelectedSubchannel(); - - // Whether we should enable health watching. - const bool enable_health_watch_; - // Whether we should omit our status message prefix. - const bool omit_status_message_prefix_; // Lateset update args. UpdateArgs latest_update_args_; // All our subchannels. - OrphanablePtr subchannel_list_; + RefCountedPtr subchannel_list_; // Latest pending subchannel list. - OrphanablePtr latest_pending_subchannel_list_; + RefCountedPtr latest_pending_subchannel_list_; // Selected subchannel in \a subchannel_list_. - SubchannelList::SubchannelData* selected_ = nullptr; - // Health watcher for the selected subchannel. - SubchannelInterface::ConnectivityStateWatcherInterface* health_watcher_ = - nullptr; - SubchannelInterface::DataWatcherInterface* health_data_watcher_ = nullptr; + PickFirstSubchannelData* selected_ = nullptr; // Are we in IDLE state? bool idle_ = false; // Are we shut down? @@ -308,16 +214,7 @@ class PickFirst : public LoadBalancingPolicy { absl::BitGen bit_gen_; }; -PickFirst::PickFirst(Args args) - : LoadBalancingPolicy(std::move(args)), - enable_health_watch_( - channel_args() - .GetBool(GRPC_ARG_INTERNAL_PICK_FIRST_ENABLE_HEALTH_CHECKING) - .value_or(false)), - omit_status_message_prefix_( - channel_args() - .GetBool(GRPC_ARG_INTERNAL_PICK_FIRST_OMIT_STATUS_MESSAGE_PREFIX) - .value_or(false)) { +PickFirst::PickFirst(Args args) : LoadBalancingPolicy(std::move(args)) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { gpr_log(GPR_INFO, "Pick First %p created.", this); } @@ -336,7 +233,6 @@ void PickFirst::ShutdownLocked() { gpr_log(GPR_INFO, "Pick First %p Shutting down", this); } shutdown_ = true; - UnsetSelectedSubchannel(); subchannel_list_.reset(); latest_pending_subchannel_list_.reset(); } @@ -372,11 +268,13 @@ void PickFirst::AttemptToConnectUsingLatestUpdateArgsLocked() { "[PF %p] Shutting down previous pending subchannel list %p", this, latest_pending_subchannel_list_.get()); } - latest_pending_subchannel_list_ = MakeOrphanable( - Ref(), std::move(addresses), latest_update_args_.args); + latest_pending_subchannel_list_ = MakeRefCounted( + this, std::move(addresses), latest_update_args_.args); + latest_pending_subchannel_list_->StartWatchingLocked( + latest_update_args_.args); // Empty update or no valid subchannels. Put the channel in // TRANSIENT_FAILURE and request re-resolution. - if (latest_pending_subchannel_list_->size() == 0) { + if (latest_pending_subchannel_list_->num_subchannels() == 0) { absl::Status status = latest_update_args_.addresses.ok() ? absl::UnavailableError(absl::StrCat( @@ -389,8 +287,9 @@ void PickFirst::AttemptToConnectUsingLatestUpdateArgsLocked() { } // If the new update is empty or we don't yet have a selected subchannel in // the current list, replace the current subchannel list immediately. - if (latest_pending_subchannel_list_->size() == 0 || selected_ == nullptr) { - UnsetSelectedSubchannel(); + if (latest_pending_subchannel_list_->num_subchannels() == 0 || + selected_ == nullptr) { + selected_ = nullptr; if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace) && subchannel_list_ != nullptr) { gpr_log(GPR_INFO, "[PF %p] Shutting down previous subchannel list %p", @@ -411,6 +310,8 @@ absl::Status PickFirst::UpdateLocked(UpdateArgs args) { this, args.addresses.status().ToString().c_str()); } } + // Add GRPC_ARG_INHIBIT_HEALTH_CHECKING channel arg. + args.args = args.args.Set(GRPC_ARG_INHIBIT_HEALTH_CHECKING, 1); // Set return status based on the address list. absl::Status status; if (!args.addresses.ok()) { @@ -423,6 +324,19 @@ absl::Status PickFirst::UpdateLocked(UpdateArgs args) { absl::c_shuffle(*args.addresses, bit_gen_); } } + // TODO(roth): This is a hack to disable outlier_detection when used + // with pick_first, for the reasons described in + // https://github.com/grpc/grpc/issues/32967. Remove this when + // implementing the dualstack design. + if (args.addresses.ok()) { + ServerAddressList addresses; + for (const auto& address : *args.addresses) { + addresses.emplace_back( + address.address(), + address.args().Set(GRPC_ARG_OUTLIER_DETECTION_DISABLE, 1)); + } + args.addresses = std::move(addresses); + } // If the update contains a resolver error and we have a previous update // that was not a resolver error, keep using the previous addresses. if (!args.addresses.ok() && latest_update_args_.config != nullptr) { @@ -438,122 +352,18 @@ absl::Status PickFirst::UpdateLocked(UpdateArgs args) { return status; } -void PickFirst::UnsetSelectedSubchannel() { - if (selected_ != nullptr && health_data_watcher_ != nullptr) { - selected_->subchannel()->CancelDataWatcher(health_data_watcher_); - } - selected_ = nullptr; - health_watcher_ = nullptr; - health_data_watcher_ = nullptr; -} - -// -// PickFirst::HealthWatcher -// - -void PickFirst::HealthWatcher::OnConnectivityStateChange( - grpc_connectivity_state new_state, absl::Status status) { - if (policy_->health_watcher_ != this) return; - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { - gpr_log(GPR_INFO, "[PF %p] health watch state update: %s (%s)", - policy_.get(), ConnectivityStateName(new_state), - status.ToString().c_str()); - } - switch (new_state) { - case GRPC_CHANNEL_READY: - policy_->channel_control_helper()->UpdateState( - GRPC_CHANNEL_READY, absl::OkStatus(), - MakeRefCounted(policy_->selected_->subchannel()->Ref())); - break; - case GRPC_CHANNEL_IDLE: - // If the subchannel becomes disconnected, the health watcher - // might happen to see the change before the raw connectivity - // state watcher does. In this case, ignore it, since the raw - // connectivity state watcher will handle it shortly. - break; - case GRPC_CHANNEL_CONNECTING: - policy_->channel_control_helper()->UpdateState( - new_state, absl::OkStatus(), - MakeRefCounted(policy_->Ref())); - break; - case GRPC_CHANNEL_TRANSIENT_FAILURE: - policy_->channel_control_helper()->UpdateState( - GRPC_CHANNEL_TRANSIENT_FAILURE, status, - MakeRefCounted(status)); - break; - case GRPC_CHANNEL_SHUTDOWN: - Crash("health watcher reported state SHUTDOWN"); - } -} - -// -// PickFirst::SubchannelList::SubchannelData -// - -PickFirst::SubchannelList::SubchannelData::SubchannelData( - SubchannelList* subchannel_list, - RefCountedPtr subchannel) - : subchannel_list_(subchannel_list), subchannel_(std::move(subchannel)) { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { - gpr_log(GPR_INFO, - "[PF %p] subchannel list %p index %" PRIuPTR " of %" PRIuPTR - " (subchannel %p): starting watch", - subchannel_list_->policy(), subchannel_list_, Index(), - subchannel_list_->size(), subchannel_.get()); - } - auto watcher = std::make_unique( - this, subchannel_list_->Ref(DEBUG_LOCATION, "Watcher")); - pending_watcher_ = watcher.get(); - subchannel_->WatchConnectivityState(std::move(watcher)); -} - -void PickFirst::SubchannelList::SubchannelData::ShutdownLocked() { - if (subchannel_ != nullptr) { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { - gpr_log(GPR_INFO, - "[PF %p] subchannel list %p index %" PRIuPTR " of %" PRIuPTR - " (subchannel %p): cancelling watch and unreffing subchannel", - subchannel_list_->policy(), subchannel_list_, Index(), - subchannel_list_->size(), subchannel_.get()); - } - subchannel_->CancelConnectivityStateWatch(pending_watcher_); - pending_watcher_ = nullptr; - subchannel_.reset(); - } -} - -void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( - grpc_connectivity_state new_state, absl::Status status) { - PickFirst* p = subchannel_list_->policy(); - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { - gpr_log( - GPR_INFO, - "[PF %p] subchannel list %p index %" PRIuPTR " of %" PRIuPTR - " (subchannel %p): connectivity changed: old_state=%s, new_state=%s, " - "status=%s, shutting_down=%d, pending_watcher=%p, " - "p->selected_=%p, p->subchannel_list_=%p, " - "p->latest_pending_subchannel_list_=%p", - p, subchannel_list_, Index(), subchannel_list_->size(), - subchannel_.get(), - (connectivity_state_.has_value() - ? ConnectivityStateName(*connectivity_state_) - : "N/A"), - ConnectivityStateName(new_state), status.ToString().c_str(), - subchannel_list_->shutting_down(), pending_watcher_, p->selected_, - p->subchannel_list_.get(), p->latest_pending_subchannel_list_.get()); - } - if (subchannel_list_->shutting_down() || pending_watcher_ == nullptr) return; +void PickFirst::PickFirstSubchannelData::ProcessConnectivityChangeLocked( + absl::optional old_state, + grpc_connectivity_state new_state) { + PickFirst* p = static_cast(subchannel_list()->policy()); // The notification must be for a subchannel in either the current or // latest pending subchannel lists. - GPR_ASSERT(subchannel_list_ == p->subchannel_list_.get() || - subchannel_list_ == p->latest_pending_subchannel_list_.get()); + GPR_ASSERT(subchannel_list() == p->subchannel_list_.get() || + subchannel_list() == p->latest_pending_subchannel_list_.get()); GPR_ASSERT(new_state != GRPC_CHANNEL_SHUTDOWN); - absl::optional old_state = connectivity_state_; - connectivity_state_ = new_state; - connectivity_status_ = status; // Handle updates for the currently selected subchannel. if (p->selected_ == this) { - GPR_ASSERT(subchannel_list_ == p->subchannel_list_.get()); + GPR_ASSERT(subchannel_list() == p->subchannel_list_.get()); if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { gpr_log(GPR_INFO, "Pick First %p selected subchannel connectivity changed to %s", p, @@ -570,15 +380,17 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( p, p->latest_pending_subchannel_list_.get(), p->subchannel_list_.get()); } - p->UnsetSelectedSubchannel(); + p->selected_ = nullptr; p->subchannel_list_ = std::move(p->latest_pending_subchannel_list_); // Set our state to that of the pending subchannel list. if (p->subchannel_list_->in_transient_failure()) { absl::Status status = absl::UnavailableError(absl::StrCat( "selected subchannel failed; switching to pending update; " "last failure: ", - p->subchannel_list_->subchannels_.back() - .connectivity_status_.ToString())); + p->subchannel_list_ + ->subchannel(p->subchannel_list_->num_subchannels()) + ->connectivity_status() + .ToString())); p->channel_control_helper()->UpdateState( GRPC_CHANNEL_TRANSIENT_FAILURE, status, MakeRefCounted(status)); @@ -598,7 +410,7 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( // and we could switch to that rather than going IDLE. // Enter idle. p->idle_ = true; - p->UnsetSelectedSubchannel(); + p->selected_ = nullptr; p->subchannel_list_.reset(); p->channel_control_helper()->UpdateState( GRPC_CHANNEL_IDLE, absl::Status(), @@ -616,7 +428,7 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( // select in place of the current one. // If the subchannel is READY, use it. if (new_state == GRPC_CHANNEL_READY) { - subchannel_list_->set_in_transient_failure(false); + subchannel_list()->set_in_transient_failure(false); ProcessUnselectedReadyLocked(); return; } @@ -626,36 +438,36 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( // Otherwise, do nothing, since we'll continue to wait until all of // the subchannels report their state. if (!old_state.has_value()) { - if (subchannel_list_->AllSubchannelsSeenInitialState()) { - subchannel_list_->subchannels_.front().subchannel_->RequestConnection(); + if (subchannel_list()->AllSubchannelsSeenInitialState()) { + subchannel_list()->subchannel(0)->subchannel()->RequestConnection(); } return; } // Ignore any other updates for subchannels we're not currently trying to // connect to. - if (Index() != subchannel_list_->attempting_index()) return; + if (Index() != subchannel_list()->attempting_index()) return; // Otherwise, process connectivity state. switch (new_state) { case GRPC_CHANNEL_READY: // Already handled this case above, so this should not happen. GPR_UNREACHABLE_CODE(break); case GRPC_CHANNEL_TRANSIENT_FAILURE: { - size_t next_index = (Index() + 1) % subchannel_list_->size(); - subchannel_list_->set_attempting_index(next_index); - SubchannelData& sd = subchannel_list_->subchannels_[next_index]; + size_t next_index = (Index() + 1) % subchannel_list()->num_subchannels(); + subchannel_list()->set_attempting_index(next_index); + PickFirstSubchannelData* sd = subchannel_list()->subchannel(next_index); // If we're tried all subchannels, set state to TRANSIENT_FAILURE. - if (sd.Index() == 0) { + if (sd->Index() == 0) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { gpr_log(GPR_INFO, "Pick First %p subchannel list %p failed to connect to " "all subchannels", - p, subchannel_list_); + p, subchannel_list()); } - subchannel_list_->set_in_transient_failure(true); + subchannel_list()->set_in_transient_failure(true); // In case 2, swap to the new subchannel list. This means reporting // TRANSIENT_FAILURE and dropping the existing (working) connection, // but we can't ignore what the control plane has told us. - if (subchannel_list_ == p->latest_pending_subchannel_list_.get()) { + if (subchannel_list() == p->latest_pending_subchannel_list_.get()) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { gpr_log(GPR_INFO, "Pick First %p promoting pending subchannel list %p to " @@ -663,19 +475,17 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( p, p->latest_pending_subchannel_list_.get(), p->subchannel_list_.get()); } - p->UnsetSelectedSubchannel(); + p->selected_ = nullptr; // owned by p->subchannel_list_ p->subchannel_list_ = std::move(p->latest_pending_subchannel_list_); } // If this is the current subchannel list (either because we were // in case 1 or because we were in case 2 and just promoted it to // be the current list), re-resolve and report new state. - if (subchannel_list_ == p->subchannel_list_.get()) { + if (subchannel_list() == p->subchannel_list_.get()) { p->channel_control_helper()->RequestReresolution(); - absl::Status status = absl::UnavailableError(absl::StrCat( - (p->omit_status_message_prefix_ - ? "" - : "failed to connect to all addresses; last error: "), - connectivity_status_.ToString())); + absl::Status status = absl::UnavailableError( + absl::StrCat("failed to connect to all addresses; last error: ", + connectivity_status().ToString())); p->channel_control_helper()->UpdateState( GRPC_CHANNEL_TRANSIENT_FAILURE, status, MakeRefCounted(status)); @@ -687,21 +497,21 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( // If it's already in CONNECTING, we don't need to do this. // If it's in TRANSIENT_FAILURE, then we will trigger the // connection attempt later when it reports IDLE. - auto sd_state = sd.connectivity_state(); + auto sd_state = sd->connectivity_state(); if (sd_state.has_value() && *sd_state == GRPC_CHANNEL_IDLE) { - sd.subchannel_->RequestConnection(); + sd->subchannel()->RequestConnection(); } break; } case GRPC_CHANNEL_IDLE: { - subchannel_->RequestConnection(); + subchannel()->RequestConnection(); break; } case GRPC_CHANNEL_CONNECTING: { // Only update connectivity state in case 1, and only if we're not // already in TRANSIENT_FAILURE. - if (subchannel_list_ == p->subchannel_list_.get() && - !subchannel_list_->in_transient_failure()) { + if (subchannel_list() == p->subchannel_list_.get() && + !subchannel_list()->in_transient_failure()) { p->channel_control_helper()->UpdateState( GRPC_CHANNEL_CONNECTING, absl::Status(), MakeRefCounted(p->Ref(DEBUG_LOCATION, "QueuePicker"))); @@ -713,8 +523,8 @@ void PickFirst::SubchannelList::SubchannelData::OnConnectivityStateChange( } } -void PickFirst::SubchannelList::SubchannelData::ProcessUnselectedReadyLocked() { - PickFirst* p = static_cast(subchannel_list_->policy()); +void PickFirst::PickFirstSubchannelData::ProcessUnselectedReadyLocked() { + PickFirst* p = static_cast(subchannel_list()->policy()); // If we get here, there are two possible cases: // 1. We do not currently have a selected subchannel, and the update is // for a subchannel in p->subchannel_list_ that we're trying to @@ -724,10 +534,10 @@ void PickFirst::SubchannelList::SubchannelData::ProcessUnselectedReadyLocked() { // for a subchannel in p->latest_pending_subchannel_list_. The // goal here is to find a subchannel from the update that we can // select in place of the current one. - GPR_ASSERT(subchannel_list_ == p->subchannel_list_.get() || - subchannel_list_ == p->latest_pending_subchannel_list_.get()); + GPR_ASSERT(subchannel_list() == p->subchannel_list_.get() || + subchannel_list() == p->latest_pending_subchannel_list_.get()); // Case 2. Promote p->latest_pending_subchannel_list_ to p->subchannel_list_. - if (subchannel_list_ == p->latest_pending_subchannel_list_.get()) { + if (subchannel_list() == p->latest_pending_subchannel_list_.get()) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { gpr_log(GPR_INFO, "Pick First %p promoting pending subchannel list %p to " @@ -739,114 +549,17 @@ void PickFirst::SubchannelList::SubchannelData::ProcessUnselectedReadyLocked() { } // Cases 1 and 2. if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { - gpr_log(GPR_INFO, "Pick First %p selected subchannel %p", p, - subchannel_.get()); + gpr_log(GPR_INFO, "Pick First %p selected subchannel %p", p, subchannel()); } p->selected_ = this; - // If health checking is enabled, start the health watch, but don't - // report a new picker -- we want to stay in CONNECTING while we wait - // for the health status notification. - // If health checking is NOT enabled, report READY. - if (p->enable_health_watch_) { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { - gpr_log(GPR_INFO, "[PF %p] starting health watch", p); - } - auto watcher = std::make_unique( - p->Ref(DEBUG_LOCATION, "HealthWatcher")); - p->health_watcher_ = watcher.get(); - auto health_data_watcher = MakeHealthCheckWatcher( - p->work_serializer(), subchannel_list_->args_, std::move(watcher)); - p->health_data_watcher_ = health_data_watcher.get(); - subchannel_->AddDataWatcher(std::move(health_data_watcher)); - } else { - p->channel_control_helper()->UpdateState( - GRPC_CHANNEL_READY, absl::Status(), - MakeRefCounted(subchannel_->Ref())); - } - // Unref all other subchannels in the list. - for (size_t i = 0; i < subchannel_list_->size(); ++i) { + p->channel_control_helper()->UpdateState( + GRPC_CHANNEL_READY, absl::Status(), + MakeRefCounted(subchannel()->Ref())); + for (size_t i = 0; i < subchannel_list()->num_subchannels(); ++i) { if (i != Index()) { - subchannel_list_->subchannels_[i].ShutdownLocked(); - } - } -} - -// -// PickFirst::SubchannelList -// - -PickFirst::SubchannelList::SubchannelList(RefCountedPtr policy, - ServerAddressList addresses, - const ChannelArgs& args) - : InternallyRefCounted( - GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace) ? "SubchannelList" - : nullptr), - policy_(std::move(policy)), - args_(args.Remove(GRPC_ARG_INTERNAL_PICK_FIRST_ENABLE_HEALTH_CHECKING) - .Remove( - GRPC_ARG_INTERNAL_PICK_FIRST_OMIT_STATUS_MESSAGE_PREFIX)) { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { - gpr_log(GPR_INFO, - "[PF %p] Creating subchannel list %p for %" PRIuPTR - " subchannels - channel args: %s", - policy_.get(), this, addresses.size(), args_.ToString().c_str()); - } - subchannels_.reserve(addresses.size()); - // Create a subchannel for each address. - for (const ServerAddress& address : addresses) { - RefCountedPtr subchannel = - policy_->channel_control_helper()->CreateSubchannel(address, args_); - if (subchannel == nullptr) { - // Subchannel could not be created. - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { - gpr_log(GPR_INFO, - "[PF %p] could not create subchannel for address %s, ignoring", - policy_.get(), address.ToString().c_str()); - } - continue; + subchannel_list()->subchannel(i)->ShutdownLocked(); } - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { - gpr_log(GPR_INFO, - "[PF %p] subchannel list %p index %" PRIuPTR - ": Created subchannel %p for address %s", - policy_.get(), this, subchannels_.size(), subchannel.get(), - address.ToString().c_str()); - } - subchannels_.emplace_back(this, std::move(subchannel)); - } -} - -PickFirst::SubchannelList::~SubchannelList() { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { - gpr_log(GPR_INFO, "[PF %p] Destroying subchannel_list %p", policy_.get(), - this); - } -} - -void PickFirst::SubchannelList::Orphan() { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) { - gpr_log(GPR_INFO, "[PF %p] Shutting down subchannel_list %p", policy_.get(), - this); - } - GPR_ASSERT(!shutting_down_); - shutting_down_ = true; - for (auto& sd : subchannels_) { - sd.ShutdownLocked(); - } - Unref(); -} - -void PickFirst::SubchannelList::ResetBackoffLocked() { - for (auto& sd : subchannels_) { - sd.ResetBackoffLocked(); - } -} - -bool PickFirst::SubchannelList::AllSubchannelsSeenInitialState() { - for (auto& sd : subchannels_) { - if (!sd.connectivity_state().has_value()) return false; } - return true; } // diff --git a/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h b/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h index 4525cef730c..fcd71d078a4 100644 --- a/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h +++ b/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h @@ -17,22 +17,6 @@ #ifndef GRPC_SRC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_PICK_FIRST_PICK_FIRST_H #define GRPC_SRC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_PICK_FIRST_PICK_FIRST_H -#include - -#include "src/core/lib/resolver/server_address.h" - -// Internal channel arg to enable health checking in pick_first. -// Intended to be used by petiole policies (e.g., round_robin) that -// delegate to pick_first. -#define GRPC_ARG_INTERNAL_PICK_FIRST_ENABLE_HEALTH_CHECKING \ - GRPC_ARG_NO_SUBCHANNEL_PREFIX "pick_first_enable_health_checking" - -// Internal channel arg to tell pick_first to omit the prefix it normally -// adds to error status messages. Intended to be used by petiole policies -// (e.g., round_robin) that want to add their own prefixes. -#define GRPC_ARG_INTERNAL_PICK_FIRST_OMIT_STATUS_MESSAGE_PREFIX \ - GRPC_ARG_NO_SUBCHANNEL_PREFIX "pick_first_omit_status_message_prefix" - namespace grpc_core { // TODO(eostroukhov): Remove once this feature is no longer experimental. bool ShufflePickFirstEnabled(); diff --git a/src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.cc b/src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.cc index ff4ec34fd57..f5767e4679f 100644 --- a/src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.cc +++ b/src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.cc @@ -23,7 +23,6 @@ #include #include -#include #include #include #include @@ -38,8 +37,6 @@ #include "absl/strings/string_view.h" #include "absl/types/optional.h" -#include - #define XXH_INLINE_ALL #include "xxhash.h" @@ -48,12 +45,11 @@ #include #include "src/core/ext/filters/client_channel/client_channel_internal.h" -#include "src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.h" +#include "src/core/ext/filters/client_channel/lb_policy/subchannel_list.h" #include "src/core/lib/address_utils/sockaddr_utils.h" #include "src/core/lib/channel/channel_args.h" #include "src/core/lib/config/core_configuration.h" #include "src/core/lib/debug/trace.h" -#include "src/core/lib/gprpp/crash.h" #include "src/core/lib/gprpp/debug_location.h" #include "src/core/lib/gprpp/orphanable.h" #include "src/core/lib/gprpp/ref_counted.h" @@ -63,12 +59,10 @@ #include "src/core/lib/iomgr/closure.h" #include "src/core/lib/iomgr/error.h" #include "src/core/lib/iomgr/exec_ctx.h" -#include "src/core/lib/iomgr/pollset_set.h" #include "src/core/lib/json/json.h" -#include "src/core/lib/load_balancing/delegating_helper.h" #include "src/core/lib/load_balancing/lb_policy.h" #include "src/core/lib/load_balancing/lb_policy_factory.h" -#include "src/core/lib/load_balancing/lb_policy_registry.h" +#include "src/core/lib/load_balancing/subchannel_interface.h" #include "src/core/lib/resolver/server_address.h" #include "src/core/lib/transport/connectivity_state.h" @@ -146,155 +140,201 @@ class RingHash : public LoadBalancingPolicy { void ResetBackoffLocked() override; private: - // A ring computed based on a config and address list. - class Ring : public RefCounted { + // Forward declaration. + class RingHashSubchannelList; + + // Data for a particular subchannel in a subchannel list. + // This subclass adds the following functionality: + // - Tracks the previous connectivity state of the subchannel, so that + // we know how many subchannels are in each state. + class RingHashSubchannelData + : public SubchannelData { public: - struct RingEntry { - uint64_t hash; - size_t endpoint_index; // Index into RingHash::addresses_. - }; - - Ring(RingHash* ring_hash, RingHashLbConfig* config); - - const std::vector& ring() const { return ring_; } + RingHashSubchannelData( + SubchannelList* + subchannel_list, + const ServerAddress& address, + RefCountedPtr subchannel) + : SubchannelData(subchannel_list, address, std::move(subchannel)), + address_(address) {} + + const ServerAddress& address() const { return address_; } + + grpc_connectivity_state logical_connectivity_state() const { + return logical_connectivity_state_; + } + const absl::Status& logical_connectivity_status() const { + return logical_connectivity_status_; + } private: - std::vector ring_; + // Performs connectivity state updates that need to be done only + // after we have started watching. + void ProcessConnectivityChangeLocked( + absl::optional old_state, + grpc_connectivity_state new_state) override; + + ServerAddress address_; + + // Last logical connectivity state seen. + // Note that this may differ from the state actually reported by the + // subchannel in some cases; for example, once this is set to + // TRANSIENT_FAILURE, we do not change it again until we get READY, + // so we skip any interim stops in CONNECTING. + grpc_connectivity_state logical_connectivity_state_ = GRPC_CHANNEL_IDLE; + absl::Status logical_connectivity_status_; }; - // State for a particular endpoint. Delegates to a pick_first child policy. - class RingHashEndpoint : public InternallyRefCounted { + // A list of subchannels and the ring containing those subchannels. + class RingHashSubchannelList + : public SubchannelList { public: - // index is the index into RingHash::addresses_ of this endpoint. - RingHashEndpoint(RefCountedPtr ring_hash, size_t index) - : ring_hash_(std::move(ring_hash)), index_(index) {} + class Ring : public RefCounted { + public: + struct RingEntry { + uint64_t hash; + size_t subchannel_index; + }; - void Orphan() override; + Ring(RingHashLbConfig* config, RingHashSubchannelList* subchannel_list, + const ChannelArgs& args); - size_t index() const { return index_; } + const std::vector& ring() const { return ring_; } - void UpdateLocked(size_t index); + private: + std::vector ring_; + }; - grpc_connectivity_state connectivity_state() const { - return connectivity_state_; - } + RingHashSubchannelList(RingHash* policy, ServerAddressList addresses, + const ChannelArgs& args); - // Returns info about the endpoint to be stored in the picker. - struct EndpointInfo { - RefCountedPtr endpoint; - RefCountedPtr picker; - grpc_connectivity_state state; - absl::Status status; - }; - EndpointInfo GetInfoForPicker() { - return {Ref(), picker_, connectivity_state_, status_}; + ~RingHashSubchannelList() override { + RingHash* p = static_cast(policy()); + p->Unref(DEBUG_LOCATION, "subchannel_list"); } - void ResetBackoffLocked(); - - // If the child policy does not yet exist, creates it; otherwise, - // asks the child to exit IDLE. - void RequestConnectionLocked(); + RefCountedPtr ring() { return ring_; } + + // Updates the counters of subchannels in each state when a + // subchannel transitions from old_state to new_state. + void UpdateStateCountersLocked(grpc_connectivity_state old_state, + grpc_connectivity_state new_state); + + // Updates the RH policy's connectivity state based on the + // subchannel list's state counters, creating new picker and new ring. + // The index parameter indicates the index into the list of the subchannel + // whose status report triggered the call to + // UpdateRingHashConnectivityStateLocked(). + // connection_attempt_complete is true if the subchannel just + // finished a connection attempt. + void UpdateRingHashConnectivityStateLocked(size_t index, + bool connection_attempt_complete, + absl::Status status); private: - class Helper; - - void CreateChildPolicy(); - void UpdateChildPolicyLocked(); + std::shared_ptr work_serializer() const override { + return static_cast(policy())->work_serializer(); + } - // Called when the child policy reports a connectivity state update. - void OnStateUpdate(grpc_connectivity_state new_state, - const absl::Status& status, - RefCountedPtr picker); + size_t num_idle_; + size_t num_ready_ = 0; + size_t num_connecting_ = 0; + size_t num_transient_failure_ = 0; - // Ref to our parent. - RefCountedPtr ring_hash_; - size_t index_; // Index into RingHash::addresses_ of this endpoint. + RefCountedPtr ring_; - // The pick_first child policy. - OrphanablePtr child_policy_; + // The index of the subchannel currently doing an internally + // triggered connection attempt, if any. + absl::optional internally_triggered_connection_index_; - grpc_connectivity_state connectivity_state_ = GRPC_CHANNEL_IDLE; - absl::Status status_; - RefCountedPtr picker_; + // TODO(roth): If we ever change the helper UpdateState() API to not + // need the status reported for TRANSIENT_FAILURE state (because + // it's not currently actually used for anything outside of the picker), + // then we will no longer need this data member. + absl::Status last_failure_; }; class Picker : public SubchannelPicker { public: - explicit Picker(RefCountedPtr ring_hash) - : ring_hash_(std::move(ring_hash)), - ring_(ring_hash_->ring_), - endpoints_(ring_hash_->addresses_.size()) { - for (const auto& p : ring_hash_->endpoint_map_) { - endpoints_[p.second->index()] = p.second->GetInfoForPicker(); + Picker(RefCountedPtr ring_hash_lb, + RingHashSubchannelList* subchannel_list) + : ring_hash_lb_(std::move(ring_hash_lb)), + ring_(subchannel_list->ring()) { + subchannels_.reserve(subchannel_list->num_subchannels()); + for (size_t i = 0; i < subchannel_list->num_subchannels(); ++i) { + RingHashSubchannelData* subchannel_data = + subchannel_list->subchannel(i); + subchannels_.emplace_back( + SubchannelInfo{subchannel_data->subchannel()->Ref(), + subchannel_data->logical_connectivity_state(), + subchannel_data->logical_connectivity_status()}); } } PickResult Pick(PickArgs args) override; private: - // A fire-and-forget class that schedules endpoint connection attempts + // A fire-and-forget class that schedules subchannel connection attempts // on the control plane WorkSerializer. - class EndpointConnectionAttempter { + class SubchannelConnectionAttempter : public Orphanable { public: - EndpointConnectionAttempter(RefCountedPtr ring_hash, - RefCountedPtr endpoint) - : ring_hash_(std::move(ring_hash)), endpoint_(std::move(endpoint)) { + explicit SubchannelConnectionAttempter( + RefCountedPtr ring_hash_lb) + : ring_hash_lb_(std::move(ring_hash_lb)) { + GRPC_CLOSURE_INIT(&closure_, RunInExecCtx, this, nullptr); + } + + void Orphan() override { // Hop into ExecCtx, so that we're not holding the data plane mutex // while we run control-plane code. - GRPC_CLOSURE_INIT(&closure_, RunInExecCtx, this, nullptr); ExecCtx::Run(DEBUG_LOCATION, &closure_, absl::OkStatus()); } + void AddSubchannel(RefCountedPtr subchannel) { + subchannels_.push_back(std::move(subchannel)); + } + private: static void RunInExecCtx(void* arg, grpc_error_handle /*error*/) { - auto* self = static_cast(arg); - self->ring_hash_->work_serializer()->Run( + auto* self = static_cast(arg); + self->ring_hash_lb_->work_serializer()->Run( [self]() { - if (!self->ring_hash_->shutdown_) { - self->endpoint_->RequestConnectionLocked(); + if (!self->ring_hash_lb_->shutdown_) { + for (auto& subchannel : self->subchannels_) { + subchannel->RequestConnection(); + } } delete self; }, DEBUG_LOCATION); } - RefCountedPtr ring_hash_; - RefCountedPtr endpoint_; + RefCountedPtr ring_hash_lb_; grpc_closure closure_; + std::vector> subchannels_; }; - RefCountedPtr ring_hash_; - RefCountedPtr ring_; - std::vector endpoints_; + struct SubchannelInfo { + RefCountedPtr subchannel; + grpc_connectivity_state state; + absl::Status status; + }; + + RefCountedPtr ring_hash_lb_; + RefCountedPtr ring_; + std::vector subchannels_; }; ~RingHash() override; void ShutdownLocked() override; - // Updates the aggregate policy's connectivity state based on the - // endpoint list's state counters, creating a new picker. - // entered_transient_failure is true if the endpoint has just - // entered TRANSIENT_FAILURE state. - // If the call to this method is triggered by an endpoint entering - // TRANSIENT_FAILURE, then status is the status reported by the endpoint. - void UpdateAggregatedConnectivityStateLocked(bool entered_transient_failure, - absl::Status status); - - // Current address list, channel args, and ring. - ServerAddressList addresses_; - ChannelArgs args_; - RefCountedPtr ring_; - - std::map> endpoint_map_; - - // TODO(roth): If we ever change the helper UpdateState() API to not - // need the status reported for TRANSIENT_FAILURE state (because - // it's not currently actually used for anything outside of the picker), - // then we will no longer need this data member. - absl::Status last_failure_; + // Current config from resolver. + RefCountedPtr config_; + // list of subchannels. + RefCountedPtr subchannel_list_; + RefCountedPtr latest_pending_subchannel_list_; // indicating if we are shutting down. bool shutdown_ = false; }; @@ -317,62 +357,106 @@ RingHash::PickResult RingHash::Picker::Pick(PickArgs args) { absl::InternalError("ring hash value is not a number")); } const auto& ring = ring_->ring(); - // Find the index in the ring to use for this RPC. // Ported from https://github.com/RJ/ketama/blob/master/libketama/ketama.c // (ketama_get_server) NOTE: The algorithm depends on using signed integers - // for lowp, highp, and index. Do not change them! + // for lowp, highp, and first_index. Do not change them! int64_t lowp = 0; int64_t highp = ring.size(); - int64_t index = 0; + int64_t first_index = 0; while (true) { - index = (lowp + highp) / 2; - if (index == static_cast(ring.size())) { - index = 0; + first_index = (lowp + highp) / 2; + if (first_index == static_cast(ring.size())) { + first_index = 0; break; } - uint64_t midval = ring[index].hash; - uint64_t midval1 = index == 0 ? 0 : ring[index - 1].hash; + uint64_t midval = ring[first_index].hash; + uint64_t midval1 = first_index == 0 ? 0 : ring[first_index - 1].hash; if (h <= midval && h > midval1) { break; } if (midval < h) { - lowp = index + 1; + lowp = first_index + 1; } else { - highp = index - 1; + highp = first_index - 1; } if (lowp > highp) { - index = 0; + first_index = 0; break; } } - // Find the first endpoint we can use from the selected index. - for (size_t i = 0; i < ring.size(); ++i) { - const auto& entry = ring[(index + i) % ring.size()]; - const auto& endpoint_info = endpoints_[entry.endpoint_index]; - switch (endpoint_info.state) { - case GRPC_CHANNEL_READY: - return endpoint_info.picker->Pick(args); - case GRPC_CHANNEL_IDLE: - new EndpointConnectionAttempter( - ring_hash_->Ref(DEBUG_LOCATION, "EndpointConnectionAttempter"), - endpoint_info.endpoint); - ABSL_FALLTHROUGH_INTENDED; - case GRPC_CHANNEL_CONNECTING: - return PickResult::Queue(); - default: - break; + OrphanablePtr subchannel_connection_attempter; + auto ScheduleSubchannelConnectionAttempt = + [&](RefCountedPtr subchannel) { + if (subchannel_connection_attempter == nullptr) { + subchannel_connection_attempter = + MakeOrphanable(ring_hash_lb_->Ref( + DEBUG_LOCATION, "SubchannelConnectionAttempter")); + } + subchannel_connection_attempter->AddSubchannel(std::move(subchannel)); + }; + SubchannelInfo& first_subchannel = + subchannels_[ring[first_index].subchannel_index]; + switch (first_subchannel.state) { + case GRPC_CHANNEL_READY: + return PickResult::Complete(first_subchannel.subchannel); + case GRPC_CHANNEL_IDLE: + ScheduleSubchannelConnectionAttempt(first_subchannel.subchannel); + ABSL_FALLTHROUGH_INTENDED; + case GRPC_CHANNEL_CONNECTING: + return PickResult::Queue(); + default: // GRPC_CHANNEL_TRANSIENT_FAILURE + break; + } + ScheduleSubchannelConnectionAttempt(first_subchannel.subchannel); + // Loop through remaining subchannels to find one in READY. + // On the way, we make sure the right set of connection attempts + // will happen. + bool found_second_subchannel = false; + bool found_first_non_failed = false; + for (size_t i = 1; i < ring.size(); ++i) { + const auto& entry = ring[(first_index + i) % ring.size()]; + if (entry.subchannel_index == ring[first_index].subchannel_index) { + continue; + } + SubchannelInfo& subchannel_info = subchannels_[entry.subchannel_index]; + if (subchannel_info.state == GRPC_CHANNEL_READY) { + return PickResult::Complete(subchannel_info.subchannel); + } + if (!found_second_subchannel) { + switch (subchannel_info.state) { + case GRPC_CHANNEL_IDLE: + ScheduleSubchannelConnectionAttempt(subchannel_info.subchannel); + ABSL_FALLTHROUGH_INTENDED; + case GRPC_CHANNEL_CONNECTING: + return PickResult::Queue(); + default: + break; + } + found_second_subchannel = true; + } + if (!found_first_non_failed) { + if (subchannel_info.state == GRPC_CHANNEL_TRANSIENT_FAILURE) { + ScheduleSubchannelConnectionAttempt(subchannel_info.subchannel); + } else { + if (subchannel_info.state == GRPC_CHANNEL_IDLE) { + ScheduleSubchannelConnectionAttempt(subchannel_info.subchannel); + } + found_first_non_failed = true; + } } } return PickResult::Fail(absl::UnavailableError(absl::StrCat( - "ring hash cannot find a connected endpoint; first failure: ", - endpoints_[ring[index].endpoint_index].status.message()))); + "ring hash cannot find a connected subchannel; first failure: ", + first_subchannel.status.ToString()))); } // -// RingHash::Ring +// RingHash::RingHashSubchannelList::Ring // -RingHash::Ring::Ring(RingHash* ring_hash, RingHashLbConfig* config) { +RingHash::RingHashSubchannelList::Ring::Ring( + RingHashLbConfig* config, RingHashSubchannelList* subchannel_list, + const ChannelArgs& args) { // Store the weights while finding the sum. struct AddressWeight { std::string address; @@ -383,15 +467,15 @@ RingHash::Ring::Ring(RingHash* ring_hash, RingHashLbConfig* config) { }; std::vector address_weights; size_t sum = 0; - const ServerAddressList& addresses = ring_hash->addresses_; - address_weights.reserve(addresses.size()); - for (const auto& address : addresses) { + address_weights.reserve(subchannel_list->num_subchannels()); + for (size_t i = 0; i < subchannel_list->num_subchannels(); ++i) { + RingHashSubchannelData* sd = subchannel_list->subchannel(i); + auto weight_arg = sd->address().args().GetInt(GRPC_ARG_ADDRESS_WEIGHT); AddressWeight address_weight; address_weight.address = - grpc_sockaddr_to_string(&address.address(), false).value(); + grpc_sockaddr_to_string(&sd->address().address(), false).value(); // Weight should never be zero, but ignore it just in case, since // that value would screw up the ring-building algorithm. - auto weight_arg = address.args().GetInt(GRPC_ARG_ADDRESS_WEIGHT); if (weight_arg.value_or(0) > 0) { address_weight.weight = *weight_arg; } @@ -415,9 +499,8 @@ RingHash::Ring::Ring(RingHash* ring_hash, RingHashLbConfig* config) { // weights aren't provided, all hosts should get an equal number of hashes. In // the case where this number exceeds the max_ring_size, it's scaled back down // to fit. - const size_t ring_size_cap = - ring_hash->args_.GetInt(GRPC_ARG_RING_HASH_LB_RING_SIZE_CAP) - .value_or(kRingSizeCapDefault); + const size_t ring_size_cap = args.GetInt(GRPC_ARG_RING_HASH_LB_RING_SIZE_CAP) + .value_or(kRingSizeCapDefault); const size_t min_ring_size = std::min(config->min_ring_size(), ring_size_cap); const size_t max_ring_size = std::min(config->max_ring_size(), ring_size_cap); const double scale = std::min( @@ -436,7 +519,7 @@ RingHash::Ring::Ring(RingHash* ring_hash, RingHashLbConfig* config) { double target_hashes = 0.0; uint64_t min_hashes_per_host = ring_size; uint64_t max_hashes_per_host = 0; - for (size_t i = 0; i < addresses.size(); ++i) { + for (size_t i = 0; i < subchannel_list->num_subchannels(); ++i) { const std::string& address_string = address_weights[i].address; hash_key_buffer.assign(address_string.begin(), address_string.end()); hash_key_buffer.emplace_back('_'); @@ -466,133 +549,210 @@ RingHash::Ring::Ring(RingHash* ring_hash, RingHashLbConfig* config) { } // -// RingHash::RingHashEndpoint::Helper +// RingHash::RingHashSubchannelList // -class RingHash::RingHashEndpoint::Helper - : public LoadBalancingPolicy::DelegatingChannelControlHelper { - public: - explicit Helper(RefCountedPtr endpoint) - : endpoint_(std::move(endpoint)) {} - - ~Helper() override { endpoint_.reset(DEBUG_LOCATION, "Helper"); } - - void UpdateState( - grpc_connectivity_state state, const absl::Status& status, - RefCountedPtr picker) override { - endpoint_->OnStateUpdate(state, status, std::move(picker)); +RingHash::RingHashSubchannelList::RingHashSubchannelList( + RingHash* policy, ServerAddressList addresses, const ChannelArgs& args) + : SubchannelList(policy, + (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace) + ? "RingHashSubchannelList" + : nullptr), + std::move(addresses), policy->channel_control_helper(), + args), + num_idle_(num_subchannels()) { + // Need to maintain a ref to the LB policy as long as we maintain + // any references to subchannels, since the subchannels' + // pollset_sets will include the LB policy's pollset_set. + policy->Ref(DEBUG_LOCATION, "subchannel_list").release(); + // Construct the ring. + ring_ = MakeRefCounted(policy->config_.get(), this, args); + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace)) { + gpr_log(GPR_INFO, + "[RH %p] created subchannel list %p with %" PRIuPTR " ring entries", + policy, this, ring_->ring().size()); } +} - private: - LoadBalancingPolicy::ChannelControlHelper* parent_helper() const override { - return endpoint_->ring_hash_->channel_control_helper(); +void RingHash::RingHashSubchannelList::UpdateStateCountersLocked( + grpc_connectivity_state old_state, grpc_connectivity_state new_state) { + if (old_state == GRPC_CHANNEL_IDLE) { + GPR_ASSERT(num_idle_ > 0); + --num_idle_; + } else if (old_state == GRPC_CHANNEL_READY) { + GPR_ASSERT(num_ready_ > 0); + --num_ready_; + } else if (old_state == GRPC_CHANNEL_CONNECTING) { + GPR_ASSERT(num_connecting_ > 0); + --num_connecting_; + } else if (old_state == GRPC_CHANNEL_TRANSIENT_FAILURE) { + GPR_ASSERT(num_transient_failure_ > 0); + --num_transient_failure_; } - - RefCountedPtr endpoint_; -}; - -// -// RingHash::RingHashEndpoint -// - -void RingHash::RingHashEndpoint::Orphan() { - if (child_policy_ != nullptr) { - // Remove pollset_set linkage. - grpc_pollset_set_del_pollset_set(child_policy_->interested_parties(), - ring_hash_->interested_parties()); - child_policy_.reset(); - picker_.reset(); + GPR_ASSERT(new_state != GRPC_CHANNEL_SHUTDOWN); + if (new_state == GRPC_CHANNEL_IDLE) { + ++num_idle_; + } else if (new_state == GRPC_CHANNEL_READY) { + ++num_ready_; + } else if (new_state == GRPC_CHANNEL_CONNECTING) { + ++num_connecting_; + } else if (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE) { + ++num_transient_failure_; } - Unref(); -} - -void RingHash::RingHashEndpoint::UpdateLocked(size_t index) { - index_ = index; - if (child_policy_ != nullptr) UpdateChildPolicyLocked(); -} - -void RingHash::RingHashEndpoint::ResetBackoffLocked() { - if (child_policy_ != nullptr) child_policy_->ResetBackoffLocked(); } -void RingHash::RingHashEndpoint::RequestConnectionLocked() { - if (child_policy_ == nullptr) { - CreateChildPolicy(); +void RingHash::RingHashSubchannelList::UpdateRingHashConnectivityStateLocked( + size_t index, bool connection_attempt_complete, absl::Status status) { + RingHash* p = static_cast(policy()); + // If this is latest_pending_subchannel_list_, then swap it into + // subchannel_list_ as soon as we get the initial connectivity state + // report for every subchannel in the list. + if (p->latest_pending_subchannel_list_.get() == this && + AllSubchannelsSeenInitialState()) { + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace)) { + gpr_log(GPR_INFO, "[RH %p] replacing subchannel list %p with %p", p, + p->subchannel_list_.get(), this); + } + p->subchannel_list_ = std::move(p->latest_pending_subchannel_list_); + } + // Only set connectivity state if this is the current subchannel list. + if (p->subchannel_list_.get() != this) return; + // The overall aggregation rules here are: + // 1. If there is at least one subchannel in READY state, report READY. + // 2. If there are 2 or more subchannels in TRANSIENT_FAILURE state, report + // TRANSIENT_FAILURE. + // 3. If there is at least one subchannel in CONNECTING state, report + // CONNECTING. + // 4. If there is one subchannel in TRANSIENT_FAILURE state and there is + // more than one subchannel, report CONNECTING. + // 5. If there is at least one subchannel in IDLE state, report IDLE. + // 6. Otherwise, report TRANSIENT_FAILURE. + // + // We set start_connection_attempt to true if we match rules 2, 3, or 6. + grpc_connectivity_state state; + bool start_connection_attempt = false; + if (num_ready_ > 0) { + state = GRPC_CHANNEL_READY; + } else if (num_transient_failure_ >= 2) { + state = GRPC_CHANNEL_TRANSIENT_FAILURE; + start_connection_attempt = true; + } else if (num_connecting_ > 0) { + state = GRPC_CHANNEL_CONNECTING; + } else if (num_transient_failure_ == 1 && num_subchannels() > 1) { + state = GRPC_CHANNEL_CONNECTING; + start_connection_attempt = true; + } else if (num_idle_ > 0) { + state = GRPC_CHANNEL_IDLE; } else { - child_policy_->ExitIdleLocked(); + state = GRPC_CHANNEL_TRANSIENT_FAILURE; + start_connection_attempt = true; } -} - -void RingHash::RingHashEndpoint::CreateChildPolicy() { - GPR_ASSERT(child_policy_ == nullptr); - LoadBalancingPolicy::Args lb_policy_args; - lb_policy_args.work_serializer = ring_hash_->work_serializer(); - lb_policy_args.args = - ring_hash_->args_ - .Set(GRPC_ARG_INTERNAL_PICK_FIRST_ENABLE_HEALTH_CHECKING, true) - .Set(GRPC_ARG_INTERNAL_PICK_FIRST_OMIT_STATUS_MESSAGE_PREFIX, true); - lb_policy_args.channel_control_helper = - std::make_unique(Ref(DEBUG_LOCATION, "Helper")); - child_policy_ = - CoreConfiguration::Get().lb_policy_registry().CreateLoadBalancingPolicy( - "pick_first", std::move(lb_policy_args)); - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace)) { - const ServerAddress& address = ring_hash_->addresses_[index_]; - gpr_log(GPR_INFO, - "[RH %p] endpoint %p (index %" PRIuPTR " of %" PRIuPTR - ", %s): created child policy %p", - ring_hash_.get(), this, index_, ring_hash_->addresses_.size(), - address.ToString().c_str(), child_policy_.get()); + // In TRANSIENT_FAILURE, report the last reported failure. + // Otherwise, report OK. + if (state == GRPC_CHANNEL_TRANSIENT_FAILURE) { + if (!status.ok()) { + last_failure_ = absl::UnavailableError(absl::StrCat( + "no reachable subchannels; last error: ", status.ToString())); + } + status = last_failure_; + } else { + status = absl::OkStatus(); + } + // Generate new picker and return it to the channel. + // Note that we use our own picker regardless of connectivity state. + p->channel_control_helper()->UpdateState( + state, status, + MakeRefCounted(p->Ref(DEBUG_LOCATION, "RingHashPicker"), this)); + // While the ring_hash policy is reporting TRANSIENT_FAILURE, it will + // not be getting any pick requests from the priority policy. + // However, because the ring_hash policy does not attempt to + // reconnect to subchannels unless it is getting pick requests, + // it will need special handling to ensure that it will eventually + // recover from TRANSIENT_FAILURE state once the problem is resolved. + // Specifically, it will make sure that it is attempting to connect to + // at least one subchannel at any given time. After a given subchannel + // fails a connection attempt, it will move on to the next subchannel + // in the ring. It will keep doing this until one of the subchannels + // successfully connects, at which point it will report READY and stop + // proactively trying to connect. The policy will remain in + // TRANSIENT_FAILURE until at least one subchannel becomes connected, + // even if subchannels are in state CONNECTING during that time. + // + // Note that we do the same thing when the policy is in state + // CONNECTING, just to ensure that we don't remain in CONNECTING state + // indefinitely if there are no new picks coming in. + if (internally_triggered_connection_index_.has_value() && + *internally_triggered_connection_index_ == index && + connection_attempt_complete) { + internally_triggered_connection_index_.reset(); + } + if (start_connection_attempt && + !internally_triggered_connection_index_.has_value()) { + size_t next_index = (index + 1) % num_subchannels(); + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace)) { + gpr_log(GPR_INFO, + "[RH %p] triggering internal connection attempt for subchannel " + "%p, subchannel_list %p (index %" PRIuPTR " of %" PRIuPTR ")", + p, subchannel(next_index)->subchannel(), this, next_index, + num_subchannels()); + } + internally_triggered_connection_index_ = next_index; + subchannel(next_index)->subchannel()->RequestConnection(); } - // Add our interested_parties pollset_set to that of the newly created - // child policy. This will make the child policy progress upon activity on - // this policy, which in turn is tied to the application's call. - grpc_pollset_set_add_pollset_set(child_policy_->interested_parties(), - ring_hash_->interested_parties()); - UpdateChildPolicyLocked(); } -void RingHash::RingHashEndpoint::UpdateChildPolicyLocked() { - // Construct pick_first config. - auto config = - CoreConfiguration::Get().lb_policy_registry().ParseLoadBalancingConfig( - Json::FromArray( - {Json::FromObject({{"pick_first", Json::FromObject({})}})})); - GPR_ASSERT(config.ok()); - // Update child policy. - LoadBalancingPolicy::UpdateArgs update_args; - update_args.addresses.emplace().emplace_back(ring_hash_->addresses_[index_]); - update_args.args = ring_hash_->args_; - update_args.config = std::move(*config); - // TODO(roth): If the child reports a non-OK status with the update, - // we need to propagate that back to the resolver somehow. - (void)child_policy_->UpdateLocked(std::move(update_args)); -} +// +// RingHash::RingHashSubchannelData +// -void RingHash::RingHashEndpoint::OnStateUpdate( - grpc_connectivity_state new_state, const absl::Status& status, - RefCountedPtr picker) { +void RingHash::RingHashSubchannelData::ProcessConnectivityChangeLocked( + absl::optional old_state, + grpc_connectivity_state new_state) { + RingHash* p = static_cast(subchannel_list()->policy()); if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace)) { gpr_log( GPR_INFO, - "[RH %p] connectivity changed for endpoint %p (%s, child_policy=%p): " - "prev_state=%s new_state=%s (%s)", - ring_hash_.get(), this, - ring_hash_->addresses_[index_].ToString().c_str(), child_policy_.get(), - ConnectivityStateName(connectivity_state_), - ConnectivityStateName(new_state), status.ToString().c_str()); + "[RH %p] connectivity changed for subchannel %p, subchannel_list %p " + "(index %" PRIuPTR " of %" PRIuPTR "): prev_state=%s new_state=%s", + p, subchannel(), subchannel_list(), Index(), + subchannel_list()->num_subchannels(), + ConnectivityStateName(logical_connectivity_state_), + ConnectivityStateName(new_state)); + } + GPR_ASSERT(subchannel() != nullptr); + // If this is not the initial state notification and the new state is + // TRANSIENT_FAILURE or IDLE, re-resolve. + // Note that we don't want to do this on the initial state notification, + // because that would result in an endless loop of re-resolution. + if (old_state.has_value() && (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE || + new_state == GRPC_CHANNEL_IDLE)) { + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace)) { + gpr_log(GPR_INFO, + "[RH %p] Subchannel %p reported %s; requesting re-resolution", p, + subchannel(), ConnectivityStateName(new_state)); + } + p->channel_control_helper()->RequestReresolution(); + } + const bool connection_attempt_complete = new_state != GRPC_CHANNEL_CONNECTING; + // Decide what state to report for the purposes of aggregation and + // picker behavior. + // If the last recorded state was TRANSIENT_FAILURE, ignore the change + // unless the new state is READY (or TF again, in which case we need + // to update the status). + if (logical_connectivity_state_ != GRPC_CHANNEL_TRANSIENT_FAILURE || + new_state == GRPC_CHANNEL_READY || + new_state == GRPC_CHANNEL_TRANSIENT_FAILURE) { + // Update state counters used for aggregation. + subchannel_list()->UpdateStateCountersLocked(logical_connectivity_state_, + new_state); + // Update logical state. + logical_connectivity_state_ = new_state; + logical_connectivity_status_ = connectivity_status(); } - if (child_policy_ == nullptr) return; // Already orphaned. - // Update state. - const bool entered_transient_failure = - connectivity_state_ != GRPC_CHANNEL_TRANSIENT_FAILURE && - new_state == GRPC_CHANNEL_TRANSIENT_FAILURE; - connectivity_state_ = new_state; - status_ = status; - picker_ = std::move(picker); - // Update the aggregated connectivity state. - ring_hash_->UpdateAggregatedConnectivityStateLocked(entered_transient_failure, - status); + // Update the RH policy's connectivity state, creating new picker and new + // ring. + subchannel_list()->UpdateRingHashConnectivityStateLocked( + Index(), connection_attempt_complete, logical_connectivity_status_); } // @@ -609,6 +769,8 @@ RingHash::~RingHash() { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace)) { gpr_log(GPR_INFO, "[RH %p] Destroying Ring Hash policy", this); } + GPR_ASSERT(subchannel_list_ == nullptr); + GPR_ASSERT(latest_pending_subchannel_list_ == nullptr); } void RingHash::ShutdownLocked() { @@ -616,212 +778,73 @@ void RingHash::ShutdownLocked() { gpr_log(GPR_INFO, "[RH %p] Shutting down", this); } shutdown_ = true; - endpoint_map_.clear(); + subchannel_list_.reset(); + latest_pending_subchannel_list_.reset(); } void RingHash::ResetBackoffLocked() { - for (const auto& p : endpoint_map_) { - p.second->ResetBackoffLocked(); + subchannel_list_->ResetBackoffLocked(); + if (latest_pending_subchannel_list_ != nullptr) { + latest_pending_subchannel_list_->ResetBackoffLocked(); } } absl::Status RingHash::UpdateLocked(UpdateArgs args) { - // Check address list. + config_ = std::move(args.config); + ServerAddressList addresses; if (args.addresses.ok()) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace)) { gpr_log(GPR_INFO, "[RH %p] received update with %" PRIuPTR " addresses", this, args.addresses->size()); } - addresses_ = *std::move(args.addresses); + addresses = *std::move(args.addresses); } else { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace)) { gpr_log(GPR_INFO, "[RH %p] received update with addresses error: %s", this, args.addresses.status().ToString().c_str()); } - // If we already have an endpoint list, then keep using the existing + // If we already have a subchannel list, then keep using the existing // list, but still report back that the update was not accepted. - if (!addresses_.empty()) return args.addresses.status(); + if (subchannel_list_ != nullptr) return args.addresses.status(); } - // Save channel args. - args_ = std::move(args.args); - // Build new ring. - ring_ = MakeRefCounted( - this, static_cast(args.config.get())); - // Update endpoint map. - std::map> endpoint_map; - for (size_t i = 0; i < addresses_.size(); ++i) { - const ServerAddress& address = addresses_[i]; - // If present in old map, retain it; otherwise, create a new one. - auto it = endpoint_map_.find(address); - if (it != endpoint_map_.end()) { - it->second->UpdateLocked(i); - endpoint_map.emplace(address, std::move(it->second)); - } else { - endpoint_map.emplace(address, MakeOrphanable(Ref(), i)); - } - } - endpoint_map_ = std::move(endpoint_map); - // If the address list is empty, report TRANSIENT_FAILURE. - // TODO(roth): As part of adding dualstack backend support, we need to - // also handle the case where the list of addresses for a given - // endpoint is empty. - if (addresses_.empty()) { - absl::Status status = - args.addresses.ok() ? absl::UnavailableError(absl::StrCat( - "empty address list: ", args.resolution_note)) - : args.addresses.status(); - channel_control_helper()->UpdateState( - GRPC_CHANNEL_TRANSIENT_FAILURE, status, - MakeRefCounted(status)); - return status; + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace) && + latest_pending_subchannel_list_ != nullptr) { + gpr_log(GPR_INFO, "[RH %p] replacing latest pending subchannel list %p", + this, latest_pending_subchannel_list_.get()); } - // Return a new picker. - UpdateAggregatedConnectivityStateLocked(/*entered_transient_failure=*/false, - absl::OkStatus()); - return absl::OkStatus(); -} - -void RingHash::UpdateAggregatedConnectivityStateLocked( - bool entered_transient_failure, absl::Status status) { - // Count the number of endpoints in each state. - size_t num_idle = 0; - size_t num_connecting = 0; - size_t num_ready = 0; - size_t num_transient_failure = 0; - for (const auto& p : endpoint_map_) { - switch (p.second->connectivity_state()) { - case GRPC_CHANNEL_READY: - ++num_ready; - break; - case GRPC_CHANNEL_IDLE: - ++num_idle; - break; - case GRPC_CHANNEL_CONNECTING: - ++num_connecting; - break; - case GRPC_CHANNEL_TRANSIENT_FAILURE: - ++num_transient_failure; - break; - default: - Crash("child policy should never report SHUTDOWN"); + latest_pending_subchannel_list_ = MakeRefCounted( + this, std::move(addresses), args.args); + latest_pending_subchannel_list_->StartWatchingLocked(args.args); + // If we have no existing list or the new list is empty, immediately + // promote the new list. + // Otherwise, do nothing; the new list will be promoted when the + // initial subchannel states are reported. + if (subchannel_list_ == nullptr || + latest_pending_subchannel_list_->num_subchannels() == 0) { + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace) && + subchannel_list_ != nullptr) { + gpr_log(GPR_INFO, + "[RH %p] empty address list, replacing subchannel list %p", this, + subchannel_list_.get()); } - } - // The overall aggregation rules here are: - // 1. If there is at least one endpoint in READY state, report READY. - // 2. If there are 2 or more endpoints in TRANSIENT_FAILURE state, report - // TRANSIENT_FAILURE. - // 3. If there is at least one endpoint in CONNECTING state, report - // CONNECTING. - // 4. If there is one endpoint in TRANSIENT_FAILURE state and there is - // more than one endpoint, report CONNECTING. - // 5. If there is at least one endpoint in IDLE state, report IDLE. - // 6. Otherwise, report TRANSIENT_FAILURE. - // - // We set start_connection_attempt to true if we match rules 2, 4, or 6. - grpc_connectivity_state state; - bool start_connection_attempt = false; - if (num_ready > 0) { - state = GRPC_CHANNEL_READY; - } else if (num_transient_failure >= 2) { - state = GRPC_CHANNEL_TRANSIENT_FAILURE; - start_connection_attempt = true; - } else if (num_connecting > 0) { - state = GRPC_CHANNEL_CONNECTING; - } else if (num_transient_failure == 1 && addresses_.size() > 1) { - state = GRPC_CHANNEL_CONNECTING; - start_connection_attempt = true; - } else if (num_idle > 0) { - state = GRPC_CHANNEL_IDLE; - } else { - state = GRPC_CHANNEL_TRANSIENT_FAILURE; - start_connection_attempt = true; - } - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace)) { - gpr_log(GPR_INFO, - "[RH %p] setting connectivity state to %s (num_idle=%" PRIuPTR - ", num_connecting=%" PRIuPTR ", num_ready=%" PRIuPTR - ", num_transient_failure=%" PRIuPTR ", size=%" PRIuPTR - ") -- start_connection_attempt=%d", - this, ConnectivityStateName(state), num_idle, num_connecting, - num_ready, num_transient_failure, addresses_.size(), - start_connection_attempt); - } - // In TRANSIENT_FAILURE, report the last reported failure. - // Otherwise, report OK. - if (state == GRPC_CHANNEL_TRANSIENT_FAILURE) { - if (!status.ok()) { - last_failure_ = absl::UnavailableError(absl::StrCat( - "no reachable endpoints; last error: ", status.message())); - } - status = last_failure_; - } else { - status = absl::OkStatus(); - } - // Generate new picker and return it to the channel. - // Note that we use our own picker regardless of connectivity state. - channel_control_helper()->UpdateState( - state, status, - MakeRefCounted(Ref(DEBUG_LOCATION, "RingHashPicker"))); - // While the ring_hash policy is reporting TRANSIENT_FAILURE, it will - // not be getting any pick requests from the priority policy. - // However, because the ring_hash policy does not attempt to - // reconnect to endpoints unless it is getting pick requests, - // it will need special handling to ensure that it will eventually - // recover from TRANSIENT_FAILURE state once the problem is resolved. - // Specifically, it will make sure that it is attempting to connect to - // at least one endpoint at any given time. But we don't want to just - // try to connect to only one endpoint, because if that particular - // endpoint happens to be down but the rest are reachable, we would - // incorrectly fail to recover. - // - // So, to handle this, whenever an endpoint initially enters - // TRANSIENT_FAILURE state (i.e., its initial connection attempt has - // failed), if there are no endpoints currently in CONNECTING state - // (i.e., they are still trying their initial connection attempt), - // then we will trigger a connection attempt for the first endpoint - // that is currently in state IDLE, if any. - // - // Note that once an endpoint enters TRANSIENT_FAILURE state, it will - // stay in that state and automatically retry after appropriate backoff, - // never stopping until it establishes a connection. This means that - // if we stay in TRANSIENT_FAILURE for a long period of time, we will - // eventually be trying *all* endpoints, which probably isn't ideal. - // But it's no different than what can happen if ring_hash is the root - // LB policy and we keep getting picks, so it's not really a new - // problem. If/when it becomes an issue, we can figure out how to - // address it. - // - // Note that we do the same thing when the policy is in state - // CONNECTING, just to ensure that we don't remain in CONNECTING state - // indefinitely if there are no new picks coming in. - if (start_connection_attempt && entered_transient_failure) { - size_t first_idle_index = addresses_.size(); - for (size_t i = 0; i < addresses_.size(); ++i) { - auto it = endpoint_map_.find(addresses_[i]); - GPR_ASSERT(it != endpoint_map_.end()); - if (it->second->connectivity_state() == GRPC_CHANNEL_CONNECTING) { - first_idle_index = addresses_.size(); - break; - } - if (first_idle_index == addresses_.size() && - it->second->connectivity_state() == GRPC_CHANNEL_IDLE) { - first_idle_index = i; - } - } - if (first_idle_index != addresses_.size()) { - auto it = endpoint_map_.find(addresses_[first_idle_index]); - GPR_ASSERT(it != endpoint_map_.end()); - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_ring_hash_trace)) { - gpr_log(GPR_INFO, - "[RH %p] triggering internal connection attempt for endpoint " - "%p (%s) (index %" PRIuPTR " of %" PRIuPTR ")", - this, it->second.get(), - addresses_[first_idle_index].ToString().c_str(), - first_idle_index, addresses_.size()); - } - it->second->RequestConnectionLocked(); + subchannel_list_ = std::move(latest_pending_subchannel_list_); + // If the new list is empty, report TRANSIENT_FAILURE. + if (subchannel_list_->num_subchannels() == 0) { + absl::Status status = + args.addresses.ok() + ? absl::UnavailableError( + absl::StrCat("empty address list: ", args.resolution_note)) + : args.addresses.status(); + channel_control_helper()->UpdateState( + GRPC_CHANNEL_TRANSIENT_FAILURE, status, + MakeRefCounted(status)); + return status; } + // Otherwise, report IDLE. + subchannel_list_->UpdateRingHashConnectivityStateLocked( + /*index=*/0, /*connection_attempt_complete=*/false, absl::OkStatus()); } + return absl::OkStatus(); } // diff --git a/src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.cc b/src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.cc index d883fe0c7cc..4cf71c9c951 100644 --- a/src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.cc +++ b/src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.cc @@ -37,7 +37,7 @@ #include #include -#include "src/core/ext/filters/client_channel/lb_policy/endpoint_list.h" +#include "src/core/ext/filters/client_channel/lb_policy/subchannel_list.h" #include "src/core/lib/channel/channel_args.h" #include "src/core/lib/config/core_configuration.h" #include "src/core/lib/debug/trace.h" @@ -48,6 +48,7 @@ #include "src/core/lib/json/json.h" #include "src/core/lib/load_balancing/lb_policy.h" #include "src/core/lib/load_balancing/lb_policy_factory.h" +#include "src/core/lib/load_balancing/subchannel_interface.h" #include "src/core/lib/resolver/server_address.h" #include "src/core/lib/transport/connectivity_state.h" @@ -73,60 +74,93 @@ class RoundRobin : public LoadBalancingPolicy { void ResetBackoffLocked() override; private: - class RoundRobinEndpointList : public EndpointList { + ~RoundRobin() override; + + // Forward declaration. + class RoundRobinSubchannelList; + + // Data for a particular subchannel in a subchannel list. + // This subclass adds the following functionality: + // - Tracks the previous connectivity state of the subchannel, so that + // we know how many subchannels are in each state. + class RoundRobinSubchannelData + : public SubchannelData { public: - RoundRobinEndpointList(RefCountedPtr round_robin, - const ServerAddressList& addresses, - const ChannelArgs& args) - : EndpointList(std::move(round_robin), - GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace) - ? "RoundRobinEndpointList" - : nullptr) { - Init(addresses, args, - [&](RefCountedPtr endpoint_list, - const ServerAddress& address, const ChannelArgs& args) { - return MakeOrphanable( - std::move(endpoint_list), address, args, - policy()->work_serializer()); - }); + RoundRobinSubchannelData( + SubchannelList* + subchannel_list, + const ServerAddress& address, + RefCountedPtr subchannel) + : SubchannelData(subchannel_list, address, std::move(subchannel)) {} + + absl::optional connectivity_state() const { + return logical_connectivity_state_; } private: - class RoundRobinEndpoint : public Endpoint { - public: - RoundRobinEndpoint(RefCountedPtr endpoint_list, - const ServerAddress& address, const ChannelArgs& args, - std::shared_ptr work_serializer) - : Endpoint(std::move(endpoint_list)) { - Init(address, args, std::move(work_serializer)); - } - - private: - // Called when the child policy reports a connectivity state update. - void OnStateUpdate(absl::optional old_state, - grpc_connectivity_state new_state, - const absl::Status& status) override; - }; - - LoadBalancingPolicy::ChannelControlHelper* channel_control_helper() - const override { - return policy()->channel_control_helper(); + // Performs connectivity state updates that need to be done only + // after we have started watching. + void ProcessConnectivityChangeLocked( + absl::optional old_state, + grpc_connectivity_state new_state) override; + + // Updates the logical connectivity state. + void UpdateLogicalConnectivityStateLocked( + grpc_connectivity_state connectivity_state); + + // The logical connectivity state of the subchannel. + // Note that the logical connectivity state may differ from the + // actual reported state in some cases (e.g., after we see + // TRANSIENT_FAILURE, we ignore any subsequent state changes until + // we see READY). + absl::optional logical_connectivity_state_; + }; + + // A list of subchannels. + class RoundRobinSubchannelList + : public SubchannelList { + public: + RoundRobinSubchannelList(RoundRobin* policy, ServerAddressList addresses, + const ChannelArgs& args) + : SubchannelList(policy, + (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace) + ? "RoundRobinSubchannelList" + : nullptr), + std::move(addresses), policy->channel_control_helper(), + args) { + // Need to maintain a ref to the LB policy as long as we maintain + // any references to subchannels, since the subchannels' + // pollset_sets will include the LB policy's pollset_set. + policy->Ref(DEBUG_LOCATION, "subchannel_list").release(); } - // Updates the counters of children in each state when a - // child transitions from old_state to new_state. + ~RoundRobinSubchannelList() override { + RoundRobin* p = static_cast(policy()); + p->Unref(DEBUG_LOCATION, "subchannel_list"); + } + + // Updates the counters of subchannels in each state when a + // subchannel transitions from old_state to new_state. void UpdateStateCountersLocked( absl::optional old_state, grpc_connectivity_state new_state); - // Ensures that the right child list is used and then updates - // the RR policy's connectivity state based on the child list's + // Ensures that the right subchannel list is used and then updates + // the RR policy's connectivity state based on the subchannel list's // state counters. void MaybeUpdateRoundRobinConnectivityStateLocked( absl::Status status_for_tf); + private: + std::shared_ptr work_serializer() const override { + return static_cast(policy())->work_serializer(); + } + std::string CountersString() const { - return absl::StrCat("num_children=", size(), " num_ready=", num_ready_, + return absl::StrCat("num_subchannels=", num_subchannels(), + " num_ready=", num_ready_, " num_connecting=", num_connecting_, " num_transient_failure=", num_transient_failure_); } @@ -140,9 +174,7 @@ class RoundRobin : public LoadBalancingPolicy { class Picker : public SubchannelPicker { public: - Picker(RoundRobin* parent, - std::vector> - pickers); + Picker(RoundRobin* parent, RoundRobinSubchannelList* subchannel_list); PickResult Pick(PickArgs args) override; @@ -151,20 +183,18 @@ class RoundRobin : public LoadBalancingPolicy { RoundRobin* parent_; std::atomic last_picked_index_; - std::vector> pickers_; + std::vector> subchannels_; }; - ~RoundRobin() override; - void ShutdownLocked() override; - // Current child list. - OrphanablePtr endpoint_list_; - // Latest pending child list. - // When we get an updated address list, we create a new child list - // for it here, and we wait to swap it into endpoint_list_ until the new + // List of subchannels. + RefCountedPtr subchannel_list_; + // Latest pending subchannel list. + // When we get an updated address list, we create a new subchannel list + // for it here, and we wait to swap it into subchannel_list_ until the new // list becomes READY. - OrphanablePtr latest_pending_endpoint_list_; + RefCountedPtr latest_pending_subchannel_list_; bool shutdown_ = false; @@ -175,32 +205,38 @@ class RoundRobin : public LoadBalancingPolicy { // RoundRobin::Picker // -RoundRobin::Picker::Picker( - RoundRobin* parent, - std::vector> pickers) - : parent_(parent), pickers_(std::move(pickers)) { +RoundRobin::Picker::Picker(RoundRobin* parent, + RoundRobinSubchannelList* subchannel_list) + : parent_(parent) { + for (size_t i = 0; i < subchannel_list->num_subchannels(); ++i) { + RoundRobinSubchannelData* sd = subchannel_list->subchannel(i); + if (sd->connectivity_state().value_or(GRPC_CHANNEL_IDLE) == + GRPC_CHANNEL_READY) { + subchannels_.push_back(sd->subchannel()->Ref()); + } + } // For discussion on why we generate a random starting index for // the picker, see https://github.com/grpc/grpc-go/issues/2580. - size_t index = absl::Uniform(parent->bit_gen_, 0, pickers_.size()); + size_t index = + absl::Uniform(parent->bit_gen_, 0, subchannels_.size()); last_picked_index_.store(index, std::memory_order_relaxed); if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { gpr_log(GPR_INFO, - "[RR %p picker %p] created picker from endpoint_list=%p " - "with %" PRIuPTR " READY children; last_picked_index_=%" PRIuPTR, - parent_, this, parent_->endpoint_list_.get(), pickers_.size(), - index); + "[RR %p picker %p] created picker from subchannel_list=%p " + "with %" PRIuPTR " READY subchannels; last_picked_index_=%" PRIuPTR, + parent_, this, subchannel_list, subchannels_.size(), index); } } -RoundRobin::PickResult RoundRobin::Picker::Pick(PickArgs args) { +RoundRobin::PickResult RoundRobin::Picker::Pick(PickArgs /*args*/) { size_t index = last_picked_index_.fetch_add(1, std::memory_order_relaxed) % - pickers_.size(); + subchannels_.size(); if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { gpr_log(GPR_INFO, - "[RR %p picker %p] using picker index %" PRIuPTR ", picker=%p", - parent_, this, index, pickers_[index].get()); + "[RR %p picker %p] returning index %" PRIuPTR ", subchannel=%p", + parent_, this, index, subchannels_[index].get()); } - return pickers_[index]->Pick(args); + return PickResult::Complete(subchannels_[index]); } // @@ -217,8 +253,8 @@ RoundRobin::~RoundRobin() { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { gpr_log(GPR_INFO, "[RR %p] Destroying Round Robin policy", this); } - GPR_ASSERT(endpoint_list_ == nullptr); - GPR_ASSERT(latest_pending_endpoint_list_ == nullptr); + GPR_ASSERT(subchannel_list_ == nullptr); + GPR_ASSERT(latest_pending_subchannel_list_ == nullptr); } void RoundRobin::ShutdownLocked() { @@ -226,14 +262,14 @@ void RoundRobin::ShutdownLocked() { gpr_log(GPR_INFO, "[RR %p] Shutting down", this); } shutdown_ = true; - endpoint_list_.reset(); - latest_pending_endpoint_list_.reset(); + subchannel_list_.reset(); + latest_pending_subchannel_list_.reset(); } void RoundRobin::ResetBackoffLocked() { - endpoint_list_->ResetBackoffLocked(); - if (latest_pending_endpoint_list_ != nullptr) { - latest_pending_endpoint_list_->ResetBackoffLocked(); + subchannel_list_->ResetBackoffLocked(); + if (latest_pending_subchannel_list_ != nullptr) { + latest_pending_subchannel_list_->ResetBackoffLocked(); } } @@ -250,31 +286,28 @@ absl::Status RoundRobin::UpdateLocked(UpdateArgs args) { gpr_log(GPR_INFO, "[RR %p] received update with address error: %s", this, args.addresses.status().ToString().c_str()); } - // If we already have a child list, then keep using the existing + // If we already have a subchannel list, then keep using the existing // list, but still report back that the update was not accepted. - if (endpoint_list_ != nullptr) return args.addresses.status(); + if (subchannel_list_ != nullptr) return args.addresses.status(); } - // Create new child list, replacing the previous pending list, if any. + // Create new subchannel list, replacing the previous pending list, if any. if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace) && - latest_pending_endpoint_list_ != nullptr) { - gpr_log(GPR_INFO, "[RR %p] replacing previous pending child list %p", this, - latest_pending_endpoint_list_.get()); + latest_pending_subchannel_list_ != nullptr) { + gpr_log(GPR_INFO, "[RR %p] replacing previous pending subchannel list %p", + this, latest_pending_subchannel_list_.get()); } - latest_pending_endpoint_list_ = MakeOrphanable( - Ref(DEBUG_LOCATION, "RoundRobinEndpointList"), std::move(addresses), - args.args); + latest_pending_subchannel_list_ = MakeRefCounted( + this, std::move(addresses), args.args); + latest_pending_subchannel_list_->StartWatchingLocked(args.args); // If the new list is empty, immediately promote it to - // endpoint_list_ and report TRANSIENT_FAILURE. - // TODO(roth): As part of adding dualstack backend support, we need to - // also handle the case where the list of addresses for a given - // endpoint is empty. - if (latest_pending_endpoint_list_->size() == 0) { + // subchannel_list_ and report TRANSIENT_FAILURE. + if (latest_pending_subchannel_list_->num_subchannels() == 0) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace) && - endpoint_list_ != nullptr) { - gpr_log(GPR_INFO, "[RR %p] replacing previous child list %p", this, - endpoint_list_.get()); + subchannel_list_ != nullptr) { + gpr_log(GPR_INFO, "[RR %p] replacing previous subchannel list %p", this, + subchannel_list_.get()); } - endpoint_list_ = std::move(latest_pending_endpoint_list_); + subchannel_list_ = std::move(latest_pending_subchannel_list_); absl::Status status = args.addresses.ok() ? absl::UnavailableError(absl::StrCat( "empty address list: ", args.resolution_note)) @@ -285,64 +318,26 @@ absl::Status RoundRobin::UpdateLocked(UpdateArgs args) { return status; } // Otherwise, if this is the initial update, immediately promote it to - // endpoint_list_. - if (endpoint_list_ == nullptr) { - endpoint_list_ = std::move(latest_pending_endpoint_list_); + // subchannel_list_. + if (subchannel_list_.get() == nullptr) { + subchannel_list_ = std::move(latest_pending_subchannel_list_); } return absl::OkStatus(); } // -// RoundRobin::RoundRobinEndpointList::RoundRobinEndpoint -// - -void RoundRobin::RoundRobinEndpointList::RoundRobinEndpoint::OnStateUpdate( - absl::optional old_state, - grpc_connectivity_state new_state, const absl::Status& status) { - auto* rr_endpoint_list = endpoint_list(); - auto* round_robin = policy(); - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { - gpr_log(GPR_INFO, - "[RR %p] connectivity changed for child %p, endpoint_list %p " - "(index %" PRIuPTR " of %" PRIuPTR - "): prev_state=%s new_state=%s " - "(%s)", - round_robin, this, rr_endpoint_list, Index(), - rr_endpoint_list->size(), - (old_state.has_value() ? ConnectivityStateName(*old_state) : "N/A"), - ConnectivityStateName(new_state), status.ToString().c_str()); - } - if (new_state == GRPC_CHANNEL_IDLE) { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { - gpr_log(GPR_INFO, "[RR %p] child %p reported IDLE; requesting connection", - round_robin, this); - } - ExitIdleLocked(); - } - // If state changed, update state counters. - if (!old_state.has_value() || *old_state != new_state) { - rr_endpoint_list->UpdateStateCountersLocked(old_state, new_state); - } - // Update the policy state. - rr_endpoint_list->MaybeUpdateRoundRobinConnectivityStateLocked(status); -} - -// -// RoundRobin::RoundRobinEndpointList +// RoundRobinSubchannelList // -void RoundRobin::RoundRobinEndpointList::UpdateStateCountersLocked( +void RoundRobin::RoundRobinSubchannelList::UpdateStateCountersLocked( absl::optional old_state, grpc_connectivity_state new_state) { - // We treat IDLE the same as CONNECTING, since it will immediately - // transition into that state anyway. if (old_state.has_value()) { GPR_ASSERT(*old_state != GRPC_CHANNEL_SHUTDOWN); if (*old_state == GRPC_CHANNEL_READY) { GPR_ASSERT(num_ready_ > 0); --num_ready_; - } else if (*old_state == GRPC_CHANNEL_CONNECTING || - *old_state == GRPC_CHANNEL_IDLE) { + } else if (*old_state == GRPC_CHANNEL_CONNECTING) { GPR_ASSERT(num_connecting_ > 0); --num_connecting_; } else if (*old_state == GRPC_CHANNEL_TRANSIENT_FAILURE) { @@ -353,90 +348,161 @@ void RoundRobin::RoundRobinEndpointList::UpdateStateCountersLocked( GPR_ASSERT(new_state != GRPC_CHANNEL_SHUTDOWN); if (new_state == GRPC_CHANNEL_READY) { ++num_ready_; - } else if (new_state == GRPC_CHANNEL_CONNECTING || - new_state == GRPC_CHANNEL_IDLE) { + } else if (new_state == GRPC_CHANNEL_CONNECTING) { ++num_connecting_; } else if (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE) { ++num_transient_failure_; } } -void RoundRobin::RoundRobinEndpointList:: +void RoundRobin::RoundRobinSubchannelList:: MaybeUpdateRoundRobinConnectivityStateLocked(absl::Status status_for_tf) { - auto* round_robin = policy(); - // If this is latest_pending_endpoint_list_, then swap it into - // endpoint_list_ in the following cases: - // - endpoint_list_ has no READY children. - // - This list has at least one READY child and we have seen the - // initial connectivity state notification for all children. - // - All of the children in this list are in TRANSIENT_FAILURE. + RoundRobin* p = static_cast(policy()); + // If this is latest_pending_subchannel_list_, then swap it into + // subchannel_list_ in the following cases: + // - subchannel_list_ has no READY subchannels. + // - This list has at least one READY subchannel and we have seen the + // initial connectivity state notification for all subchannels. + // - All of the subchannels in this list are in TRANSIENT_FAILURE. // (This may cause the channel to go from READY to TRANSIENT_FAILURE, // but we're doing what the control plane told us to do.) - if (round_robin->latest_pending_endpoint_list_.get() == this && - (round_robin->endpoint_list_->num_ready_ == 0 || - (num_ready_ > 0 && AllEndpointsSeenInitialState()) || - num_transient_failure_ == size())) { + if (p->latest_pending_subchannel_list_.get() == this && + (p->subchannel_list_->num_ready_ == 0 || + (num_ready_ > 0 && AllSubchannelsSeenInitialState()) || + num_transient_failure_ == num_subchannels())) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { const std::string old_counters_string = - round_robin->endpoint_list_ != nullptr - ? round_robin->endpoint_list_->CountersString() - : ""; - gpr_log(GPR_INFO, - "[RR %p] swapping out child list %p (%s) in favor of %p (%s)", - round_robin, round_robin->endpoint_list_.get(), - old_counters_string.c_str(), this, CountersString().c_str()); + p->subchannel_list_ != nullptr ? p->subchannel_list_->CountersString() + : ""; + gpr_log( + GPR_INFO, + "[RR %p] swapping out subchannel list %p (%s) in favor of %p (%s)", p, + p->subchannel_list_.get(), old_counters_string.c_str(), this, + CountersString().c_str()); } - round_robin->endpoint_list_ = - std::move(round_robin->latest_pending_endpoint_list_); + p->subchannel_list_ = std::move(p->latest_pending_subchannel_list_); } - // Only set connectivity state if this is the current child list. - if (round_robin->endpoint_list_.get() != this) return; - // FIXME: scan children each time instead of keeping counters? + // Only set connectivity state if this is the current subchannel list. + if (p->subchannel_list_.get() != this) return; // First matching rule wins: - // 1) ANY child is READY => policy is READY. - // 2) ANY child is CONNECTING => policy is CONNECTING. - // 3) ALL children are TRANSIENT_FAILURE => policy is TRANSIENT_FAILURE. + // 1) ANY subchannel is READY => policy is READY. + // 2) ANY subchannel is CONNECTING => policy is CONNECTING. + // 3) ALL subchannels are TRANSIENT_FAILURE => policy is TRANSIENT_FAILURE. if (num_ready_ > 0) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { - gpr_log(GPR_INFO, "[RR %p] reporting READY with child list %p", - round_robin, this); + gpr_log(GPR_INFO, "[RR %p] reporting READY with subchannel list %p", p, + this); } - std::vector> pickers; - for (const auto& endpoint : endpoints()) { - auto state = endpoint->connectivity_state(); - if (state.has_value() && *state == GRPC_CHANNEL_READY) { - pickers.push_back(endpoint->picker()); - } - } - GPR_ASSERT(!pickers.empty()); - round_robin->channel_control_helper()->UpdateState( - GRPC_CHANNEL_READY, absl::OkStatus(), - MakeRefCounted(round_robin, std::move(pickers))); + p->channel_control_helper()->UpdateState(GRPC_CHANNEL_READY, absl::Status(), + MakeRefCounted(p, this)); } else if (num_connecting_ > 0) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { - gpr_log(GPR_INFO, "[RR %p] reporting CONNECTING with child list %p", - round_robin, this); + gpr_log(GPR_INFO, "[RR %p] reporting CONNECTING with subchannel list %p", + p, this); } - round_robin->channel_control_helper()->UpdateState( + p->channel_control_helper()->UpdateState( GRPC_CHANNEL_CONNECTING, absl::Status(), - MakeRefCounted(nullptr)); - } else if (num_transient_failure_ == size()) { + MakeRefCounted(p->Ref(DEBUG_LOCATION, "QueuePicker"))); + } else if (num_transient_failure_ == num_subchannels()) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { gpr_log(GPR_INFO, - "[RR %p] reporting TRANSIENT_FAILURE with child list %p: %s", - round_robin, this, status_for_tf.ToString().c_str()); + "[RR %p] reporting TRANSIENT_FAILURE with subchannel list %p: %s", + p, this, status_for_tf.ToString().c_str()); } if (!status_for_tf.ok()) { last_failure_ = absl::UnavailableError( absl::StrCat("connections to all backends failing; last error: ", - status_for_tf.message())); + status_for_tf.ToString())); } - round_robin->channel_control_helper()->UpdateState( + p->channel_control_helper()->UpdateState( GRPC_CHANNEL_TRANSIENT_FAILURE, last_failure_, MakeRefCounted(last_failure_)); } } +// +// RoundRobinSubchannelData +// + +void RoundRobin::RoundRobinSubchannelData::ProcessConnectivityChangeLocked( + absl::optional old_state, + grpc_connectivity_state new_state) { + RoundRobin* p = static_cast(subchannel_list()->policy()); + GPR_ASSERT(subchannel() != nullptr); + // If this is not the initial state notification and the new state is + // TRANSIENT_FAILURE or IDLE, re-resolve. + // Note that we don't want to do this on the initial state notification, + // because that would result in an endless loop of re-resolution. + if (old_state.has_value() && (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE || + new_state == GRPC_CHANNEL_IDLE)) { + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { + gpr_log(GPR_INFO, + "[RR %p] Subchannel %p reported %s; requesting re-resolution", p, + subchannel(), ConnectivityStateName(new_state)); + } + p->channel_control_helper()->RequestReresolution(); + } + if (new_state == GRPC_CHANNEL_IDLE) { + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { + gpr_log(GPR_INFO, + "[RR %p] Subchannel %p reported IDLE; requesting connection", p, + subchannel()); + } + subchannel()->RequestConnection(); + } + // Update logical connectivity state. + UpdateLogicalConnectivityStateLocked(new_state); + // Update the policy state. + subchannel_list()->MaybeUpdateRoundRobinConnectivityStateLocked( + connectivity_status()); +} + +void RoundRobin::RoundRobinSubchannelData::UpdateLogicalConnectivityStateLocked( + grpc_connectivity_state connectivity_state) { + RoundRobin* p = static_cast(subchannel_list()->policy()); + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { + gpr_log( + GPR_INFO, + "[RR %p] connectivity changed for subchannel %p, subchannel_list %p " + "(index %" PRIuPTR " of %" PRIuPTR "): prev_state=%s new_state=%s", + p, subchannel(), subchannel_list(), Index(), + subchannel_list()->num_subchannels(), + (logical_connectivity_state_.has_value() + ? ConnectivityStateName(*logical_connectivity_state_) + : "N/A"), + ConnectivityStateName(connectivity_state)); + } + // Decide what state to report for aggregation purposes. + // If the last logical state was TRANSIENT_FAILURE, then ignore the + // state change unless the new state is READY. + if (logical_connectivity_state_.has_value() && + *logical_connectivity_state_ == GRPC_CHANNEL_TRANSIENT_FAILURE && + connectivity_state != GRPC_CHANNEL_READY) { + return; + } + // If the new state is IDLE, treat it as CONNECTING, since it will + // immediately transition into CONNECTING anyway. + if (connectivity_state == GRPC_CHANNEL_IDLE) { + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_round_robin_trace)) { + gpr_log(GPR_INFO, + "[RR %p] subchannel %p, subchannel_list %p (index %" PRIuPTR + " of %" PRIuPTR "): treating IDLE as CONNECTING", + p, subchannel(), subchannel_list(), Index(), + subchannel_list()->num_subchannels()); + } + connectivity_state = GRPC_CHANNEL_CONNECTING; + } + // If no change, return false. + if (logical_connectivity_state_.has_value() && + *logical_connectivity_state_ == connectivity_state) { + return; + } + // Otherwise, update counters and logical state. + subchannel_list()->UpdateStateCountersLocked(logical_connectivity_state_, + connectivity_state); + logical_connectivity_state_ = connectivity_state; +} + // // factory // diff --git a/src/core/ext/filters/client_channel/lb_policy/subchannel_list.h b/src/core/ext/filters/client_channel/lb_policy/subchannel_list.h new file mode 100644 index 00000000000..87072c7381c --- /dev/null +++ b/src/core/ext/filters/client_channel/lb_policy/subchannel_list.h @@ -0,0 +1,488 @@ +// +// Copyright 2015 gRPC authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef GRPC_SRC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_SUBCHANNEL_LIST_H +#define GRPC_SRC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_SUBCHANNEL_LIST_H + +#include + +#include +#include + +#include +#include +#include +#include + +#include "absl/status/status.h" +#include "absl/types/optional.h" + +#include +#include +#include + +#include "src/core/ext/filters/client_channel/client_channel_internal.h" +#include "src/core/ext/filters/client_channel/lb_policy/health_check_client.h" +#include "src/core/lib/channel/channel_args.h" +#include "src/core/lib/gprpp/debug_location.h" +#include "src/core/lib/gprpp/dual_ref_counted.h" +#include "src/core/lib/gprpp/manual_constructor.h" +#include "src/core/lib/gprpp/ref_counted_ptr.h" +#include "src/core/lib/gprpp/work_serializer.h" +#include "src/core/lib/iomgr/iomgr_fwd.h" +#include "src/core/lib/load_balancing/lb_policy.h" +#include "src/core/lib/load_balancing/subchannel_interface.h" +#include "src/core/lib/resolver/server_address.h" +#include "src/core/lib/transport/connectivity_state.h" + +// Code for maintaining a list of subchannels within an LB policy. +// +// To use this, callers must create their own subclasses, like so: +// + +// class MySubchannelList; // Forward declaration. + +// class MySubchannelData +// : public SubchannelData { +// public: +// void ProcessConnectivityChangeLocked( +// absl::optional old_state, +// grpc_connectivity_state new_state) override { +// // ...code to handle connectivity changes... +// } +// }; + +// class MySubchannelList +// : public SubchannelList { +// }; + +// +// All methods will be called from within the client_channel work serializer. + +namespace grpc_core { + +// Forward declaration. +template +class SubchannelList; + +// Stores data for a particular subchannel in a subchannel list. +// Callers must create a subclass that implements the +// ProcessConnectivityChangeLocked() method. +template +class SubchannelData { + public: + // Returns a pointer to the subchannel list containing this object. + SubchannelListType* subchannel_list() const { + return static_cast(subchannel_list_); + } + + // Returns the index into the subchannel list of this object. + size_t Index() const { + return static_cast(static_cast(this) - + subchannel_list_->subchannel(0)); + } + + // Returns a pointer to the subchannel. + SubchannelInterface* subchannel() const { return subchannel_.get(); } + + // Returns the cached connectivity state, if any. + absl::optional connectivity_state() { + return connectivity_state_; + } + absl::Status connectivity_status() { return connectivity_status_; } + + // Resets the connection backoff. + void ResetBackoffLocked(); + + // Cancels any pending connectivity watch and unrefs the subchannel. + void ShutdownLocked(); + + protected: + SubchannelData( + SubchannelList* subchannel_list, + const ServerAddress& address, + RefCountedPtr subchannel); + + virtual ~SubchannelData(); + + // This method will be invoked once soon after instantiation to report + // the current connectivity state, and it will then be invoked again + // whenever the connectivity state changes. + virtual void ProcessConnectivityChangeLocked( + absl::optional old_state, + grpc_connectivity_state new_state) = 0; + + private: + // For accessing StartConnectivityWatchLocked(). + friend class SubchannelList; + + // Watcher for subchannel connectivity state. + class Watcher + : public SubchannelInterface::ConnectivityStateWatcherInterface { + public: + Watcher( + SubchannelData* subchannel_data, + WeakRefCountedPtr subchannel_list) + : subchannel_data_(subchannel_data), + subchannel_list_(std::move(subchannel_list)) {} + + ~Watcher() override { + subchannel_list_.reset(DEBUG_LOCATION, "Watcher dtor"); + } + + void OnConnectivityStateChange(grpc_connectivity_state new_state, + absl::Status status) override; + + grpc_pollset_set* interested_parties() override { + return subchannel_list_->policy()->interested_parties(); + } + + private: + SubchannelData* subchannel_data_; + WeakRefCountedPtr subchannel_list_; + }; + + // Starts watching the connectivity state of the subchannel. + // ProcessConnectivityChangeLocked() will be called whenever the + // connectivity state changes. + void StartConnectivityWatchLocked(const ChannelArgs& args); + + // Cancels watching the connectivity state of the subchannel. + void CancelConnectivityWatchLocked(const char* reason); + + // Unrefs the subchannel. + void UnrefSubchannelLocked(const char* reason); + + // Backpointer to owning subchannel list. Not owned. + SubchannelList* subchannel_list_; + // The subchannel. + RefCountedPtr subchannel_; + // Will be non-null when the subchannel's state is being watched. + SubchannelInterface::ConnectivityStateWatcherInterface* pending_watcher_ = + nullptr; + SubchannelInterface::DataWatcherInterface* health_watcher_ = nullptr; + // Data updated by the watcher. + absl::optional connectivity_state_; + absl::Status connectivity_status_; +}; + +// A list of subchannels. +template +class SubchannelList : public DualRefCounted { + public: + // Starts watching the connectivity state of all subchannels. + // Must be called immediately after instantiation. + void StartWatchingLocked(const ChannelArgs& args); + + // The number of subchannels in the list. + size_t num_subchannels() const { return subchannels_.size(); } + + // The data for the subchannel at a particular index. + SubchannelDataType* subchannel(size_t index) { + return subchannels_[index].get(); + } + + // Returns true if the subchannel list is shutting down. + bool shutting_down() const { return shutting_down_; } + + // Accessors. + LoadBalancingPolicy* policy() const { return policy_; } + const char* tracer() const { return tracer_; } + + // Resets connection backoff of all subchannels. + void ResetBackoffLocked(); + + // Returns true if all subchannels have seen their initial + // connectivity state notifications. + bool AllSubchannelsSeenInitialState(); + + void Orphan() override; + + protected: + SubchannelList(LoadBalancingPolicy* policy, const char* tracer, + ServerAddressList addresses, + LoadBalancingPolicy::ChannelControlHelper* helper, + const ChannelArgs& args); + + virtual ~SubchannelList(); + + private: + // For accessing Ref() and Unref(). + friend class SubchannelData; + + virtual std::shared_ptr work_serializer() const = 0; + + // Backpointer to owning policy. + LoadBalancingPolicy* policy_; + + const char* tracer_; + + absl::optional health_check_service_name_; + + // The list of subchannels. + // We use ManualConstructor here to support SubchannelDataType classes + // that are not copyable. + std::vector> subchannels_; + + // Is this list shutting down? This may be true due to the shutdown of the + // policy itself or because a newer update has arrived while this one hadn't + // finished processing. + bool shutting_down_ = false; +}; + +// +// implementation -- no user-servicable parts below +// + +// +// SubchannelData::Watcher +// + +template +void SubchannelData::Watcher:: + OnConnectivityStateChange(grpc_connectivity_state new_state, + absl::Status status) { + if (GPR_UNLIKELY(subchannel_list_->tracer() != nullptr)) { + gpr_log( + GPR_INFO, + "[%s %p] subchannel list %p index %" PRIuPTR " of %" PRIuPTR + " (subchannel %p): connectivity changed: old_state=%s, new_state=%s, " + "status=%s, shutting_down=%d, pending_watcher=%p, health_watcher=%p", + subchannel_list_->tracer(), subchannel_list_->policy(), + subchannel_list_.get(), subchannel_data_->Index(), + subchannel_list_->num_subchannels(), + subchannel_data_->subchannel_.get(), + (subchannel_data_->connectivity_state_.has_value() + ? ConnectivityStateName(*subchannel_data_->connectivity_state_) + : "N/A"), + ConnectivityStateName(new_state), status.ToString().c_str(), + subchannel_list_->shutting_down(), subchannel_data_->pending_watcher_, + subchannel_data_->health_watcher_); + } + if (!subchannel_list_->shutting_down() && + (subchannel_data_->pending_watcher_ != nullptr || + subchannel_data_->health_watcher_ != nullptr)) { + absl::optional old_state = + subchannel_data_->connectivity_state_; + subchannel_data_->connectivity_state_ = new_state; + subchannel_data_->connectivity_status_ = status; + // Call the subclass's ProcessConnectivityChangeLocked() method. + subchannel_data_->ProcessConnectivityChangeLocked(old_state, new_state); + } +} + +// +// SubchannelData +// + +template +SubchannelData::SubchannelData( + SubchannelList* subchannel_list, + const ServerAddress& /*address*/, + RefCountedPtr subchannel) + : subchannel_list_(subchannel_list), subchannel_(std::move(subchannel)) {} + +template +SubchannelData::~SubchannelData() { + GPR_ASSERT(subchannel_ == nullptr); +} + +template +void SubchannelData:: + UnrefSubchannelLocked(const char* reason) { + if (subchannel_ != nullptr) { + if (GPR_UNLIKELY(subchannel_list_->tracer() != nullptr)) { + gpr_log(GPR_INFO, + "[%s %p] subchannel list %p index %" PRIuPTR " of %" PRIuPTR + " (subchannel %p): unreffing subchannel (%s)", + subchannel_list_->tracer(), subchannel_list_->policy(), + subchannel_list_, Index(), subchannel_list_->num_subchannels(), + subchannel_.get(), reason); + } + subchannel_.reset(); + } +} + +template +void SubchannelData::ResetBackoffLocked() { + if (subchannel_ != nullptr) { + subchannel_->ResetBackoff(); + } +} + +template +void SubchannelData:: + StartConnectivityWatchLocked(const ChannelArgs& args) { + if (GPR_UNLIKELY(subchannel_list_->tracer() != nullptr)) { + gpr_log( + GPR_INFO, + "[%s %p] subchannel list %p index %" PRIuPTR " of %" PRIuPTR + " (subchannel %p): starting watch " + "(health_check_service_name=\"%s\")", + subchannel_list_->tracer(), subchannel_list_->policy(), + subchannel_list_, Index(), subchannel_list_->num_subchannels(), + subchannel_.get(), + subchannel_list()->health_check_service_name_.value_or("N/A").c_str()); + } + GPR_ASSERT(pending_watcher_ == nullptr); + GPR_ASSERT(health_watcher_ == nullptr); + auto watcher = std::make_unique( + this, subchannel_list()->WeakRef(DEBUG_LOCATION, "Watcher")); + if (subchannel_list()->health_check_service_name_.has_value()) { + auto health_watcher = MakeHealthCheckWatcher( + subchannel_list_->work_serializer(), args, std::move(watcher)); + health_watcher_ = health_watcher.get(); + subchannel_->AddDataWatcher(std::move(health_watcher)); + } else { + pending_watcher_ = watcher.get(); + subchannel_->WatchConnectivityState(std::move(watcher)); + } +} + +template +void SubchannelData:: + CancelConnectivityWatchLocked(const char* reason) { + if (pending_watcher_ != nullptr) { + if (GPR_UNLIKELY(subchannel_list_->tracer() != nullptr)) { + gpr_log(GPR_INFO, + "[%s %p] subchannel list %p index %" PRIuPTR " of %" PRIuPTR + " (subchannel %p): canceling connectivity watch (%s)", + subchannel_list_->tracer(), subchannel_list_->policy(), + subchannel_list_, Index(), subchannel_list_->num_subchannels(), + subchannel_.get(), reason); + } + subchannel_->CancelConnectivityStateWatch(pending_watcher_); + pending_watcher_ = nullptr; + } else if (health_watcher_ != nullptr) { + if (GPR_UNLIKELY(subchannel_list_->tracer() != nullptr)) { + gpr_log(GPR_INFO, + "[%s %p] subchannel list %p index %" PRIuPTR " of %" PRIuPTR + " (subchannel %p): canceling health watch (%s)", + subchannel_list_->tracer(), subchannel_list_->policy(), + subchannel_list_, Index(), subchannel_list_->num_subchannels(), + subchannel_.get(), reason); + } + subchannel_->CancelDataWatcher(health_watcher_); + health_watcher_ = nullptr; + } +} + +template +void SubchannelData::ShutdownLocked() { + CancelConnectivityWatchLocked("shutdown"); + UnrefSubchannelLocked("shutdown"); +} + +// +// SubchannelList +// + +template +SubchannelList::SubchannelList( + LoadBalancingPolicy* policy, const char* tracer, + ServerAddressList addresses, + LoadBalancingPolicy::ChannelControlHelper* helper, const ChannelArgs& args) + : DualRefCounted(tracer), + policy_(policy), + tracer_(tracer) { + if (!args.GetBool(GRPC_ARG_INHIBIT_HEALTH_CHECKING).value_or(false)) { + health_check_service_name_ = + args.GetOwnedString(GRPC_ARG_HEALTH_CHECK_SERVICE_NAME); + } + if (GPR_UNLIKELY(tracer_ != nullptr)) { + gpr_log(GPR_INFO, + "[%s %p] Creating subchannel list %p for %" PRIuPTR " subchannels", + tracer_, policy, this, addresses.size()); + } + subchannels_.reserve(addresses.size()); + // Create a subchannel for each address. + for (ServerAddress address : addresses) { + RefCountedPtr subchannel = + helper->CreateSubchannel(address, args); + if (subchannel == nullptr) { + // Subchannel could not be created. + if (GPR_UNLIKELY(tracer_ != nullptr)) { + gpr_log(GPR_INFO, + "[%s %p] could not create subchannel for address %s, ignoring", + tracer_, policy_, address.ToString().c_str()); + } + continue; + } + if (GPR_UNLIKELY(tracer_ != nullptr)) { + gpr_log(GPR_INFO, + "[%s %p] subchannel list %p index %" PRIuPTR + ": Created subchannel %p for address %s", + tracer_, policy_, this, subchannels_.size(), subchannel.get(), + address.ToString().c_str()); + } + subchannels_.emplace_back(); + subchannels_.back().Init(this, std::move(address), std::move(subchannel)); + } +} + +template +SubchannelList::~SubchannelList() { + if (GPR_UNLIKELY(tracer_ != nullptr)) { + gpr_log(GPR_INFO, "[%s %p] Destroying subchannel_list %p", tracer_, policy_, + this); + } + for (auto& sd : subchannels_) { + sd.Destroy(); + } +} + +template +void SubchannelList:: + StartWatchingLocked(const ChannelArgs& args) { + for (auto& sd : subchannels_) { + sd->StartConnectivityWatchLocked(args); + } +} + +template +void SubchannelList::Orphan() { + if (GPR_UNLIKELY(tracer_ != nullptr)) { + gpr_log(GPR_INFO, "[%s %p] Shutting down subchannel_list %p", tracer_, + policy_, this); + } + GPR_ASSERT(!shutting_down_); + shutting_down_ = true; + for (auto& sd : subchannels_) { + sd->ShutdownLocked(); + } +} + +template +void SubchannelList::ResetBackoffLocked() { + for (auto& sd : subchannels_) { + sd->ResetBackoffLocked(); + } +} + +template +bool SubchannelList::AllSubchannelsSeenInitialState() { + for (size_t i = 0; i < num_subchannels(); ++i) { + if (!subchannel(i)->connectivity_state().has_value()) return false; + } + return true; +} + +} // namespace grpc_core + +#endif // GRPC_SRC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_SUBCHANNEL_LIST_H diff --git a/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc b/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc index fb169c725d9..ecafa42d5ea 100644 --- a/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc +++ b/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc @@ -39,15 +39,14 @@ #include "absl/strings/str_join.h" #include "absl/strings/string_view.h" #include "absl/types/optional.h" -#include "absl/types/variant.h" #include #include #include #include "src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h" -#include "src/core/ext/filters/client_channel/lb_policy/endpoint_list.h" #include "src/core/ext/filters/client_channel/lb_policy/oob_backend_metric.h" +#include "src/core/ext/filters/client_channel/lb_policy/subchannel_list.h" #include "src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.h" #include "src/core/lib/address_utils/sockaddr_utils.h" #include "src/core/lib/channel/channel_args.h" @@ -153,11 +152,11 @@ class WeightedRoundRobin : public LoadBalancingPolicy { private: // Represents the weight for a given address. - class EndpointWeight : public RefCounted { + class AddressWeight : public RefCounted { public: - EndpointWeight(RefCountedPtr wrr, std::string key) + AddressWeight(RefCountedPtr wrr, std::string key) : wrr_(std::move(wrr)), key_(std::move(key)) {} - ~EndpointWeight() override; + ~AddressWeight() override; void MaybeUpdateWeight(double qps, double eps, double utilization, float error_utilization_penalty); @@ -177,83 +176,109 @@ class WeightedRoundRobin : public LoadBalancingPolicy { Timestamp last_update_time_ ABSL_GUARDED_BY(&mu_) = Timestamp::InfPast(); }; - class WrrEndpointList : public EndpointList { + // Forward declaration. + class WeightedRoundRobinSubchannelList; + + // Data for a particular subchannel in a subchannel list. + // This subclass adds the following functionality: + // - Tracks the previous connectivity state of the subchannel, so that + // we know how many subchannels are in each state. + class WeightedRoundRobinSubchannelData + : public SubchannelData { public: - class WrrEndpoint : public Endpoint { + WeightedRoundRobinSubchannelData( + SubchannelList* subchannel_list, + const ServerAddress& address, RefCountedPtr sc); + + absl::optional connectivity_state() const { + return logical_connectivity_state_; + } + + RefCountedPtr weight() const { return weight_; } + + private: + class OobWatcher : public OobBackendMetricWatcher { public: - WrrEndpoint(RefCountedPtr endpoint_list, - const ServerAddress& address, const ChannelArgs& args, - std::shared_ptr work_serializer) - : Endpoint(std::move(endpoint_list)), - weight_(policy()->GetOrCreateWeight( - address.address())) { - Init(address, args, std::move(work_serializer)); - } + OobWatcher(RefCountedPtr weight, + float error_utilization_penalty) + : weight_(std::move(weight)), + error_utilization_penalty_(error_utilization_penalty) {} - RefCountedPtr weight() const { return weight_; } + void OnBackendMetricReport( + const BackendMetricData& backend_metric_data) override; private: - class OobWatcher : public OobBackendMetricWatcher { - public: - OobWatcher(RefCountedPtr weight, - float error_utilization_penalty) - : weight_(std::move(weight)), - error_utilization_penalty_(error_utilization_penalty) {} - - void OnBackendMetricReport( - const BackendMetricData& backend_metric_data) override; - - private: - RefCountedPtr weight_; - const float error_utilization_penalty_; - }; - - RefCountedPtr CreateSubchannel( - ServerAddress address, const ChannelArgs& args) override; - - // Called when the child policy reports a connectivity state update. - void OnStateUpdate(absl::optional old_state, - grpc_connectivity_state new_state, - const absl::Status& status) override; - - RefCountedPtr weight_; + RefCountedPtr weight_; + const float error_utilization_penalty_; }; - WrrEndpointList(RefCountedPtr wrr, - const ServerAddressList& addresses, const ChannelArgs& args) - : EndpointList(std::move(wrr), - GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace) - ? "WrrEndpointList" - : nullptr) { - Init(addresses, args, - [&](RefCountedPtr endpoint_list, - const ServerAddress& address, const ChannelArgs& args) { - return MakeOrphanable( - std::move(endpoint_list), address, args, - policy()->work_serializer()); - }); + // Performs connectivity state updates that need to be done only + // after we have started watching. + void ProcessConnectivityChangeLocked( + absl::optional old_state, + grpc_connectivity_state new_state) override; + + // Updates the logical connectivity state. + void UpdateLogicalConnectivityStateLocked( + grpc_connectivity_state connectivity_state); + + // The logical connectivity state of the subchannel. + // Note that the logical connectivity state may differ from the + // actual reported state in some cases (e.g., after we see + // TRANSIENT_FAILURE, we ignore any subsequent state changes until + // we see READY). + absl::optional logical_connectivity_state_; + + RefCountedPtr weight_; + }; + + // A list of subchannels. + class WeightedRoundRobinSubchannelList + : public SubchannelList { + public: + WeightedRoundRobinSubchannelList(WeightedRoundRobin* policy, + ServerAddressList addresses, + const ChannelArgs& args) + : SubchannelList(policy, + (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace) + ? "WeightedRoundRobinSubchannelList" + : nullptr), + std::move(addresses), policy->channel_control_helper(), + args) { + // Need to maintain a ref to the LB policy as long as we maintain + // any references to subchannels, since the subchannels' + // pollset_sets will include the LB policy's pollset_set. + policy->Ref(DEBUG_LOCATION, "subchannel_list").release(); } - private: - LoadBalancingPolicy::ChannelControlHelper* channel_control_helper() - const override { - return policy()->channel_control_helper(); + ~WeightedRoundRobinSubchannelList() override { + WeightedRoundRobin* p = static_cast(policy()); + p->Unref(DEBUG_LOCATION, "subchannel_list"); } - // Updates the counters of children in each state when a - // child transitions from old_state to new_state. + // Updates the counters of subchannels in each state when a + // subchannel transitions from old_state to new_state. void UpdateStateCountersLocked( absl::optional old_state, grpc_connectivity_state new_state); - // Ensures that the right child list is used and then updates - // the WRR policy's connectivity state based on the child list's + // Ensures that the right subchannel list is used and then updates + // the aggregated connectivity state based on the subchannel list's // state counters. void MaybeUpdateAggregatedConnectivityStateLocked( absl::Status status_for_tf); + private: + std::shared_ptr work_serializer() const override { + return static_cast(policy())->work_serializer(); + } + std::string CountersString() const { - return absl::StrCat("num_children=", size(), " num_ready=", num_ready_, + return absl::StrCat("num_subchannels=", num_subchannels(), + " num_ready=", num_ready_, " num_connecting=", num_connecting_, " num_transient_failure=", num_transient_failure_); } @@ -270,7 +295,7 @@ class WeightedRoundRobin : public LoadBalancingPolicy { class Picker : public SubchannelPicker { public: Picker(RefCountedPtr wrr, - WrrEndpointList* endpoint_list); + WeightedRoundRobinSubchannelList* subchannel_list); ~Picker() override; @@ -282,34 +307,31 @@ class WeightedRoundRobin : public LoadBalancingPolicy { // A call tracker that collects per-call endpoint utilization reports. class SubchannelCallTracker : public SubchannelCallTrackerInterface { public: - SubchannelCallTracker( - RefCountedPtr weight, float error_utilization_penalty, - std::unique_ptr child_tracker) + SubchannelCallTracker(RefCountedPtr weight, + float error_utilization_penalty) : weight_(std::move(weight)), - error_utilization_penalty_(error_utilization_penalty), - child_tracker_(std::move(child_tracker)) {} + error_utilization_penalty_(error_utilization_penalty) {} - void Start() override; + void Start() override {} void Finish(FinishArgs args) override; private: - RefCountedPtr weight_; + RefCountedPtr weight_; const float error_utilization_penalty_; - std::unique_ptr child_tracker_; }; - // Info stored about each endpoint. - struct EndpointInfo { - EndpointInfo(RefCountedPtr picker, - RefCountedPtr weight) - : picker(std::move(picker)), weight(std::move(weight)) {} + // Info stored about each subchannel. + struct SubchannelInfo { + SubchannelInfo(RefCountedPtr subchannel, + RefCountedPtr weight) + : subchannel(std::move(subchannel)), weight(std::move(weight)) {} - RefCountedPtr picker; - RefCountedPtr weight; + RefCountedPtr subchannel; + RefCountedPtr weight; }; - // Returns the index into endpoints_ to be picked. + // Returns the index into subchannels_ to be picked. size_t PickIndex(); // Builds a new scheduler and swaps it into place, then starts a @@ -319,7 +341,7 @@ class WeightedRoundRobin : public LoadBalancingPolicy { RefCountedPtr wrr_; RefCountedPtr config_; - std::vector endpoints_; + std::vector subchannels_; Mutex scheduler_mu_; std::shared_ptr scheduler_ @@ -337,22 +359,23 @@ class WeightedRoundRobin : public LoadBalancingPolicy { void ShutdownLocked() override; - RefCountedPtr GetOrCreateWeight( + RefCountedPtr GetOrCreateWeight( const grpc_resolved_address& address); RefCountedPtr config_; - // List of endpoints. - OrphanablePtr endpoint_list_; - // Latest pending endpoint list. - // When we get an updated address list, we create a new endpoint list - // for it here, and we wait to swap it into endpoint_list_ until the new + // List of subchannels. + RefCountedPtr subchannel_list_; + // Latest pending subchannel list. + // When we get an updated address list, we create a new subchannel list + // for it here, and we wait to swap it into subchannel_list_ until the new // list becomes READY. - OrphanablePtr latest_pending_endpoint_list_; + RefCountedPtr + latest_pending_subchannel_list_; - Mutex endpoint_weight_map_mu_; - std::map> endpoint_weight_map_ - ABSL_GUARDED_BY(&endpoint_weight_map_mu_); + Mutex address_weight_map_mu_; + std::map> address_weight_map_ + ABSL_GUARDED_BY(&address_weight_map_mu_); bool shutdown_ = false; @@ -363,18 +386,18 @@ class WeightedRoundRobin : public LoadBalancingPolicy { }; // -// WeightedRoundRobin::EndpointWeight +// WeightedRoundRobin::AddressWeight // -WeightedRoundRobin::EndpointWeight::~EndpointWeight() { - MutexLock lock(&wrr_->endpoint_weight_map_mu_); - auto it = wrr_->endpoint_weight_map_.find(key_); - if (it != wrr_->endpoint_weight_map_.end() && it->second == this) { - wrr_->endpoint_weight_map_.erase(it); +WeightedRoundRobin::AddressWeight::~AddressWeight() { + MutexLock lock(&wrr_->address_weight_map_mu_); + auto it = wrr_->address_weight_map_.find(key_); + if (it != wrr_->address_weight_map_.end() && it->second == this) { + wrr_->address_weight_map_.erase(it); } } -void WeightedRoundRobin::EndpointWeight::MaybeUpdateWeight( +void WeightedRoundRobin::AddressWeight::MaybeUpdateWeight( double qps, double eps, double utilization, float error_utilization_penalty) { // Compute weight. @@ -414,7 +437,7 @@ void WeightedRoundRobin::EndpointWeight::MaybeUpdateWeight( last_update_time_ = now; } -float WeightedRoundRobin::EndpointWeight::GetWeight( +float WeightedRoundRobin::AddressWeight::GetWeight( Timestamp now, Duration weight_expiration_period, Duration blackout_period) { MutexLock lock(&mu_); @@ -445,7 +468,7 @@ float WeightedRoundRobin::EndpointWeight::GetWeight( return weight_; } -void WeightedRoundRobin::EndpointWeight::ResetNonEmptySince() { +void WeightedRoundRobin::AddressWeight::ResetNonEmptySince() { MutexLock lock(&mu_); non_empty_since_ = Timestamp::InfFuture(); } @@ -454,13 +477,8 @@ void WeightedRoundRobin::EndpointWeight::ResetNonEmptySince() { // WeightedRoundRobin::Picker::SubchannelCallTracker // -void WeightedRoundRobin::Picker::SubchannelCallTracker::Start() { - if (child_tracker_ != nullptr) child_tracker_->Start(); -} - void WeightedRoundRobin::Picker::SubchannelCallTracker::Finish( FinishArgs args) { - if (child_tracker_ != nullptr) child_tracker_->Finish(args); auto* backend_metric_data = args.backend_metric_accessor->GetBackendMetricData(); double qps = 0; @@ -481,22 +499,23 @@ void WeightedRoundRobin::Picker::SubchannelCallTracker::Finish( // WeightedRoundRobin::Picker // -WeightedRoundRobin::Picker::Picker(RefCountedPtr wrr, - WrrEndpointList* endpoint_list) +WeightedRoundRobin::Picker::Picker( + RefCountedPtr wrr, + WeightedRoundRobinSubchannelList* subchannel_list) : wrr_(std::move(wrr)), config_(wrr_->config_), last_picked_index_(absl::Uniform(wrr_->bit_gen_)) { - for (auto& endpoint : endpoint_list->endpoints()) { - auto* ep = static_cast(endpoint.get()); - if (ep->connectivity_state() == GRPC_CHANNEL_READY) { - endpoints_.emplace_back(ep->picker(), ep->weight()); + for (size_t i = 0; i < subchannel_list->num_subchannels(); ++i) { + WeightedRoundRobinSubchannelData* sd = subchannel_list->subchannel(i); + if (sd->connectivity_state() == GRPC_CHANNEL_READY) { + subchannels_.emplace_back(sd->subchannel()->Ref(), sd->weight()); } } if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { gpr_log(GPR_INFO, - "[WRR %p picker %p] created picker from endpoint_list=%p " + "[WRR %p picker %p] created picker from subchannel_list=%p " "with %" PRIuPTR " subchannels", - wrr_.get(), this, endpoint_list, endpoints_.size()); + wrr_.get(), this, subchannel_list, subchannels_.size()); } BuildSchedulerAndStartTimerLocked(); } @@ -514,30 +533,26 @@ void WeightedRoundRobin::Picker::Orphan() { } wrr_->channel_control_helper()->GetEventEngine()->Cancel(*timer_handle_); timer_handle_.reset(); - wrr_.reset(); } -WeightedRoundRobin::PickResult WeightedRoundRobin::Picker::Pick(PickArgs args) { +WeightedRoundRobin::PickResult WeightedRoundRobin::Picker::Pick( + PickArgs /*args*/) { size_t index = PickIndex(); - GPR_ASSERT(index < endpoints_.size()); - auto& endpoint_info = endpoints_[index]; - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { - gpr_log(GPR_INFO, - "[WRR %p picker %p] returning index %" PRIuPTR ", picker=%p", - wrr_.get(), this, index, endpoint_info.picker.get()); - } - auto result = endpoint_info.picker->Pick(args); + GPR_ASSERT(index < subchannels_.size()); + auto& subchannel_info = subchannels_[index]; // Collect per-call utilization data if needed. + std::unique_ptr subchannel_call_tracker; if (!config_->enable_oob_load_report()) { - auto* complete = absl::get_if(&result.result); - if (complete != nullptr) { - complete->subchannel_call_tracker = - std::make_unique( - endpoint_info.weight, config_->error_utilization_penalty(), - std::move(complete->subchannel_call_tracker)); - } + subchannel_call_tracker = std::make_unique( + subchannel_info.weight, config_->error_utilization_penalty()); + } + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { + gpr_log(GPR_INFO, + "[WRR %p picker %p] returning index %" PRIuPTR ", subchannel=%p", + wrr_.get(), this, index, subchannel_info.subchannel.get()); } - return result; + return PickResult::Complete(subchannel_info.subchannel, + std::move(subchannel_call_tracker)); } size_t WeightedRoundRobin::Picker::PickIndex() { @@ -551,16 +566,16 @@ size_t WeightedRoundRobin::Picker::PickIndex() { if (scheduler != nullptr) return scheduler->Pick(); // We don't have a scheduler (i.e., either all of the weights are 0 or // there is only one subchannel), so fall back to RR. - return last_picked_index_.fetch_add(1) % endpoints_.size(); + return last_picked_index_.fetch_add(1) % subchannels_.size(); } void WeightedRoundRobin::Picker::BuildSchedulerAndStartTimerLocked() { // Build scheduler. const Timestamp now = Timestamp::Now(); std::vector weights; - weights.reserve(endpoints_.size()); - for (const auto& endpoint : endpoints_) { - weights.push_back(endpoint.weight->GetWeight( + weights.reserve(subchannels_.size()); + for (const auto& subchannel : subchannels_) { + weights.push_back(subchannel.weight->GetWeight( now, config_->weight_expiration_period(), config_->blackout_period())); } if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { @@ -586,10 +601,6 @@ void WeightedRoundRobin::Picker::BuildSchedulerAndStartTimerLocked() { scheduler_ = std::move(scheduler); } // Start timer. - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { - gpr_log(GPR_INFO, "[WRR %p picker %p] scheduling timer for %s", wrr_.get(), - this, config_->weight_update_period().ToString().c_str()); - } WeakRefCountedPtr self = WeakRef(); timer_handle_ = wrr_->channel_control_helper()->GetEventEngine()->RunAfter( config_->weight_update_period(), [self = std::move(self)]() mutable { @@ -625,8 +636,8 @@ WeightedRoundRobin::~WeightedRoundRobin() { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { gpr_log(GPR_INFO, "[WRR %p] Destroying Round Robin policy", this); } - GPR_ASSERT(endpoint_list_ == nullptr); - GPR_ASSERT(latest_pending_endpoint_list_ == nullptr); + GPR_ASSERT(subchannel_list_ == nullptr); + GPR_ASSERT(latest_pending_subchannel_list_ == nullptr); } void WeightedRoundRobin::ShutdownLocked() { @@ -634,14 +645,14 @@ void WeightedRoundRobin::ShutdownLocked() { gpr_log(GPR_INFO, "[WRR %p] Shutting down", this); } shutdown_ = true; - endpoint_list_.reset(); - latest_pending_endpoint_list_.reset(); + subchannel_list_.reset(); + latest_pending_subchannel_list_.reset(); } void WeightedRoundRobin::ResetBackoffLocked() { - endpoint_list_->ResetBackoffLocked(); - if (latest_pending_endpoint_list_ != nullptr) { - latest_pending_endpoint_list_->ResetBackoffLocked(); + subchannel_list_->ResetBackoffLocked(); + if (latest_pending_subchannel_list_ != nullptr) { + latest_pending_subchannel_list_->ResetBackoffLocked(); } } @@ -681,28 +692,27 @@ absl::Status WeightedRoundRobin::UpdateLocked(UpdateArgs args) { } // If we already have a subchannel list, then keep using the existing // list, but still report back that the update was not accepted. - if (endpoint_list_ != nullptr) return args.addresses.status(); + if (subchannel_list_ != nullptr) return args.addresses.status(); } // Create new subchannel list, replacing the previous pending list, if any. if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace) && - latest_pending_endpoint_list_ != nullptr) { + latest_pending_subchannel_list_ != nullptr) { gpr_log(GPR_INFO, "[WRR %p] replacing previous pending subchannel list %p", - this, latest_pending_endpoint_list_.get()); + this, latest_pending_subchannel_list_.get()); } - latest_pending_endpoint_list_ = - MakeOrphanable(Ref(), std::move(addresses), args.args); + latest_pending_subchannel_list_ = + MakeRefCounted( + this, std::move(addresses), args.args); + latest_pending_subchannel_list_->StartWatchingLocked(args.args); // If the new list is empty, immediately promote it to - // endpoint_list_ and report TRANSIENT_FAILURE. - // TODO(roth): As part of adding dualstack backend support, we need to - // also handle the case where the list of addresses for a given - // endpoint is empty. - if (latest_pending_endpoint_list_->size() == 0) { + // subchannel_list_ and report TRANSIENT_FAILURE. + if (latest_pending_subchannel_list_->num_subchannels() == 0) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace) && - endpoint_list_ != nullptr) { + subchannel_list_ != nullptr) { gpr_log(GPR_INFO, "[WRR %p] replacing previous subchannel list %p", this, - endpoint_list_.get()); + subchannel_list_.get()); } - endpoint_list_ = std::move(latest_pending_endpoint_list_); + subchannel_list_ = std::move(latest_pending_subchannel_list_); absl::Status status = args.addresses.ok() ? absl::UnavailableError(absl::StrCat( "empty address list: ", args.resolution_note)) @@ -713,126 +723,42 @@ absl::Status WeightedRoundRobin::UpdateLocked(UpdateArgs args) { return status; } // Otherwise, if this is the initial update, immediately promote it to - // endpoint_list_. - if (endpoint_list_.get() == nullptr) { - endpoint_list_ = std::move(latest_pending_endpoint_list_); + // subchannel_list_. + if (subchannel_list_.get() == nullptr) { + subchannel_list_ = std::move(latest_pending_subchannel_list_); } return absl::OkStatus(); } -RefCountedPtr +RefCountedPtr WeightedRoundRobin::GetOrCreateWeight(const grpc_resolved_address& address) { auto key = grpc_sockaddr_to_uri(&address); if (!key.ok()) return nullptr; - MutexLock lock(&endpoint_weight_map_mu_); - auto it = endpoint_weight_map_.find(*key); - if (it != endpoint_weight_map_.end()) { + MutexLock lock(&address_weight_map_mu_); + auto it = address_weight_map_.find(*key); + if (it != address_weight_map_.end()) { auto weight = it->second->RefIfNonZero(); if (weight != nullptr) return weight; } - auto weight = MakeRefCounted( - Ref(DEBUG_LOCATION, "EndpointWeight"), *key); - endpoint_weight_map_.emplace(*key, weight.get()); + auto weight = + MakeRefCounted(Ref(DEBUG_LOCATION, "AddressWeight"), *key); + address_weight_map_.emplace(*key, weight.get()); return weight; } // -// WeightedRoundRobin::WrrEndpointList::WrrEndpoint::OobWatcher -// - -void WeightedRoundRobin::WrrEndpointList::WrrEndpoint::OobWatcher:: - OnBackendMetricReport(const BackendMetricData& backend_metric_data) { - double utilization = backend_metric_data.application_utilization; - if (utilization <= 0) { - utilization = backend_metric_data.cpu_utilization; - } - weight_->MaybeUpdateWeight(backend_metric_data.qps, backend_metric_data.eps, - utilization, error_utilization_penalty_); -} - -// -// WeightedRoundRobin::WrrEndpointList::WrrEndpoint -// - -RefCountedPtr -WeightedRoundRobin::WrrEndpointList::WrrEndpoint::CreateSubchannel( - ServerAddress address, const ChannelArgs& args) { - auto* wrr = policy(); - auto subchannel = - wrr->channel_control_helper()->CreateSubchannel(std::move(address), args); - // Start OOB watch if configured. - if (wrr->config_->enable_oob_load_report()) { - subchannel->AddDataWatcher(MakeOobBackendMetricWatcher( - wrr->config_->oob_reporting_period(), - std::make_unique( - weight_, wrr->config_->error_utilization_penalty()))); - } - return subchannel; -} - -void WeightedRoundRobin::WrrEndpointList::WrrEndpoint::OnStateUpdate( - absl::optional old_state, - grpc_connectivity_state new_state, const absl::Status& status) { - auto* wrr_endpoint_list = endpoint_list(); - auto* wrr = policy(); - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { - gpr_log(GPR_INFO, - "[WRR %p] connectivity changed for child %p, endpoint_list %p " - "(index %" PRIuPTR " of %" PRIuPTR - "): prev_state=%s new_state=%s (%s)", - wrr, this, wrr_endpoint_list, Index(), wrr_endpoint_list->size(), - (old_state.has_value() ? ConnectivityStateName(*old_state) : "N/A"), - ConnectivityStateName(new_state), status.ToString().c_str()); - } - if (new_state == GRPC_CHANNEL_IDLE) { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { - gpr_log(GPR_INFO, - "[WRR %p] child %p reported IDLE; requesting connection", wrr, - this); - } - ExitIdleLocked(); - } else if (new_state == GRPC_CHANNEL_READY) { - // If we transition back to READY state, restart the blackout period. - // Skip this if this is the initial notification for this - // subchannel (which happens whenever we get updated addresses and - // create a new endpoint list). Also skip it if the previous state - // was READY (which should never happen in practice, but we've seen - // at least one bug that caused this in the outlier_detection - // policy, so let's be defensive here). - // - // Note that we cannot guarantee that we will never receive - // lingering callbacks for backend metric reports from the previous - // connection after the new connection has been established, but they - // should be masked by new backend metric reports from the new - // connection by the time the blackout period ends. - if (old_state.has_value() && old_state != GRPC_CHANNEL_READY) { - weight_->ResetNonEmptySince(); - } - } - // If state changed, update state counters. - if (!old_state.has_value() || *old_state != new_state) { - wrr_endpoint_list->UpdateStateCountersLocked(old_state, new_state); - } - // Update the policy state. - wrr_endpoint_list->MaybeUpdateAggregatedConnectivityStateLocked(status); -} - -// -// WeightedRoundRobin::WrrEndpointList +// WeightedRoundRobin::WeightedRoundRobinSubchannelList // -void WeightedRoundRobin::WrrEndpointList::UpdateStateCountersLocked( - absl::optional old_state, - grpc_connectivity_state new_state) { - // We treat IDLE the same as CONNECTING, since it will immediately - // transition into that state anyway. +void WeightedRoundRobin::WeightedRoundRobinSubchannelList:: + UpdateStateCountersLocked(absl::optional old_state, + grpc_connectivity_state new_state) { if (old_state.has_value()) { GPR_ASSERT(*old_state != GRPC_CHANNEL_SHUTDOWN); if (*old_state == GRPC_CHANNEL_READY) { GPR_ASSERT(num_ready_ > 0); --num_ready_; - } else if (*old_state == GRPC_CHANNEL_CONNECTING || - *old_state == GRPC_CHANNEL_IDLE) { + } else if (*old_state == GRPC_CHANNEL_CONNECTING) { GPR_ASSERT(num_connecting_ > 0); --num_connecting_; } else if (*old_state == GRPC_CHANNEL_TRANSIENT_FAILURE) { @@ -843,79 +769,217 @@ void WeightedRoundRobin::WrrEndpointList::UpdateStateCountersLocked( GPR_ASSERT(new_state != GRPC_CHANNEL_SHUTDOWN); if (new_state == GRPC_CHANNEL_READY) { ++num_ready_; - } else if (new_state == GRPC_CHANNEL_CONNECTING || - new_state == GRPC_CHANNEL_IDLE) { + } else if (new_state == GRPC_CHANNEL_CONNECTING) { ++num_connecting_; } else if (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE) { ++num_transient_failure_; } } -void WeightedRoundRobin::WrrEndpointList:: +void WeightedRoundRobin::WeightedRoundRobinSubchannelList:: MaybeUpdateAggregatedConnectivityStateLocked(absl::Status status_for_tf) { - auto* wrr = policy(); - // If this is latest_pending_endpoint_list_, then swap it into - // endpoint_list_ in the following cases: - // - endpoint_list_ has no READY children. - // - This list has at least one READY child and we have seen the - // initial connectivity state notification for all children. - // - All of the children in this list are in TRANSIENT_FAILURE. + WeightedRoundRobin* p = static_cast(policy()); + // If this is latest_pending_subchannel_list_, then swap it into + // subchannel_list_ in the following cases: + // - subchannel_list_ has no READY subchannels. + // - This list has at least one READY subchannel and we have seen the + // initial connectivity state notification for all subchannels. + // - All of the subchannels in this list are in TRANSIENT_FAILURE. // (This may cause the channel to go from READY to TRANSIENT_FAILURE, // but we're doing what the control plane told us to do.) - if (wrr->latest_pending_endpoint_list_.get() == this && - (wrr->endpoint_list_->num_ready_ == 0 || - (num_ready_ > 0 && AllEndpointsSeenInitialState()) || - num_transient_failure_ == size())) { + if (p->latest_pending_subchannel_list_.get() == this && + (p->subchannel_list_->num_ready_ == 0 || + (num_ready_ > 0 && AllSubchannelsSeenInitialState()) || + num_transient_failure_ == num_subchannels())) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { const std::string old_counters_string = - wrr->endpoint_list_ != nullptr ? wrr->endpoint_list_->CountersString() + p->subchannel_list_ != nullptr ? p->subchannel_list_->CountersString() : ""; - gpr_log(GPR_INFO, - "[WRR %p] swapping out endpoint list %p (%s) in favor of %p (%s)", - wrr, wrr->endpoint_list_.get(), old_counters_string.c_str(), this, - CountersString().c_str()); + gpr_log( + GPR_INFO, + "[WRR %p] swapping out subchannel list %p (%s) in favor of %p (%s)", + p, p->subchannel_list_.get(), old_counters_string.c_str(), this, + CountersString().c_str()); } - wrr->endpoint_list_ = std::move(wrr->latest_pending_endpoint_list_); + p->subchannel_list_ = std::move(p->latest_pending_subchannel_list_); } - // Only set connectivity state if this is the current endpoint list. - if (wrr->endpoint_list_.get() != this) return; + // Only set connectivity state if this is the current subchannel list. + if (p->subchannel_list_.get() != this) return; // First matching rule wins: - // 1) ANY child is READY => policy is READY. - // 2) ANY child is CONNECTING => policy is CONNECTING. - // 3) ALL children are TRANSIENT_FAILURE => policy is TRANSIENT_FAILURE. + // 1) ANY subchannel is READY => policy is READY. + // 2) ANY subchannel is CONNECTING => policy is CONNECTING. + // 3) ALL subchannels are TRANSIENT_FAILURE => policy is TRANSIENT_FAILURE. if (num_ready_ > 0) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { - gpr_log(GPR_INFO, "[WRR %p] reporting READY with endpoint list %p", wrr, + gpr_log(GPR_INFO, "[WRR %p] reporting READY with subchannel list %p", p, this); } - wrr->channel_control_helper()->UpdateState( + p->channel_control_helper()->UpdateState( GRPC_CHANNEL_READY, absl::Status(), - MakeRefCounted(wrr->Ref(), this)); + MakeRefCounted(p->Ref(), this)); } else if (num_connecting_ > 0) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { - gpr_log(GPR_INFO, "[WRR %p] reporting CONNECTING with endpoint list %p", - wrr, this); + gpr_log(GPR_INFO, "[WRR %p] reporting CONNECTING with subchannel list %p", + p, this); } - wrr->channel_control_helper()->UpdateState( + p->channel_control_helper()->UpdateState( GRPC_CHANNEL_CONNECTING, absl::Status(), - MakeRefCounted(nullptr)); - } else if (num_transient_failure_ == size()) { + MakeRefCounted(p->Ref(DEBUG_LOCATION, "QueuePicker"))); + } else if (num_transient_failure_ == num_subchannels()) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { - gpr_log(GPR_INFO, - "[WRR %p] reporting TRANSIENT_FAILURE with endpoint list %p: %s", - wrr, this, status_for_tf.ToString().c_str()); + gpr_log( + GPR_INFO, + "[WRR %p] reporting TRANSIENT_FAILURE with subchannel list %p: %s", p, + this, status_for_tf.ToString().c_str()); } if (!status_for_tf.ok()) { last_failure_ = absl::UnavailableError( absl::StrCat("connections to all backends failing; last error: ", status_for_tf.ToString())); } - wrr->channel_control_helper()->UpdateState( + p->channel_control_helper()->UpdateState( GRPC_CHANNEL_TRANSIENT_FAILURE, last_failure_, MakeRefCounted(last_failure_)); } } +// +// WeightedRoundRobin::WeightedRoundRobinSubchannelData::OobWatcher +// + +void WeightedRoundRobin::WeightedRoundRobinSubchannelData::OobWatcher:: + OnBackendMetricReport(const BackendMetricData& backend_metric_data) { + double utilization = backend_metric_data.application_utilization; + if (utilization <= 0) { + utilization = backend_metric_data.cpu_utilization; + } + weight_->MaybeUpdateWeight(backend_metric_data.qps, backend_metric_data.eps, + utilization, error_utilization_penalty_); +} + +// +// WeightedRoundRobin::WeightedRoundRobinSubchannelData +// + +WeightedRoundRobin::WeightedRoundRobinSubchannelData:: + WeightedRoundRobinSubchannelData( + SubchannelList* subchannel_list, + const ServerAddress& address, RefCountedPtr sc) + : SubchannelData(subchannel_list, address, std::move(sc)), + weight_(static_cast(subchannel_list->policy()) + ->GetOrCreateWeight(address.address())) { + // Start OOB watch if configured. + WeightedRoundRobin* p = + static_cast(subchannel_list->policy()); + if (p->config_->enable_oob_load_report()) { + subchannel()->AddDataWatcher(MakeOobBackendMetricWatcher( + p->config_->oob_reporting_period(), + std::make_unique(weight_, + p->config_->error_utilization_penalty()))); + } +} + +void WeightedRoundRobin::WeightedRoundRobinSubchannelData:: + ProcessConnectivityChangeLocked( + absl::optional old_state, + grpc_connectivity_state new_state) { + WeightedRoundRobin* p = + static_cast(subchannel_list()->policy()); + GPR_ASSERT(subchannel() != nullptr); + // If this is not the initial state notification and the new state is + // TRANSIENT_FAILURE or IDLE, re-resolve. + // Note that we don't want to do this on the initial state notification, + // because that would result in an endless loop of re-resolution. + if (old_state.has_value() && (new_state == GRPC_CHANNEL_TRANSIENT_FAILURE || + new_state == GRPC_CHANNEL_IDLE)) { + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { + gpr_log(GPR_INFO, + "[WRR %p] Subchannel %p reported %s; requesting re-resolution", p, + subchannel(), ConnectivityStateName(new_state)); + } + p->channel_control_helper()->RequestReresolution(); + } + if (new_state == GRPC_CHANNEL_IDLE) { + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { + gpr_log(GPR_INFO, + "[WRR %p] Subchannel %p reported IDLE; requesting connection", p, + subchannel()); + } + subchannel()->RequestConnection(); + } else if (new_state == GRPC_CHANNEL_READY) { + // If we transition back to READY state, restart the blackout period. + // Skip this if this is the initial notification for this + // subchannel (which happens whenever we get updated addresses and + // create a new endpoint list). Also skip it if the previous state + // was READY (which should never happen in practice, but we've seen + // at least one bug that caused this in the outlier_detection + // policy, so let's be defensive here). + // + // Note that we cannot guarantee that we will never receive + // lingering callbacks for backend metric reports from the previous + // connection after the new connection has been established, but they + // should be masked by new backend metric reports from the new + // connection by the time the blackout period ends. + if (old_state.has_value() && old_state != GRPC_CHANNEL_READY) { + weight_->ResetNonEmptySince(); + } + } + // Update logical connectivity state. + UpdateLogicalConnectivityStateLocked(new_state); + // Update the policy state. + subchannel_list()->MaybeUpdateAggregatedConnectivityStateLocked( + connectivity_status()); +} + +void WeightedRoundRobin::WeightedRoundRobinSubchannelData:: + UpdateLogicalConnectivityStateLocked( + grpc_connectivity_state connectivity_state) { + WeightedRoundRobin* p = + static_cast(subchannel_list()->policy()); + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { + gpr_log( + GPR_INFO, + "[WRR %p] connectivity changed for subchannel %p, subchannel_list %p " + "(index %" PRIuPTR " of %" PRIuPTR "): prev_state=%s new_state=%s", + p, subchannel(), subchannel_list(), Index(), + subchannel_list()->num_subchannels(), + (logical_connectivity_state_.has_value() + ? ConnectivityStateName(*logical_connectivity_state_) + : "N/A"), + ConnectivityStateName(connectivity_state)); + } + // Decide what state to report for aggregation purposes. + // If the last logical state was TRANSIENT_FAILURE, then ignore the + // state change unless the new state is READY. + if (logical_connectivity_state_.has_value() && + *logical_connectivity_state_ == GRPC_CHANNEL_TRANSIENT_FAILURE && + connectivity_state != GRPC_CHANNEL_READY) { + return; + } + // If the new state is IDLE, treat it as CONNECTING, since it will + // immediately transition into CONNECTING anyway. + if (connectivity_state == GRPC_CHANNEL_IDLE) { + if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_wrr_trace)) { + gpr_log(GPR_INFO, + "[WRR %p] subchannel %p, subchannel_list %p (index %" PRIuPTR + " of %" PRIuPTR "): treating IDLE as CONNECTING", + p, subchannel(), subchannel_list(), Index(), + subchannel_list()->num_subchannels()); + } + connectivity_state = GRPC_CHANNEL_CONNECTING; + } + // If no change, return false. + if (logical_connectivity_state_.has_value() && + *logical_connectivity_state_ == connectivity_state) { + return; + } + // Otherwise, update counters and logical state. + subchannel_list()->UpdateStateCountersLocked(logical_connectivity_state_, + connectivity_state); + logical_connectivity_state_ = connectivity_state; +} + // // factory // diff --git a/src/core/ext/filters/client_channel/lb_policy/xds/xds_override_host.cc b/src/core/ext/filters/client_channel/lb_policy/xds/xds_override_host.cc index 02949cac379..c5445c3dcf2 100644 --- a/src/core/ext/filters/client_channel/lb_policy/xds/xds_override_host.cc +++ b/src/core/ext/filters/client_channel/lb_policy/xds/xds_override_host.cc @@ -18,7 +18,6 @@ #include "src/core/ext/filters/client_channel/lb_policy/xds/xds_override_host.h" -#include #include #include @@ -431,10 +430,7 @@ void XdsOverrideHostLb::ResetBackoffLocked() { absl::Status XdsOverrideHostLb::UpdateLocked(UpdateArgs args) { if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_xds_override_host_trace)) { - gpr_log(GPR_INFO, - "[xds_override_host_lb %p] Received update with %" PRIuPTR - " addresses", - this, args.addresses.ok() ? args.addresses->size() : 0); + gpr_log(GPR_INFO, "[xds_override_host_lb %p] Received update", this); } auto old_config = std::move(config_); // Update config. @@ -502,10 +498,6 @@ OrphanablePtr XdsOverrideHostLb::CreateChildPolicyLocked( absl::StatusOr XdsOverrideHostLb::UpdateAddressMap( absl::StatusOr addresses) { if (!addresses.ok()) { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_xds_override_host_trace)) { - gpr_log(GPR_INFO, "[xds_override_host_lb %p] address error: %s", this, - addresses.status().ToString().c_str()); - } return addresses; } ServerAddressList return_value; @@ -513,30 +505,13 @@ absl::StatusOr XdsOverrideHostLb::UpdateAddressMap( for (const auto& address : *addresses) { XdsHealthStatus status = GetAddressHealthStatus(address); if (status.status() != XdsHealthStatus::kDraining) { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_xds_override_host_trace)) { - gpr_log(GPR_INFO, - "[xds_override_host_lb %p] address %s: not draining, " - "passing to child", - this, address.ToString().c_str()); - } return_value.push_back(address); } else if (!config_->override_host_status_set().Contains(status)) { // Skip draining hosts if not in the override status set. - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_xds_override_host_trace)) { - gpr_log(GPR_INFO, - "[xds_override_host_lb %p] address %s: draining but not in " - "override_host_status set -- ignoring", - this, address.ToString().c_str()); - } continue; } auto key = grpc_sockaddr_to_uri(&address.address()); if (key.ok()) { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_xds_override_host_trace)) { - gpr_log(GPR_INFO, - "[xds_override_host_lb %p] address %s: adding map key %s", this, - address.ToString().c_str(), key->c_str()); - } addresses_for_map.emplace(std::move(*key), status); } } @@ -544,10 +519,6 @@ absl::StatusOr XdsOverrideHostLb::UpdateAddressMap( MutexLock lock(&subchannel_map_mu_); for (auto it = subchannel_map_.begin(); it != subchannel_map_.end();) { if (addresses_for_map.find(it->first) == addresses_for_map.end()) { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_xds_override_host_trace)) { - gpr_log(GPR_INFO, "[xds_override_host_lb %p] removing map key %s", - this, it->first.c_str()); - } it = subchannel_map_.erase(it); } else { ++it; @@ -556,20 +527,10 @@ absl::StatusOr XdsOverrideHostLb::UpdateAddressMap( for (const auto& key_status : addresses_for_map) { auto it = subchannel_map_.find(key_status.first); if (it == subchannel_map_.end()) { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_xds_override_host_trace)) { - gpr_log(GPR_INFO, "[xds_override_host_lb %p] adding map key %s", this, - key_status.first.c_str()); - } subchannel_map_.emplace(std::piecewise_construct, std::forward_as_tuple(key_status.first), std::forward_as_tuple(key_status.second)); } else { - if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_xds_override_host_trace)) { - gpr_log(GPR_INFO, - "[xds_override_host_lb %p] setting EDS health status for " - "%s to %s", - this, key_status.first.c_str(), key_status.second.ToString()); - } it->second.SetEdsHealthStatus(key_status.second); } } diff --git a/src/core/ext/filters/client_channel/subchannel.h b/src/core/ext/filters/client_channel/subchannel.h index f9f88194151..033c711bb6c 100644 --- a/src/core/ext/filters/client_channel/subchannel.h +++ b/src/core/ext/filters/client_channel/subchannel.h @@ -212,8 +212,6 @@ class Subchannel : public DualRefCounted { channelz::SubchannelNode* channelz_node(); - const grpc_resolved_address& address() const { return key_.address(); } - // Starts watching the subchannel's connectivity state. // The first callback to the watcher will be delivered ~immediately. // Subsequent callbacks will be delivered as the subchannel's state diff --git a/src/core/lib/resolver/server_address.h b/src/core/lib/resolver/server_address.h index e5c62638eca..d42a70f9d74 100644 --- a/src/core/lib/resolver/server_address.h +++ b/src/core/lib/resolver/server_address.h @@ -58,7 +58,6 @@ class ServerAddress { ServerAddress& operator=(ServerAddress&& other) noexcept; bool operator==(const ServerAddress& other) const { return Cmp(other) == 0; } - bool operator<(const ServerAddress& other) const { return Cmp(other) < 0; } int Cmp(const ServerAddress& other) const; diff --git a/src/python/grpcio/grpc_core_dependencies.py b/src/python/grpcio/grpc_core_dependencies.py index ad948cd7fcf..eadfa0e6e41 100644 --- a/src/python/grpcio/grpc_core_dependencies.py +++ b/src/python/grpcio/grpc_core_dependencies.py @@ -33,7 +33,6 @@ CORE_SOURCE_FILES = [ 'src/core/ext/filters/client_channel/http_proxy.cc', 'src/core/ext/filters/client_channel/lb_policy/address_filtering.cc', 'src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc', - 'src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc', 'src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc', 'src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.cc', diff --git a/test/core/client_channel/lb_policy/lb_policy_test_lib.h b/test/core/client_channel/lb_policy/lb_policy_test_lib.h index ebd5c90518d..70c4b1bbe34 100644 --- a/test/core/client_channel/lb_policy/lb_policy_test_lib.h +++ b/test/core/client_channel/lb_policy/lb_policy_test_lib.h @@ -57,10 +57,8 @@ #include "src/core/ext/filters/client_channel/client_channel_internal.h" #include "src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h" -#include "src/core/ext/filters/client_channel/lb_policy/health_check_client_internal.h" #include "src/core/ext/filters/client_channel/lb_policy/oob_backend_metric.h" #include "src/core/ext/filters/client_channel/lb_policy/oob_backend_metric_internal.h" -#include "src/core/ext/filters/client_channel/subchannel_interface_internal.h" #include "src/core/ext/filters/client_channel/subchannel_pool_interface.h" #include "src/core/lib/address_utils/parse_address.h" #include "src/core/lib/address_utils/sockaddr_utils.h" @@ -113,10 +111,7 @@ class LoadBalancingPolicyTest : public ::testing::Test { ~FakeSubchannel() override { if (orca_watcher_ != nullptr) { MutexLock lock(&state_->backend_metric_watcher_mu_); - state_->orca_watchers_.erase(orca_watcher_.get()); - } - for (const auto& p : watcher_map_) { - state_->state_tracker_.RemoveWatcher(p.second); + state_->watchers_.erase(orca_watcher_.get()); } } @@ -126,11 +121,6 @@ class LoadBalancingPolicyTest : public ::testing::Test { // Converts between // SubchannelInterface::ConnectivityStateWatcherInterface and // ConnectivityStateWatcherInterface. - // - // We support both unique_ptr<> and shared_ptr<>, since raw - // connectivity watches use the latter but health watches use the - // former. - // TODO(roth): Clean this up. class WatcherWrapper : public AsyncConnectivityStateWatcherInterface { public: WatcherWrapper( @@ -142,59 +132,33 @@ class LoadBalancingPolicyTest : public ::testing::Test { std::move(work_serializer)), watcher_(std::move(watcher)) {} - WatcherWrapper( - std::shared_ptr work_serializer, - std::shared_ptr< - SubchannelInterface::ConnectivityStateWatcherInterface> - watcher) - : AsyncConnectivityStateWatcherInterface( - std::move(work_serializer)), - watcher_(std::move(watcher)) {} - void OnConnectivityStateChange(grpc_connectivity_state new_state, const absl::Status& status) override { - watcher()->OnConnectivityStateChange(new_state, status); + watcher_->OnConnectivityStateChange(new_state, status); } private: - SubchannelInterface::ConnectivityStateWatcherInterface* watcher() - const { - return Match( - watcher_, - [](const std::unique_ptr< - SubchannelInterface::ConnectivityStateWatcherInterface>& - watcher) { return watcher.get(); }, - [](const std::shared_ptr< - SubchannelInterface::ConnectivityStateWatcherInterface>& - watcher) { return watcher.get(); }); - } - - absl::variant< - std::unique_ptr< - SubchannelInterface::ConnectivityStateWatcherInterface>, - std::shared_ptr< - SubchannelInterface::ConnectivityStateWatcherInterface>> + std::unique_ptr watcher_; }; void WatchConnectivityState( std::unique_ptr< SubchannelInterface::ConnectivityStateWatcherInterface> - watcher) override - ABSL_EXCLUSIVE_LOCKS_REQUIRED(*state_->work_serializer_) { - auto* watcher_ptr = watcher.get(); + watcher) override { auto watcher_wrapper = MakeOrphanable( work_serializer_, std::move(watcher)); - watcher_map_[watcher_ptr] = watcher_wrapper.get(); + watcher_map_[watcher.get()] = watcher_wrapper.get(); + MutexLock lock(&state_->mu_); state_->state_tracker_.AddWatcher(GRPC_CHANNEL_SHUTDOWN, std::move(watcher_wrapper)); } void CancelConnectivityStateWatch( - ConnectivityStateWatcherInterface* watcher) override - ABSL_EXCLUSIVE_LOCKS_REQUIRED(*state_->work_serializer_) { + ConnectivityStateWatcherInterface* watcher) override { auto it = watcher_map_.find(watcher); if (it == watcher_map_.end()) return; + MutexLock lock(&state_->mu_); state_->state_tracker_.RemoveWatcher(it->second); watcher_map_.erase(it); } @@ -204,56 +168,19 @@ class LoadBalancingPolicyTest : public ::testing::Test { state_->requested_connection_ = true; } - void AddDataWatcher(std::unique_ptr watcher) - override ABSL_EXCLUSIVE_LOCKS_REQUIRED(*state_->work_serializer_) { + void AddDataWatcher( + std::unique_ptr watcher) override { MutexLock lock(&state_->backend_metric_watcher_mu_); - auto* w = - static_cast(watcher.get()); - if (w->type() == OrcaProducer::Type()) { - GPR_ASSERT(orca_watcher_ == nullptr); - orca_watcher_.reset(static_cast(watcher.release())); - state_->orca_watchers_.insert(orca_watcher_.get()); - } else if (w->type() == HealthProducer::Type()) { - // TODO(roth): Support health checking in test framework. - // For now, we just hard-code this to the raw connectivity state. - GPR_ASSERT(health_watcher_ == nullptr); - GPR_ASSERT(health_watcher_wrapper_ == nullptr); - health_watcher_.reset(static_cast(watcher.release())); - auto connectivity_watcher = health_watcher_->TakeWatcher(); - auto* connectivity_watcher_ptr = connectivity_watcher.get(); - auto watcher_wrapper = MakeOrphanable( - work_serializer_, std::move(connectivity_watcher)); - health_watcher_wrapper_ = watcher_wrapper.get(); - state_->state_tracker_.AddWatcher(GRPC_CHANNEL_SHUTDOWN, - std::move(watcher_wrapper)); - gpr_log(GPR_INFO, - "AddDataWatcher(): added HealthWatch=%p " - "connectivity_watcher=%p watcher_wrapper=%p", - health_watcher_.get(), connectivity_watcher_ptr, - health_watcher_wrapper_); - } + GPR_ASSERT(orca_watcher_ == nullptr); + orca_watcher_.reset(static_cast(watcher.release())); + state_->watchers_.insert(orca_watcher_.get()); } - void CancelDataWatcher(DataWatcherInterface* watcher) override - ABSL_EXCLUSIVE_LOCKS_REQUIRED(*state_->work_serializer_) { + void CancelDataWatcher(DataWatcherInterface* watcher) override { MutexLock lock(&state_->backend_metric_watcher_mu_); - auto* w = static_cast(watcher); - if (w->type() == OrcaProducer::Type()) { - if (orca_watcher_.get() != static_cast(watcher)) return; - state_->orca_watchers_.erase(orca_watcher_.get()); - orca_watcher_.reset(); - } else if (w->type() == HealthProducer::Type()) { - if (health_watcher_.get() != static_cast(watcher)) { - return; - } - gpr_log(GPR_INFO, - "CancelDataWatcher(): cancelling HealthWatch=%p " - "watcher_wrapper=%p", - health_watcher_.get(), health_watcher_wrapper_); - state_->state_tracker_.RemoveWatcher(health_watcher_wrapper_); - health_watcher_wrapper_ = nullptr; - health_watcher_.reset(); - } + if (orca_watcher_.get() != static_cast(watcher)) return; + state_->watchers_.erase(orca_watcher_.get()); + orca_watcher_.reset(); } // Don't need this method, so it's a no-op. @@ -264,16 +191,11 @@ class LoadBalancingPolicyTest : public ::testing::Test { std::map watcher_map_; - std::unique_ptr health_watcher_; - WatcherWrapper* health_watcher_wrapper_ = nullptr; std::unique_ptr orca_watcher_; }; - SubchannelState(absl::string_view address, - std::shared_ptr work_serializer) - : address_(address), - work_serializer_(std::move(work_serializer)), - state_tracker_("LoadBalancingPolicyTest") {} + explicit SubchannelState(absl::string_view address) + : address_(address), state_tracker_("LoadBalancingPolicyTest") {} const std::string& address() const { return address_; } @@ -328,14 +250,10 @@ class LoadBalancingPolicyTest : public ::testing::Test { << "bug in test: " << ConnectivityStateName(state) << " must have OK status: " << status; } - work_serializer_->Run( - [this, state, status, location]() - ABSL_EXCLUSIVE_LOCKS_REQUIRED(*work_serializer_) { - AssertValidConnectivityStateTransition(state_tracker_.state(), - state, location); - state_tracker_.SetState(state, status, "set from test"); - }, - DEBUG_LOCATION); + MutexLock lock(&mu_); + AssertValidConnectivityStateTransition(state_tracker_.state(), state, + location); + state_tracker_.SetState(state, status, "set from test"); } // Indicates if any of the associated SubchannelInterface objects @@ -355,7 +273,7 @@ class LoadBalancingPolicyTest : public ::testing::Test { // Sends an OOB backend metric report to all watchers. void SendOobBackendMetricReport(const BackendMetricData& backend_metrics) { MutexLock lock(&backend_metric_watcher_mu_); - for (const auto* watcher : orca_watchers_) { + for (const auto* watcher : watchers_) { watcher->watcher()->OnBackendMetricReport(backend_metrics); } } @@ -364,7 +282,7 @@ class LoadBalancingPolicyTest : public ::testing::Test { void CheckOobReportingPeriod(Duration expected, SourceLocation location = SourceLocation()) { MutexLock lock(&backend_metric_watcher_mu_); - for (const auto* watcher : orca_watchers_) { + for (const auto* watcher : watchers_) { EXPECT_EQ(watcher->report_interval(), expected) << location.file() << ":" << location.line(); } @@ -372,15 +290,16 @@ class LoadBalancingPolicyTest : public ::testing::Test { private: const std::string address_; - std::shared_ptr work_serializer_; - ConnectivityStateTracker state_tracker_ ABSL_GUARDED_BY(*work_serializer_); + + Mutex mu_; + ConnectivityStateTracker state_tracker_ ABSL_GUARDED_BY(&mu_); Mutex requested_connection_mu_; bool requested_connection_ ABSL_GUARDED_BY(&requested_connection_mu_) = false; Mutex backend_metric_watcher_mu_; - std::set orca_watchers_ + std::set watchers_ ABSL_GUARDED_BY(&backend_metric_watcher_mu_); }; @@ -498,8 +417,7 @@ class LoadBalancingPolicyTest : public ::testing::Test { GPR_ASSERT(address_uri.ok()); it = test_->subchannel_pool_ .emplace(std::piecewise_construct, std::forward_as_tuple(key), - std::forward_as_tuple(std::move(*address_uri), - work_serializer_)) + std::forward_as_tuple(std::move(*address_uri))) .first; } return it->second.CreateSubchannel(work_serializer_); @@ -1009,6 +927,7 @@ class LoadBalancingPolicyTest : public ::testing::Test { // Expect startup with RR with a set of addresses. RefCountedPtr ExpectRoundRobinStartup( absl::Span addresses) { + ExpectConnectingUpdate(); RefCountedPtr picker; for (size_t i = 0; i < addresses.size(); ++i) { auto* subchannel = FindSubchannel(addresses[i]); @@ -1016,7 +935,6 @@ class LoadBalancingPolicyTest : public ::testing::Test { if (subchannel == nullptr) return nullptr; EXPECT_TRUE(subchannel->ConnectionRequested()); subchannel->SetConnectivityState(GRPC_CHANNEL_CONNECTING); - if (i == 0) ExpectConnectingUpdate(); subchannel->SetConnectivityState(GRPC_CHANNEL_READY); if (i == 0) { picker = WaitForConnected(); @@ -1086,7 +1004,7 @@ class LoadBalancingPolicyTest : public ::testing::Test { SubchannelKey key(MakeAddress(address), args); auto it = subchannel_pool_ .emplace(std::piecewise_construct, std::forward_as_tuple(key), - std::forward_as_tuple(address, work_serializer_)) + std::forward_as_tuple(address)) .first; return &it->second; } diff --git a/test/core/client_channel/lb_policy/outlier_detection_test.cc b/test/core/client_channel/lb_policy/outlier_detection_test.cc index 597bfabb8e6..907db1b4d19 100644 --- a/test/core/client_channel/lb_policy/outlier_detection_test.cc +++ b/test/core/client_channel/lb_policy/outlier_detection_test.cc @@ -37,6 +37,7 @@ #include #include "src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h" +#include "src/core/lib/channel/channel_args.h" #include "src/core/lib/gprpp/orphanable.h" #include "src/core/lib/gprpp/ref_counted_ptr.h" #include "src/core/lib/gprpp/time.h" @@ -183,6 +184,8 @@ TEST_F(OutlierDetectionTest, Basic) { absl::Status status = ApplyUpdate( BuildUpdate({kAddressUri}, ConfigBuilder().Build()), lb_policy_.get()); EXPECT_TRUE(status.ok()) << status; + // LB policy should have reported CONNECTING state. + ExpectConnectingUpdate(); // LB policy should have created a subchannel for the address. auto* subchannel = FindSubchannel(kAddressUri); ASSERT_NE(subchannel, nullptr); @@ -191,8 +194,6 @@ TEST_F(OutlierDetectionTest, Basic) { EXPECT_TRUE(subchannel->ConnectionRequested()); // This causes the subchannel to start to connect, so it reports CONNECTING. subchannel->SetConnectivityState(GRPC_CHANNEL_CONNECTING); - // LB policy should have reported CONNECTING state. - ExpectConnectingUpdate(); // When the subchannel becomes connected, it reports READY. subchannel->SetConnectivityState(GRPC_CHANNEL_READY); // The LB policy will report CONNECTING some number of times (doesn't @@ -229,6 +230,8 @@ TEST_F(OutlierDetectionTest, FailurePercentage) { time_cache_.IncrementBy(Duration::Seconds(10)); RunTimerCallback(); gpr_log(GPR_INFO, "### ejection complete"); + // Expect a re-resolution request. + ExpectReresolutionRequest(); // Expect a picker update. std::vector remaining_addresses; for (const auto& addr : kAddresses) { @@ -251,8 +254,10 @@ TEST_F(OutlierDetectionTest, DoesNotWorkWithPickFirst) { .Build()), lb_policy_.get()); EXPECT_TRUE(status.ok()) << status; - // LB policy should have created a subchannel for the first address. - auto* subchannel = FindSubchannel(kAddresses[0]); + // LB policy should have created a subchannel for the first address with + // the GRPC_ARG_INHIBIT_HEALTH_CHECKING channel arg. + auto* subchannel = FindSubchannel( + kAddresses[0], ChannelArgs().Set(GRPC_ARG_INHIBIT_HEALTH_CHECKING, true)); ASSERT_NE(subchannel, nullptr); // When the LB policy receives the subchannel's initial connectivity // state notification (IDLE), it will request a connection. diff --git a/test/core/client_channel/lb_policy/pick_first_test.cc b/test/core/client_channel/lb_policy/pick_first_test.cc index e0571db43c7..46d833d99c2 100644 --- a/test/core/client_channel/lb_policy/pick_first_test.cc +++ b/test/core/client_channel/lb_policy/pick_first_test.cc @@ -33,6 +33,7 @@ #include #include +#include "src/core/lib/channel/channel_args.h" #include "src/core/lib/gprpp/debug_location.h" #include "src/core/lib/gprpp/orphanable.h" #include "src/core/lib/gprpp/ref_counted_ptr.h" @@ -73,7 +74,8 @@ class PickFirstTest : public LoadBalancingPolicyTest { // We will remove entries as each subchannel starts to connect. std::map subchannels; for (auto address : addresses) { - auto* subchannel = FindSubchannel(address); + auto* subchannel = FindSubchannel( + address, ChannelArgs().Set(GRPC_ARG_INHIBIT_HEALTH_CHECKING, true)); ASSERT_NE(subchannel, nullptr); subchannels.emplace(subchannel, address); } @@ -134,10 +136,13 @@ TEST_F(PickFirstTest, FirstAddressWorks) { absl::Status status = ApplyUpdate( BuildUpdate(kAddresses, MakePickFirstConfig(false)), lb_policy_.get()); EXPECT_TRUE(status.ok()) << status; - // LB policy should have created a subchannel for both addresses. - auto* subchannel = FindSubchannel(kAddresses[0]); + // LB policy should have created a subchannel for both addresses with + // the GRPC_ARG_INHIBIT_HEALTH_CHECKING channel arg. + auto* subchannel = FindSubchannel( + kAddresses[0], ChannelArgs().Set(GRPC_ARG_INHIBIT_HEALTH_CHECKING, true)); ASSERT_NE(subchannel, nullptr); - auto* subchannel2 = FindSubchannel(kAddresses[1]); + auto* subchannel2 = FindSubchannel( + kAddresses[1], ChannelArgs().Set(GRPC_ARG_INHIBIT_HEALTH_CHECKING, true)); ASSERT_NE(subchannel2, nullptr); // When the LB policy receives the first subchannel's initial connectivity // state notification (IDLE), it will request a connection. @@ -167,10 +172,13 @@ TEST_F(PickFirstTest, FirstAddressFails) { absl::Status status = ApplyUpdate( BuildUpdate(kAddresses, MakePickFirstConfig(false)), lb_policy_.get()); EXPECT_TRUE(status.ok()) << status; - // LB policy should have created a subchannel for both addresses. - auto* subchannel = FindSubchannel(kAddresses[0]); + // LB policy should have created a subchannel for both addresses with + // the GRPC_ARG_INHIBIT_HEALTH_CHECKING channel arg. + auto* subchannel = FindSubchannel( + kAddresses[0], ChannelArgs().Set(GRPC_ARG_INHIBIT_HEALTH_CHECKING, true)); ASSERT_NE(subchannel, nullptr); - auto* subchannel2 = FindSubchannel(kAddresses[1]); + auto* subchannel2 = FindSubchannel( + kAddresses[1], ChannelArgs().Set(GRPC_ARG_INHIBIT_HEALTH_CHECKING, true)); ASSERT_NE(subchannel2, nullptr); // When the LB policy receives the first subchannel's initial connectivity // state notification (IDLE), it will request a connection. @@ -209,10 +217,13 @@ TEST_F(PickFirstTest, GoesIdleWhenConnectionFailsThenCanReconnect) { absl::Status status = ApplyUpdate( BuildUpdate(kAddresses, MakePickFirstConfig(false)), lb_policy_.get()); EXPECT_TRUE(status.ok()) << status; - // LB policy should have created a subchannel for both addresses. - auto* subchannel = FindSubchannel(kAddresses[0]); + // LB policy should have created a subchannel for both addresses with + // the GRPC_ARG_INHIBIT_HEALTH_CHECKING channel arg. + auto* subchannel = FindSubchannel( + kAddresses[0], ChannelArgs().Set(GRPC_ARG_INHIBIT_HEALTH_CHECKING, true)); ASSERT_NE(subchannel, nullptr); - auto* subchannel2 = FindSubchannel(kAddresses[1]); + auto* subchannel2 = FindSubchannel( + kAddresses[1], ChannelArgs().Set(GRPC_ARG_INHIBIT_HEALTH_CHECKING, true)); ASSERT_NE(subchannel2, nullptr); // When the LB policy receives the first subchannel's initial connectivity // state notification (IDLE), it will request a connection. diff --git a/test/core/client_channel/lb_policy/round_robin_test.cc b/test/core/client_channel/lb_policy/round_robin_test.cc index 092242e66f3..ef82ceadebf 100644 --- a/test/core/client_channel/lb_policy/round_robin_test.cc +++ b/test/core/client_channel/lb_policy/round_robin_test.cc @@ -42,6 +42,8 @@ class RoundRobinTest : public LoadBalancingPolicyTest { void ExpectStartup(absl::Span addresses) { EXPECT_EQ(ApplyUpdate(BuildUpdate(addresses, nullptr), lb_policy_.get()), absl::OkStatus()); + // Expect the initial CONNECTNG update with a picker that queues. + ExpectConnectingUpdate(); // RR should have created a subchannel for each address. for (size_t i = 0; i < addresses.size(); ++i) { auto* subchannel = FindSubchannel(addresses[i]); @@ -50,8 +52,6 @@ class RoundRobinTest : public LoadBalancingPolicyTest { EXPECT_TRUE(subchannel->ConnectionRequested()); // The subchannel will connect successfully. subchannel->SetConnectivityState(GRPC_CHANNEL_CONNECTING); - // Expect the initial CONNECTNG update with a picker that queues. - if (i == 0) ExpectConnectingUpdate(); subchannel->SetConnectivityState(GRPC_CHANNEL_READY); // As each subchannel becomes READY, we should get a new picker that // includes the behavior. Note that there may be any number of diff --git a/test/core/client_channel/lb_policy/weighted_round_robin_test.cc b/test/core/client_channel/lb_policy/weighted_round_robin_test.cc index 536bb779069..77020f7e92c 100644 --- a/test/core/client_channel/lb_policy/weighted_round_robin_test.cc +++ b/test/core/client_channel/lb_policy/weighted_round_robin_test.cc @@ -127,6 +127,8 @@ class WeightedRoundRobinTest : public TimeAwareLoadBalancingPolicyTest { EXPECT_EQ(ApplyUpdate(BuildUpdate(update_addresses, config_builder.Build()), lb_policy_.get()), absl::OkStatus()); + // Expect the initial CONNECTNG update with a picker that queues. + ExpectConnectingUpdate(location); // RR should have created a subchannel for each address. for (size_t i = 0; i < addresses.size(); ++i) { auto* subchannel = FindSubchannel(addresses[i]); @@ -140,8 +142,6 @@ class WeightedRoundRobinTest : public TimeAwareLoadBalancingPolicyTest { << location.line(); // The subchannel will connect successfully. subchannel->SetConnectivityState(GRPC_CHANNEL_CONNECTING); - // Expect the initial CONNECTNG update with a picker that queues. - if (i == 0) ExpectConnectingUpdate(location); subchannel->SetConnectivityState(GRPC_CHANNEL_READY); } return WaitForConnected(location); diff --git a/test/core/client_channel/lb_policy/xds_override_host_test.cc b/test/core/client_channel/lb_policy/xds_override_host_test.cc index 1395542026e..26cfc7a60d4 100644 --- a/test/core/client_channel/lb_policy/xds_override_host_test.cc +++ b/test/core/client_channel/lb_policy/xds_override_host_test.cc @@ -29,7 +29,6 @@ #include #include -#include #include "src/core/ext/filters/stateful_session/stateful_session_filter.h" #include "src/core/ext/xds/xds_health_status.h" @@ -210,31 +209,18 @@ TEST_F(XdsOverrideHostTest, FailedSubchannelIsNotPicked) { EXPECT_EQ(ExpectPickComplete(picker.get(), MakeOverrideHostAttribute(kAddresses[1])), kAddresses[1]); - // Subchannel for address 1 becomes disconnected. - gpr_log(GPR_INFO, "### subchannel 1 reporting IDLE"); auto subchannel = FindSubchannel(kAddresses[1]); ASSERT_NE(subchannel, nullptr); subchannel->SetConnectivityState(GRPC_CHANNEL_IDLE); - gpr_log(GPR_INFO, "### expecting re-resolution request"); ExpectReresolutionRequest(); - gpr_log(GPR_INFO, - "### expecting RR picks to exclude the disconnected subchannel"); ExpectRoundRobinPicks(ExpectState(GRPC_CHANNEL_READY).get(), {kAddresses[0], kAddresses[2]}); - // It starts trying to reconnect... - gpr_log(GPR_INFO, "### subchannel 1 reporting CONNECTING"); subchannel->SetConnectivityState(GRPC_CHANNEL_CONNECTING); - gpr_log(GPR_INFO, "### expecting RR picks again"); ExpectRoundRobinPicks(ExpectState(GRPC_CHANNEL_READY).get(), {kAddresses[0], kAddresses[2]}); - // ...but the connection attempt fails. - gpr_log(GPR_INFO, "### subchannel 1 reporting TRANSIENT_FAILURE"); subchannel->SetConnectivityState(GRPC_CHANNEL_TRANSIENT_FAILURE, absl::ResourceExhaustedError("Hmmmm")); - gpr_log(GPR_INFO, "### expecting re-resolution request"); ExpectReresolutionRequest(); - // The host override is not used. - gpr_log(GPR_INFO, "### checking that host override is not used"); picker = ExpectState(GRPC_CHANNEL_READY); ExpectRoundRobinPicks(picker.get(), {kAddresses[0], kAddresses[2]}, MakeOverrideHostAttribute(kAddresses[1])); @@ -306,12 +292,6 @@ TEST_F(XdsOverrideHostTest, DrainingSubchannelIsConnecting) { EXPECT_EQ(ExpectPickComplete(picker.get(), MakeOverrideHostAttribute(kAddresses[1])), kAddresses[1]); - // Send an update that marks the endpoints with different EDS health - // states, but those states are present in override_host_status. - // The picker should use the DRAINING host when a call's override - // points to that hose, but the host should not be used if there is no - // override pointing to it. - gpr_log(GPR_INFO, "### sending update with DRAINING host"); ApplyUpdateWithHealthStatuses( {{kAddresses[0], XdsHealthStatus::HealthStatus::kUnknown}, {kAddresses[1], XdsHealthStatus::HealthStatus::kDraining}, @@ -319,35 +299,23 @@ TEST_F(XdsOverrideHostTest, DrainingSubchannelIsConnecting) { {"UNKNOWN", "HEALTHY", "DRAINING"}); auto subchannel = FindSubchannel(kAddresses[1]); ASSERT_NE(subchannel, nullptr); + // There are two notifications - one from child policy and one from the parent + // policy due to draining channel update picker = ExpectState(GRPC_CHANNEL_READY); EXPECT_EQ(ExpectPickComplete(picker.get(), MakeOverrideHostAttribute(kAddresses[1])), kAddresses[1]); ExpectRoundRobinPicks(picker.get(), {kAddresses[0], kAddresses[2]}); - // Now the connection to the draining host gets dropped. - // The picker should queue picks where the override host is IDLE. - // All picks without an override host should not use this host. - gpr_log(GPR_INFO, "### closing connection to DRAINING host"); subchannel->SetConnectivityState(GRPC_CHANNEL_IDLE); picker = ExpectState(GRPC_CHANNEL_READY); ExpectPickQueued(picker.get(), MakeOverrideHostAttribute(kAddresses[1])); ExpectRoundRobinPicks(picker.get(), {kAddresses[0], kAddresses[2]}); - // The subchannel should have been asked to reconnect as a result of the - // queued pick above. It will therefore transition into state CONNECTING. - // The pick behavior is the same as above: The picker should queue - // picks where the override host is CONNECTING. All picks without an - // override host should not use this host. - gpr_log(GPR_INFO, "### subchannel starts reconnecting"); EXPECT_TRUE(subchannel->ConnectionRequested()); ExpectQueueEmpty(); subchannel->SetConnectivityState(GRPC_CHANNEL_CONNECTING); picker = ExpectState(GRPC_CHANNEL_READY); ExpectPickQueued(picker.get(), MakeOverrideHostAttribute(kAddresses[1])); ExpectRoundRobinPicks(picker.get(), {kAddresses[0], kAddresses[2]}); - // The subchannel now becomes connected again. - // Now picks with this override host can be completed again. - // Picks without an override host still don't use the draining host. - gpr_log(GPR_INFO, "### subchannel becomes reconnected"); subchannel->SetConnectivityState(GRPC_CHANNEL_READY); picker = ExpectState(GRPC_CHANNEL_READY); EXPECT_EQ(ExpectPickComplete(picker.get(), diff --git a/test/cpp/end2end/client_lb_end2end_test.cc b/test/cpp/end2end/client_lb_end2end_test.cc index d363d5cc640..63ee73045aa 100644 --- a/test/cpp/end2end/client_lb_end2end_test.cc +++ b/test/cpp/end2end/client_lb_end2end_test.cc @@ -2080,8 +2080,7 @@ TEST_F(RoundRobinTest, HealthChecking) { EXPECT_TRUE(WaitForChannelNotReady(channel.get())); CheckRpcSendFailure(DEBUG_LOCATION, stub, StatusCode::UNAVAILABLE, "connections to all backends failing; last error: " - "(ipv6:%5B::1%5D|ipv4:127.0.0.1):[0-9]+: " - "backend unhealthy"); + "UNAVAILABLE: backend unhealthy"); // Clean up. EnableDefaultHealthCheckService(false); } @@ -2139,8 +2138,7 @@ TEST_F(RoundRobinTest, WithHealthCheckingInhibitPerChannel) { EXPECT_FALSE(WaitForChannelReady(channel1.get(), 1)); CheckRpcSendFailure(DEBUG_LOCATION, stub1, StatusCode::UNAVAILABLE, "connections to all backends failing; last error: " - "(ipv6:%5B::1%5D|ipv4:127.0.0.1):[0-9]+: " - "backend unhealthy"); + "UNAVAILABLE: backend unhealthy"); // Second channel should be READY. EXPECT_TRUE(WaitForChannelReady(channel2.get(), 1)); CheckRpcSendOk(DEBUG_LOCATION, stub2); @@ -2185,8 +2183,7 @@ TEST_F(RoundRobinTest, HealthCheckingServiceNamePerChannel) { EXPECT_FALSE(WaitForChannelReady(channel1.get(), 1)); CheckRpcSendFailure(DEBUG_LOCATION, stub1, StatusCode::UNAVAILABLE, "connections to all backends failing; last error: " - "(ipv6:%5B::1%5D|ipv4:127.0.0.1):[0-9]+: " - "backend unhealthy"); + "UNAVAILABLE: backend unhealthy"); // Second channel should be READY. EXPECT_TRUE(WaitForChannelReady(channel2.get(), 1)); CheckRpcSendOk(DEBUG_LOCATION, stub2); @@ -2894,8 +2891,10 @@ TEST_F(ClientLbAddressTest, Basic) { // Make sure that the attributes wind up on the subchannels. std::vector expected; for (const int port : GetServersPorts()) { - expected.emplace_back(absl::StrCat(ipv6_only_ ? "[::1]:" : "127.0.0.1:", - port, " args={test_key=test_value}")); + expected.emplace_back(absl::StrCat( + ipv6_only_ ? "[::1]:" : "127.0.0.1:", port, + " args={grpc.internal.no_subchannel.outlier_detection_disable=1, " + "test_key=test_value}")); } EXPECT_EQ(addresses_seen(), expected); } diff --git a/test/cpp/end2end/xds/xds_ring_hash_end2end_test.cc b/test/cpp/end2end/xds/xds_ring_hash_end2end_test.cc index ef06120bc5b..9a4b112e656 100644 --- a/test/cpp/end2end/xds/xds_ring_hash_end2end_test.cc +++ b/test/cpp/end2end/xds/xds_ring_hash_end2end_test.cc @@ -995,7 +995,7 @@ TEST_P(RingHashTest, ReattemptWhenAllEndpointsUnreachable) { CheckRpcSendFailure( DEBUG_LOCATION, StatusCode::UNAVAILABLE, MakeConnectionFailureRegex( - "ring hash cannot find a connected endpoint; first failure: "), + "ring hash cannot find a connected subchannel; first failure: "), RpcOptions().set_metadata(std::move(metadata))); StartBackend(0); // Ensure we are actively connecting without any traffic. @@ -1034,7 +1034,7 @@ TEST_P(RingHashTest, TransientFailureSkipToAvailableReady) { CheckRpcSendFailure( DEBUG_LOCATION, StatusCode::UNAVAILABLE, MakeConnectionFailureRegex( - "ring hash cannot find a connected endpoint; first failure: "), + "ring hash cannot find a connected subchannel; first failure: "), rpc_options); gpr_log(GPR_INFO, "=== DONE WITH FIRST RPC ==="); EXPECT_EQ(GRPC_CHANNEL_TRANSIENT_FAILURE, channel_->GetState(false)); @@ -1070,7 +1070,7 @@ TEST_P(RingHashTest, TransientFailureSkipToAvailableReady) { CheckRpcSendFailure( DEBUG_LOCATION, StatusCode::UNAVAILABLE, MakeConnectionFailureRegex( - "ring hash cannot find a connected endpoint; first failure: "), + "ring hash cannot find a connected subchannel; first failure: "), rpc_options); gpr_log(GPR_INFO, "=== STARTING BACKEND 1 ==="); StartBackend(1); diff --git a/tools/doxygen/Doxyfile.c++.internal b/tools/doxygen/Doxyfile.c++.internal index 7fc976ec1be..92d52f2675e 100644 --- a/tools/doxygen/Doxyfile.c++.internal +++ b/tools/doxygen/Doxyfile.c++.internal @@ -1113,8 +1113,6 @@ src/core/ext/filters/client_channel/lb_policy/address_filtering.h \ src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h \ src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc \ src/core/ext/filters/client_channel/lb_policy/child_policy_handler.h \ -src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc \ -src/core/ext/filters/client_channel/lb_policy/endpoint_list.h \ src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc \ src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.h \ src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc \ @@ -1140,6 +1138,7 @@ src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.cc \ src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.h \ src/core/ext/filters/client_channel/lb_policy/rls/rls.cc \ src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.cc \ +src/core/ext/filters/client_channel/lb_policy/subchannel_list.h \ src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.cc \ src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.h \ src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc \ diff --git a/tools/doxygen/Doxyfile.core.internal b/tools/doxygen/Doxyfile.core.internal index cb6b9ccda63..7a3e69a4d73 100644 --- a/tools/doxygen/Doxyfile.core.internal +++ b/tools/doxygen/Doxyfile.core.internal @@ -919,8 +919,6 @@ src/core/ext/filters/client_channel/lb_policy/address_filtering.h \ src/core/ext/filters/client_channel/lb_policy/backend_metric_data.h \ src/core/ext/filters/client_channel/lb_policy/child_policy_handler.cc \ src/core/ext/filters/client_channel/lb_policy/child_policy_handler.h \ -src/core/ext/filters/client_channel/lb_policy/endpoint_list.cc \ -src/core/ext/filters/client_channel/lb_policy/endpoint_list.h \ src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc \ src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.h \ src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc \ @@ -946,6 +944,7 @@ src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.cc \ src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.h \ src/core/ext/filters/client_channel/lb_policy/rls/rls.cc \ src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.cc \ +src/core/ext/filters/client_channel/lb_policy/subchannel_list.h \ src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.cc \ src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/static_stride_scheduler.h \ src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc \