From 14374494bf8b1376c7028ceb8e91f1663f297c93 Mon Sep 17 00:00:00 2001 From: Eric Gribkoff Date: Wed, 20 May 2020 19:44:12 -0700 Subject: [PATCH] Reenable --fail_on_failed_rpcs flag These were previously causing flaky tests due to latency in the config push from GCP -> TD, meaning that the clients were launching before the new config was actually available to the client. These tests now launch a "exploratory" client process before initiating the test to ensure that the expected backends are receiving traffic before the test begins. (The existing checks rely purely on GCP API, e.g., checking if the backend service reports healthy. There can be slight delays before these status changes are actually available in the xDS response sent to proxyless clients). --- tools/run_tests/run_xds_tests.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tools/run_tests/run_xds_tests.py b/tools/run_tests/run_xds_tests.py index 9b47b28d1ba..c56376c5949 100755 --- a/tools/run_tests/run_xds_tests.py +++ b/tools/run_tests/run_xds_tests.py @@ -198,6 +198,7 @@ _WAIT_FOR_OPERATION_SEC = 300 _INSTANCE_GROUP_SIZE = args.instance_group_size _NUM_TEST_RPCS = 10 * args.qps _WAIT_FOR_STATS_SEC = 180 +_WAIT_FOR_VALID_CONFIG_SEC = 60 _WAIT_FOR_URL_MAP_PATCH_SEC = 300 _CONNECTION_TIMEOUT_SEC = 60 _GCP_API_RETRIES = 5 @@ -226,9 +227,9 @@ _BOOTSTRAP_TEMPLATE = """ # TODO(ericgribkoff) Add change_backend_service to this list once TD no longer # sends an update with no localities when adding the MIG to the backend service # can race with the URL map patch. -# TODO(ericgribkoff) Add new_instance_group_receives_traffic, ping_pong, and -# round_robin when empty update issue is resolved. -_TESTS_TO_FAIL_ON_RPC_FAILURE = [] +_TESTS_TO_FAIL_ON_RPC_FAILURE = [ + 'new_instance_group_receives_traffic', 'ping_pong', 'round_robin' +] _TESTS_USING_SECONDARY_IG = [ 'secondary_locality_gets_no_requests_on_partial_primary_failure', 'secondary_locality_gets_requests_on_primary_failure' @@ -1021,6 +1022,15 @@ def wait_for_healthy_backends(gcp, (timeout_sec, result)) +def wait_for_config_propagation(gcp, instance_group, client_cmd, client_env): + """Use client to verify config propagation from GCP->TD->client""" + instance_names = get_instance_names(gcp, instance_group) + client_process = subprocess.Popen(shlex.split(client_cmd), env=client_env) + wait_until_all_rpcs_go_to_given_backends(instance_names, + _WAIT_FOR_VALID_CONFIG_SEC) + client_process.terminate() + + def get_instance_names(gcp, instance_group): instance_names = [] result = gcp.compute.instanceGroups().listInstances( @@ -1205,6 +1215,13 @@ try: test_log_file = open(test_log_filename, 'w+') client_process = None if test_case in _TESTS_TO_FAIL_ON_RPC_FAILURE: + wait_for_config_propagation( + gcp, instance_group, + args.client_cmd.format(server_uri=server_uri, + stats_port=args.stats_port, + qps=args.qps, + fail_on_failed_rpc=False), + client_env) fail_on_failed_rpc = '--fail_on_failed_rpc=true' else: fail_on_failed_rpc = '--fail_on_failed_rpc=false'