Reenable --fail_on_failed_rpcs flag

These were previously causing flaky tests due to latency in the config
push from GCP -> TD, meaning that the clients were launching before the
new config was actually available to the client. These tests now launch
a "exploratory" client process before initiating the test to ensure that
the expected backends are receiving traffic before the test begins.

(The existing checks rely purely on GCP API, e.g., checking if the
backend service reports healthy. There can be slight delays before these
status changes are actually available in the xDS response sent to
proxyless clients).
pull/23022/head
Eric Gribkoff 5 years ago
parent de195d3520
commit 14374494bf
  1. 23
      tools/run_tests/run_xds_tests.py

@ -198,6 +198,7 @@ _WAIT_FOR_OPERATION_SEC = 300
_INSTANCE_GROUP_SIZE = args.instance_group_size
_NUM_TEST_RPCS = 10 * args.qps
_WAIT_FOR_STATS_SEC = 180
_WAIT_FOR_VALID_CONFIG_SEC = 60
_WAIT_FOR_URL_MAP_PATCH_SEC = 300
_CONNECTION_TIMEOUT_SEC = 60
_GCP_API_RETRIES = 5
@ -226,9 +227,9 @@ _BOOTSTRAP_TEMPLATE = """
# TODO(ericgribkoff) Add change_backend_service to this list once TD no longer
# sends an update with no localities when adding the MIG to the backend service
# can race with the URL map patch.
# TODO(ericgribkoff) Add new_instance_group_receives_traffic, ping_pong, and
# round_robin when empty update issue is resolved.
_TESTS_TO_FAIL_ON_RPC_FAILURE = []
_TESTS_TO_FAIL_ON_RPC_FAILURE = [
'new_instance_group_receives_traffic', 'ping_pong', 'round_robin'
]
_TESTS_USING_SECONDARY_IG = [
'secondary_locality_gets_no_requests_on_partial_primary_failure',
'secondary_locality_gets_requests_on_primary_failure'
@ -1021,6 +1022,15 @@ def wait_for_healthy_backends(gcp,
(timeout_sec, result))
def wait_for_config_propagation(gcp, instance_group, client_cmd, client_env):
"""Use client to verify config propagation from GCP->TD->client"""
instance_names = get_instance_names(gcp, instance_group)
client_process = subprocess.Popen(shlex.split(client_cmd), env=client_env)
wait_until_all_rpcs_go_to_given_backends(instance_names,
_WAIT_FOR_VALID_CONFIG_SEC)
client_process.terminate()
def get_instance_names(gcp, instance_group):
instance_names = []
result = gcp.compute.instanceGroups().listInstances(
@ -1205,6 +1215,13 @@ try:
test_log_file = open(test_log_filename, 'w+')
client_process = None
if test_case in _TESTS_TO_FAIL_ON_RPC_FAILURE:
wait_for_config_propagation(
gcp, instance_group,
args.client_cmd.format(server_uri=server_uri,
stats_port=args.stats_port,
qps=args.qps,
fail_on_failed_rpc=False),
client_env)
fail_on_failed_rpc = '--fail_on_failed_rpc=true'
else:
fail_on_failed_rpc = '--fail_on_failed_rpc=false'

Loading…
Cancel
Save