Revert "[xDS interop] Updating the config update timeout to 600s (#26090)" (#26197)

This reverts commit 9bc421c6cf.
pull/26203/head
Eric Gribkoff 4 years ago committed by GitHub
parent 2ee8c49af0
commit cf7f5a0b85
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 321
      tools/run_tests/run_xds_tests.py

@ -279,23 +279,14 @@ CLIENT_HOSTS = []
if args.client_hosts: if args.client_hosts:
CLIENT_HOSTS = args.client_hosts.split(',') CLIENT_HOSTS = args.client_hosts.split(',')
# Each of the config propagation in the control plane should finish within 600s.
# Otherwise, it indicates a bug in the control plane. The config propagation
# includes all kinds of traffic config update, like updating urlMap, creating
# the resources for the first time, updating BackendService, and changing the
# status of endpoints in BackendService.
_WAIT_FOR_URL_MAP_PATCH_SEC = 600
# In general, fetching load balancing stats only takes ~10s. However, slow
# config update could lead to empty EDS or similar symptoms causing the
# connection to hang for a long period of time. So, we want to extend the stats
# wait time to be the same as urlMap patch time.
_WAIT_FOR_STATS_SEC = _WAIT_FOR_URL_MAP_PATCH_SEC
_DEFAULT_SERVICE_PORT = 80 _DEFAULT_SERVICE_PORT = 80
_WAIT_FOR_BACKEND_SEC = args.wait_for_backend_sec _WAIT_FOR_BACKEND_SEC = args.wait_for_backend_sec
_WAIT_FOR_OPERATION_SEC = 1200 _WAIT_FOR_OPERATION_SEC = 1200
_INSTANCE_GROUP_SIZE = args.instance_group_size _INSTANCE_GROUP_SIZE = args.instance_group_size
_NUM_TEST_RPCS = 10 * args.qps _NUM_TEST_RPCS = 10 * args.qps
_WAIT_FOR_STATS_SEC = 360
_WAIT_FOR_VALID_CONFIG_SEC = 60
_WAIT_FOR_URL_MAP_PATCH_SEC = 300
_CONNECTION_TIMEOUT_SEC = 60 _CONNECTION_TIMEOUT_SEC = 60
_GCP_API_RETRIES = 5 _GCP_API_RETRIES = 5
_BOOTSTRAP_TEMPLATE = """ _BOOTSTRAP_TEMPLATE = """
@ -836,11 +827,8 @@ def test_round_robin(gcp, backend_service, instance_group):
# creating new backend resources for each individual test case. # creating new backend resources for each individual test case.
# Each attempt takes 10 seconds. Config propagation can take several # Each attempt takes 10 seconds. Config propagation can take several
# minutes. # minutes.
deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC max_attempts = 40
logger.info( for i in range(max_attempts):
'Attempting for %d seconds until received the expected distribution',
_WAIT_FOR_URL_MAP_PATCH_SEC)
while time.time() < deadline:
stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC) stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
requests_received = [stats.rpcs_by_peer[x] for x in stats.rpcs_by_peer] requests_received = [stats.rpcs_by_peer[x] for x in stats.rpcs_by_peer]
total_requests_received = sum(requests_received) total_requests_received = sum(requests_received)
@ -855,8 +843,7 @@ def test_round_robin(gcp, backend_service, instance_group):
'RPC peer distribution differs from expected by more than %d ' 'RPC peer distribution differs from expected by more than %d '
'for instance %s (%s)' % (threshold, instance, stats)) 'for instance %s (%s)' % (threshold, instance, stats))
return return
raise Exception('RPC failures persisted through after %s seconds' % raise Exception('RPC failures persisted through %d retries' % max_attempts)
_WAIT_FOR_URL_MAP_PATCH_SEC)
def test_secondary_locality_gets_no_requests_on_partial_primary_failure( def test_secondary_locality_gets_no_requests_on_partial_primary_failure(
@ -1320,11 +1307,10 @@ def test_traffic_splitting(gcp, original_backend_service, instance_group,
_WAIT_FOR_STATS_SEC) _WAIT_FOR_STATS_SEC)
# Verify that weights between two services are expected. # Verify that weights between two services are expected.
deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC retry_count = 10
logger.info( # Each attempt takes about 10 seconds, 10 retries is equivalent to 100
'Attempting for %d seconds until received the expected distribution', # seconds timeout.
_WAIT_FOR_URL_MAP_PATCH_SEC) for i in range(retry_count):
while time.time() < deadline:
stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC) stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
got_instance_count = [ got_instance_count = [
stats.rpcs_by_peer[i] for i in original_backend_instances stats.rpcs_by_peer[i] for i in original_backend_instances
@ -1338,15 +1324,18 @@ def test_traffic_splitting(gcp, original_backend_service, instance_group,
compare_distributions(got_instance_percentage, compare_distributions(got_instance_percentage,
expected_instance_percentage, 5) expected_instance_percentage, 5)
except Exception as e: except Exception as e:
logger.info('Got percentage: %s, expected percentage: %s', logger.info('attempt %d', i)
got_instance_percentage, logger.info('got percentage: %s', got_instance_percentage)
logger.info('expected percentage: %s',
expected_instance_percentage) expected_instance_percentage)
logger.info(e) logger.info(e)
if i == retry_count - 1:
raise Exception(
'RPC distribution (%s) differs from expected (%s)' %
(got_instance_percentage, expected_instance_percentage))
else: else:
logger.info("success") logger.info("success")
return break
raise Exception('RPC distribution (%s) differs from expected (%s)' %
(got_instance_percentage, expected_instance_percentage))
finally: finally:
patch_url_map_backend_service(gcp, original_backend_service) patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, alternate_backend_service, []) patch_backend_service(gcp, alternate_backend_service, [])
@ -1470,22 +1459,23 @@ def test_path_matching(gcp, original_backend_service, instance_group,
original_backend_instances + alternate_backend_instances, original_backend_instances + alternate_backend_instances,
_WAIT_FOR_STATS_SEC) _WAIT_FOR_STATS_SEC)
deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC retry_count = 80
logger.info( # Each attempt takes about 5 seconds, 80 retries is equivalent to 400
'Attempting for %d seconds until received the expected distribution', # seconds timeout.
_WAIT_FOR_URL_MAP_PATCH_SEC) for i in range(retry_count):
while time.time() < deadline:
stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC) stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
if not stats.rpcs_by_method: if not stats.rpcs_by_method:
raise ValueError( raise ValueError(
'stats.rpcs_by_method is None, the interop client stats service does not support this test case' 'stats.rpcs_by_method is None, the interop client stats service does not support this test case'
) )
logger.info('attempt %d', i)
if compare_expected_instances(stats, expected_instances): if compare_expected_instances(stats, expected_instances):
logger.info("success") logger.info("success")
return break
raise Exception( elif i == retry_count - 1:
'timeout waiting for RPCs to the expected instances: %s' % raise Exception(
expected_instances) 'timeout waiting for RPCs to the expected instances: %s'
% expected_instances)
finally: finally:
patch_url_map_backend_service(gcp, original_backend_service) patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, alternate_backend_service, []) patch_backend_service(gcp, alternate_backend_service, [])
@ -1667,22 +1657,23 @@ def test_header_matching(gcp, original_backend_service, instance_group,
original_backend_instances + alternate_backend_instances, original_backend_instances + alternate_backend_instances,
_WAIT_FOR_STATS_SEC) _WAIT_FOR_STATS_SEC)
deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC retry_count = 80
logger.info( # Each attempt takes about 5 seconds, 80 retries is equivalent to 400
'Attempting for %d seconds until received the expected distribution', # seconds timeout.
_WAIT_FOR_URL_MAP_PATCH_SEC) for i in range(retry_count):
while time.time() < deadline:
stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC) stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
if not stats.rpcs_by_method: if not stats.rpcs_by_method:
raise ValueError( raise ValueError(
'stats.rpcs_by_method is None, the interop client stats service does not support this test case' 'stats.rpcs_by_method is None, the interop client stats service does not support this test case'
) )
logger.info('attempt %d', i)
if compare_expected_instances(stats, expected_instances): if compare_expected_instances(stats, expected_instances):
logger.info("success") logger.info("success")
return break
raise Exception( elif i == retry_count - 1:
'timeout waiting for RPCs to the expected instances: %s' % raise Exception(
expected_instances) 'timeout waiting for RPCs to the expected instances: %s'
% expected_instances)
finally: finally:
patch_url_map_backend_service(gcp, original_backend_service) patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, alternate_backend_service, []) patch_backend_service(gcp, alternate_backend_service, [])
@ -1693,7 +1684,7 @@ def test_circuit_breaking(gcp, original_backend_service, instance_group,
''' '''
Since backend service circuit_breakers configuration cannot be unset, Since backend service circuit_breakers configuration cannot be unset,
which causes trouble for restoring validate_for_proxy flag in target which causes trouble for restoring validate_for_proxy flag in target
proxy/global forwarding rule. This test uses dedicated backend services. proxy/global forwarding rule. This test uses dedicated backend sevices.
The url_map and backend services undergoes the following state changes: The url_map and backend services undergoes the following state changes:
Before test: Before test:
@ -1920,62 +1911,53 @@ def test_timeout(gcp, original_backend_service, instance_group):
] ]
try: try:
deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC first_case = True
logger.info( for (testcase_name, client_config, expected_results) in test_cases:
'Attempting for %d seconds until received the expected distribution', logger.info('starting case %s', testcase_name)
_WAIT_FOR_URL_MAP_PATCH_SEC) configure_client(**client_config)
attempt_counter = 0 # wait a second to help ensure the client stops sending RPCs with
while True: # the old config. We will make multiple attempts if it is failing,
attempt_counter += 1 # but this improves confidence that the test is valid if the
try: # previous client_config would lead to the same results.
for (testcase_name, client_config, time.sleep(1)
expected_results) in test_cases: # Each attempt takes 10 seconds; 20 attempts is equivalent to 200
logger.info('starting case %s: attempt %d', testcase_name, # second timeout.
attempt_counter) attempt_count = 20
configure_client(**client_config) if first_case:
# wait a second to help ensure the client stops sending RPCs with attempt_count = 120
# the old config. We will make multiple attempts if it is failing, first_case = False
# but this improves confidence that the test is valid if the before_stats = get_client_accumulated_stats()
# previous client_config would lead to the same results. if not before_stats.stats_per_method:
time.sleep(1) raise ValueError(
before_stats = get_client_accumulated_stats() 'stats.stats_per_method is None, the interop client stats service does not support this test case'
if not before_stats.stats_per_method: )
raise ValueError( for i in range(attempt_count):
'stats.stats_per_method is None, the interop client stats service does not support this test case' logger.info('%s: attempt %d', testcase_name, i)
)
logger.info('%s: attempt %d', testcase_name, test_runtime_secs = 10
attempt_counter) time.sleep(test_runtime_secs)
after_stats = get_client_accumulated_stats()
test_runtime_secs = 10
time.sleep(test_runtime_secs) success = True
after_stats = get_client_accumulated_stats() for rpc, status in expected_results.items():
qty = (after_stats.stats_per_method[rpc].result[status] -
success = True before_stats.stats_per_method[rpc].result[status])
for rpc, status in expected_results.items(): want = test_runtime_secs * args.qps
qty = ( # Allow 10% deviation from expectation to reduce flakiness
after_stats.stats_per_method[rpc].result[status] - if qty < (want * .9) or qty > (want * 1.1):
before_stats.stats_per_method[rpc].result[status]) logger.info('%s: failed due to %s[%s]: got %d want ~%d',
want = test_runtime_secs * args.qps testcase_name, rpc, status, qty, want)
# Allow 10% deviation from expectation to reduce flakiness success = False
if qty < (want * .9) or qty > (want * 1.1): if success:
logger.info( logger.info('success')
'%s: failed due to %s[%s]: got %d want ~%d', break
testcase_name, rpc, status, qty, want) logger.info('%s attempt %d failed', testcase_name, i)
success = False before_stats = after_stats
if success: else:
logger.info('success') raise Exception(
return '%s: timeout waiting for expected results: %s; got %s' %
logger.info('%s attempt %d failed', testcase_name, (testcase_name, expected_results,
attempt_counter) after_stats.stats_per_method))
raise RpcDistributionError(
'%s: timeout waiting for expected results: %s; got %s' %
(testcase_name, expected_results,
after_stats.stats_per_method))
except RpcDistributionError as e:
if time.time() < deadline:
pass
else:
raise
finally: finally:
patch_url_map_backend_service(gcp, original_backend_service) patch_url_map_backend_service(gcp, original_backend_service)
@ -2098,78 +2080,70 @@ def test_fault_injection(gcp, original_backend_service, instance_group):
] ]
try: try:
deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC first_case = True
logger.info( for (testcase_name, client_config, expected_results) in test_cases:
'Attempting for %d seconds until received the expected distribution', logger.info('starting case %s', testcase_name)
_WAIT_FOR_URL_MAP_PATCH_SEC)
attempt_counter = 0 client_config['metadata'] = [
while True: (messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
attempt_counter += 1 testcase_header, testcase_name)
try: ]
for (testcase_name, client_config, client_config['rpc_types'] = [
expected_results) in test_cases: messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
logger.info('starting case %s: attempt %d', testcase_name, ]
attempt_counter) configure_client(**client_config)
# wait a second to help ensure the client stops sending RPCs with
client_config['metadata'] = [ # the old config. We will make multiple attempts if it is failing,
(messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL, # but this improves confidence that the test is valid if the
testcase_header, testcase_name) # previous client_config would lead to the same results.
] time.sleep(1)
client_config['rpc_types'] = [ # Each attempt takes 10 seconds; 20 attempts is equivalent to 200
messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL, # second timeout.
] attempt_count = 20
configure_client(**client_config) if first_case:
# wait a second to help ensure the client stops sending RPCs with attempt_count = 120
# the old config. We will make multiple attempts if it is failing, first_case = False
# but this improves confidence that the test is valid if the before_stats = get_client_accumulated_stats()
# previous client_config would lead to the same results. if not before_stats.stats_per_method:
time.sleep(1) raise ValueError(
# Each attempt takes 10 seconds; 20 attempts is equivalent to 200 'stats.stats_per_method is None, the interop client stats service does not support this test case'
# second timeout. )
before_stats = get_client_accumulated_stats() for i in range(attempt_count):
if not before_stats.stats_per_method: logger.info('%s: attempt %d', testcase_name, i)
raise ValueError(
'stats.stats_per_method is None, the interop client stats service does not support this test case' test_runtime_secs = 10
) time.sleep(test_runtime_secs)
logger.info('%s: attempt %d', testcase_name, i) after_stats = get_client_accumulated_stats()
test_runtime_secs = 10 success = True
time.sleep(test_runtime_secs) for status, pct in expected_results.items():
after_stats = get_client_accumulated_stats() rpc = 'UNARY_CALL'
qty = (after_stats.stats_per_method[rpc].result[status] -
success = True before_stats.stats_per_method[rpc].result[status])
for status, pct in expected_results.items(): want = pct * args.qps * test_runtime_secs
rpc = 'UNARY_CALL' # Allow 10% deviation from expectation to reduce flakiness
qty = ( VARIANCE_ALLOWED = 0.1
after_stats.stats_per_method[rpc].result[status] - if abs(qty - want) > want * VARIANCE_ALLOWED:
before_stats.stats_per_method[rpc].result[status]) logger.info('%s: failed due to %s[%s]: got %d want ~%d',
want = pct * args.qps * test_runtime_secs testcase_name, rpc, status, qty, want)
# Allow 10% deviation from expectation to reduce flakiness success = False
VARIANCE_ALLOWED = 0.1 if success:
if abs(qty - want) > want * VARIANCE_ALLOWED: logger.info('success')
logger.info( break
'%s: failed due to %s[%s]: got %d want ~%d', logger.info('%s attempt %d failed', testcase_name, i)
testcase_name, rpc, status, qty, want) before_stats = after_stats
success = False else:
if success: raise Exception(
logger.info('success') '%s: timeout waiting for expected results: %s; got %s' %
break (testcase_name, expected_results,
logger.info('%s attempt %d failed', testcase_name, i) after_stats.stats_per_method))
raise RpcDistributionError(
'%s: timeout waiting for expected results: %s; got %s' %
(testcase_name, expected_results,
after_stats.stats_per_method))
except RpcDistributionError as e:
if time.time() < deadline:
pass
else:
raise
finally: finally:
patch_url_map_backend_service(gcp, original_backend_service) patch_url_map_backend_service(gcp, original_backend_service)
set_validate_for_proxyless(gcp, True) set_validate_for_proxyless(gcp, True)
def test_csds(gcp, original_backend_service, instance_group, server_uri): def test_csds(gcp, original_backend_service, instance_group, server_uri):
test_csds_timeout_s = datetime.timedelta(minutes=5).total_seconds()
sleep_interval_between_attempts_s = datetime.timedelta( sleep_interval_between_attempts_s = datetime.timedelta(
seconds=2).total_seconds() seconds=2).total_seconds()
logger.info('Running test_csds') logger.info('Running test_csds')
@ -2177,9 +2151,10 @@ def test_csds(gcp, original_backend_service, instance_group, server_uri):
logger.info('waiting for original backends to become healthy') logger.info('waiting for original backends to become healthy')
wait_for_healthy_backends(gcp, original_backend_service, instance_group) wait_for_healthy_backends(gcp, original_backend_service, instance_group)
deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC # Test case timeout: 5 minutes
deadline = time.time() + test_csds_timeout_s
cnt = 0 cnt = 0
while time.time() < deadline: while time.time() <= deadline:
client_config = get_client_xds_config_dump() client_config = get_client_xds_config_dump()
logger.info('test_csds attempt %d: received xDS config %s', cnt, logger.info('test_csds attempt %d: received xDS config %s', cnt,
json.dumps(client_config, indent=2)) json.dumps(client_config, indent=2))
@ -2251,7 +2226,7 @@ def test_csds(gcp, original_backend_service, instance_group, server_uri):
cnt += 1 cnt += 1
raise RuntimeError('failed to receive a valid xDS config in %s seconds' % raise RuntimeError('failed to receive a valid xDS config in %s seconds' %
_WAIT_FOR_URL_MAP_PATCH_SEC) test_csds_timeout_s)
def set_validate_for_proxyless(gcp, validate_for_proxyless): def set_validate_for_proxyless(gcp, validate_for_proxyless):

Loading…
Cancel
Save