From 6276b66490fb1b28791cdc995520f0dfcbfc65c7 Mon Sep 17 00:00:00 2001 From: Doug Fawley Date: Tue, 9 Mar 2021 09:02:59 -0800 Subject: [PATCH] run_xds_tests: add fault_injection tests (#25641) --- tools/run_tests/run_xds_tests.py | 183 ++++++++++++++++++++++++++++++- 1 file changed, 182 insertions(+), 1 deletion(-) diff --git a/tools/run_tests/run_xds_tests.py b/tools/run_tests/run_xds_tests.py index d8966991892..83a38ec35f4 100755 --- a/tools/run_tests/run_xds_tests.py +++ b/tools/run_tests/run_xds_tests.py @@ -69,10 +69,11 @@ _ADDITIONAL_TEST_CASES = [ 'header_matching', 'circuit_breaking', 'timeout', + 'fault_injection', ] # Test cases that require the V3 API. Skipped in older runs. -_V3_TEST_CASES = frozenset(['timeout']) +_V3_TEST_CASES = frozenset(['timeout', 'fault_injection']) # Test cases that require the alpha API. Skipped for stable API runs. _ALPHA_TEST_CASES = frozenset(['timeout']) @@ -1574,6 +1575,7 @@ def test_timeout(gcp, original_backend_service, instance_group): testcase_name, rpc, status, qty, want) success = False if success: + logger.info('success') break logger.info('%s attempt %d failed', testcase_name, i) before_stats = after_stats @@ -1586,6 +1588,182 @@ def test_timeout(gcp, original_backend_service, instance_group): patch_url_map_backend_service(gcp, original_backend_service) +def test_fault_injection(gcp, original_backend_service, instance_group): + logger.info('Running test_fault_injection') + + logger.info('waiting for original backends to become healthy') + wait_for_healthy_backends(gcp, original_backend_service, instance_group) + + testcase_header = 'fi_testcase' + + def _route(pri, name, fi_policy): + return { + 'priority': pri, + 'matchRules': [{ + 'prefixMatch': + '/', + 'headerMatches': [{ + 'headerName': testcase_header, + 'exactMatch': name, + }], + }], + 'service': original_backend_service.url, + 'routeAction': { + 'faultInjectionPolicy': fi_policy + }, + } + + def _abort(pct): + return { + 'abort': { + 'httpStatus': 401, + 'percentage': pct, + } + } + + def _delay(pct): + return { + 'delay': { + 'fixedDelay': { + 'seconds': '20' + }, + 'percentage': pct, + } + } + + zero_route = _abort(0) + zero_route.update(_delay(0)) + route_rules = [ + _route(0, 'zero_percent_fault_injection', zero_route), + _route(1, 'always_delay', _delay(100)), + _route(2, 'always_abort', _abort(100)), + _route(3, 'delay_half', _delay(50)), + _route(4, 'abort_half', _abort(50)), + { + 'priority': 5, + 'matchRules': [{ + 'prefixMatch': '/' + }], + 'service': original_backend_service.url, + }, + ] + set_validate_for_proxyless(gcp, False) + patch_url_map_backend_service(gcp, + original_backend_service, + route_rules=route_rules) + # A list of tuples (testcase_name, {client_config}, {code: percent}). Each + # test case will set the testcase_header with the testcase_name for routing + # to the appropriate config for the case, defined above. + test_cases = [ + ( + 'zero_percent_fault_injection', + {}, + { + 0: 1 + }, # OK + ), + ( + 'non_matching_fault_injection', # Not in route_rules, above. + {}, + { + 0: 1 + }, # OK + ), + ( + 'always_delay', + { + 'timeout_sec': 2 + }, + { + 4: 1 + }, # DEADLINE_EXCEEDED + ), + ( + 'always_abort', + {}, + { + 16: 1 + }, # UNAUTHENTICATED + ), + ( + 'delay_half', + { + 'timeout_sec': 2 + }, + { + 4: .5, + 0: .5 + }, # DEADLINE_EXCEEDED / OK: 50% / 50% + ), + ( + 'abort_half', + {}, + { + 16: .5, + 0: .5 + }, # UNAUTHENTICATED / OK: 50% / 50% + ) + ] + + try: + for (testcase_name, client_config, expected_results) in test_cases: + logger.info('starting case %s', testcase_name) + + client_config['metadata'] = [ + (messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL, + testcase_header, testcase_name) + ] + client_config['rpc_types'] = [ + messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL, + ] + configure_client(**client_config) + # wait a second to help ensure the client stops sending RPCs with + # the old config. We will make multiple attempts if it is failing, + # but this improves confidence that the test is valid if the + # previous client_config would lead to the same results. + time.sleep(1) + # Each attempt takes 10 seconds; 20 attempts is equivalent to 200 + # second timeout. + attempt_count = 20 + before_stats = get_client_accumulated_stats() + if not before_stats.stats_per_method: + raise ValueError( + 'stats.stats_per_method is None, the interop client stats service does not support this test case' + ) + for i in range(attempt_count): + logger.info('%s: attempt %d', testcase_name, i) + + test_runtime_secs = 10 + time.sleep(test_runtime_secs) + after_stats = get_client_accumulated_stats() + + success = True + for status, pct in expected_results.items(): + rpc = 'UNARY_CALL' + qty = (after_stats.stats_per_method[rpc].result[status] - + before_stats.stats_per_method[rpc].result[status]) + want = pct * args.qps * test_runtime_secs + # Allow 10% deviation from expectation to reduce flakiness + VARIANCE_ALLOWED = 0.1 + if abs(qty - want) > want * VARIANCE_ALLOWED: + logger.info('%s: failed due to %s[%s]: got %d want ~%d', + testcase_name, rpc, status, qty, want) + success = False + if success: + logger.info('success') + break + logger.info('%s attempt %d failed', testcase_name, i) + before_stats = after_stats + else: + raise Exception( + '%s: timeout waiting for expected results: %s; got %s' % + (testcase_name, expected_results, + after_stats.stats_per_method)) + finally: + patch_url_map_backend_service(gcp, original_backend_service) + set_validate_for_proxyless(gcp, True) + + def set_validate_for_proxyless(gcp, validate_for_proxyless): if not gcp.alpha_compute: logger.debug( @@ -2417,6 +2595,7 @@ try: client_env['GRPC_XDS_BOOTSTRAP'] = bootstrap_path client_env['GRPC_XDS_EXPERIMENTAL_CIRCUIT_BREAKING'] = 'true' client_env['GRPC_XDS_EXPERIMENTAL_ENABLE_TIMEOUT'] = 'true' + client_env['GRPC_XDS_EXPERIMENTAL_FAULT_INJECTION'] = 'true' test_results = {} failed_tests = [] for test_case in args.test_case: @@ -2534,6 +2713,8 @@ try: same_zone_instance_group) elif test_case == 'timeout': test_timeout(gcp, backend_service, instance_group) + elif test_case == 'fault_injection': + test_fault_injection(gcp, backend_service, instance_group) else: logger.error('Unknown test case: %s', test_case) sys.exit(1)