run_xds_tests: add fault_injection tests (#25641)

4 years ago · 6276b66490
parent 5139a012e7
commit 6276b66490
1 changed files with 182 additions and 1 deletions
--- a/tools/run_tests/run_xds_tests.py
+++ b/tools/run_tests/run_xds_tests.py
@ -69,10 +69,11 @@ _ADDITIONAL_TEST_CASES = [
    'header_matching',
    'circuit_breaking',
    'timeout',
+    'fault_injection',
 ]

 # Test cases that require the V3 API.  Skipped in older runs.
-_V3_TEST_CASES = frozenset(['timeout'])
+_V3_TEST_CASES = frozenset(['timeout', 'fault_injection'])

 # Test cases that require the alpha API.  Skipped for stable API runs.
 _ALPHA_TEST_CASES = frozenset(['timeout'])
@ -1574,6 +1575,7 @@ def test_timeout(gcp, original_backend_service, instance_group):
                                    testcase_name, rpc, status, qty, want)
                        success = False
                if success:
+                    logger.info('success')
                    break
                logger.info('%s attempt %d failed', testcase_name, i)
                before_stats = after_stats
@ -1586,6 +1588,182 @@ def test_timeout(gcp, original_backend_service, instance_group):
        patch_url_map_backend_service(gcp, original_backend_service)


+def test_fault_injection(gcp, original_backend_service, instance_group):
+    logger.info('Running test_fault_injection')
+
+    logger.info('waiting for original backends to become healthy')
+    wait_for_healthy_backends(gcp, original_backend_service, instance_group)
+
+    testcase_header = 'fi_testcase'
+
+    def _route(pri, name, fi_policy):
+        return {
+            'priority': pri,
+            'matchRules': [{
+                'prefixMatch':
+                    '/',
+                'headerMatches': [{
+                    'headerName': testcase_header,
+                    'exactMatch': name,
+                }],
+            }],
+            'service': original_backend_service.url,
+            'routeAction': {
+                'faultInjectionPolicy': fi_policy
+            },
+        }
+
+    def _abort(pct):
+        return {
+            'abort': {
+                'httpStatus': 401,
+                'percentage': pct,
+            }
+        }
+
+    def _delay(pct):
+        return {
+            'delay': {
+                'fixedDelay': {
+                    'seconds': '20'
+                },
+                'percentage': pct,
+            }
+        }
+
+    zero_route = _abort(0)
+    zero_route.update(_delay(0))
+    route_rules = [
+        _route(0, 'zero_percent_fault_injection', zero_route),
+        _route(1, 'always_delay', _delay(100)),
+        _route(2, 'always_abort', _abort(100)),
+        _route(3, 'delay_half', _delay(50)),
+        _route(4, 'abort_half', _abort(50)),
+        {
+            'priority': 5,
+            'matchRules': [{
+                'prefixMatch': '/'
+            }],
+            'service': original_backend_service.url,
+        },
+    ]
+    set_validate_for_proxyless(gcp, False)
+    patch_url_map_backend_service(gcp,
+                                  original_backend_service,
+                                  route_rules=route_rules)
+    # A list of tuples (testcase_name, {client_config}, {code: percent}).  Each
+    # test case will set the testcase_header with the testcase_name for routing
+    # to the appropriate config for the case, defined above.
+    test_cases = [
+        (
+            'zero_percent_fault_injection',
+            {},
+            {
+                0: 1
+            },  # OK
+        ),
+        (
+            'non_matching_fault_injection',  # Not in route_rules, above.
+            {},
+            {
+                0: 1
+            },  # OK
+        ),
+        (
+            'always_delay',
+            {
+                'timeout_sec': 2
+            },
+            {
+                4: 1
+            },  # DEADLINE_EXCEEDED
+        ),
+        (
+            'always_abort',
+            {},
+            {
+                16: 1
+            },  # UNAUTHENTICATED
+        ),
+        (
+            'delay_half',
+            {
+                'timeout_sec': 2
+            },
+            {
+                4: .5,
+                0: .5
+            },  # DEADLINE_EXCEEDED / OK: 50% / 50%
+        ),
+        (
+            'abort_half',
+            {},
+            {
+                16: .5,
+                0: .5
+            },  # UNAUTHENTICATED / OK: 50% / 50%
+        )
+    ]
+
+    try:
+        for (testcase_name, client_config, expected_results) in test_cases:
+            logger.info('starting case %s', testcase_name)
+
+            client_config['metadata'] = [
+                (messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
+                 testcase_header, testcase_name)
+            ]
+            client_config['rpc_types'] = [
+                messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
+            ]
+            configure_client(**client_config)
+            # wait a second to help ensure the client stops sending RPCs with
+            # the old config.  We will make multiple attempts if it is failing,
+            # but this improves confidence that the test is valid if the
+            # previous client_config would lead to the same results.
+            time.sleep(1)
+            # Each attempt takes 10 seconds; 20 attempts is equivalent to 200
+            # second timeout.
+            attempt_count = 20
+            before_stats = get_client_accumulated_stats()
+            if not before_stats.stats_per_method:
+                raise ValueError(
+                    'stats.stats_per_method is None, the interop client stats service does not support this test case'
+                )
+            for i in range(attempt_count):
+                logger.info('%s: attempt %d', testcase_name, i)
+
+                test_runtime_secs = 10
+                time.sleep(test_runtime_secs)
+                after_stats = get_client_accumulated_stats()
+
+                success = True
+                for status, pct in expected_results.items():
+                    rpc = 'UNARY_CALL'
+                    qty = (after_stats.stats_per_method[rpc].result[status] -
+                           before_stats.stats_per_method[rpc].result[status])
+                    want = pct * args.qps * test_runtime_secs
+                    # Allow 10% deviation from expectation to reduce flakiness
+                    VARIANCE_ALLOWED = 0.1
+                    if abs(qty - want) > want * VARIANCE_ALLOWED:
+                        logger.info('%s: failed due to %s[%s]: got %d want ~%d',
+                                    testcase_name, rpc, status, qty, want)
+                        success = False
+                if success:
+                    logger.info('success')
+                    break
+                logger.info('%s attempt %d failed', testcase_name, i)
+                before_stats = after_stats
+            else:
+                raise Exception(
+                    '%s: timeout waiting for expected results: %s; got %s' %
+                    (testcase_name, expected_results,
+                     after_stats.stats_per_method))
+    finally:
+        patch_url_map_backend_service(gcp, original_backend_service)
+        set_validate_for_proxyless(gcp, True)
+
+
 def set_validate_for_proxyless(gcp, validate_for_proxyless):
    if not gcp.alpha_compute:
        logger.debug(
@ -2417,6 +2595,7 @@ try:
        client_env['GRPC_XDS_BOOTSTRAP'] = bootstrap_path
        client_env['GRPC_XDS_EXPERIMENTAL_CIRCUIT_BREAKING'] = 'true'
        client_env['GRPC_XDS_EXPERIMENTAL_ENABLE_TIMEOUT'] = 'true'
+        client_env['GRPC_XDS_EXPERIMENTAL_FAULT_INJECTION'] = 'true'
        test_results = {}
        failed_tests = []
        for test_case in args.test_case:
@ -2534,6 +2713,8 @@ try:
                                          same_zone_instance_group)
                elif test_case == 'timeout':
                    test_timeout(gcp, backend_service, instance_group)
+                elif test_case == 'fault_injection':
+                    test_fault_injection(gcp, backend_service, instance_group)
                else:
                    logger.error('Unknown test case: %s', test_case)
                    sys.exit(1)