Revert "[xDS interop] Updating the config update timeout to 600s (#26090)" (#26197)

This reverts commit 9bc421c6cf.
4 years ago · cf7f5a0b85
parent 2ee8c49af0
commit cf7f5a0b85
1 changed files with 148 additions and 173 deletions
--- a/tools/run_tests/run_xds_tests.py
+++ b/tools/run_tests/run_xds_tests.py
@ -279,23 +279,14 @@ CLIENT_HOSTS = []
 if args.client_hosts:
    CLIENT_HOSTS = args.client_hosts.split(',')
 # Each of the config propagation in the control plane should finish within 600s.
 # Otherwise, it indicates a bug in the control plane. The config propagation
 # includes all kinds of traffic config update, like updating urlMap, creating
 # the resources for the first time, updating BackendService, and changing the
 # status of endpoints in BackendService.
 _WAIT_FOR_URL_MAP_PATCH_SEC = 600
 # In general, fetching load balancing stats only takes ~10s. However, slow
 # config update could lead to empty EDS or similar symptoms causing the
 # connection to hang for a long period of time. So, we want to extend the stats
 # wait time to be the same as urlMap patch time.
 _WAIT_FOR_STATS_SEC = _WAIT_FOR_URL_MAP_PATCH_SEC
 _DEFAULT_SERVICE_PORT = 80
 _WAIT_FOR_BACKEND_SEC = args.wait_for_backend_sec
 _WAIT_FOR_OPERATION_SEC = 1200
 _INSTANCE_GROUP_SIZE = args.instance_group_size
 _NUM_TEST_RPCS = 10 * args.qps
 _WAIT_FOR_STATS_SEC = 360
 _WAIT_FOR_VALID_CONFIG_SEC = 60
 _WAIT_FOR_URL_MAP_PATCH_SEC = 300
 _CONNECTION_TIMEOUT_SEC = 60
 _GCP_API_RETRIES = 5
 _BOOTSTRAP_TEMPLATE = """
@ -836,11 +827,8 @@ def test_round_robin(gcp, backend_service, instance_group):
    # creating new backend resources for each individual test case.
    # Each attempt takes 10 seconds. Config propagation can take several
    # minutes.
-    deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
+    max_attempts = 40
-    logger.info(
+    for i in range(max_attempts):
        'Attempting for %d seconds until received the expected distribution',
        _WAIT_FOR_URL_MAP_PATCH_SEC)
    while time.time() < deadline:
        stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
        requests_received = [stats.rpcs_by_peer[x] for x in stats.rpcs_by_peer]
        total_requests_received = sum(requests_received)
@ -855,8 +843,7 @@ def test_round_robin(gcp, backend_service, instance_group):
                    'RPC peer distribution differs from expected by more than %d '
                    'for instance %s (%s)' % (threshold, instance, stats))
        return
-    raise Exception('RPC failures persisted through after %s seconds' %
+    raise Exception('RPC failures persisted through %d retries' % max_attempts)
                    _WAIT_FOR_URL_MAP_PATCH_SEC)
 def test_secondary_locality_gets_no_requests_on_partial_primary_failure(
@ -1320,11 +1307,10 @@ def test_traffic_splitting(gcp, original_backend_service, instance_group,
            _WAIT_FOR_STATS_SEC)
        # Verify that weights between two services are expected.
-        deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
+        retry_count = 10
-        logger.info(
+        # Each attempt takes about 10 seconds, 10 retries is equivalent to 100
-            'Attempting for %d seconds until received the expected distribution',
+        # seconds timeout.
-            _WAIT_FOR_URL_MAP_PATCH_SEC)
+        for i in range(retry_count):
        while time.time() < deadline:
            stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
            got_instance_count = [
                stats.rpcs_by_peer[i] for i in original_backend_instances
@ -1338,15 +1324,18 @@ def test_traffic_splitting(gcp, original_backend_service, instance_group,
                compare_distributions(got_instance_percentage,
                                      expected_instance_percentage, 5)
            except Exception as e:
-                logger.info('Got percentage: %s, expected percentage: %s',
+                logger.info('attempt %d', i)
-                            got_instance_percentage,
+                logger.info('got percentage: %s', got_instance_percentage)
                logger.info('expected percentage: %s',
                            expected_instance_percentage)
                logger.info(e)
                if i == retry_count - 1:
                    raise Exception(
                        'RPC distribution (%s) differs from expected (%s)' %
                        (got_instance_percentage, expected_instance_percentage))
            else:
                logger.info("success")
-                return
+                break
        raise Exception('RPC distribution (%s) differs from expected (%s)' %
                        (got_instance_percentage, expected_instance_percentage))
    finally:
        patch_url_map_backend_service(gcp, original_backend_service)
        patch_backend_service(gcp, alternate_backend_service, [])
@ -1470,22 +1459,23 @@ def test_path_matching(gcp, original_backend_service, instance_group,
                original_backend_instances + alternate_backend_instances,
                _WAIT_FOR_STATS_SEC)
-            deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
+            retry_count = 80
-            logger.info(
+            # Each attempt takes about 5 seconds, 80 retries is equivalent to 400
-                'Attempting for %d seconds until received the expected distribution',
+            # seconds timeout.
-                _WAIT_FOR_URL_MAP_PATCH_SEC)
+            for i in range(retry_count):
            while time.time() < deadline:
                stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
                if not stats.rpcs_by_method:
                    raise ValueError(
                        'stats.rpcs_by_method is None, the interop client stats service does not support this test case'
                    )
                logger.info('attempt %d', i)
                if compare_expected_instances(stats, expected_instances):
                    logger.info("success")
-                    return
+                    break
-            raise Exception(
+                elif i == retry_count - 1:
-                'timeout waiting for RPCs to the expected instances: %s' %
+                    raise Exception(
-                expected_instances)
+                        'timeout waiting for RPCs to the expected instances: %s'
                        % expected_instances)
    finally:
        patch_url_map_backend_service(gcp, original_backend_service)
        patch_backend_service(gcp, alternate_backend_service, [])
@ -1667,22 +1657,23 @@ def test_header_matching(gcp, original_backend_service, instance_group,
                original_backend_instances + alternate_backend_instances,
                _WAIT_FOR_STATS_SEC)
-            deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
+            retry_count = 80
-            logger.info(
+            # Each attempt takes about 5 seconds, 80 retries is equivalent to 400
-                'Attempting for %d seconds until received the expected distribution',
+            # seconds timeout.
-                _WAIT_FOR_URL_MAP_PATCH_SEC)
+            for i in range(retry_count):
            while time.time() < deadline:
                stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
                if not stats.rpcs_by_method:
                    raise ValueError(
                        'stats.rpcs_by_method is None, the interop client stats service does not support this test case'
                    )
                logger.info('attempt %d', i)
                if compare_expected_instances(stats, expected_instances):
                    logger.info("success")
-                    return
+                    break
-            raise Exception(
+                elif i == retry_count - 1:
-                'timeout waiting for RPCs to the expected instances: %s' %
+                    raise Exception(
-                expected_instances)
+                        'timeout waiting for RPCs to the expected instances: %s'
                        % expected_instances)
    finally:
        patch_url_map_backend_service(gcp, original_backend_service)
        patch_backend_service(gcp, alternate_backend_service, [])
@ -1693,7 +1684,7 @@ def test_circuit_breaking(gcp, original_backend_service, instance_group,
    '''
    Since backend service circuit_breakers configuration cannot be unset,
    which causes trouble for restoring validate_for_proxy flag in target
-    proxy/global forwarding rule. This test uses dedicated backend services.
+    proxy/global forwarding rule. This test uses dedicated backend sevices.
    The url_map and backend services undergoes the following state changes:
    Before test:
@ -1920,62 +1911,53 @@ def test_timeout(gcp, original_backend_service, instance_group):
    ]
    try:
-        deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
+        first_case = True
-        logger.info(
+        for (testcase_name, client_config, expected_results) in test_cases:
-            'Attempting for %d seconds until received the expected distribution',
+            logger.info('starting case %s', testcase_name)
-            _WAIT_FOR_URL_MAP_PATCH_SEC)
+            configure_client(**client_config)
-        attempt_counter = 0
+            # wait a second to help ensure the client stops sending RPCs with
-        while True:
+            # the old config.  We will make multiple attempts if it is failing,
-            attempt_counter += 1
+            # but this improves confidence that the test is valid if the
-            try:
+            # previous client_config would lead to the same results.
-                for (testcase_name, client_config,
+            time.sleep(1)
-                     expected_results) in test_cases:
+            # Each attempt takes 10 seconds; 20 attempts is equivalent to 200
-                    logger.info('starting case %s: attempt %d', testcase_name,
+            # second timeout.
-                                attempt_counter)
+            attempt_count = 20
-                    configure_client(**client_config)
+            if first_case:
-                    # wait a second to help ensure the client stops sending RPCs with
+                attempt_count = 120
-                    # the old config.  We will make multiple attempts if it is failing,
+                first_case = False
-                    # but this improves confidence that the test is valid if the
+            before_stats = get_client_accumulated_stats()
-                    # previous client_config would lead to the same results.
+            if not before_stats.stats_per_method:
-                    time.sleep(1)
+                raise ValueError(
-                    before_stats = get_client_accumulated_stats()
+                    'stats.stats_per_method is None, the interop client stats service does not support this test case'
-                    if not before_stats.stats_per_method:
+                )
-                        raise ValueError(
+            for i in range(attempt_count):
-                            'stats.stats_per_method is None, the interop client stats service does not support this test case'
+                logger.info('%s: attempt %d', testcase_name, i)
-                        )
+
-                    logger.info('%s: attempt %d', testcase_name,
+                test_runtime_secs = 10
-                                attempt_counter)
+                time.sleep(test_runtime_secs)
-
+                after_stats = get_client_accumulated_stats()
-                    test_runtime_secs = 10
+
-                    time.sleep(test_runtime_secs)
+                success = True
-                    after_stats = get_client_accumulated_stats()
+                for rpc, status in expected_results.items():
-
+                    qty = (after_stats.stats_per_method[rpc].result[status] -
-                    success = True
+                           before_stats.stats_per_method[rpc].result[status])
-                    for rpc, status in expected_results.items():
+                    want = test_runtime_secs * args.qps
-                        qty = (
+                    # Allow 10% deviation from expectation to reduce flakiness
-                            after_stats.stats_per_method[rpc].result[status] -
+                    if qty < (want * .9) or qty > (want * 1.1):
-                            before_stats.stats_per_method[rpc].result[status])
+                        logger.info('%s: failed due to %s[%s]: got %d want ~%d',
-                        want = test_runtime_secs * args.qps
+                                    testcase_name, rpc, status, qty, want)
-                        # Allow 10% deviation from expectation to reduce flakiness
+                        success = False
-                        if qty < (want * .9) or qty > (want * 1.1):
+                if success:
-                            logger.info(
+                    logger.info('success')
-                                '%s: failed due to %s[%s]: got %d want ~%d',
+                    break
-                                testcase_name, rpc, status, qty, want)
+                logger.info('%s attempt %d failed', testcase_name, i)
-                            success = False
+                before_stats = after_stats
-                    if success:
+            else:
-                        logger.info('success')
+                raise Exception(
-                        return
+                    '%s: timeout waiting for expected results: %s; got %s' %
-                    logger.info('%s attempt %d failed', testcase_name,
+                    (testcase_name, expected_results,
-                                attempt_counter)
+                     after_stats.stats_per_method))
                    raise RpcDistributionError(
                        '%s: timeout waiting for expected results: %s; got %s' %
                        (testcase_name, expected_results,
                         after_stats.stats_per_method))
            except RpcDistributionError as e:
                if time.time() < deadline:
                    pass
                else:
                    raise
    finally:
        patch_url_map_backend_service(gcp, original_backend_service)
@ -2098,78 +2080,70 @@ def test_fault_injection(gcp, original_backend_service, instance_group):
    ]
    try:
-        deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
+        first_case = True
-        logger.info(
+        for (testcase_name, client_config, expected_results) in test_cases:
-            'Attempting for %d seconds until received the expected distribution',
+            logger.info('starting case %s', testcase_name)
-            _WAIT_FOR_URL_MAP_PATCH_SEC)
+
-        attempt_counter = 0
+            client_config['metadata'] = [
-        while True:
+                (messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
-            attempt_counter += 1
+                 testcase_header, testcase_name)
-            try:
+            ]
-                for (testcase_name, client_config,
+            client_config['rpc_types'] = [
-                     expected_results) in test_cases:
+                messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
-                    logger.info('starting case %s: attempt %d', testcase_name,
+            ]
-                                attempt_counter)
+            configure_client(**client_config)
-
+            # wait a second to help ensure the client stops sending RPCs with
-                    client_config['metadata'] = [
+            # the old config.  We will make multiple attempts if it is failing,
-                        (messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
+            # but this improves confidence that the test is valid if the
-                         testcase_header, testcase_name)
+            # previous client_config would lead to the same results.
-                    ]
+            time.sleep(1)
-                    client_config['rpc_types'] = [
+            # Each attempt takes 10 seconds; 20 attempts is equivalent to 200
-                        messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
+            # second timeout.
-                    ]
+            attempt_count = 20
-                    configure_client(**client_config)
+            if first_case:
-                    # wait a second to help ensure the client stops sending RPCs with
+                attempt_count = 120
-                    # the old config.  We will make multiple attempts if it is failing,
+                first_case = False
-                    # but this improves confidence that the test is valid if the
+            before_stats = get_client_accumulated_stats()
-                    # previous client_config would lead to the same results.
+            if not before_stats.stats_per_method:
-                    time.sleep(1)
+                raise ValueError(
-                    # Each attempt takes 10 seconds; 20 attempts is equivalent to 200
+                    'stats.stats_per_method is None, the interop client stats service does not support this test case'
-                    # second timeout.
+                )
-                    before_stats = get_client_accumulated_stats()
+            for i in range(attempt_count):
-                    if not before_stats.stats_per_method:
+                logger.info('%s: attempt %d', testcase_name, i)
-                        raise ValueError(
+
-                            'stats.stats_per_method is None, the interop client stats service does not support this test case'
+                test_runtime_secs = 10
-                        )
+                time.sleep(test_runtime_secs)
-                    logger.info('%s: attempt %d', testcase_name, i)
+                after_stats = get_client_accumulated_stats()
-
+
-                    test_runtime_secs = 10
+                success = True
-                    time.sleep(test_runtime_secs)
+                for status, pct in expected_results.items():
-                    after_stats = get_client_accumulated_stats()
+                    rpc = 'UNARY_CALL'
-
+                    qty = (after_stats.stats_per_method[rpc].result[status] -
-                    success = True
+                           before_stats.stats_per_method[rpc].result[status])
-                    for status, pct in expected_results.items():
+                    want = pct * args.qps * test_runtime_secs
-                        rpc = 'UNARY_CALL'
+                    # Allow 10% deviation from expectation to reduce flakiness
-                        qty = (
+                    VARIANCE_ALLOWED = 0.1
-                            after_stats.stats_per_method[rpc].result[status] -
+                    if abs(qty - want) > want * VARIANCE_ALLOWED:
-                            before_stats.stats_per_method[rpc].result[status])
+                        logger.info('%s: failed due to %s[%s]: got %d want ~%d',
-                        want = pct * args.qps * test_runtime_secs
+                                    testcase_name, rpc, status, qty, want)
-                        # Allow 10% deviation from expectation to reduce flakiness
+                        success = False
-                        VARIANCE_ALLOWED = 0.1
+                if success:
-                        if abs(qty - want) > want * VARIANCE_ALLOWED:
+                    logger.info('success')
-                            logger.info(
+                    break
-                                '%s: failed due to %s[%s]: got %d want ~%d',
+                logger.info('%s attempt %d failed', testcase_name, i)
-                                testcase_name, rpc, status, qty, want)
+                before_stats = after_stats
-                            success = False
+            else:
-                    if success:
+                raise Exception(
-                        logger.info('success')
+                    '%s: timeout waiting for expected results: %s; got %s' %
-                        break
+                    (testcase_name, expected_results,
-                    logger.info('%s attempt %d failed', testcase_name, i)
+                     after_stats.stats_per_method))
                    raise RpcDistributionError(
                        '%s: timeout waiting for expected results: %s; got %s' %
                        (testcase_name, expected_results,
                         after_stats.stats_per_method))
            except RpcDistributionError as e:
                if time.time() < deadline:
                    pass
                else:
                    raise
    finally:
        patch_url_map_backend_service(gcp, original_backend_service)
        set_validate_for_proxyless(gcp, True)
 def test_csds(gcp, original_backend_service, instance_group, server_uri):
    test_csds_timeout_s = datetime.timedelta(minutes=5).total_seconds()
    sleep_interval_between_attempts_s = datetime.timedelta(
        seconds=2).total_seconds()
    logger.info('Running test_csds')
@ -2177,9 +2151,10 @@ def test_csds(gcp, original_backend_service, instance_group, server_uri):
    logger.info('waiting for original backends to become healthy')
    wait_for_healthy_backends(gcp, original_backend_service, instance_group)
-    deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
+    # Test case timeout: 5 minutes
    deadline = time.time() + test_csds_timeout_s
    cnt = 0
-    while time.time() < deadline:
+    while time.time() <= deadline:
        client_config = get_client_xds_config_dump()
        logger.info('test_csds attempt %d: received xDS config %s', cnt,
                    json.dumps(client_config, indent=2))
@ -2251,7 +2226,7 @@ def test_csds(gcp, original_backend_service, instance_group, server_uri):
        cnt += 1
    raise RuntimeError('failed to receive a valid xDS config in %s seconds' %
-                       _WAIT_FOR_URL_MAP_PATCH_SEC)
+                       test_csds_timeout_s)
 def set_validate_for_proxyless(gcp, validate_for_proxyless):