Revert "[xDS interop] Updating the config update timeout to 600s (#26090)" (#26197)

This reverts commit 9bc421c6cf.
4 years ago · cf7f5a0b85
parent 2ee8c49af0
commit cf7f5a0b85
1 changed files with 148 additions and 173 deletions
--- a/tools/run_tests/run_xds_tests.py
+++ b/tools/run_tests/run_xds_tests.py
@ -279,23 +279,14 @@ CLIENT_HOSTS = []
 if args.client_hosts:
    CLIENT_HOSTS = args.client_hosts.split(',')

-# Each of the config propagation in the control plane should finish within 600s.
-# Otherwise, it indicates a bug in the control plane. The config propagation
-# includes all kinds of traffic config update, like updating urlMap, creating
-# the resources for the first time, updating BackendService, and changing the
-# status of endpoints in BackendService.
-_WAIT_FOR_URL_MAP_PATCH_SEC = 600
-# In general, fetching load balancing stats only takes ~10s. However, slow
-# config update could lead to empty EDS or similar symptoms causing the
-# connection to hang for a long period of time. So, we want to extend the stats
-# wait time to be the same as urlMap patch time.
-_WAIT_FOR_STATS_SEC = _WAIT_FOR_URL_MAP_PATCH_SEC
-
 _DEFAULT_SERVICE_PORT = 80
 _WAIT_FOR_BACKEND_SEC = args.wait_for_backend_sec
 _WAIT_FOR_OPERATION_SEC = 1200
 _INSTANCE_GROUP_SIZE = args.instance_group_size
 _NUM_TEST_RPCS = 10 * args.qps
+_WAIT_FOR_STATS_SEC = 360
+_WAIT_FOR_VALID_CONFIG_SEC = 60
+_WAIT_FOR_URL_MAP_PATCH_SEC = 300
 _CONNECTION_TIMEOUT_SEC = 60
 _GCP_API_RETRIES = 5
 _BOOTSTRAP_TEMPLATE = """
@ -836,11 +827,8 @@ def test_round_robin(gcp, backend_service, instance_group):
    # creating new backend resources for each individual test case.
    # Each attempt takes 10 seconds. Config propagation can take several
    # minutes.
-    deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
-    logger.info(
-        'Attempting for %d seconds until received the expected distribution',
-        _WAIT_FOR_URL_MAP_PATCH_SEC)
-    while time.time() < deadline:
+    max_attempts = 40
+    for i in range(max_attempts):
        stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
        requests_received = [stats.rpcs_by_peer[x] for x in stats.rpcs_by_peer]
        total_requests_received = sum(requests_received)
@ -855,8 +843,7 @@ def test_round_robin(gcp, backend_service, instance_group):
                    'RPC peer distribution differs from expected by more than %d '
                    'for instance %s (%s)' % (threshold, instance, stats))
        return
-    raise Exception('RPC failures persisted through after %s seconds' %
-                    _WAIT_FOR_URL_MAP_PATCH_SEC)
+    raise Exception('RPC failures persisted through %d retries' % max_attempts)


 def test_secondary_locality_gets_no_requests_on_partial_primary_failure(
@ -1320,11 +1307,10 @@ def test_traffic_splitting(gcp, original_backend_service, instance_group,
            _WAIT_FOR_STATS_SEC)

        # Verify that weights between two services are expected.
-        deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
-        logger.info(
-            'Attempting for %d seconds until received the expected distribution',
-            _WAIT_FOR_URL_MAP_PATCH_SEC)
-        while time.time() < deadline:
+        retry_count = 10
+        # Each attempt takes about 10 seconds, 10 retries is equivalent to 100
+        # seconds timeout.
+        for i in range(retry_count):
            stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
            got_instance_count = [
                stats.rpcs_by_peer[i] for i in original_backend_instances
@ -1338,15 +1324,18 @@ def test_traffic_splitting(gcp, original_backend_service, instance_group,
                compare_distributions(got_instance_percentage,
                                      expected_instance_percentage, 5)
            except Exception as e:
-                logger.info('Got percentage: %s, expected percentage: %s',
-                            got_instance_percentage,
+                logger.info('attempt %d', i)
+                logger.info('got percentage: %s', got_instance_percentage)
+                logger.info('expected percentage: %s',
                            expected_instance_percentage)
                logger.info(e)
+                if i == retry_count - 1:
+                    raise Exception(
+                        'RPC distribution (%s) differs from expected (%s)' %
+                        (got_instance_percentage, expected_instance_percentage))
            else:
                logger.info("success")
-                return
-        raise Exception('RPC distribution (%s) differs from expected (%s)' %
-                        (got_instance_percentage, expected_instance_percentage))
+                break
    finally:
        patch_url_map_backend_service(gcp, original_backend_service)
        patch_backend_service(gcp, alternate_backend_service, [])
@ -1470,22 +1459,23 @@ def test_path_matching(gcp, original_backend_service, instance_group,
                original_backend_instances + alternate_backend_instances,
                _WAIT_FOR_STATS_SEC)

-            deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
-            logger.info(
-                'Attempting for %d seconds until received the expected distribution',
-                _WAIT_FOR_URL_MAP_PATCH_SEC)
-            while time.time() < deadline:
+            retry_count = 80
+            # Each attempt takes about 5 seconds, 80 retries is equivalent to 400
+            # seconds timeout.
+            for i in range(retry_count):
                stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
                if not stats.rpcs_by_method:
                    raise ValueError(
                        'stats.rpcs_by_method is None, the interop client stats service does not support this test case'
                    )
+                logger.info('attempt %d', i)
                if compare_expected_instances(stats, expected_instances):
                    logger.info("success")
-                    return
-            raise Exception(
-                'timeout waiting for RPCs to the expected instances: %s' %
-                expected_instances)
+                    break
+                elif i == retry_count - 1:
+                    raise Exception(
+                        'timeout waiting for RPCs to the expected instances: %s'
+                        % expected_instances)
    finally:
        patch_url_map_backend_service(gcp, original_backend_service)
        patch_backend_service(gcp, alternate_backend_service, [])
@ -1667,22 +1657,23 @@ def test_header_matching(gcp, original_backend_service, instance_group,
                original_backend_instances + alternate_backend_instances,
                _WAIT_FOR_STATS_SEC)

-            deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
-            logger.info(
-                'Attempting for %d seconds until received the expected distribution',
-                _WAIT_FOR_URL_MAP_PATCH_SEC)
-            while time.time() < deadline:
+            retry_count = 80
+            # Each attempt takes about 5 seconds, 80 retries is equivalent to 400
+            # seconds timeout.
+            for i in range(retry_count):
                stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
                if not stats.rpcs_by_method:
                    raise ValueError(
                        'stats.rpcs_by_method is None, the interop client stats service does not support this test case'
                    )
+                logger.info('attempt %d', i)
                if compare_expected_instances(stats, expected_instances):
                    logger.info("success")
-                    return
-            raise Exception(
-                'timeout waiting for RPCs to the expected instances: %s' %
-                expected_instances)
+                    break
+                elif i == retry_count - 1:
+                    raise Exception(
+                        'timeout waiting for RPCs to the expected instances: %s'
+                        % expected_instances)
    finally:
        patch_url_map_backend_service(gcp, original_backend_service)
        patch_backend_service(gcp, alternate_backend_service, [])
@ -1693,7 +1684,7 @@ def test_circuit_breaking(gcp, original_backend_service, instance_group,
    '''
    Since backend service circuit_breakers configuration cannot be unset,
    which causes trouble for restoring validate_for_proxy flag in target
-    proxy/global forwarding rule. This test uses dedicated backend services.
+    proxy/global forwarding rule. This test uses dedicated backend sevices.
    The url_map and backend services undergoes the following state changes:

    Before test:
@ -1920,62 +1911,53 @@ def test_timeout(gcp, original_backend_service, instance_group):
    ]

    try:
-        deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
-        logger.info(
-            'Attempting for %d seconds until received the expected distribution',
-            _WAIT_FOR_URL_MAP_PATCH_SEC)
-        attempt_counter = 0
-        while True:
-            attempt_counter += 1
-            try:
-                for (testcase_name, client_config,
-                     expected_results) in test_cases:
-                    logger.info('starting case %s: attempt %d', testcase_name,
-                                attempt_counter)
-                    configure_client(**client_config)
-                    # wait a second to help ensure the client stops sending RPCs with
-                    # the old config.  We will make multiple attempts if it is failing,
-                    # but this improves confidence that the test is valid if the
-                    # previous client_config would lead to the same results.
-                    time.sleep(1)
-                    before_stats = get_client_accumulated_stats()
-                    if not before_stats.stats_per_method:
-                        raise ValueError(
-                            'stats.stats_per_method is None, the interop client stats service does not support this test case'
-                        )
-                    logger.info('%s: attempt %d', testcase_name,
-                                attempt_counter)
-
-                    test_runtime_secs = 10
-                    time.sleep(test_runtime_secs)
-                    after_stats = get_client_accumulated_stats()
-
-                    success = True
-                    for rpc, status in expected_results.items():
-                        qty = (
-                            after_stats.stats_per_method[rpc].result[status] -
-                            before_stats.stats_per_method[rpc].result[status])
-                        want = test_runtime_secs * args.qps
-                        # Allow 10% deviation from expectation to reduce flakiness
-                        if qty < (want * .9) or qty > (want * 1.1):
-                            logger.info(
-                                '%s: failed due to %s[%s]: got %d want ~%d',
-                                testcase_name, rpc, status, qty, want)
-                            success = False
-                    if success:
-                        logger.info('success')
-                        return
-                    logger.info('%s attempt %d failed', testcase_name,
-                                attempt_counter)
-                    raise RpcDistributionError(
-                        '%s: timeout waiting for expected results: %s; got %s' %
-                        (testcase_name, expected_results,
-                         after_stats.stats_per_method))
-            except RpcDistributionError as e:
-                if time.time() < deadline:
-                    pass
-                else:
-                    raise
+        first_case = True
+        for (testcase_name, client_config, expected_results) in test_cases:
+            logger.info('starting case %s', testcase_name)
+            configure_client(**client_config)
+            # wait a second to help ensure the client stops sending RPCs with
+            # the old config.  We will make multiple attempts if it is failing,
+            # but this improves confidence that the test is valid if the
+            # previous client_config would lead to the same results.
+            time.sleep(1)
+            # Each attempt takes 10 seconds; 20 attempts is equivalent to 200
+            # second timeout.
+            attempt_count = 20
+            if first_case:
+                attempt_count = 120
+                first_case = False
+            before_stats = get_client_accumulated_stats()
+            if not before_stats.stats_per_method:
+                raise ValueError(
+                    'stats.stats_per_method is None, the interop client stats service does not support this test case'
+                )
+            for i in range(attempt_count):
+                logger.info('%s: attempt %d', testcase_name, i)
+
+                test_runtime_secs = 10
+                time.sleep(test_runtime_secs)
+                after_stats = get_client_accumulated_stats()
+
+                success = True
+                for rpc, status in expected_results.items():
+                    qty = (after_stats.stats_per_method[rpc].result[status] -
+                           before_stats.stats_per_method[rpc].result[status])
+                    want = test_runtime_secs * args.qps
+                    # Allow 10% deviation from expectation to reduce flakiness
+                    if qty < (want * .9) or qty > (want * 1.1):
+                        logger.info('%s: failed due to %s[%s]: got %d want ~%d',
+                                    testcase_name, rpc, status, qty, want)
+                        success = False
+                if success:
+                    logger.info('success')
+                    break
+                logger.info('%s attempt %d failed', testcase_name, i)
+                before_stats = after_stats
+            else:
+                raise Exception(
+                    '%s: timeout waiting for expected results: %s; got %s' %
+                    (testcase_name, expected_results,
+                     after_stats.stats_per_method))
    finally:
        patch_url_map_backend_service(gcp, original_backend_service)

@ -2098,78 +2080,70 @@ def test_fault_injection(gcp, original_backend_service, instance_group):
    ]

    try:
-        deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
-        logger.info(
-            'Attempting for %d seconds until received the expected distribution',
-            _WAIT_FOR_URL_MAP_PATCH_SEC)
-        attempt_counter = 0
-        while True:
-            attempt_counter += 1
-            try:
-                for (testcase_name, client_config,
-                     expected_results) in test_cases:
-                    logger.info('starting case %s: attempt %d', testcase_name,
-                                attempt_counter)
-
-                    client_config['metadata'] = [
-                        (messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
-                         testcase_header, testcase_name)
-                    ]
-                    client_config['rpc_types'] = [
-                        messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
-                    ]
-                    configure_client(**client_config)
-                    # wait a second to help ensure the client stops sending RPCs with
-                    # the old config.  We will make multiple attempts if it is failing,
-                    # but this improves confidence that the test is valid if the
-                    # previous client_config would lead to the same results.
-                    time.sleep(1)
-                    # Each attempt takes 10 seconds; 20 attempts is equivalent to 200
-                    # second timeout.
-                    before_stats = get_client_accumulated_stats()
-                    if not before_stats.stats_per_method:
-                        raise ValueError(
-                            'stats.stats_per_method is None, the interop client stats service does not support this test case'
-                        )
-                    logger.info('%s: attempt %d', testcase_name, i)
-
-                    test_runtime_secs = 10
-                    time.sleep(test_runtime_secs)
-                    after_stats = get_client_accumulated_stats()
-
-                    success = True
-                    for status, pct in expected_results.items():
-                        rpc = 'UNARY_CALL'
-                        qty = (
-                            after_stats.stats_per_method[rpc].result[status] -
-                            before_stats.stats_per_method[rpc].result[status])
-                        want = pct * args.qps * test_runtime_secs
-                        # Allow 10% deviation from expectation to reduce flakiness
-                        VARIANCE_ALLOWED = 0.1
-                        if abs(qty - want) > want * VARIANCE_ALLOWED:
-                            logger.info(
-                                '%s: failed due to %s[%s]: got %d want ~%d',
-                                testcase_name, rpc, status, qty, want)
-                            success = False
-                    if success:
-                        logger.info('success')
-                        break
-                    logger.info('%s attempt %d failed', testcase_name, i)
-                    raise RpcDistributionError(
-                        '%s: timeout waiting for expected results: %s; got %s' %
-                        (testcase_name, expected_results,
-                         after_stats.stats_per_method))
-            except RpcDistributionError as e:
-                if time.time() < deadline:
-                    pass
-                else:
-                    raise
+        first_case = True
+        for (testcase_name, client_config, expected_results) in test_cases:
+            logger.info('starting case %s', testcase_name)
+
+            client_config['metadata'] = [
+                (messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
+                 testcase_header, testcase_name)
+            ]
+            client_config['rpc_types'] = [
+                messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
+            ]
+            configure_client(**client_config)
+            # wait a second to help ensure the client stops sending RPCs with
+            # the old config.  We will make multiple attempts if it is failing,
+            # but this improves confidence that the test is valid if the
+            # previous client_config would lead to the same results.
+            time.sleep(1)
+            # Each attempt takes 10 seconds; 20 attempts is equivalent to 200
+            # second timeout.
+            attempt_count = 20
+            if first_case:
+                attempt_count = 120
+                first_case = False
+            before_stats = get_client_accumulated_stats()
+            if not before_stats.stats_per_method:
+                raise ValueError(
+                    'stats.stats_per_method is None, the interop client stats service does not support this test case'
+                )
+            for i in range(attempt_count):
+                logger.info('%s: attempt %d', testcase_name, i)
+
+                test_runtime_secs = 10
+                time.sleep(test_runtime_secs)
+                after_stats = get_client_accumulated_stats()
+
+                success = True
+                for status, pct in expected_results.items():
+                    rpc = 'UNARY_CALL'
+                    qty = (after_stats.stats_per_method[rpc].result[status] -
+                           before_stats.stats_per_method[rpc].result[status])
+                    want = pct * args.qps * test_runtime_secs
+                    # Allow 10% deviation from expectation to reduce flakiness
+                    VARIANCE_ALLOWED = 0.1
+                    if abs(qty - want) > want * VARIANCE_ALLOWED:
+                        logger.info('%s: failed due to %s[%s]: got %d want ~%d',
+                                    testcase_name, rpc, status, qty, want)
+                        success = False
+                if success:
+                    logger.info('success')
+                    break
+                logger.info('%s attempt %d failed', testcase_name, i)
+                before_stats = after_stats
+            else:
+                raise Exception(
+                    '%s: timeout waiting for expected results: %s; got %s' %
+                    (testcase_name, expected_results,
+                     after_stats.stats_per_method))
    finally:
        patch_url_map_backend_service(gcp, original_backend_service)
        set_validate_for_proxyless(gcp, True)


 def test_csds(gcp, original_backend_service, instance_group, server_uri):
+    test_csds_timeout_s = datetime.timedelta(minutes=5).total_seconds()
    sleep_interval_between_attempts_s = datetime.timedelta(
        seconds=2).total_seconds()
    logger.info('Running test_csds')
@ -2177,9 +2151,10 @@ def test_csds(gcp, original_backend_service, instance_group, server_uri):
    logger.info('waiting for original backends to become healthy')
    wait_for_healthy_backends(gcp, original_backend_service, instance_group)

-    deadline = time.time() + _WAIT_FOR_URL_MAP_PATCH_SEC
+    # Test case timeout: 5 minutes
+    deadline = time.time() + test_csds_timeout_s
    cnt = 0
-    while time.time() < deadline:
+    while time.time() <= deadline:
        client_config = get_client_xds_config_dump()
        logger.info('test_csds attempt %d: received xDS config %s', cnt,
                    json.dumps(client_config, indent=2))
@ -2251,7 +2226,7 @@ def test_csds(gcp, original_backend_service, instance_group, server_uri):
        cnt += 1

    raise RuntimeError('failed to receive a valid xDS config in %s seconds' %
-                       _WAIT_FOR_URL_MAP_PATCH_SEC)
+                       test_csds_timeout_s)


 def set_validate_for_proxyless(gcp, validate_for_proxyless):