From cdcedbf401d0885c0ee32d92b06063ff410c8592 Mon Sep 17 00:00:00 2001 From: Eric Gribkoff Date: Tue, 11 Aug 2020 13:47:21 -0700 Subject: [PATCH] Merge pull request #23791 from ericgribkoff/unconditional_wait_for_propagation Deflake fail_on_failed_rpc xDS test cases --- tools/run_tests/run_xds_tests.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/run_tests/run_xds_tests.py b/tools/run_tests/run_xds_tests.py index 690806703b4..95241b55a82 100755 --- a/tools/run_tests/run_xds_tests.py +++ b/tools/run_tests/run_xds_tests.py @@ -1718,6 +1718,21 @@ try: metadata_to_send = '--metadata=""' if test_case in _TESTS_TO_FAIL_ON_RPC_FAILURE: + # TODO(ericgribkoff) Unconditional wait is recommended by TD + # team when reusing backend resources after config changes + # between test cases, as we are doing here. This should address + # flakiness issues with these tests; other attempts to deflake + # (such as waiting for the first successful RPC before failing + # on any subsequent failures) were insufficient because, due to + # propagation delays, we may initially see an RPC succeed to the + # expected backends but due to a stale configuration: e.g., test + # A (1) routes traffic to MIG A, then (2) switches to MIG B, + # then (3) back to MIG A. Test B begins running and sees RPCs + # going to MIG A, as expected. However, due to propagation + # delays, Test B is actually seeing the stale config from step + # (1), and then fails when it gets update (2) unexpectedly + # switching to MIG B. + time.sleep(200) fail_on_failed_rpc = '--fail_on_failed_rpc=true' else: fail_on_failed_rpc = '--fail_on_failed_rpc=false'