From cdcedbf401d0885c0ee32d92b06063ff410c8592 Mon Sep 17 00:00:00 2001
From: Eric Gribkoff <ericgribkoff@google.com>
Date: Tue, 11 Aug 2020 13:47:21 -0700
Subject: [PATCH] Merge pull request #23791 from
 ericgribkoff/unconditional_wait_for_propagation

Deflake fail_on_failed_rpc xDS test cases
---
 tools/run_tests/run_xds_tests.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tools/run_tests/run_xds_tests.py b/tools/run_tests/run_xds_tests.py
index 690806703b4..95241b55a82 100755
--- a/tools/run_tests/run_xds_tests.py
+++ b/tools/run_tests/run_xds_tests.py
@@ -1718,6 +1718,21 @@ try:
                 metadata_to_send = '--metadata=""'
 
             if test_case in _TESTS_TO_FAIL_ON_RPC_FAILURE:
+                # TODO(ericgribkoff) Unconditional wait is recommended by TD
+                # team when reusing backend resources after config changes
+                # between test cases, as we are doing here. This should address
+                # flakiness issues with these tests; other attempts to deflake
+                # (such as waiting for the first successful RPC before failing
+                # on any subsequent failures) were insufficient because, due to
+                # propagation delays, we may initially see an RPC succeed to the
+                # expected backends but due to a stale configuration: e.g., test
+                # A (1) routes traffic to MIG A, then (2) switches to MIG B,
+                # then (3) back to MIG A. Test B begins running and sees RPCs
+                # going to MIG A, as expected. However, due to propagation
+                # delays, Test B is actually seeing the stale config from step
+                # (1), and then fails when it gets update (2) unexpectedly
+                # switching to MIG B.
+                time.sleep(200)
                 fail_on_failed_rpc = '--fail_on_failed_rpc=true'
             else:
                 fail_on_failed_rpc = '--fail_on_failed_rpc=false'