test: ServerFailoverOpts can fail on heavily loaded systems due to its reliance on sleep and time. Try to harden it a little bit

4 months ago · 1e8f056b2c
parent 493a2a57f0
commit 1e8f056b2c
5 changed files with 218 additions and 74 deletions
--- a/test/ares-test-mock-ai.cc
+++ b/test/ares-test-mock-ai.cc
@ -723,20 +723,6 @@ class NoRotateMultiMockTestAI : public MockMultiServerChannelTestAI {
  NoRotateMultiMockTestAI() : MockMultiServerChannelTestAI(nullptr, ARES_OPT_NOROTATE) {}
 };
 class ServerFailoverOptsMockTestAI : public MockMultiServerChannelTestAI {
 public:
  ServerFailoverOptsMockTestAI()
    : MockMultiServerChannelTestAI(FillOptions(&opts_),
                                   ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {}
  static struct ares_options* FillOptions(struct ares_options *opts) {
    memset(opts, 0, sizeof(struct ares_options));
    opts->server_failover_opts.retry_chance = 1;
    opts->server_failover_opts.retry_delay = 250;
    return opts;
  }
 private:
  struct ares_options opts_;
 };
 TEST_P(NoRotateMultiMockTestAI, ThirdServer) {
  struct ares_options opts;
@ -789,10 +775,26 @@ TEST_P(NoRotateMultiMockTestAI, ThirdServer) {
  CheckExample();
 }
 #define SERVER_FAILOVER_RETRY_DELAY 750
 class ServerFailoverOptsMockTestAI : public MockMultiServerChannelTestAI {
 public:
  ServerFailoverOptsMockTestAI()
    : MockMultiServerChannelTestAI(FillOptions(&opts_),
                                   ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {}
  static struct ares_options* FillOptions(struct ares_options *opts) {
    memset(opts, 0, sizeof(struct ares_options));
    opts->server_failover_opts.retry_chance = 1;
    opts->server_failover_opts.retry_delay = SERVER_FAILOVER_RETRY_DELAY;
    return opts;
  }
 private:
  struct ares_options opts_;
 };
 // Test case to trigger server failover behavior. We use a retry chance of
-// 100% and a retry delay of 250ms so that we can test behavior reliably.
+// 100% and a retry delay so that we can test behavior reliably.
 TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) {
-  DNSPacket servfailrsp;
+ DNSPacket servfailrsp;
  servfailrsp.set_response().set_aa().set_rcode(SERVFAIL)
    .add_question(new DNSQuestion("www.example.com", T_A));
  DNSPacket okrsp;
@ -800,7 +802,12 @@ TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) {
    .add_question(new DNSQuestion("www.example.com", T_A))
    .add_answer(new DNSARR("www.example.com", 100, {2,3,4,5}));
  auto tv_begin = std::chrono::high_resolution_clock::now();
  auto tv_now   = std::chrono::high_resolution_clock::now();
  unsigned int delay_ms;
  // 1. If all servers are healthy, then the first server should be selected.
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: First server should be selected" << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &okrsp));
  CheckExample();
@ -808,16 +815,23 @@ TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) {
  // 2. Failed servers should be retried after the retry delay.
  //
  // Fail server #0 but leave server #1 as healthy.
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Server0 will fail but leave Server1 as healthy" << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp));
  EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[1].get(), &okrsp));
  CheckExample();
-  // Sleep for the retry delay (actually a little more than 250ms to account
+  // Sleep for the retry delay (actually a little more than the retry delay to account
  // for unreliable timing, e.g. NTP slew) and send in another query. Server #0
  // should be retried.
-  std::this_thread::sleep_for(std::chrono::milliseconds(260));
+  tv_now = std::chrono::high_resolution_clock::now();
  delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10);
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
  ares_sleep_time(delay_ms);
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Server0 should be past retry delay and should be tried again successfully" << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &okrsp));
  CheckExample();
@ -827,6 +841,8 @@ TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) {
  //
  // Fail all servers for the first round of tries. On the second round server
  // #1 responds successfully.
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: All 3 servers will fail on the first attempt. On second attempt, Server0 will fail, but Server1 will answer correctly." << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp));
@ -840,20 +856,36 @@ TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) {
  // At this point the sorted servers look like [1] (f0) [2] (f1) [0] (f2).
  // Sleep for the retry delay and send in another query. Server #2 should be
  // retried first, and then server #0.
-  std::this_thread::sleep_for(std::chrono::milliseconds(260));
+  tv_now = std::chrono::high_resolution_clock::now();
  delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10);
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
  ares_sleep_time(delay_ms);
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Past retry delay, so will choose Server2 and Server0 that are down. Server2 will fail but Server0 will succeed." << std::endl;
  EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[2].get(), &servfailrsp));
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &okrsp));
  CheckExample();
  // Test might take a while to run and the sleep may not be accurate, so we
  // want to track this interval otherwise we may not pass the last test case
  // on slow systems.
  auto elapse_start = tv_now;
  // 4. If there are multiple failed servers, then servers which have not yet
  //    met the retry delay should be skipped.
  //
  // The sorted servers currently look like [0] (f0) [1] (f0) [2] (f2) and
  // server #2 has just been retried.
-  // Sleep for half the retry delay and trigger a failure on server #0.
+  // Sleep for 1/2 the retry delay and trigger a failure on server #0.
-  std::this_thread::sleep_for(std::chrono::milliseconds(130));
+  tv_now = std::chrono::high_resolution_clock::now();
  delay_ms = (SERVER_FAILOVER_RETRY_DELAY/2);
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
  ares_sleep_time(delay_ms);
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Retry delay has not been hit yet. Server0 was last successful, so should be tried first (and will fail), Server1 is also healthy so will respond." << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp));
  EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
@ -861,10 +893,22 @@ TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) {
  CheckExample();
  // The sorted servers now look like [1] (f0) [0] (f1) [2] (f2). Server #0
-  // has just failed whilst server #2 is halfway through the retry delay.
+  // has just failed whilst server #2 is somewhere in its retry delay.
-  // Sleep for another half the retry delay and check that server #2 is retried
+  // Sleep until we know server #2s retry delay has elapsed but Server #0 has
-  // whilst server #0 is not.
+  // not.
-  std::this_thread::sleep_for(std::chrono::milliseconds(130));
+  tv_now = std::chrono::high_resolution_clock::now();
  unsigned int elapsed_time = (unsigned int)std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - elapse_start).count();
  delay_ms = (SERVER_FAILOVER_RETRY_DELAY) + (SERVER_FAILOVER_RETRY_DELAY / 10);
  if (elapsed_time > delay_ms) {
    if (verbose) std::cerr << "elapsed duration " << elapsed_time << "ms greater than desired delay of " << delay_ms << "ms, not sleeping" << std::endl;
  } else {
    delay_ms -= elapsed_time; // subtract already elapsed time
    if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
    ares_sleep_time(delay_ms);
  }
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Retry delay has expired on Server2 but not Server0, will try on Server2 and fail, then Server1 will answer" << std::endl;
  EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[2].get(), &servfailrsp));
  EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
--- a/test/ares-test-mock-et.cc
+++ b/test/ares-test-mock-et.cc
@ -1250,20 +1250,6 @@ class NoRotateMultiMockEventThreadTest : public MockMultiServerEventThreadTest {
  NoRotateMultiMockEventThreadTest() : MockMultiServerEventThreadTest(nullptr, ARES_OPT_NOROTATE) {}
 };
 class ServerFailoverOptsMockEventThreadTest : public MockMultiServerEventThreadTest {
 public:
  ServerFailoverOptsMockEventThreadTest()
    : MockMultiServerEventThreadTest(FillOptions(&opts_),
                                     ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {}
  static struct ares_options* FillOptions(struct ares_options *opts) {
    memset(opts, 0, sizeof(struct ares_options));
    opts->server_failover_opts.retry_chance = 1;
    opts->server_failover_opts.retry_delay = 250;
    return opts;
  }
 private:
  struct ares_options opts_;
 };
 TEST_P(NoRotateMultiMockEventThreadTest, ThirdServer) {
  struct ares_options opts;
@ -1384,8 +1370,24 @@ TEST_P(NoRotateMultiMockEventThreadTest, ServerNoResponseFailover) {
  EXPECT_EQ("{'www.example.com' aliases=[] addrs=[2.3.4.5]}", ss4.str());
 }
 #define SERVER_FAILOVER_RETRY_DELAY 750
 class ServerFailoverOptsMockEventThreadTest : public MockMultiServerEventThreadTest {
 public:
  ServerFailoverOptsMockEventThreadTest()
    : MockMultiServerEventThreadTest(FillOptions(&opts_),
                                     ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {}
  static struct ares_options* FillOptions(struct ares_options *opts) {
    memset(opts, 0, sizeof(struct ares_options));
    opts->server_failover_opts.retry_chance = 1;
    opts->server_failover_opts.retry_delay = SERVER_FAILOVER_RETRY_DELAY;
    return opts;
  }
 private:
  struct ares_options opts_;
 };
 // Test case to trigger server failover behavior. We use a retry chance of
-// 100% and a retry delay of 250ms so that we can test behavior reliably.
+// 100% and a retry delay so that we can test behavior reliably.
 TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) {
  DNSPacket servfailrsp;
  servfailrsp.set_response().set_aa().set_rcode(SERVFAIL)
@ -1395,7 +1397,12 @@ TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) {
    .add_question(new DNSQuestion("www.example.com", T_A))
    .add_answer(new DNSARR("www.example.com", 100, {2,3,4,5}));
  auto tv_begin = std::chrono::high_resolution_clock::now();
  auto tv_now   = std::chrono::high_resolution_clock::now();
  unsigned int delay_ms;
  // 1. If all servers are healthy, then the first server should be selected.
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: First server should be selected" << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &okrsp));
  CheckExample();
@ -1403,16 +1410,23 @@ TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) {
  // 2. Failed servers should be retried after the retry delay.
  //
  // Fail server #0 but leave server #1 as healthy.
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Server0 will fail but leave Server1 as healthy" << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp));
  EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[1].get(), &okrsp));
  CheckExample();
-  // Sleep for the retry delay (actually a little more than 250ms to account
+  // Sleep for the retry delay (actually a little more than the retry delay to account
  // for unreliable timing, e.g. NTP slew) and send in another query. Server #0
  // should be retried.
-  std::this_thread::sleep_for(std::chrono::milliseconds(260));
+  tv_now = std::chrono::high_resolution_clock::now();
  delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10);
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
  ares_sleep_time(delay_ms);
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Server0 should be past retry delay and should be tried again successfully" << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &okrsp));
  CheckExample();
@ -1422,6 +1436,8 @@ TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) {
  //
  // Fail all servers for the first round of tries. On the second round server
  // #1 responds successfully.
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: All 3 servers will fail on the first attempt. On second attempt, Server0 will fail, but Server1 will answer correctly." << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp));
@ -1435,20 +1451,36 @@ TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) {
  // At this point the sorted servers look like [1] (f0) [2] (f1) [0] (f2).
  // Sleep for the retry delay and send in another query. Server #2 should be
  // retried first, and then server #0.
-  std::this_thread::sleep_for(std::chrono::milliseconds(260));
+  tv_now = std::chrono::high_resolution_clock::now();
  delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10);
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
  ares_sleep_time(delay_ms);
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Past retry delay, so will choose Server2 and Server0 that are down. Server2 will fail but Server0 will succeed." << std::endl;
  EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[2].get(), &servfailrsp));
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &okrsp));
  CheckExample();
  // Test might take a while to run and the sleep may not be accurate, so we
  // want to track this interval otherwise we may not pass the last test case
  // on slow systems.
  auto elapse_start = tv_now;
  // 4. If there are multiple failed servers, then servers which have not yet
  //    met the retry delay should be skipped.
  //
  // The sorted servers currently look like [0] (f0) [1] (f0) [2] (f2) and
  // server #2 has just been retried.
-  // Sleep for half the retry delay and trigger a failure on server #0.
+  // Sleep for 1/2 the retry delay and trigger a failure on server #0.
-  std::this_thread::sleep_for(std::chrono::milliseconds(130));
+  tv_now = std::chrono::high_resolution_clock::now();
  delay_ms = (SERVER_FAILOVER_RETRY_DELAY/2);
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
  ares_sleep_time(delay_ms);
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Retry delay has not been hit yet. Server0 was last successful, so should be tried first (and will fail), Server1 is also healthy so will respond." << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp));
  EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
@ -1456,10 +1488,22 @@ TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) {
  CheckExample();
  // The sorted servers now look like [1] (f0) [0] (f1) [2] (f2). Server #0
-  // has just failed whilst server #2 is halfway through the retry delay.
+  // has just failed whilst server #2 is somewhere in its retry delay.
-  // Sleep for another half the retry delay and check that server #2 is retried
+  // Sleep until we know server #2s retry delay has elapsed but Server #0 has
-  // whilst server #0 is not.
+  // not.
-  std::this_thread::sleep_for(std::chrono::milliseconds(130));
+  tv_now = std::chrono::high_resolution_clock::now();
  unsigned int elapsed_time = (unsigned int)std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - elapse_start).count();
  delay_ms = (SERVER_FAILOVER_RETRY_DELAY) + (SERVER_FAILOVER_RETRY_DELAY / 10);
  if (elapsed_time > delay_ms) {
    if (verbose) std::cerr << "elapsed duration " << elapsed_time << "ms greater than desired delay of " << delay_ms << "ms, not sleeping" << std::endl;
  } else {
    delay_ms -= elapsed_time; // subtract already elapsed time
    if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
    ares_sleep_time(delay_ms);
  }
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Retry delay has expired on Server2 but not Server0, will try on Server2 and fail, then Server1 will answer" << std::endl;
  EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[2].get(), &servfailrsp));
  EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
--- a/test/ares-test-mock.cc
+++ b/test/ares-test-mock.cc
@ -1559,20 +1559,6 @@ class NoRotateMultiMockTest : public MockMultiServerChannelTest {
  NoRotateMultiMockTest() : MockMultiServerChannelTest(nullptr, ARES_OPT_NOROTATE) {}
 };
 class ServerFailoverOptsMultiMockTest : public MockMultiServerChannelTest {
 public:
  ServerFailoverOptsMultiMockTest()
    : MockMultiServerChannelTest(FillOptions(&opts_),
                                 ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {}
  static struct ares_options* FillOptions(struct ares_options *opts) {
    memset(opts, 0, sizeof(struct ares_options));
    opts->server_failover_opts.retry_chance = 1;
    opts->server_failover_opts.retry_delay = 250;
    return opts;
  }
 private:
  struct ares_options opts_;
 };
 TEST_P(NoRotateMultiMockTest, ThirdServer) {
  struct ares_options opts;
@ -1693,10 +1679,26 @@ TEST_P(NoRotateMultiMockTest, ServerNoResponseFailover) {
  EXPECT_EQ("{'www.example.com' aliases=[] addrs=[2.3.4.5]}", ss4.str());
 }
 #define SERVER_FAILOVER_RETRY_DELAY 750
 class ServerFailoverOptsMultiMockTest : public MockMultiServerChannelTest {
 public:
  ServerFailoverOptsMultiMockTest()
    : MockMultiServerChannelTest(FillOptions(&opts_),
                                 ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {}
  static struct ares_options* FillOptions(struct ares_options *opts) {
    memset(opts, 0, sizeof(struct ares_options));
    opts->server_failover_opts.retry_chance = 1;
    opts->server_failover_opts.retry_delay = SERVER_FAILOVER_RETRY_DELAY;
    return opts;
  }
 private:
  struct ares_options opts_;
 };
 // Test case to trigger server failover behavior. We use a retry chance of
-// 100% and a retry delay of 250ms so that we can test behavior reliably.
+// 100% and a retry delay so that we can test behavior reliably.
 TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) {
-  DNSPacket servfailrsp;
+ DNSPacket servfailrsp;
  servfailrsp.set_response().set_aa().set_rcode(SERVFAIL)
    .add_question(new DNSQuestion("www.example.com", T_A));
  DNSPacket okrsp;
@ -1704,7 +1706,12 @@ TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) {
    .add_question(new DNSQuestion("www.example.com", T_A))
    .add_answer(new DNSARR("www.example.com", 100, {2,3,4,5}));
  auto tv_begin = std::chrono::high_resolution_clock::now();
  auto tv_now   = std::chrono::high_resolution_clock::now();
  unsigned int delay_ms;
  // 1. If all servers are healthy, then the first server should be selected.
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: First server should be selected" << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &okrsp));
  CheckExample();
@ -1712,16 +1719,23 @@ TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) {
  // 2. Failed servers should be retried after the retry delay.
  //
  // Fail server #0 but leave server #1 as healthy.
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Server0 will fail but leave Server1 as healthy" << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp));
  EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[1].get(), &okrsp));
  CheckExample();
-  // Sleep for the retry delay (actually a little more than 250ms to account
+  // Sleep for the retry delay (actually a little more than the retry delay to account
  // for unreliable timing, e.g. NTP slew) and send in another query. Server #0
  // should be retried.
-  std::this_thread::sleep_for(std::chrono::milliseconds(260));
+  tv_now = std::chrono::high_resolution_clock::now();
  delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10);
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
  ares_sleep_time(delay_ms);
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Server0 should be past retry delay and should be tried again successfully" << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &okrsp));
  CheckExample();
@ -1731,6 +1745,8 @@ TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) {
  //
  // Fail all servers for the first round of tries. On the second round server
  // #1 responds successfully.
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: All 3 servers will fail on the first attempt. On second attempt, Server0 will fail, but Server1 will answer correctly." << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp));
@ -1744,20 +1760,36 @@ TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) {
  // At this point the sorted servers look like [1] (f0) [2] (f1) [0] (f2).
  // Sleep for the retry delay and send in another query. Server #2 should be
  // retried first, and then server #0.
-  std::this_thread::sleep_for(std::chrono::milliseconds(260));
+  tv_now = std::chrono::high_resolution_clock::now();
  delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10);
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
  ares_sleep_time(delay_ms);
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Past retry delay, so will choose Server2 and Server0 that are down. Server2 will fail but Server0 will succeed." << std::endl;
  EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[2].get(), &servfailrsp));
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &okrsp));
  CheckExample();
  // Test might take a while to run and the sleep may not be accurate, so we
  // want to track this interval otherwise we may not pass the last test case
  // on slow systems.
  auto elapse_start = tv_now;
  // 4. If there are multiple failed servers, then servers which have not yet
  //    met the retry delay should be skipped.
  //
  // The sorted servers currently look like [0] (f0) [1] (f0) [2] (f2) and
  // server #2 has just been retried.
-  // Sleep for half the retry delay and trigger a failure on server #0.
+  // Sleep for 1/2 the retry delay and trigger a failure on server #0.
-  std::this_thread::sleep_for(std::chrono::milliseconds(130));
+  tv_now = std::chrono::high_resolution_clock::now();
  delay_ms = (SERVER_FAILOVER_RETRY_DELAY/2);
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
  ares_sleep_time(delay_ms);
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Retry delay has not been hit yet. Server0 was last successful, so should be tried first (and will fail), Server1 is also healthy so will respond." << std::endl;
  EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[0].get(), &servfailrsp));
  EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
@ -1765,10 +1797,22 @@ TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) {
  CheckExample();
  // The sorted servers now look like [1] (f0) [0] (f1) [2] (f2). Server #0
-  // has just failed whilst server #2 is halfway through the retry delay.
+  // has just failed whilst server #2 is somewhere in its retry delay.
-  // Sleep for another half the retry delay and check that server #2 is retried
+  // Sleep until we know server #2s retry delay has elapsed but Server #0 has
-  // whilst server #0 is not.
+  // not.
-  std::this_thread::sleep_for(std::chrono::milliseconds(130));
+  tv_now = std::chrono::high_resolution_clock::now();
  unsigned int elapsed_time = (unsigned int)std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - elapse_start).count();
  delay_ms = (SERVER_FAILOVER_RETRY_DELAY) + (SERVER_FAILOVER_RETRY_DELAY / 10);
  if (elapsed_time > delay_ms) {
    if (verbose) std::cerr << "elapsed duration " << elapsed_time << "ms greater than desired delay of " << delay_ms << "ms, not sleeping" << std::endl;
  } else {
    delay_ms -= elapsed_time; // subtract already elapsed time
    if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl;
    ares_sleep_time(delay_ms);
  }
  tv_now = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << std::chrono::duration_cast<std::chrono::milliseconds>(tv_now - tv_begin).count() << "ms: Retry delay has expired on Server2 but not Server0, will try on Server2 and fail, then Server1 will answer" << std::endl;
  EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
    .WillOnce(SetReply(servers_[2].get(), &servfailrsp));
  EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
--- a/test/ares-test.cc
+++ b/test/ares-test.cc
@ -249,6 +249,16 @@ unsigned long long LibraryTest::fails_ = 0;
 std::map<size_t, int> LibraryTest::size_fails_;
 std::mutex            LibraryTest::lock_;
 void ares_sleep_time(unsigned int ms)
 {
  auto duration   = std::chrono::milliseconds(ms);
  auto start_time = std::chrono::high_resolution_clock::now();
  auto wake_time  = start_time + duration;
  std::this_thread::sleep_until(wake_time);
  auto end_time   = std::chrono::high_resolution_clock::now();
  if (verbose) std::cerr << "sleep requested " << ms << "ms, slept for " << std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count() << "ms" << std::endl;
 }
 void ProcessWork(ares_channel_t *channel,
                 std::function<std::set<ares_socket_t>()> get_extrafds,
                 std::function<void(ares_socket_t)> process_extra,
--- a/test/ares-test.h
+++ b/test/ares-test.h
@ -88,6 +88,8 @@ extern std::vector<std::tuple<ares_evsys_t, int>>       evsys_families;
 extern std::vector<std::pair<int, bool>>                families_modes;
 extern std::vector<std::tuple<ares_evsys_t, int, bool>> evsys_families_modes;
 // Hopefully a more accurate sleep than sleep_for()
 void ares_sleep_time(unsigned int ms);
 // Process all pending work on ares-owned file descriptors, plus
 // optionally the given set-of-FDs + work function.