From 1e8f056b2c76121cc2650a4ddaaef7f42cdbe5a4 Mon Sep 17 00:00:00 2001 From: Brad House Date: Tue, 23 Jul 2024 13:14:10 -0400 Subject: [PATCH] test: ServerFailoverOpts can fail on heavily loaded systems due to its reliance on sleep and time. Try to harden it a little bit --- test/ares-test-mock-ai.cc | 94 ++++++++++++++++++++++++++++----------- test/ares-test-mock-et.cc | 92 ++++++++++++++++++++++++++++---------- test/ares-test-mock.cc | 94 ++++++++++++++++++++++++++++----------- test/ares-test.cc | 10 +++++ test/ares-test.h | 2 + 5 files changed, 218 insertions(+), 74 deletions(-) diff --git a/test/ares-test-mock-ai.cc b/test/ares-test-mock-ai.cc index b52850b0..d5fd0619 100644 --- a/test/ares-test-mock-ai.cc +++ b/test/ares-test-mock-ai.cc @@ -723,20 +723,6 @@ class NoRotateMultiMockTestAI : public MockMultiServerChannelTestAI { NoRotateMultiMockTestAI() : MockMultiServerChannelTestAI(nullptr, ARES_OPT_NOROTATE) {} }; -class ServerFailoverOptsMockTestAI : public MockMultiServerChannelTestAI { - public: - ServerFailoverOptsMockTestAI() - : MockMultiServerChannelTestAI(FillOptions(&opts_), - ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {} - static struct ares_options* FillOptions(struct ares_options *opts) { - memset(opts, 0, sizeof(struct ares_options)); - opts->server_failover_opts.retry_chance = 1; - opts->server_failover_opts.retry_delay = 250; - return opts; - } - private: - struct ares_options opts_; -}; TEST_P(NoRotateMultiMockTestAI, ThirdServer) { struct ares_options opts; @@ -789,10 +775,26 @@ TEST_P(NoRotateMultiMockTestAI, ThirdServer) { CheckExample(); } +#define SERVER_FAILOVER_RETRY_DELAY 750 +class ServerFailoverOptsMockTestAI : public MockMultiServerChannelTestAI { + public: + ServerFailoverOptsMockTestAI() + : MockMultiServerChannelTestAI(FillOptions(&opts_), + ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {} + static struct ares_options* FillOptions(struct ares_options *opts) { + memset(opts, 0, sizeof(struct ares_options)); + opts->server_failover_opts.retry_chance = 1; + opts->server_failover_opts.retry_delay = SERVER_FAILOVER_RETRY_DELAY; + return opts; + } + private: + struct ares_options opts_; +}; + // Test case to trigger server failover behavior. We use a retry chance of -// 100% and a retry delay of 250ms so that we can test behavior reliably. +// 100% and a retry delay so that we can test behavior reliably. TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) { - DNSPacket servfailrsp; + DNSPacket servfailrsp; servfailrsp.set_response().set_aa().set_rcode(SERVFAIL) .add_question(new DNSQuestion("www.example.com", T_A)); DNSPacket okrsp; @@ -800,7 +802,12 @@ TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) { .add_question(new DNSQuestion("www.example.com", T_A)) .add_answer(new DNSARR("www.example.com", 100, {2,3,4,5})); + auto tv_begin = std::chrono::high_resolution_clock::now(); + auto tv_now = std::chrono::high_resolution_clock::now(); + unsigned int delay_ms; + // 1. If all servers are healthy, then the first server should be selected. + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: First server should be selected" << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &okrsp)); CheckExample(); @@ -808,16 +815,23 @@ TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) { // 2. Failed servers should be retried after the retry delay. // // Fail server #0 but leave server #1 as healthy. + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Server0 will fail but leave Server1 as healthy" << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)); EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[1].get(), &okrsp)); CheckExample(); - // Sleep for the retry delay (actually a little more than 250ms to account + // Sleep for the retry delay (actually a little more than the retry delay to account // for unreliable timing, e.g. NTP slew) and send in another query. Server #0 // should be retried. - std::this_thread::sleep_for(std::chrono::milliseconds(260)); + tv_now = std::chrono::high_resolution_clock::now(); + delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Server0 should be past retry delay and should be tried again successfully" << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &okrsp)); CheckExample(); @@ -827,6 +841,8 @@ TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) { // // Fail all servers for the first round of tries. On the second round server // #1 responds successfully. + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: All 3 servers will fail on the first attempt. On second attempt, Server0 will fail, but Server1 will answer correctly." << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)); @@ -840,20 +856,36 @@ TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) { // At this point the sorted servers look like [1] (f0) [2] (f1) [0] (f2). // Sleep for the retry delay and send in another query. Server #2 should be // retried first, and then server #0. - std::this_thread::sleep_for(std::chrono::milliseconds(260)); + tv_now = std::chrono::high_resolution_clock::now(); + delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Past retry delay, so will choose Server2 and Server0 that are down. Server2 will fail but Server0 will succeed." << std::endl; EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[2].get(), &servfailrsp)); EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &okrsp)); CheckExample(); + // Test might take a while to run and the sleep may not be accurate, so we + // want to track this interval otherwise we may not pass the last test case + // on slow systems. + auto elapse_start = tv_now; + // 4. If there are multiple failed servers, then servers which have not yet // met the retry delay should be skipped. // // The sorted servers currently look like [0] (f0) [1] (f0) [2] (f2) and // server #2 has just been retried. - // Sleep for half the retry delay and trigger a failure on server #0. - std::this_thread::sleep_for(std::chrono::milliseconds(130)); + // Sleep for 1/2 the retry delay and trigger a failure on server #0. + tv_now = std::chrono::high_resolution_clock::now(); + delay_ms = (SERVER_FAILOVER_RETRY_DELAY/2); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + tv_now = std::chrono::high_resolution_clock::now(); + + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Retry delay has not been hit yet. Server0 was last successful, so should be tried first (and will fail), Server1 is also healthy so will respond." << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)); EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A)) @@ -861,10 +893,22 @@ TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) { CheckExample(); // The sorted servers now look like [1] (f0) [0] (f1) [2] (f2). Server #0 - // has just failed whilst server #2 is halfway through the retry delay. - // Sleep for another half the retry delay and check that server #2 is retried - // whilst server #0 is not. - std::this_thread::sleep_for(std::chrono::milliseconds(130)); + // has just failed whilst server #2 is somewhere in its retry delay. + // Sleep until we know server #2s retry delay has elapsed but Server #0 has + // not. + tv_now = std::chrono::high_resolution_clock::now(); + + unsigned int elapsed_time = (unsigned int)std::chrono::duration_cast(tv_now - elapse_start).count(); + delay_ms = (SERVER_FAILOVER_RETRY_DELAY) + (SERVER_FAILOVER_RETRY_DELAY / 10); + if (elapsed_time > delay_ms) { + if (verbose) std::cerr << "elapsed duration " << elapsed_time << "ms greater than desired delay of " << delay_ms << "ms, not sleeping" << std::endl; + } else { + delay_ms -= elapsed_time; // subtract already elapsed time + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + } + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Retry delay has expired on Server2 but not Server0, will try on Server2 and fail, then Server1 will answer" << std::endl; EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[2].get(), &servfailrsp)); EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A)) diff --git a/test/ares-test-mock-et.cc b/test/ares-test-mock-et.cc index fdbab032..59843105 100644 --- a/test/ares-test-mock-et.cc +++ b/test/ares-test-mock-et.cc @@ -1250,20 +1250,6 @@ class NoRotateMultiMockEventThreadTest : public MockMultiServerEventThreadTest { NoRotateMultiMockEventThreadTest() : MockMultiServerEventThreadTest(nullptr, ARES_OPT_NOROTATE) {} }; -class ServerFailoverOptsMockEventThreadTest : public MockMultiServerEventThreadTest { - public: - ServerFailoverOptsMockEventThreadTest() - : MockMultiServerEventThreadTest(FillOptions(&opts_), - ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {} - static struct ares_options* FillOptions(struct ares_options *opts) { - memset(opts, 0, sizeof(struct ares_options)); - opts->server_failover_opts.retry_chance = 1; - opts->server_failover_opts.retry_delay = 250; - return opts; - } - private: - struct ares_options opts_; -}; TEST_P(NoRotateMultiMockEventThreadTest, ThirdServer) { struct ares_options opts; @@ -1384,8 +1370,24 @@ TEST_P(NoRotateMultiMockEventThreadTest, ServerNoResponseFailover) { EXPECT_EQ("{'www.example.com' aliases=[] addrs=[2.3.4.5]}", ss4.str()); } +#define SERVER_FAILOVER_RETRY_DELAY 750 +class ServerFailoverOptsMockEventThreadTest : public MockMultiServerEventThreadTest { + public: + ServerFailoverOptsMockEventThreadTest() + : MockMultiServerEventThreadTest(FillOptions(&opts_), + ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {} + static struct ares_options* FillOptions(struct ares_options *opts) { + memset(opts, 0, sizeof(struct ares_options)); + opts->server_failover_opts.retry_chance = 1; + opts->server_failover_opts.retry_delay = SERVER_FAILOVER_RETRY_DELAY; + return opts; + } + private: + struct ares_options opts_; +}; + // Test case to trigger server failover behavior. We use a retry chance of -// 100% and a retry delay of 250ms so that we can test behavior reliably. +// 100% and a retry delay so that we can test behavior reliably. TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) { DNSPacket servfailrsp; servfailrsp.set_response().set_aa().set_rcode(SERVFAIL) @@ -1395,7 +1397,12 @@ TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) { .add_question(new DNSQuestion("www.example.com", T_A)) .add_answer(new DNSARR("www.example.com", 100, {2,3,4,5})); + auto tv_begin = std::chrono::high_resolution_clock::now(); + auto tv_now = std::chrono::high_resolution_clock::now(); + unsigned int delay_ms; + // 1. If all servers are healthy, then the first server should be selected. + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: First server should be selected" << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &okrsp)); CheckExample(); @@ -1403,16 +1410,23 @@ TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) { // 2. Failed servers should be retried after the retry delay. // // Fail server #0 but leave server #1 as healthy. + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Server0 will fail but leave Server1 as healthy" << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)); EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[1].get(), &okrsp)); CheckExample(); - // Sleep for the retry delay (actually a little more than 250ms to account + // Sleep for the retry delay (actually a little more than the retry delay to account // for unreliable timing, e.g. NTP slew) and send in another query. Server #0 // should be retried. - std::this_thread::sleep_for(std::chrono::milliseconds(260)); + tv_now = std::chrono::high_resolution_clock::now(); + delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Server0 should be past retry delay and should be tried again successfully" << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &okrsp)); CheckExample(); @@ -1422,6 +1436,8 @@ TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) { // // Fail all servers for the first round of tries. On the second round server // #1 responds successfully. + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: All 3 servers will fail on the first attempt. On second attempt, Server0 will fail, but Server1 will answer correctly." << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)); @@ -1435,20 +1451,36 @@ TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) { // At this point the sorted servers look like [1] (f0) [2] (f1) [0] (f2). // Sleep for the retry delay and send in another query. Server #2 should be // retried first, and then server #0. - std::this_thread::sleep_for(std::chrono::milliseconds(260)); + tv_now = std::chrono::high_resolution_clock::now(); + delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Past retry delay, so will choose Server2 and Server0 that are down. Server2 will fail but Server0 will succeed." << std::endl; EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[2].get(), &servfailrsp)); EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &okrsp)); CheckExample(); + // Test might take a while to run and the sleep may not be accurate, so we + // want to track this interval otherwise we may not pass the last test case + // on slow systems. + auto elapse_start = tv_now; + // 4. If there are multiple failed servers, then servers which have not yet // met the retry delay should be skipped. // // The sorted servers currently look like [0] (f0) [1] (f0) [2] (f2) and // server #2 has just been retried. - // Sleep for half the retry delay and trigger a failure on server #0. - std::this_thread::sleep_for(std::chrono::milliseconds(130)); + // Sleep for 1/2 the retry delay and trigger a failure on server #0. + tv_now = std::chrono::high_resolution_clock::now(); + delay_ms = (SERVER_FAILOVER_RETRY_DELAY/2); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + tv_now = std::chrono::high_resolution_clock::now(); + + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Retry delay has not been hit yet. Server0 was last successful, so should be tried first (and will fail), Server1 is also healthy so will respond." << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)); EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A)) @@ -1456,10 +1488,22 @@ TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) { CheckExample(); // The sorted servers now look like [1] (f0) [0] (f1) [2] (f2). Server #0 - // has just failed whilst server #2 is halfway through the retry delay. - // Sleep for another half the retry delay and check that server #2 is retried - // whilst server #0 is not. - std::this_thread::sleep_for(std::chrono::milliseconds(130)); + // has just failed whilst server #2 is somewhere in its retry delay. + // Sleep until we know server #2s retry delay has elapsed but Server #0 has + // not. + tv_now = std::chrono::high_resolution_clock::now(); + + unsigned int elapsed_time = (unsigned int)std::chrono::duration_cast(tv_now - elapse_start).count(); + delay_ms = (SERVER_FAILOVER_RETRY_DELAY) + (SERVER_FAILOVER_RETRY_DELAY / 10); + if (elapsed_time > delay_ms) { + if (verbose) std::cerr << "elapsed duration " << elapsed_time << "ms greater than desired delay of " << delay_ms << "ms, not sleeping" << std::endl; + } else { + delay_ms -= elapsed_time; // subtract already elapsed time + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + } + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Retry delay has expired on Server2 but not Server0, will try on Server2 and fail, then Server1 will answer" << std::endl; EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[2].get(), &servfailrsp)); EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A)) diff --git a/test/ares-test-mock.cc b/test/ares-test-mock.cc index dba2b860..2b2f5fdb 100644 --- a/test/ares-test-mock.cc +++ b/test/ares-test-mock.cc @@ -1559,20 +1559,6 @@ class NoRotateMultiMockTest : public MockMultiServerChannelTest { NoRotateMultiMockTest() : MockMultiServerChannelTest(nullptr, ARES_OPT_NOROTATE) {} }; -class ServerFailoverOptsMultiMockTest : public MockMultiServerChannelTest { - public: - ServerFailoverOptsMultiMockTest() - : MockMultiServerChannelTest(FillOptions(&opts_), - ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {} - static struct ares_options* FillOptions(struct ares_options *opts) { - memset(opts, 0, sizeof(struct ares_options)); - opts->server_failover_opts.retry_chance = 1; - opts->server_failover_opts.retry_delay = 250; - return opts; - } - private: - struct ares_options opts_; -}; TEST_P(NoRotateMultiMockTest, ThirdServer) { struct ares_options opts; @@ -1693,10 +1679,26 @@ TEST_P(NoRotateMultiMockTest, ServerNoResponseFailover) { EXPECT_EQ("{'www.example.com' aliases=[] addrs=[2.3.4.5]}", ss4.str()); } +#define SERVER_FAILOVER_RETRY_DELAY 750 +class ServerFailoverOptsMultiMockTest : public MockMultiServerChannelTest { + public: + ServerFailoverOptsMultiMockTest() + : MockMultiServerChannelTest(FillOptions(&opts_), + ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {} + static struct ares_options* FillOptions(struct ares_options *opts) { + memset(opts, 0, sizeof(struct ares_options)); + opts->server_failover_opts.retry_chance = 1; + opts->server_failover_opts.retry_delay = SERVER_FAILOVER_RETRY_DELAY; + return opts; + } + private: + struct ares_options opts_; +}; + // Test case to trigger server failover behavior. We use a retry chance of -// 100% and a retry delay of 250ms so that we can test behavior reliably. +// 100% and a retry delay so that we can test behavior reliably. TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) { - DNSPacket servfailrsp; + DNSPacket servfailrsp; servfailrsp.set_response().set_aa().set_rcode(SERVFAIL) .add_question(new DNSQuestion("www.example.com", T_A)); DNSPacket okrsp; @@ -1704,7 +1706,12 @@ TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) { .add_question(new DNSQuestion("www.example.com", T_A)) .add_answer(new DNSARR("www.example.com", 100, {2,3,4,5})); + auto tv_begin = std::chrono::high_resolution_clock::now(); + auto tv_now = std::chrono::high_resolution_clock::now(); + unsigned int delay_ms; + // 1. If all servers are healthy, then the first server should be selected. + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: First server should be selected" << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &okrsp)); CheckExample(); @@ -1712,16 +1719,23 @@ TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) { // 2. Failed servers should be retried after the retry delay. // // Fail server #0 but leave server #1 as healthy. + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Server0 will fail but leave Server1 as healthy" << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)); EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[1].get(), &okrsp)); CheckExample(); - // Sleep for the retry delay (actually a little more than 250ms to account + // Sleep for the retry delay (actually a little more than the retry delay to account // for unreliable timing, e.g. NTP slew) and send in another query. Server #0 // should be retried. - std::this_thread::sleep_for(std::chrono::milliseconds(260)); + tv_now = std::chrono::high_resolution_clock::now(); + delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Server0 should be past retry delay and should be tried again successfully" << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &okrsp)); CheckExample(); @@ -1731,6 +1745,8 @@ TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) { // // Fail all servers for the first round of tries. On the second round server // #1 responds successfully. + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: All 3 servers will fail on the first attempt. On second attempt, Server0 will fail, but Server1 will answer correctly." << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)); @@ -1744,20 +1760,36 @@ TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) { // At this point the sorted servers look like [1] (f0) [2] (f1) [0] (f2). // Sleep for the retry delay and send in another query. Server #2 should be // retried first, and then server #0. - std::this_thread::sleep_for(std::chrono::milliseconds(260)); + tv_now = std::chrono::high_resolution_clock::now(); + delay_ms = SERVER_FAILOVER_RETRY_DELAY + (SERVER_FAILOVER_RETRY_DELAY / 10); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Past retry delay, so will choose Server2 and Server0 that are down. Server2 will fail but Server0 will succeed." << std::endl; EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[2].get(), &servfailrsp)); EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &okrsp)); CheckExample(); + // Test might take a while to run and the sleep may not be accurate, so we + // want to track this interval otherwise we may not pass the last test case + // on slow systems. + auto elapse_start = tv_now; + // 4. If there are multiple failed servers, then servers which have not yet // met the retry delay should be skipped. // // The sorted servers currently look like [0] (f0) [1] (f0) [2] (f2) and // server #2 has just been retried. - // Sleep for half the retry delay and trigger a failure on server #0. - std::this_thread::sleep_for(std::chrono::milliseconds(130)); + // Sleep for 1/2 the retry delay and trigger a failure on server #0. + tv_now = std::chrono::high_resolution_clock::now(); + delay_ms = (SERVER_FAILOVER_RETRY_DELAY/2); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + tv_now = std::chrono::high_resolution_clock::now(); + + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Retry delay has not been hit yet. Server0 was last successful, so should be tried first (and will fail), Server1 is also healthy so will respond." << std::endl; EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[0].get(), &servfailrsp)); EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A)) @@ -1765,10 +1797,22 @@ TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) { CheckExample(); // The sorted servers now look like [1] (f0) [0] (f1) [2] (f2). Server #0 - // has just failed whilst server #2 is halfway through the retry delay. - // Sleep for another half the retry delay and check that server #2 is retried - // whilst server #0 is not. - std::this_thread::sleep_for(std::chrono::milliseconds(130)); + // has just failed whilst server #2 is somewhere in its retry delay. + // Sleep until we know server #2s retry delay has elapsed but Server #0 has + // not. + tv_now = std::chrono::high_resolution_clock::now(); + + unsigned int elapsed_time = (unsigned int)std::chrono::duration_cast(tv_now - elapse_start).count(); + delay_ms = (SERVER_FAILOVER_RETRY_DELAY) + (SERVER_FAILOVER_RETRY_DELAY / 10); + if (elapsed_time > delay_ms) { + if (verbose) std::cerr << "elapsed duration " << elapsed_time << "ms greater than desired delay of " << delay_ms << "ms, not sleeping" << std::endl; + } else { + delay_ms -= elapsed_time; // subtract already elapsed time + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: sleep " << delay_ms << "ms" << std::endl; + ares_sleep_time(delay_ms); + } + tv_now = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << std::chrono::duration_cast(tv_now - tv_begin).count() << "ms: Retry delay has expired on Server2 but not Server0, will try on Server2 and fail, then Server1 will answer" << std::endl; EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A)) .WillOnce(SetReply(servers_[2].get(), &servfailrsp)); EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A)) diff --git a/test/ares-test.cc b/test/ares-test.cc index 4636e471..08f48e83 100644 --- a/test/ares-test.cc +++ b/test/ares-test.cc @@ -249,6 +249,16 @@ unsigned long long LibraryTest::fails_ = 0; std::map LibraryTest::size_fails_; std::mutex LibraryTest::lock_; +void ares_sleep_time(unsigned int ms) +{ + auto duration = std::chrono::milliseconds(ms); + auto start_time = std::chrono::high_resolution_clock::now(); + auto wake_time = start_time + duration; + std::this_thread::sleep_until(wake_time); + auto end_time = std::chrono::high_resolution_clock::now(); + if (verbose) std::cerr << "sleep requested " << ms << "ms, slept for " << std::chrono::duration_cast(end_time - start_time).count() << "ms" << std::endl; +} + void ProcessWork(ares_channel_t *channel, std::function()> get_extrafds, std::function process_extra, diff --git a/test/ares-test.h b/test/ares-test.h index d935f46d..2c1569a4 100644 --- a/test/ares-test.h +++ b/test/ares-test.h @@ -88,6 +88,8 @@ extern std::vector> evsys_families; extern std::vector> families_modes; extern std::vector> evsys_families_modes; +// Hopefully a more accurate sleep than sleep_for() +void ares_sleep_time(unsigned int ms); // Process all pending work on ares-owned file descriptors, plus // optionally the given set-of-FDs + work function.