Add server failover retry behavior, where failed servers are retried with small probability after a minimum delay (#731)

**Summary**

By default c-ares will select the server with the least number of
consecutive failures when sending a query. However, this means that if a
server temporarily goes down and hits failures (e.g. a transient network
issue), then that server will never be retried until all other servers
hit the same number of failures.

This is an issue if the failed server is preferred to other servers in
the list. For example if a primary server and a backup server are
configured.

This PR adds new server failover retry behavior, where failed servers
are retried with small probability after a minimum delay has passed. The
probability and minimum delay are configurable via the
`ARES_OPT_SERVER_FAILOVER` option. By default c-ares will use a
probability of 10% and a minimum delay of 5 seconds.

In addition, this PR includes a small change to always close out
connections to servers which have hit failures, even with
`ARES_FLAG_STAYOPEN`. It's possible that resetting the connection can
resolve some server issues (e.g. by resetting the source port).

**Testing**

A new set of regression tests have been added to test the new server
failover retry behavior.

Fixes Issue: #717 
Fix By: Oliver Welsh (@oliverwelsh)
pull/747/head
Oliver Welsh 8 months ago committed by GitHub
parent 1d859b1ece
commit fd81f36d3e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 24
      docs/ares_init_options.3
  2. 14
      include/ares.h
  3. 8
      src/lib/ares__close_sockets.c
  4. 6
      src/lib/ares_init.c
  5. 16
      src/lib/ares_options.c
  6. 18
      src/lib/ares_private.h
  7. 81
      src/lib/ares_process.c
  8. 2
      src/lib/ares_update_servers.c
  9. 105
      test/ares-test-mock-ai.cc
  10. 104
      test/ares-test-mock-et.cc
  11. 105
      test/ares-test-mock.cc
  12. 2
      test/ares-test-ns.cc
  13. 1
      test/ares-test.h

@ -11,6 +11,11 @@ ares_init_options, ares_init \- Initialize a resolver channel
.nf
#include <ares.h>
struct ares_server_failover_options {
unsigned short retry_chance;
size_t retry_delay;
};
struct ares_options {
int flags;
int timeout; /* in seconds or milliseconds, depending on options */
@ -36,6 +41,7 @@ struct ares_options {
int maxtimeout; /* in milliseconds */
unsigned int qcache_max_ttl; /* in seconds */
ares_evsys_t evsys;
struct ares_server_failover_options server_failover_opts;
};
int ares_init_options(ares_channel_t **\fIchannelptr\fP,
@ -316,6 +322,24 @@ Returns \fBARES_ENOTIMP\fP if this option is passed but not available, and
\fBARES_ESERVFAIL\fP if there is a critical failure during initialization of
the event thread.
.br
.TP 18
.B ARES_OPT_SERVER_FAILOVER
.B struct ares_server_failover_options \fIserver_failover_opts\fP;
.br
Configure server failover retry behavior. When a DNS server fails to
respond to a query, c-ares will deprioritize the server. On subsequent
queries, servers with fewer consecutive failures will be selected in
preference. However, in order to detect when such a server has recovered,
c-ares will occasionally retry failed servers. The
\fIares_server_failover_options\fP structure contains options to control this
behavior.
The \fIretry_chance\fP field gives the probability (1/N) of retrying a
failed server on any given query. Setting to a value of 0 disables retries.
The \fIretry_delay\fP field gives the minimum delay in milliseconds that c-ares
will wait before retrying a specific failed server.
If this option is not specificed then c-ares will use a probability of 10%
and a minimum delay of 5 seconds.
.br
.PP
The \fIoptmask\fP parameter also includes options without a corresponding
field in the

@ -255,6 +255,7 @@ typedef enum {
#define ARES_OPT_MAXTIMEOUTMS (1 << 20)
#define ARES_OPT_QUERY_CACHE (1 << 21)
#define ARES_OPT_EVENT_THREAD (1 << 22)
#define ARES_OPT_SERVER_FAILOVER (1 << 23)
/* Nameinfo flag values */
#define ARES_NI_NOFQDN (1 << 0)
@ -326,6 +327,18 @@ typedef void (*ares_sock_state_cb)(void *data, ares_socket_t socket_fd,
struct apattern;
/* Options controlling server failover behavior.
* The retry chance is the probability (1/N) by which we will retry a failed
* server instead of the best server when selecting a server to send queries
* to.
* The retry delay is the minimum time in milliseconds to wait between doing
* such retries (applied per-server).
*/
struct ares_server_failover_options {
unsigned short retry_chance;
size_t retry_delay;
};
/* NOTE about the ares_options struct to users and developers.
This struct will remain looking like this. It will not be extended nor
@ -368,6 +381,7 @@ struct ares_options {
int maxtimeout; /* in milliseconds */
unsigned int qcache_max_ttl; /* Maximum TTL for query cache, 0=disabled */
ares_evsys_t evsys;
struct ares_server_failover_options server_failover_opts;
};
struct hostent;

@ -97,6 +97,14 @@ void ares__check_cleanup_conn(const ares_channel_t *channel,
do_cleanup = ARES_TRUE;
}
/* If the associated server has failures, close it out. Resetting the
* connection (and specifically the source port number) can help resolve
* situations where packets are being dropped.
*/
if (conn->server->consec_failures > 0) {
do_cleanup = ARES_TRUE;
}
/* If the udp connection hit its max queries, always close it */
if (!conn->is_tcp && channel->udp_max_queries > 0 &&
conn->total_queries >= channel->udp_max_queries) {

@ -256,6 +256,12 @@ static ares_status_t init_by_defaults(ares_channel_t *channel)
}
}
/* Set default fields for server failover behavior */
if (!(channel->optmask & ARES_OPT_SERVER_FAILOVER)) {
channel->server_retry_chance = DEFAULT_SERVER_RETRY_CHANCE;
channel->server_retry_delay = DEFAULT_SERVER_RETRY_DELAY;
}
error:
if (hostname) {
ares_free(hostname);

@ -229,6 +229,14 @@ int ares_save_options(ares_channel_t *channel, struct ares_options *options,
options->evsys = channel->evsys;
}
/* Set options for server failover behavior */
if (channel->optmask & ARES_OPT_SERVER_FAILOVER) {
options->server_failover_opts.retry_chance =
channel->server_retry_chance;
options->server_failover_opts.retry_delay =
channel->server_retry_delay;
}
*optmask = (int)channel->optmask;
return ARES_SUCCESS;
@ -474,6 +482,14 @@ ares_status_t ares__init_by_options(ares_channel_t *channel,
}
}
/* Set fields for server failover behavior */
if (optmask & ARES_OPT_SERVER_FAILOVER) {
channel->server_retry_chance =
options->server_failover_opts.retry_chance;
channel->server_retry_delay =
options->server_failover_opts.retry_delay;
}
channel->optmask = (unsigned int)optmask;
return ARES_SUCCESS;

@ -146,6 +146,11 @@ typedef struct ares_rand_state ares_rand_state;
/********* EDNS defines section ******/
/* Default values for server failover behavior. We retry failed servers with
* a 10% probability and a minimum delay of 5 seconds between retries.
*/
#define DEFAULT_SERVER_RETRY_CHANCE 10
#define DEFAULT_SERVER_RETRY_DELAY 5000
struct query;
@ -176,6 +181,9 @@ struct server_state {
ares__llist_t *connections;
struct server_connection *tcp_conn;
/* The next time when we will retry this server if it has hit failures */
struct timeval next_retry_time;
/* TCP buffer since multiple responses can come back in one read, or partial
* in a read */
ares__buf_t *tcp_parser;
@ -315,6 +323,16 @@ struct ares_channeldata {
/* Query Cache */
ares__qcache_t *qcache;
/* Fields controlling server failover behavior.
* The retry chance is the probability (1/N) by which we will retry a failed
* server instead of the best server when selecting a server to send queries
* to.
* The retry delay is the minimum time in milliseconds to wait between doing
* such retries (applied per-server).
*/
unsigned short server_retry_chance;
size_t server_retry_delay;
};
/* Does the domain end in ".onion" or ".onion."? Case-insensitive. */

@ -50,6 +50,7 @@
#include "ares_nameser.h"
#include "ares_dns.h"
static void timeadd(struct timeval *now, size_t millisecs);
static ares_bool_t try_again(int errnum);
static void write_tcp_data(ares_channel_t *channel, fd_set *write_fds,
ares_socket_t write_fd);
@ -74,13 +75,19 @@ static void server_increment_failures(struct server_state *server)
{
ares__slist_node_t *node;
const ares_channel_t *channel = server->channel;
struct timeval next_retry_time;
node = ares__slist_node_find(channel->servers, server);
if (node == NULL) {
return;
}
server->consec_failures++;
ares__slist_node_reinsert(node);
next_retry_time = ares__tvnow();
timeadd(&next_retry_time, channel->server_retry_delay);
server->next_retry_time = next_retry_time;
}
static void server_set_good(struct server_state *server)
@ -88,19 +95,20 @@ static void server_set_good(struct server_state *server)
ares__slist_node_t *node;
const ares_channel_t *channel = server->channel;
if (!server->consec_failures) {
return;
}
node = ares__slist_node_find(channel->servers, server);
if (node == NULL) {
return;
}
if (server->consec_failures > 0) {
server->consec_failures = 0;
ares__slist_node_reinsert(node);
}
server->next_retry_time.tv_sec = 0;
server->next_retry_time.tv_usec = 0;
}
/* return true if now is exactly check time or later */
ares_bool_t ares__timedout(const struct timeval *now,
const struct timeval *check)
@ -816,6 +824,66 @@ static struct server_state *ares__random_server(ares_channel_t *channel)
return NULL;
}
/* Pick a server from the list with failover behavior.
*
* We default to using the first server in the sorted list of servers. That is
* the server with the lowest number of consecutive failures and then the
* highest priority server (by idx) if there is a draw.
*
* However, if a server temporarily goes down and hits some failures, then that
* server will never be retried until all other servers hit the same number of
* failures. This may prevent the server from being retried for a long time.
*
* To resolve this, with some probability we select a failed server to retry
* instead.
*/
static struct server_state *ares__failover_server(ares_channel_t *channel)
{
struct server_state *first_server = ares__slist_first_val(channel->servers);
struct server_state *last_server = ares__slist_last_val(channel->servers);
unsigned short r;
/* Defensive code against no servers being available on the channel. */
if (first_server == NULL) {
return NULL;
}
/* If no servers have failures, then prefer the first server in the list. */
if (last_server != NULL && last_server->consec_failures == 0) {
return first_server;
}
/* If we are not configured with a server retry chance then return the first
* server.
*/
if (channel->server_retry_chance == 0) {
return first_server;
}
/* Generate a random value to decide whether to retry a failed server. The
* probability to use is 1/channel->server_retry_chance, rounded up to a
* precision of 1/2^B where B is the number of bits in the random value.
* We use an unsigned short for the random value for increased precision.
*/
ares__rand_bytes(channel->rand_state, (unsigned char *)&r, sizeof(r));
if (r % channel->server_retry_chance == 0) {
/* Select a suitable failed server to retry. */
struct timeval now = ares__tvnow();
ares__slist_node_t *node;
for (node = ares__slist_node_first(channel->servers); node != NULL;
node = ares__slist_node_next(node)) {
struct server_state *node_val = ares__slist_node_val(node);
if (node_val != NULL && node_val->consec_failures > 0 &&
ares__timedout(&now, &node_val->next_retry_time)) {
return node_val;
}
}
}
/* If we have not returned yet, then return the first server. */
return first_server;
}
static ares_status_t ares__append_tcpbuf(struct server_state *server,
const struct query *query)
{
@ -890,10 +958,11 @@ ares_status_t ares__send_query(struct query *query, struct timeval *now)
/* Choose the server to send the query to */
if (channel->rotate) {
/* Pull random server */
server = ares__random_server(channel);
} else {
/* Pull first */
server = ares__slist_first_val(channel->servers);
/* Pull server with failover behavior */
server = ares__failover_server(channel);
}
if (server == NULL) {

@ -587,6 +587,8 @@ static ares_status_t ares__server_create(ares_channel_t *channel,
server->udp_port = ares__sconfig_get_port(channel, sconfig, ARES_FALSE);
server->tcp_port = ares__sconfig_get_port(channel, sconfig, ARES_TRUE);
server->addr.family = sconfig->addr.family;
server->next_retry_time.tv_sec = 0;
server->next_retry_time.tv_usec = 0;
if (sconfig->addr.family == AF_INET) {
memcpy(&server->addr.addr.addr4, &sconfig->addr.addr.addr4,

@ -702,8 +702,8 @@ class MockMultiServerChannelTestAI
: public MockChannelOptsTest,
public ::testing::WithParamInterface< std::pair<int, bool> > {
public:
MockMultiServerChannelTestAI(bool rotate)
: MockChannelOptsTest(3, GetParam().first, GetParam().second, nullptr, rotate ? ARES_OPT_ROTATE : ARES_OPT_NOROTATE) {}
MockMultiServerChannelTestAI(ares_options *opts, int optmask)
: MockChannelOptsTest(3, GetParam().first, GetParam().second, opts, optmask) {}
void CheckExample() {
AddrInfoResult result;
struct ares_addrinfo_hints hints = {};
@ -720,7 +720,22 @@ class MockMultiServerChannelTestAI
class NoRotateMultiMockTestAI : public MockMultiServerChannelTestAI {
public:
NoRotateMultiMockTestAI() : MockMultiServerChannelTestAI(false) {}
NoRotateMultiMockTestAI() : MockMultiServerChannelTestAI(nullptr, ARES_OPT_NOROTATE) {}
};
class ServerFailoverOptsMockTestAI : public MockMultiServerChannelTestAI {
public:
ServerFailoverOptsMockTestAI()
: MockMultiServerChannelTestAI(FillOptions(&opts_),
ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {}
static struct ares_options* FillOptions(struct ares_options *opts) {
memset(opts, 0, sizeof(struct ares_options));
opts->server_failover_opts.retry_chance = 1;
opts->server_failover_opts.retry_delay = 100;
return opts;
}
private:
struct ares_options opts_;
};
TEST_P(NoRotateMultiMockTestAI, ThirdServer) {
@ -774,6 +789,88 @@ TEST_P(NoRotateMultiMockTestAI, ThirdServer) {
CheckExample();
}
// Test case to trigger server failover behavior. We use a retry chance of
// 100% and a retry delay of 100ms so that we can test behavior reliably.
TEST_P(ServerFailoverOptsMockTestAI, ServerFailoverOpts) {
DNSPacket servfailrsp;
servfailrsp.set_response().set_aa().set_rcode(SERVFAIL)
.add_question(new DNSQuestion("www.example.com", T_A));
DNSPacket okrsp;
okrsp.set_response().set_aa()
.add_question(new DNSQuestion("www.example.com", T_A))
.add_answer(new DNSARR("www.example.com", 100, {2,3,4,5}));
// 1. If all servers are healthy, then the first server should be selected.
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &okrsp));
CheckExample();
// 2. Failed servers should be retried after the retry delay.
//
// Fail server #0 but leave server #1 as healthy.
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
CheckExample();
// Sleep for the retry delay and send in another query. Server #0 should be
// retried.
std::this_thread::sleep_for(std::chrono::milliseconds(100));
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &okrsp));
CheckExample();
// 3. If there are multiple failed servers, then the servers should be
// retried in sorted order.
//
// Fail all servers for the first round of tries. On the second round server
// #1 responds successfully.
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &servfailrsp))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[2].get(), &servfailrsp));
CheckExample();
// At this point the sorted servers look like [1] (f0) [2] (f1) [0] (f2).
// Sleep for the retry delay and send in another query. Server #2 should be
// retried first, and then server #0.
std::this_thread::sleep_for(std::chrono::milliseconds(100));
EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[2].get(), &servfailrsp));
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &okrsp));
CheckExample();
// 4. If there are multiple failed servers, then servers which have not yet
// met the retry delay should be skipped.
//
// The sorted servers currently look like [0] (f0) [1] (f0) [2] (f2) and
// server #2 has just been retried.
// Sleep for half the retry delay and trigger a failure on server #0.
std::this_thread::sleep_for(std::chrono::milliseconds(50));
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
CheckExample();
// The sorted servers now look like [1] (f0) [0] (f1) [2] (f2). Server #0
// has just failed whilst server #2 is halfway through the retry delay.
// Sleep for another half the retry delay and check that server #2 is retried
// whilst server #0 is not.
std::this_thread::sleep_for(std::chrono::milliseconds(50));
EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[2].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
CheckExample();
}
TEST_P(MockChannelTestAI, FamilyV4ServiceName) {
DNSPacket rsp4;
rsp4.set_response().set_aa()
@ -821,6 +918,8 @@ INSTANTIATE_TEST_SUITE_P(AddressFamiliesAI, MockEDNSChannelTestAI,
INSTANTIATE_TEST_SUITE_P(TransportModesAI, NoRotateMultiMockTestAI,
::testing::ValuesIn(ares::test::families_modes), PrintFamilyMode);
INSTANTIATE_TEST_SUITE_P(TransportModesAI, ServerFailoverOptsMockTestAI,
::testing::ValuesIn(ares::test::families_modes), PrintFamilyMode);
} // namespace test
} // namespace ares

@ -1196,8 +1196,8 @@ class MockMultiServerEventThreadTest
: public MockEventThreadOptsTest,
public ::testing::WithParamInterface< std::tuple<ares_evsys_t, int, bool> > {
public:
MockMultiServerEventThreadTest(bool rotate)
: MockEventThreadOptsTest(3, std::get<0>(GetParam()), std::get<1>(GetParam()), std::get<2>(GetParam()), nullptr, rotate ? ARES_OPT_ROTATE : ARES_OPT_NOROTATE) {}
MockMultiServerEventThreadTest(ares_options *opts, int optmask)
: MockEventThreadOptsTest(3, std::get<0>(GetParam()), std::get<1>(GetParam()), std::get<2>(GetParam()), opts, optmask) {}
void CheckExample() {
HostResult result;
ares_gethostbyname(channel_, "www.example.com.", AF_INET, HostCallback, &result);
@ -1211,7 +1211,22 @@ class MockMultiServerEventThreadTest
class NoRotateMultiMockEventThreadTest : public MockMultiServerEventThreadTest {
public:
NoRotateMultiMockEventThreadTest() : MockMultiServerEventThreadTest(false) {}
NoRotateMultiMockEventThreadTest() : MockMultiServerEventThreadTest(nullptr, ARES_OPT_NOROTATE) {}
};
class ServerFailoverOptsMockEventThreadTest : public MockMultiServerEventThreadTest {
public:
ServerFailoverOptsMockEventThreadTest()
: MockMultiServerEventThreadTest(FillOptions(&opts_),
ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {}
static struct ares_options* FillOptions(struct ares_options *opts) {
memset(opts, 0, sizeof(struct ares_options));
opts->server_failover_opts.retry_chance = 1;
opts->server_failover_opts.retry_delay = 100;
return opts;
}
private:
struct ares_options opts_;
};
TEST_P(NoRotateMultiMockEventThreadTest, ThirdServer) {
@ -1333,6 +1348,87 @@ TEST_P(NoRotateMultiMockEventThreadTest, ServerNoResponseFailover) {
EXPECT_EQ("{'www.example.com' aliases=[] addrs=[2.3.4.5]}", ss4.str());
}
// Test case to trigger server failover behavior. We use a retry chance of
// 100% and a retry delay of 100ms so that we can test behavior reliably.
TEST_P(ServerFailoverOptsMockEventThreadTest, ServerFailoverOpts) {
DNSPacket servfailrsp;
servfailrsp.set_response().set_aa().set_rcode(SERVFAIL)
.add_question(new DNSQuestion("www.example.com", T_A));
DNSPacket okrsp;
okrsp.set_response().set_aa()
.add_question(new DNSQuestion("www.example.com", T_A))
.add_answer(new DNSARR("www.example.com", 100, {2,3,4,5}));
// 1. If all servers are healthy, then the first server should be selected.
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &okrsp));
CheckExample();
// 2. Failed servers should be retried after the retry delay.
//
// Fail server #0 but leave server #1 as healthy.
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
CheckExample();
// Sleep for the retry delay and send in another query. Server #0 should be
// retried.
std::this_thread::sleep_for(std::chrono::milliseconds(100));
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &okrsp));
CheckExample();
// 3. If there are multiple failed servers, then the servers should be
// retried in sorted order.
//
// Fail all servers for the first round of tries. On the second round server
// #1 responds successfully.
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &servfailrsp))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[2].get(), &servfailrsp));
CheckExample();
// At this point the sorted servers look like [1] (f0) [2] (f1) [0] (f2).
// Sleep for the retry delay and send in another query. Server #2 should be
// retried first, and then server #0.
std::this_thread::sleep_for(std::chrono::milliseconds(100));
EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[2].get(), &servfailrsp));
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &okrsp));
CheckExample();
// 4. If there are multiple failed servers, then servers which have not yet
// met the retry delay should be skipped.
//
// The sorted servers currently look like [0] (f0) [1] (f0) [2] (f2) and
// server #2 has just been retried.
// Sleep for half the retry delay and trigger a failure on server #0.
std::this_thread::sleep_for(std::chrono::milliseconds(50));
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
CheckExample();
// The sorted servers now look like [1] (f0) [0] (f1) [2] (f2). Server #0
// has just failed whilst server #2 is halfway through the retry delay.
// Sleep for another half the retry delay and check that server #2 is retried
// whilst server #0 is not.
std::this_thread::sleep_for(std::chrono::milliseconds(50));
EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[2].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
CheckExample();
}
static const char *evsys_tostr(ares_evsys_t evsys)
{
@ -1396,6 +1492,8 @@ INSTANTIATE_TEST_SUITE_P(AddressFamilies, MockEDNSEventThreadTest, ::testing::Va
INSTANTIATE_TEST_SUITE_P(TransportModes, NoRotateMultiMockEventThreadTest, ::testing::ValuesIn(ares::test::evsys_families_modes), ares::test::PrintEvsysFamilyMode);
INSTANTIATE_TEST_SUITE_P(TransportModes, ServerFailoverOptsMockEventThreadTest, ::testing::ValuesIn(ares::test::evsys_families_modes), ares::test::PrintEvsysFamilyMode);
} // namespace test
} // namespace ares

@ -1403,8 +1403,8 @@ class MockMultiServerChannelTest
: public MockChannelOptsTest,
public ::testing::WithParamInterface< std::pair<int, bool> > {
public:
MockMultiServerChannelTest(bool rotate)
: MockChannelOptsTest(3, GetParam().first, GetParam().second, nullptr, rotate ? ARES_OPT_ROTATE : ARES_OPT_NOROTATE) {}
MockMultiServerChannelTest(ares_options *opts, int optmask)
: MockChannelOptsTest(3, GetParam().first, GetParam().second, opts, optmask) {}
void CheckExample() {
HostResult result;
ares_gethostbyname(channel_, "www.example.com.", AF_INET, HostCallback, &result);
@ -1418,7 +1418,22 @@ class MockMultiServerChannelTest
class NoRotateMultiMockTest : public MockMultiServerChannelTest {
public:
NoRotateMultiMockTest() : MockMultiServerChannelTest(false) {}
NoRotateMultiMockTest() : MockMultiServerChannelTest(nullptr, ARES_OPT_NOROTATE) {}
};
class ServerFailoverOptsMultiMockTest : public MockMultiServerChannelTest {
public:
ServerFailoverOptsMultiMockTest()
: MockMultiServerChannelTest(FillOptions(&opts_),
ARES_OPT_SERVER_FAILOVER | ARES_OPT_NOROTATE) {}
static struct ares_options* FillOptions(struct ares_options *opts) {
memset(opts, 0, sizeof(struct ares_options));
opts->server_failover_opts.retry_chance = 1;
opts->server_failover_opts.retry_delay = 100;
return opts;
}
private:
struct ares_options opts_;
};
TEST_P(NoRotateMultiMockTest, ThirdServer) {
@ -1540,6 +1555,88 @@ TEST_P(NoRotateMultiMockTest, ServerNoResponseFailover) {
EXPECT_EQ("{'www.example.com' aliases=[] addrs=[2.3.4.5]}", ss4.str());
}
// Test case to trigger server failover behavior. We use a retry chance of
// 100% and a retry delay of 100ms so that we can test behavior reliably.
TEST_P(ServerFailoverOptsMultiMockTest, ServerFailoverOpts) {
DNSPacket servfailrsp;
servfailrsp.set_response().set_aa().set_rcode(SERVFAIL)
.add_question(new DNSQuestion("www.example.com", T_A));
DNSPacket okrsp;
okrsp.set_response().set_aa()
.add_question(new DNSQuestion("www.example.com", T_A))
.add_answer(new DNSARR("www.example.com", 100, {2,3,4,5}));
// 1. If all servers are healthy, then the first server should be selected.
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &okrsp));
CheckExample();
// 2. Failed servers should be retried after the retry delay.
//
// Fail server #0 but leave server #1 as healthy.
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
CheckExample();
// Sleep for the retry delay and send in another query. Server #0 should be
// retried.
std::this_thread::sleep_for(std::chrono::milliseconds(100));
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &okrsp));
CheckExample();
// 3. If there are multiple failed servers, then the servers should be
// retried in sorted order.
//
// Fail all servers for the first round of tries. On the second round server
// #1 responds successfully.
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &servfailrsp))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[2].get(), &servfailrsp));
CheckExample();
// At this point the sorted servers look like [1] (f0) [2] (f1) [0] (f2).
// Sleep for the retry delay and send in another query. Server #2 should be
// retried first, and then server #0.
std::this_thread::sleep_for(std::chrono::milliseconds(100));
EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[2].get(), &servfailrsp));
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &okrsp));
CheckExample();
// 4. If there are multiple failed servers, then servers which have not yet
// met the retry delay should be skipped.
//
// The sorted servers currently look like [0] (f0) [1] (f0) [2] (f2) and
// server #2 has just been retried.
// Sleep for half the retry delay and trigger a failure on server #0.
std::this_thread::sleep_for(std::chrono::milliseconds(50));
EXPECT_CALL(*servers_[0], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[0].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
CheckExample();
// The sorted servers now look like [1] (f0) [0] (f1) [2] (f2). Server #0
// has just failed whilst server #2 is halfway through the retry delay.
// Sleep for another half the retry delay and check that server #2 is retried
// whilst server #0 is not.
std::this_thread::sleep_for(std::chrono::milliseconds(50));
EXPECT_CALL(*servers_[2], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[2].get(), &servfailrsp));
EXPECT_CALL(*servers_[1], OnRequest("www.example.com", T_A))
.WillOnce(SetReply(servers_[1].get(), &okrsp));
CheckExample();
}
const char *af_tostr(int af)
{
switch (af) {
@ -1592,5 +1689,7 @@ INSTANTIATE_TEST_SUITE_P(AddressFamilies, MockEDNSChannelTest, ::testing::Values
INSTANTIATE_TEST_SUITE_P(TransportModes, NoRotateMultiMockTest, ::testing::ValuesIn(ares::test::families_modes), PrintFamilyMode);
INSTANTIATE_TEST_SUITE_P(TransportModes, ServerFailoverOptsMultiMockTest, ::testing::ValuesIn(ares::test::families_modes), PrintFamilyMode);
} // namespace test
} // namespace ares

@ -63,7 +63,7 @@ int EnterContainer(void *data) {
// Ensure we are apparently root before continuing.
int count = 10;
while (getuid() != 0 && count > 0) {
usleep(100000);
std::this_thread::sleep_for(std::chrono::milliseconds(100));
count--;
}
if (getuid() != 0) {

1
test/ares-test.h vendored

@ -53,6 +53,7 @@
#include <thread>
#include <utility>
#include <vector>
#include <chrono>
namespace ares {

Loading…
Cancel
Save