Add observability into DNS server health via a server state callback, invoked whenever a query finishes (#744)

**Summary**

This PR adds a server state callback that is invoked whenever a query to
a DNS server finishes.

The callback is invoked with the server details (as a string), a boolean
indicating whether the query succeeded or failed, flags describing the
query (currently just indicating whether TCP or UDP was used), and
custom userdata.

This can be used by user applications to gain observability into DNS
server health and usage. For example, alerts when a DNS server
fails/recovers or metrics to track how often a DNS server is used and
responds successfully.

**Testing**

Three new regression tests `MockChannelTest.ServStateCallback*` have
been added to test the new callback in different success/failure
scenarios.

Fix By: Oliver Welsh (@oliverwelsh)
pull/754/head
Oliver Welsh 7 months ago committed by GitHub
parent 751201a047
commit 89a8856cca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 1
      docs/Makefile.inc
  2. 64
      docs/ares_set_server_state_callback.3
  3. 10
      include/ares.h
  4. 14
      src/lib/ares_init.c
  5. 6
      src/lib/ares_private.h
  6. 63
      src/lib/ares_process.c
  7. 111
      src/lib/ares_update_servers.c
  8. 123
      test/ares-test-mock.cc

@ -122,6 +122,7 @@ MANPAGES = ares_cancel.3 \
ares_set_local_dev.3 \
ares_set_local_ip4.3 \
ares_set_local_ip6.3 \
ares_set_server_state_callback.3 \
ares_set_servers.3 \
ares_set_servers_csv.3 \
ares_set_servers_ports.3 \

@ -0,0 +1,64 @@
.\"
.\" SPDX-License-Identifier: MIT
.\"
.TH ARES_SET_SERVER_STATE_CALLBACK 3 "26 Apr 2024"
.SH NAME
ares_set_server_state_callback \- Function for setting a server state callback
.SH SYNOPSIS
.nf
#include <ares.h>
typedef void (*ares_server_state_callback)(const char *\fIserver_string\fP,
ares_bool_t \fIsuccess\fP,
int \fIflags\fP,
void *\fIdata\fP);
void ares_set_server_state_callback(ares_channel_t *\fIchannel\fP,
ares_server_state_callback \fIcallback\fP,
void *\fIuser_data\fP);
.fi
.SH DESCRIPTION
The \fBares_set_server_state_callback(3)\fP function sets a callback function
\fIcallback\fP in the given ares channel handle \fIchannel\fP that is invoked
whenever a query on the channel completes. This includes both successful and
unsuccessful queries (including hard errors and timeouts). The callback
function is invoked with a number of parameters describing the query, as
follows.
The \fIserver_string\fP parameter indicates the DNS server that was used for
the query, given as a string with the same format returned by
\fBares_get_servers_csv(3)\fP.
The \fIsuccess\fP parameter indicates whether the query succeeded or not. It is
set to \fBARES_TRUE\fP on success and \fBARES_FALSE\fP on failure.
The \fIflags\fP parameter is a bitmask of flags describing various aspects of
the query (for example whether the query used UDP or TCP). These are described
below.
The \fIdata\fP parameter is a reference to the custom user data \fIuser_data\fP
that was passed to \fBares_set_server_state_callback(3)\fP when setting the
server state callback.
The server state callback can be used by applications to monitor the state of
the DNS servers used by an ares channel. For example, it can be used to track
metrics about the numbers and types of queries sent to each server or to
detect when a server is uncontactable or unhealthy.
.SH FLAGS
.TP 5
.B ARES_SERV_STATE_UDP
Indicates that the query was tried over UDP.
.TP 5
.B ARES_SERV_STATE_TCP
Indicates that the query was tried over TCP.
.SH AVAILABILITY
This function was first introduced in c-ares version 1.29.0.
.SH SEE ALSO
.BR ares_get_servers_csv (3)
.SH AUTHOR
Copyright (C) 2024 The c-ares project and its members.

@ -306,6 +306,9 @@ typedef enum {
#define ARES_LIB_INIT_WIN32 (1 << 0)
#define ARES_LIB_INIT_ALL (ARES_LIB_INIT_WIN32)
/* Server state callback flag values */
#define ARES_SERV_STATE_UDP (1 << 0) /* Query used UDP */
#define ARES_SERV_STATE_TCP (1 << 1) /* Query used TCP */
/*
* Typedef our socket type
@ -444,6 +447,10 @@ typedef int (*ares_sock_config_callback)(ares_socket_t socket_fd, int type,
typedef void (*ares_addrinfo_callback)(void *arg, int status, int timeouts,
struct ares_addrinfo *res);
typedef void (*ares_server_state_callback)(const char *server_string,
ares_bool_t success, int flags,
void *data);
CARES_EXTERN int ares_library_init(int flags);
CARES_EXTERN int ares_library_init_mem(int flags, void *(*amalloc)(size_t size),
@ -505,6 +512,9 @@ CARES_EXTERN void ares_set_socket_callback(ares_channel_t *ch
CARES_EXTERN void ares_set_socket_configure_callback(
ares_channel_t *channel, ares_sock_config_callback callback, void *user_data);
CARES_EXTERN void ares_set_server_state_callback(
ares_channel_t *channel, ares_server_state_callback callback, void *user_data);
CARES_EXTERN int ares_set_sortlist(ares_channel_t *channel,
const char *sortstr);

@ -456,12 +456,14 @@ int ares_dup(ares_channel_t **dest, ares_channel_t *src)
/* Now clone the options that ares_save_options() doesn't support, but are
* user-provided */
(*dest)->sock_create_cb = src->sock_create_cb;
(*dest)->sock_create_cb_data = src->sock_create_cb_data;
(*dest)->sock_config_cb = src->sock_config_cb;
(*dest)->sock_config_cb_data = src->sock_config_cb_data;
(*dest)->sock_funcs = src->sock_funcs;
(*dest)->sock_func_cb_data = src->sock_func_cb_data;
(*dest)->sock_create_cb = src->sock_create_cb;
(*dest)->sock_create_cb_data = src->sock_create_cb_data;
(*dest)->sock_config_cb = src->sock_config_cb;
(*dest)->sock_config_cb_data = src->sock_config_cb_data;
(*dest)->sock_funcs = src->sock_funcs;
(*dest)->sock_func_cb_data = src->sock_func_cb_data;
(*dest)->server_state_cb = src->server_state_cb;
(*dest)->server_state_cb_data = src->server_state_cb_data;
ares_strcpy((*dest)->local_dev_name, src->local_dev_name,
sizeof((*dest)->local_dev_name));

@ -333,6 +333,10 @@ struct ares_channeldata {
*/
unsigned short server_retry_chance;
size_t server_retry_delay;
/* Callback triggered when a server has a successful or failed response */
ares_server_state_callback server_state_cb;
void *server_state_cb_data;
};
/* Does the domain end in ".onion" or ".onion."? Case-insensitive. */
@ -515,6 +519,8 @@ ares_status_t ares__sconfig_append_fromstr(ares__llist_t **sconfig,
ares_status_t ares_in_addr_to_server_config_llist(const struct in_addr *servers,
size_t nservers,
ares__llist_t **llist);
ares_status_t ares_get_server_addr(const struct server_state *server,
ares__buf_t *buf);
struct ares_hosts_entry;
typedef struct ares_hosts_entry ares_hosts_entry_t;

@ -71,7 +71,44 @@ static ares_bool_t same_address(const struct sockaddr *sa,
static void end_query(ares_channel_t *channel, struct query *query,
ares_status_t status, const ares_dns_record_t *dnsrec);
static void server_increment_failures(struct server_state *server)
/* Invoke the server state callback after a success or failure */
static void invoke_server_state_cb(const struct server_state *server,
ares_bool_t success,
int flags)
{
const ares_channel_t *channel = server->channel;
ares__buf_t *buf;
ares_status_t status;
char *server_string;
if (channel->server_state_cb == NULL) {
return;
}
buf = ares__buf_create();
if (buf == NULL) {
return;
}
status = ares_get_server_addr(server, buf);
if (status != ARES_SUCCESS) {
ares__buf_destroy(buf);
return;
}
server_string = ares__buf_finish_str(buf, NULL);
buf = NULL;
if (server_string == NULL) {
return;
}
channel->server_state_cb(server_string, success, flags,
channel->server_state_cb_data);
ares_free(server_string);
}
static void server_increment_failures(struct server_state *server,
ares_bool_t used_tcp)
{
ares__slist_node_t *node;
const ares_channel_t *channel = server->channel;
@ -88,9 +125,13 @@ static void server_increment_failures(struct server_state *server)
next_retry_time = ares__tvnow();
timeadd(&next_retry_time, channel->server_retry_delay);
server->next_retry_time = next_retry_time;
invoke_server_state_cb(server, ARES_FALSE, used_tcp == ARES_TRUE
? ARES_SERV_STATE_TCP
: ARES_SERV_STATE_UDP);
}
static void server_set_good(struct server_state *server)
static void server_set_good(struct server_state *server, ares_bool_t used_tcp)
{
ares__slist_node_t *node;
const ares_channel_t *channel = server->channel;
@ -107,6 +148,10 @@ static void server_set_good(struct server_state *server)
server->next_retry_time.tv_sec = 0;
server->next_retry_time.tv_usec = 0;
invoke_server_state_cb(server, ARES_TRUE, used_tcp == ARES_TRUE
? ARES_SERV_STATE_TCP
: ARES_SERV_STATE_UDP);
}
/* return true if now is exactly check time or later */
@ -569,7 +614,7 @@ static void process_timeouts(ares_channel_t *channel, struct timeval *now)
query->timeouts++;
conn = query->conn;
server_increment_failures(conn->server);
server_increment_failures(conn->server, query->using_tcp);
ares__requeue_query(query, now);
ares__check_cleanup_conn(channel, conn);
@ -724,7 +769,7 @@ static ares_status_t process_answer(ares_channel_t *channel,
break;
}
server_increment_failures(server);
server_increment_failures(server, query->using_tcp);
ares__requeue_query(query, now);
/* Should any of these cause a connection termination?
@ -740,7 +785,7 @@ static ares_status_t process_answer(ares_channel_t *channel,
is_cached = ARES_TRUE;
}
server_set_good(server);
server_set_good(server, query->using_tcp);
end_query(channel, query, ARES_SUCCESS, rdnsrec);
status = ARES_SUCCESS;
@ -763,7 +808,7 @@ static void handle_conn_error(struct server_connection *conn,
/* Increment failures first before requeue so it is unlikely to requeue
* to the same server */
if (critical_failure) {
server_increment_failures(server);
server_increment_failures(server, conn->is_tcp);
}
/* This will requeue any connections automatically */
@ -987,7 +1032,7 @@ ares_status_t ares__send_query(struct query *query, struct timeval *now)
* error codes */
case ARES_ECONNREFUSED:
case ARES_EBADFAMILY:
server_increment_failures(server);
server_increment_failures(server, query->using_tcp);
query->error_status = status;
return ares__requeue_query(query, now);
@ -1046,7 +1091,7 @@ ares_status_t ares__send_query(struct query *query, struct timeval *now)
* error codes */
case ARES_ECONNREFUSED:
case ARES_EBADFAMILY:
server_increment_failures(server);
server_increment_failures(server, query->using_tcp);
query->error_status = status;
return ares__requeue_query(query, now);
@ -1061,7 +1106,7 @@ ares_status_t ares__send_query(struct query *query, struct timeval *now)
conn = ares__llist_node_val(node);
if (ares__socket_write(channel, conn->fd, query->qbuf, query->qlen) == -1) {
/* FIXME: Handle EAGAIN here since it likely can happen. */
server_increment_failures(server);
server_increment_failures(server, query->using_tcp);
status = ares__requeue_query(query, now);
/* Only safe to kill connection if it was new, otherwise it should be

@ -910,6 +910,62 @@ fail:
return ARES_ENOMEM;
}
/* Write out the details of a server to a buffer */
ares_status_t ares_get_server_addr(const struct server_state *server,
ares__buf_t *buf)
{
ares_status_t status;
char addr[INET6_ADDRSTRLEN];
/* ipv4addr or [ipv6addr] */
if (server->addr.family == AF_INET6) {
status = ares__buf_append_byte(buf, '[');
if (status != ARES_SUCCESS) {
return status;
}
}
ares_inet_ntop(server->addr.family, &server->addr.addr, addr, sizeof(addr));
status = ares__buf_append_str(buf, addr);
if (status != ARES_SUCCESS) {
return status;
}
if (server->addr.family == AF_INET6) {
status = ares__buf_append_byte(buf, ']');
if (status != ARES_SUCCESS) {
return status;
}
}
/* :port */
status = ares__buf_append_byte(buf, ':');
if (status != ARES_SUCCESS) {
return status;
}
status = ares__buf_append_num_dec(buf, server->udp_port, 0);
if (status != ARES_SUCCESS) {
return status;
}
/* %iface */
if (ares_strlen(server->ll_iface)) {
status = ares__buf_append_byte(buf, '%');
if (status != ARES_SUCCESS) {
return status;
}
status = ares__buf_append_str(buf, server->ll_iface);
if (status != ARES_SUCCESS) {
return status;
}
}
return ARES_SUCCESS;
}
int ares_get_servers(ares_channel_t *channel, struct ares_addr_node **servers)
{
struct ares_addr_node *srvr_head = NULL;
@ -1129,7 +1185,6 @@ char *ares_get_servers_csv(ares_channel_t *channel)
node = ares__slist_node_next(node)) {
ares_status_t status;
const struct server_state *server = ares__slist_node_val(node);
char addr[64];
if (ares__buf_len(buf)) {
status = ares__buf_append_byte(buf, ',');
@ -1138,51 +1193,10 @@ char *ares_get_servers_csv(ares_channel_t *channel)
}
}
/* ipv4addr or [ipv6addr] */
if (server->addr.family == AF_INET6) {
status = ares__buf_append_byte(buf, '[');
if (status != ARES_SUCCESS) {
goto done;
}
}
ares_inet_ntop(server->addr.family, &server->addr.addr, addr, sizeof(addr));
status = ares__buf_append_str(buf, addr);
if (status != ARES_SUCCESS) {
goto done;
}
if (server->addr.family == AF_INET6) {
status = ares__buf_append_byte(buf, ']');
if (status != ARES_SUCCESS) {
goto done;
}
}
/* :port */
status = ares__buf_append_byte(buf, ':');
status = ares_get_server_addr(server, buf);
if (status != ARES_SUCCESS) {
goto done;
}
status = ares__buf_append_num_dec(buf, server->udp_port, 0);
if (status != ARES_SUCCESS) {
goto done;
}
/* %iface */
if (ares_strlen(server->ll_iface)) {
status = ares__buf_append_byte(buf, '%');
if (status != ARES_SUCCESS) {
goto done;
}
status = ares__buf_append_str(buf, server->ll_iface);
if (status != ARES_SUCCESS) {
goto done;
}
}
}
out = ares__buf_finish_str(buf, NULL);
@ -1193,3 +1207,14 @@ done:
ares__buf_destroy(buf);
return out;
}
void ares_set_server_state_callback(ares_channel_t *channel,
ares_server_state_callback cb,
void *data)
{
if (channel == NULL) {
return;
}
channel->server_state_cb = cb;
channel->server_state_cb_data = data;
}

@ -227,6 +227,129 @@ TEST_P(MockChannelTest, SockConfigureFailCallback) {
EXPECT_EQ(ARES_ECONNREFUSED, result.status_);
}
// Define a server state callback for testing. The custom userdata should be
// the expected server string that the callback is invoked with.
static int server_state_cb_success_count = 0;
static int server_state_cb_failure_count = 0;
static void ServerStateCallback(const char *server_string,
ares_bool_t success, int flags, void *data) {
// Increment overall success/failure counts appropriately.
if (verbose) std::cerr << "ServerStateCallback("
<< server_string << ", "
<< success << ", "
<< flags << ") invoked" << std::endl;
if (success == ARES_TRUE) server_state_cb_success_count++;
else server_state_cb_failure_count++;
// Check that the server string is as expected.
char *exp_server_string = *(char **)(data);
EXPECT_STREQ(exp_server_string, server_string);
// The callback should be invoked with either the UDP flag or the TCP flag,
// but not both.
ares_bool_t udp = (flags & ARES_SERV_STATE_UDP) ? ARES_TRUE: ARES_FALSE;
ares_bool_t tcp = (flags & ARES_SERV_STATE_TCP) ? ARES_TRUE: ARES_FALSE;
EXPECT_NE(udp, tcp);
}
TEST_P(MockChannelTest, ServStateCallbackSuccess) {
// Set up the server response. The server returns successfully with an answer
// to the query.
DNSPacket rsp;
rsp.set_response().set_aa()
.add_question(new DNSQuestion("www.google.com", T_A))
.add_answer(new DNSARR("www.google.com", 100, {2, 3, 4, 5}));
EXPECT_CALL(server_, OnRequest("www.google.com", T_A))
.WillOnce(SetReply(&server_, &rsp));
// Set up the server state callback. The channel used for this test has a
// single server configured.
char *exp_server_string = ares_get_servers_csv(channel_);
ares_set_server_state_callback(channel_, ServerStateCallback,
&exp_server_string);
// Perform the hostname lookup. Expect 1 successful query to the server.
HostResult result;
server_state_cb_success_count = 0;
server_state_cb_failure_count = 0;
ares_gethostbyname(channel_, "www.google.com.", AF_INET, HostCallback,
&result);
Process();
EXPECT_EQ(1, server_state_cb_success_count);
EXPECT_EQ(0, server_state_cb_failure_count);
EXPECT_TRUE(result.done_);
std::stringstream ss;
ss << result.host_;
EXPECT_EQ("{'www.google.com' aliases=[] addrs=[2.3.4.5]}", ss.str());
ares_free_string(exp_server_string);
}
TEST_P(MockChannelTest, ServStateCallbackFailure) {
// Set up the server response. The server always returns SERVFAIL.
DNSPacket rsp;
rsp.set_response().set_aa()
.add_question(new DNSQuestion("www.google.com", T_A));
rsp.set_rcode(SERVFAIL);
ON_CALL(server_, OnRequest("www.google.com", T_A))
.WillByDefault(SetReply(&server_, &rsp));
// Set up the server state callback. The channel used for this test has a
// single server configured.
char *exp_server_string = ares_get_servers_csv(channel_);
ares_set_server_state_callback(channel_, ServerStateCallback,
&exp_server_string);
// Perform the hostname lookup. Expect 3 failed queries to the server (due to
// retries).
HostResult result;
server_state_cb_success_count = 0;
server_state_cb_failure_count = 0;
ares_gethostbyname(channel_, "www.google.com.", AF_INET, HostCallback,
&result);
Process();
EXPECT_EQ(0, server_state_cb_success_count);
EXPECT_EQ(3, server_state_cb_failure_count);
EXPECT_TRUE(result.done_);
EXPECT_EQ(ARES_ESERVFAIL, result.status_);
ares_free_string(exp_server_string);
}
TEST_P(MockChannelTest, ServStateCallbackRecover) {
// Set up the server response. The server initially times out, but then
// returns successfully (with NXDOMAIN) on the first retry.
std::vector<byte> nothing;
DNSPacket rsp;
rsp.set_response().set_aa()
.add_question(new DNSQuestion("www.google.com", T_A));
rsp.set_rcode(NXDOMAIN);
EXPECT_CALL(server_, OnRequest("www.google.com", T_A))
.WillOnce(SetReplyData(&server_, nothing))
.WillOnce(SetReply(&server_, &rsp));
// Set up the server state callback. The channel used for this test has a
// single server configured.
char *exp_server_string = ares_get_servers_csv(channel_);
ares_set_server_state_callback(channel_, ServerStateCallback,
&exp_server_string);
// Perform the hostname lookup. Expect 1 failed query and 1 successful query
// to the server.
HostResult result;
server_state_cb_success_count = 0;
server_state_cb_failure_count = 0;
ares_gethostbyname(channel_, "www.google.com.", AF_INET, HostCallback,
&result);
Process();
EXPECT_EQ(1, server_state_cb_success_count);
EXPECT_EQ(1, server_state_cb_failure_count);
EXPECT_TRUE(result.done_);
EXPECT_EQ(ARES_ENOTFOUND, result.status_);
ares_free_string(exp_server_string);
}
TEST_P(MockChannelTest, ReInit) {
DNSPacket rsp;
rsp.set_response().set_aa()

Loading…
Cancel
Save