From 4e9b6f7806dc15d8c5695576cb04ee22aec2058c Mon Sep 17 00:00:00 2001 From: Brad House Date: Fri, 19 Jul 2024 10:55:49 -0400 Subject: [PATCH] Prevent complex recursion during query requeing and connection cleanup c-ares utilizes recursion for some operations, and some of these processes can have unintended side effects, such as if a callback is called that then recurses into the same function. This can cause strange cleanup conditions that lead to crashes. Try to disassociate queries with connections as early as possible and move cleaning up unneeded connections to its own scan rather than trying to detect each time a query is disassociated from a connection. Fix By: Brad House (@bradh352) --- src/lib/ares__close_sockets.c | 80 +++++++++++++++++++++-------------- src/lib/ares_cancel.c | 8 ++-- src/lib/ares_private.h | 3 +- src/lib/ares_process.c | 41 ++++++++++-------- test/ares-test-mock-et.cc | 2 +- 5 files changed, 77 insertions(+), 57 deletions(-) diff --git a/src/lib/ares__close_sockets.c b/src/lib/ares__close_sockets.c index 39f40823..a12606c3 100644 --- a/src/lib/ares__close_sockets.c +++ b/src/lib/ares__close_sockets.c @@ -81,41 +81,59 @@ void ares__close_sockets(struct server_state *server) } } -void ares__check_cleanup_conn(const ares_channel_t *channel, - struct server_connection *conn) +void ares__check_cleanup_conns(const ares_channel_t *channel) { - ares_bool_t do_cleanup = ARES_FALSE; + ares__slist_node_t *snode; - if (channel == NULL || conn == NULL) { + if (channel == NULL) { return; } - if (ares__llist_len(conn->queries_to_conn)) { - return; - } - - /* If we are configured not to stay open, close it out */ - if (!(channel->flags & ARES_FLAG_STAYOPEN)) { - do_cleanup = ARES_TRUE; + /* Iterate across each server */ + for (snode = ares__slist_node_first(channel->servers); snode != NULL; + snode = ares__slist_node_next(snode)) { + + struct server_state *server = ares__slist_node_val(snode); + ares__llist_node_t *cnode; + + /* Iterate across each connection */ + cnode = ares__llist_node_first(server->connections); + while (cnode != NULL) { + ares__llist_node_t *next = ares__llist_node_next(cnode); + struct server_connection *conn = ares__llist_node_val(cnode); + ares_bool_t do_cleanup = ARES_FALSE; + cnode = next; + + /* Has connections, not eligible */ + if (ares__llist_len(conn->queries_to_conn)) { + continue; + } + + /* If we are configured not to stay open, close it out */ + if (!(channel->flags & ARES_FLAG_STAYOPEN)) { + do_cleanup = ARES_TRUE; + } + + /* If the associated server has failures, close it out. Resetting the + * connection (and specifically the source port number) can help resolve + * situations where packets are being dropped. + */ + if (conn->server->consec_failures > 0) { + do_cleanup = ARES_TRUE; + } + + /* If the udp connection hit its max queries, always close it */ + if (!conn->is_tcp && channel->udp_max_queries > 0 && + conn->total_queries >= channel->udp_max_queries) { + do_cleanup = ARES_TRUE; + } + + if (!do_cleanup) { + continue; + } + + /* Clean it up */ + ares__close_connection(conn, ARES_SUCCESS); + } } - - /* If the associated server has failures, close it out. Resetting the - * connection (and specifically the source port number) can help resolve - * situations where packets are being dropped. - */ - if (conn->server->consec_failures > 0) { - do_cleanup = ARES_TRUE; - } - - /* If the udp connection hit its max queries, always close it */ - if (!conn->is_tcp && channel->udp_max_queries > 0 && - conn->total_queries >= channel->udp_max_queries) { - do_cleanup = ARES_TRUE; - } - - if (!do_cleanup) { - return; - } - - ares__close_connection(conn, ARES_SUCCESS); } diff --git a/src/lib/ares_cancel.c b/src/lib/ares_cancel.c index 5a9fb722..464a547f 100644 --- a/src/lib/ares_cancel.c +++ b/src/lib/ares_cancel.c @@ -64,28 +64,26 @@ void ares_cancel(ares_channel_t *channel) node = ares__llist_node_first(list_copy); while (node != NULL) { struct query *query; - struct server_connection *conn; /* Cache next since this node is being deleted */ next = ares__llist_node_next(node); query = ares__llist_node_claim(node); - conn = query->conn; query->node_all_queries = NULL; /* NOTE: its possible this may enqueue new queries */ query->callback(query->arg, ARES_ECANCELLED, 0, NULL); ares__free_query(query); - /* See if the connection should be cleaned up */ - ares__check_cleanup_conn(channel, conn); - node = next; } ares__llist_destroy(list_copy); } + /* See if the connections should be cleaned up */ + ares__check_cleanup_conns(channel); + ares_queue_notify_empty(channel); done: diff --git a/src/lib/ares_private.h b/src/lib/ares_private.h index 01b2f812..85c0c39d 100644 --- a/src/lib/ares_private.h +++ b/src/lib/ares_private.h @@ -401,8 +401,7 @@ void ares__dnsrec_convert_cb(void *arg, ares_status_t status, size_t timeouts, void ares__close_connection(struct server_connection *conn, ares_status_t requeue_status); void ares__close_sockets(struct server_state *server); -void ares__check_cleanup_conn(const ares_channel_t *channel, - struct server_connection *conn); +void ares__check_cleanup_conns(const ares_channel_t *channel); void ares__free_query(struct query *query); ares_rand_state *ares__init_rand_state(void); diff --git a/src/lib/ares_process.c b/src/lib/ares_process.c index 316666dd..f6e22b35 100644 --- a/src/lib/ares_process.c +++ b/src/lib/ares_process.c @@ -72,6 +72,17 @@ static ares_bool_t same_address(const struct sockaddr *sa, static void end_query(ares_channel_t *channel, struct query *query, ares_status_t status, const ares_dns_record_t *dnsrec); + +static void ares__query_disassociate_from_conn(struct query *query) +{ + /* If its not part of a connection, it can't be tracked for timeouts either */ + ares__slist_node_destroy(query->node_queries_by_timeout); + ares__llist_node_destroy(query->node_queries_to_conn); + query->node_queries_by_timeout = NULL; + query->node_queries_to_conn = NULL; + query->conn = NULL; +} + /* Invoke the server state callback after a success or failure */ static void invoke_server_state_cb(const struct server_state *server, ares_bool_t success, int flags) @@ -206,6 +217,8 @@ static void processfds(ares_channel_t *channel, fd_set *read_fds, /* Write last as the other 2 operations might have triggered writes */ write_tcp_data(channel, write_fds, write_fd); + /* See if any connections should be cleaned up */ + ares__check_cleanup_conns(channel); ares__channel_unlock(channel); } @@ -398,8 +411,6 @@ static void read_tcp_data(ares_channel_t *channel, /* Since we processed the answer, clear the tag so space can be reclaimed */ ares__buf_tag_clear(server->tcp_parser); } - - ares__check_cleanup_conn(channel, conn); } static int socket_list_append(ares_socket_t **socketlist, ares_socket_t fd, @@ -523,8 +534,6 @@ static void read_udp_packets_fd(ares_channel_t *channel, /* Try to read again only if *we* set up the socket, otherwise it may be * a blocking socket and would cause recvfrom to hang. */ } while (read_len >= 0 && channel->sock_funcs == NULL); - - ares__check_cleanup_conn(channel, conn); } static void read_packets(ares_channel_t *channel, fd_set *read_fds, @@ -597,13 +606,15 @@ static void read_packets(ares_channel_t *channel, fd_set *read_fds, /* If any queries have timed out, note the timeout and move them on. */ static void process_timeouts(ares_channel_t *channel, struct timeval *now) { - ares__slist_node_t *node = - ares__slist_node_first(channel->queries_by_timeout); - while (node != NULL) { + ares__slist_node_t *node; + + /* Just keep popping off the first as this list will re-sort as things come + * and go. We don't want to try to rely on 'next' as some operation might + * cause a cleanup of that pointer and would become invalid */ + while ((node = ares__slist_node_first(channel->queries_by_timeout)) != NULL) { struct query *query = ares__slist_node_val(node); - /* Node might be removed, cache next */ - ares__slist_node_t *next = ares__slist_node_next(node); struct server_connection *conn; + /* Since this is sorted, as soon as we hit a query that isn't timed out, * break */ if (!ares__timedout(now, &query->timeout)) { @@ -615,9 +626,6 @@ static void process_timeouts(ares_channel_t *channel, struct timeval *now) conn = query->conn; server_increment_failures(conn->server, query->using_tcp); ares__requeue_query(query, now, ARES_ETIMEOUT); - ares__check_cleanup_conn(channel, conn); - - node = next; } } @@ -821,6 +829,8 @@ ares_status_t ares__requeue_query(struct query *query, struct timeval *now, ares_channel_t *channel = query->channel; size_t max_tries = ares__slist_len(channel->servers) * channel->tries; + ares__query_disassociate_from_conn(query); + if (status != ARES_SUCCESS) { query->error_status = status; } @@ -1003,8 +1013,6 @@ ares_status_t ares__send_query(struct query *query, struct timeval *now) ares_status_t status; ares_bool_t new_connection = ARES_FALSE; - query->conn = NULL; - /* Choose the server to send the query to */ if (channel->rotate) { /* Pull random server */ @@ -1235,12 +1243,9 @@ static ares_bool_t same_address(const struct sockaddr *sa, static void ares_detach_query(struct query *query) { /* Remove the query from all the lists in which it is linked */ + ares__query_disassociate_from_conn(query); ares__htable_szvp_remove(query->channel->queries_by_qid, query->qid); - ares__slist_node_destroy(query->node_queries_by_timeout); - ares__llist_node_destroy(query->node_queries_to_conn); ares__llist_node_destroy(query->node_all_queries); - query->node_queries_by_timeout = NULL; - query->node_queries_to_conn = NULL; query->node_all_queries = NULL; } diff --git a/test/ares-test-mock-et.cc b/test/ares-test-mock-et.cc index 5117d9f8..c51fc52b 100644 --- a/test/ares-test-mock-et.cc +++ b/test/ares-test-mock-et.cc @@ -133,7 +133,7 @@ TEST_P(MockUDPEventThreadTest, BadLoopbackServerNoTimeouts) { EXPECT_EQ(ARES_ECONNREFUSED, result[i].status_); EXPECT_EQ(0, result[i].timeouts_); #else - EXPECT_TRUE(result[i].status_ == ARES_ECONNREFUSED || result[i].status_ == ARES_ETIMEOUT); + EXPECT_TRUE(result[i].status_ == ARES_ECONNREFUSED || result[i].status_ == ARES_ETIMEOUT || result[i].status_ == ARES_ESERVFAIL); #endif } }