Rework WinAFD event code (#811)

We've had reports of user-after-free type crashes in Windows cleanup
code for the Event Thread. In evaluating the code, it appeared there
were some memory leaks on per-connection handles that may have remained
open during shutdown, while trying to resolve that it became apparent
the methodology chosen may not have been the right one for interfacing
with the Windows AFD system as stability issues were seen during this
debugging process.

Since this system is completely undocumented, there was no clear
resolution path other than to switch to the *other* methodology which
involves directly opening `\Device\Afd`, rather than spawning a "peer
socket" to use to queue AFD operations.

The original methodology chosen more closely resembled what is employed
by [libuv](https://github.com/libuv/libuv) and given its widespread use
was the reason it was used. The new methodology more closely resembles
[wepoll](https://github.com/piscisaureus/wepoll).

Its not clear if there are any scalability or performance advantages or
disadvantages for either method. They both seem like different ways to
do the same thing, but this current way does seem more stable.

Fixes #798 
Fix By: Brad House (@bradh352)
pull/812/head
Brad House 4 months ago committed by GitHub
parent 986bef771c
commit b19c186ce7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 25
      .github/workflows/msys2.yml
  2. 2
      appveyor.yml
  3. 2
      ci/test.sh
  4. 59
      src/lib/ares_event_thread.c
  5. 781
      src/lib/ares_event_win32.c
  6. 58
      src/lib/ares_event_win32.h
  7. 5
      src/lib/ares_sysconfig_win.c
  8. 58
      test/ares-test-mock-et.cc
  9. 4
      test/ares-test.cc

@ -9,6 +9,9 @@ concurrency:
group: ${{ github.ref }}
cancel-in-progress: true
env:
GTEST_ARGS: "-4 --gtest_filter=-*LiveSearchTXT*:*LiveSearchANY*:*LiveGetLocalhostByAddr*"
jobs:
build:
runs-on: windows-latest
@ -42,6 +45,7 @@ jobs:
mingw-w64-${{ matrix.env }}-cmake
mingw-w64-${{ matrix.env }}-ninja
mingw-w64-${{ matrix.env }}-gtest
mingw-w64-${{ matrix.env }}-lldb
${{ matrix.extra_packages }}
- name: Checkout c-ares
uses: actions/checkout@v4
@ -51,7 +55,11 @@ jobs:
cmake --build build_cmake
./build_cmake/bin/adig.exe www.google.com
./build_cmake/bin/ahost.exe www.google.com
GTEST_INSTALL_FAILURE_SIGNAL_HANDLER=1 ./build_cmake/bin/arestest.exe -4 -v --gtest_filter=-*LiveSearchTXT*:*LiveSearchANY*:*LiveGetLocalhostByAddr*
if [ "${{ matrix.msystem }}" != "MINGW32" ] ; then
lldb --batch -o 'run ${{ env.GTEST_ARGS }}' -k 'thread backtrace all' -k 'quit 1' ./build_cmake/bin/arestest.exe
else
./build_cmake/bin/arestest.exe ${{ env.GTEST_ARGS }}
fi
- name: "Autotools: build and test c-ares"
run: |
autoreconf -fi
@ -61,9 +69,14 @@ jobs:
make -j3
./src/tools/adig.exe www.google.com
./src/tools/ahost.exe www.google.com
GTEST_INSTALL_FAILURE_SIGNAL_HANDLER=1 ./test/arestest.exe -4 -v --gtest_filter=-*LiveSearchTXT*:*LiveSearchANY*:*LiveGetLocalhostByAddr*
if [ "${{ matrix.msystem }}" != "MINGW32" ] ; then
lldb --batch -o 'run ${{ env.GTEST_ARGS }}' -k 'thread backtrace all' -k 'quit 1' ./test/arestest.exe
else
./test/arestest.exe ${{ env.GTEST_ARGS }}
fi
- name: "CMake: UBSAN: build and test c-ares"
if: ${{ matrix.env == 'clang-x86_64' || matrix.env == 'clang-i686' }}
# Bogus alignment errors on i686, so lets not run UBSAN on i686.
if: ${{ matrix.env == 'clang-x86_64' }}
env:
CMAKE_OPTS: "-DCMAKE_CXX_FLAGS=-fsanitize=undefined -DCMAKE_C_FLAGS=-fsanitize=undefined -DCMAKE_SHARED_LINKER_FLAGS=-fsanitize=undefined -DCMAKE_EXE_LINKER_FLAGS=-fsanitize=undefined"
run: |
@ -71,7 +84,8 @@ jobs:
cmake --build build_ubsan
./build_ubsan/bin/adig.exe www.google.com
./build_ubsan/bin/ahost.exe www.google.com
GTEST_INSTALL_FAILURE_SIGNAL_HANDLER=1 ./build_ubsan/bin/arestest.exe -4 -v --gtest_filter=-*LiveSearchTXT*:*LiveSearchANY*:*LiveGetLocalhostByAddr*
lldb --batch -o 'run ${{ env.GTEST_ARGS }}' -k 'thread backtrace all' -k 'quit 1' ./build_ubsan/bin/arestest.exe
# ./build_ubsan/bin/arestest.exe ${{ env.GTEST_ARGS }}
- name: "CMake: ASAN: build and test c-ares"
if: ${{ matrix.env == 'clang-x86_64' || matrix.env == 'clang-i686' }}
env:
@ -81,7 +95,8 @@ jobs:
cmake --build build_asan
./build_asan/bin/adig.exe www.google.com
./build_asan/bin/ahost.exe www.google.com
GTEST_INSTALL_FAILURE_SIGNAL_HANDLER=1 ./build_asan/bin/arestest.exe -4 -v --gtest_filter=-*LiveSearchTXT*:*LiveSearchANY*:*LiveGetLocalhostByAddr*
lldb --batch -o 'run ${{ env.GTEST_ARGS }}' -k 'thread backtrace all' -k 'quit 1' ./build_asan/bin/arestest.exe
# ./build_asan/bin/arestest.exe ${{ env.GTEST_ARGS }}
- name: "Autotools: Static Analyzer: build c-ares"
if: ${{ matrix.env == 'clang-x86_64' || matrix.env == 'clang-i686' }}
# Cmake won't work because it somehow mangles linker args and it can't find core windows libraries

@ -138,7 +138,7 @@ test_script:
- if "%SYSTEM%" == "CONSOLE" if "%CONFTOOL%" == "CMAKE" if not "%SKIP_TESTS%" == "yes" cd %TESTDIR%
- if "%SYSTEM%" == "CONSOLE" if "%CONFTOOL%" == "CMAKE" if not "%SKIP_TESTS%" == "yes" .\adig.exe www.google.com
- if "%SYSTEM%" == "CONSOLE" if "%CONFTOOL%" == "CMAKE" if not "%SKIP_TESTS%" == "yes" .\ahost.exe www.google.com
- if "%SYSTEM%" == "CONSOLE" if "%CONFTOOL%" == "CMAKE" if not "%SKIP_TESTS%" == "yes" .\arestest.exe -4 -v --gtest_filter=-*LiveSearchTXT*:*LiveSearchANY*:*LiveGetLocalhostByAddr*
- if "%SYSTEM%" == "CONSOLE" if "%CONFTOOL%" == "CMAKE" if not "%SKIP_TESTS%" == "yes" .\arestest.exe --gtest_filter=-*LiveSearchTXT*:*LiveSearchANY*:*LiveGetLocalhostByAddr*
- if "%SYSTEM%" == "CONSOLE" if "%CONFTOOL%" == "CMAKE" if not "%SKIP_TESTS%" == "yes" .\dnsdump.exe "%APPVEYOR_BUILD_FOLDER%\test\fuzzinput\answer_a" "%APPVEYOR_BUILD_FOLDER%\test\fuzzinput\answer_aaaa"
#on_finish:

@ -36,7 +36,7 @@ export GTEST_INSTALL_FAILURE_SIGNAL_HANDLER
$TEST_WRAP "${TOOLSBIN}/adig" www.google.com
$TEST_WRAP "${TOOLSBIN}/ahost" www.google.com
cd "${TESTSBIN}"
$TEST_WRAP ./arestest -4 -v $TEST_FILTER
$TEST_WRAP ./arestest -4 $TEST_FILTER
./aresfuzz ${TESTDIR}/fuzzinput/*
./aresfuzzname ${TESTDIR}/fuzznames/*
./dnsdump "${TESTDIR}/fuzzinput/answer_a" "${TESTDIR}/fuzzinput/answer_aaaa"

@ -262,6 +262,35 @@ static void ares_event_process_updates(ares_event_thread_t *e)
}
}
static void ares_event_thread_cleanup(ares_event_thread_t *e)
{
/* Manually free any updates that weren't processed */
if (e->ev_updates != NULL) {
ares__llist_node_t *node;
while ((node = ares__llist_node_first(e->ev_updates)) != NULL) {
ares_event_destroy_cb(ares__llist_node_claim(node));
}
ares__llist_destroy(e->ev_updates);
e->ev_updates = NULL;
}
if (e->ev_sock_handles != NULL) {
ares__htable_asvp_destroy(e->ev_sock_handles);
e->ev_sock_handles = NULL;
}
if (e->ev_cust_handles != NULL) {
ares__htable_vpvp_destroy(e->ev_cust_handles);
e->ev_cust_handles = NULL;
}
if (e->ev_sys != NULL && e->ev_sys->destroy != NULL) {
e->ev_sys->destroy(e);
e->ev_sys = NULL;
}
}
static void *ares_event_thread(void *arg)
{
ares_event_thread_t *e = arg;
@ -296,14 +325,16 @@ static void *ares_event_thread(void *arg)
ares__thread_mutex_lock(e->mutex);
}
/* Lets cleanup while we're in the thread itself */
ares_event_thread_cleanup(e);
ares__thread_mutex_unlock(e->mutex);
return NULL;
}
static void ares_event_thread_destroy_int(ares_event_thread_t *e)
{
ares__llist_node_t *node;
/* Wake thread and tell it to shutdown if it exists */
ares__thread_mutex_lock(e->mutex);
if (e->isup) {
@ -314,26 +345,14 @@ static void ares_event_thread_destroy_int(ares_event_thread_t *e)
/* Wait for thread to shutdown */
if (e->thread) {
ares__thread_join(e->thread, NULL);
void *rv = NULL;
ares__thread_join(e->thread, &rv);
e->thread = NULL;
}
/* Manually free any updates that weren't processed */
while ((node = ares__llist_node_first(e->ev_updates)) != NULL) {
ares_event_destroy_cb(ares__llist_node_claim(node));
}
ares__llist_destroy(e->ev_updates);
e->ev_updates = NULL;
ares__htable_asvp_destroy(e->ev_sock_handles);
e->ev_sock_handles = NULL;
ares__htable_vpvp_destroy(e->ev_cust_handles);
e->ev_cust_handles = NULL;
if (e->ev_sys && e->ev_sys->destroy) {
e->ev_sys->destroy(e);
}
/* If the event thread ever got to the point of starting, this is a no-op
* as it runs this same cleanup when it shuts down */
ares_event_thread_cleanup(e);
ares__thread_mutex_destroy(e->mutex);
e->mutex = NULL;
@ -350,6 +369,8 @@ void ares_event_thread_destroy(ares_channel_t *channel)
}
ares_event_thread_destroy_int(e);
channel->sock_state_cb_data = NULL;
channel->sock_state_cb = NULL;
}
static const ares_event_sys_t *ares_event_fetch_sys(ares_evsys_t evsys)

File diff suppressed because it is too large Load Diff

@ -67,17 +67,53 @@ typedef VOID(NTAPI *PIO_APC_ROUTINE)(PVOID ApcContext,
# define STATUS_CANCELLED ((NTSTATUS)0xC0000120L)
# define STATUS_NOT_FOUND ((NTSTATUS)0xC0000225L)
typedef struct _UNICODE_STRING {
USHORT Length;
USHORT MaximumLength;
LPCWSTR Buffer;
} UNICODE_STRING, *PUNICODE_STRING;
typedef struct _OBJECT_ATTRIBUTES {
ULONG Length;
HANDLE RootDirectory;
PUNICODE_STRING ObjectName;
ULONG Attributes;
PVOID SecurityDescriptor;
PVOID SecurityQualityOfService;
} OBJECT_ATTRIBUTES, *POBJECT_ATTRIBUTES;
# ifndef FILE_OPEN
# define FILE_OPEN 0x00000001UL
# endif
/* Not sure what headers might have these */
# define IOCTL_AFD_POLL 0x00012024
# define AFD_POLL_RECEIVE 0x0001
# define AFD_POLL_RECEIVE_EXPEDITED 0x0002
# define AFD_POLL_SEND 0x0004
# define AFD_POLL_DISCONNECT 0x0008
# define AFD_POLL_ABORT 0x0010
# define AFD_POLL_LOCAL_CLOSE 0x0020
# define AFD_POLL_ACCEPT 0x0080
# define AFD_POLL_CONNECT_FAIL 0x0100
# define AFD_POLL_RECEIVE_BIT 0
# define AFD_POLL_RECEIVE (1 << AFD_POLL_RECEIVE_BIT)
# define AFD_POLL_RECEIVE_EXPEDITED_BIT 1
# define AFD_POLL_RECEIVE_EXPEDITED (1 << AFD_POLL_RECEIVE_EXPEDITED_BIT)
# define AFD_POLL_SEND_BIT 2
# define AFD_POLL_SEND (1 << AFD_POLL_SEND_BIT)
# define AFD_POLL_DISCONNECT_BIT 3
# define AFD_POLL_DISCONNECT (1 << AFD_POLL_DISCONNECT_BIT)
# define AFD_POLL_ABORT_BIT 4
# define AFD_POLL_ABORT (1 << AFD_POLL_ABORT_BIT)
# define AFD_POLL_LOCAL_CLOSE_BIT 5
# define AFD_POLL_LOCAL_CLOSE (1 << AFD_POLL_LOCAL_CLOSE_BIT)
# define AFD_POLL_CONNECT_BIT 6
# define AFD_POLL_CONNECT (1 << AFD_POLL_CONNECT_BIT)
# define AFD_POLL_ACCEPT_BIT 7
# define AFD_POLL_ACCEPT (1 << AFD_POLL_ACCEPT_BIT)
# define AFD_POLL_CONNECT_FAIL_BIT 8
# define AFD_POLL_CONNECT_FAIL (1 << AFD_POLL_CONNECT_FAIL_BIT)
# define AFD_POLL_QOS_BIT 9
# define AFD_POLL_QOS (1 << AFD_POLL_QOS_BIT)
# define AFD_POLL_GROUP_QOS_BIT 10
# define AFD_POLL_GROUP_QOS (1 << AFD_POLL_GROUP_QOS_BIT)
# define AFD_NUM_POLL_EVENTS 11
# define AFD_POLL_ALL ((1 << AFD_NUM_POLL_EVENTS) - 1)
typedef struct _AFD_POLL_HANDLE_INFO {
HANDLE Handle;
@ -101,6 +137,12 @@ typedef NTSTATUS(NTAPI *NtDeviceIoControlFile_t)(
PIO_STATUS_BLOCK IoStatusBlock, ULONG IoControlCode, PVOID InputBuffer,
ULONG InputBufferLength, PVOID OutputBuffer, ULONG OutputBufferLength);
typedef NTSTATUS(NTAPI *NtCreateFile_t)(
PHANDLE FileHandle, ACCESS_MASK DesiredAccess,
POBJECT_ATTRIBUTES ObjectAttributes, PIO_STATUS_BLOCK IoStatusBlock,
PLARGE_INTEGER AllocationSize, ULONG FileAttributes, ULONG ShareAccess,
ULONG CreateDisposition, ULONG CreateOptions, PVOID EaBuffer, ULONG EaLength);
/* On UWP/Windows Store, these definitions aren't there for some reason */
# ifndef SIO_BSP_HANDLE_POLL
# define SIO_BSP_HANDLE_POLL 0x4800001D

@ -356,10 +356,11 @@ static ares_bool_t get_DNS_Windows(char **outptr)
* compute the resulting total metric, just as Windows routing will do.
* Then, sort all the addresses found by the metric.
*/
for (ipaDNSAddr = ipaaEntry->FirstDnsServerAddress; ipaDNSAddr;
for (ipaDNSAddr = ipaaEntry->FirstDnsServerAddress; ipaDNSAddr != NULL;
ipaDNSAddr = ipaDNSAddr->Next) {
char ipaddr[INET6_ADDRSTRLEN] = "";
namesrvr.sa = ipaDNSAddr->Address.lpSockaddr;
namesrvr.sa = ipaDNSAddr->Address.lpSockaddr;
if (namesrvr.sa->sa_family == AF_INET) {
if ((namesrvr.sa4->sin_addr.S_un.S_addr == INADDR_ANY) ||

@ -103,8 +103,8 @@ static int sock_cb_count = 0;
static int SocketConnectCallback(ares_socket_t fd, int type, void *data) {
int rc = *(int*)data;
(void)type;
if (verbose) std::cerr << "SocketConnectCallback(" << fd << ") invoked" << std::endl;
sock_cb_count++;
if (verbose) std::cerr << "SocketConnectCallback(fd: " << fd << ", cnt: " << sock_cb_count << ") invoked" << std::endl;
return rc;
}
@ -215,6 +215,55 @@ TEST_P(MockUDPEventThreadMaxQueriesTest, GetHostByNameParallelLookups) {
}
}
class MockUDPEventThreadSingleQueryPerConnTest
: public MockEventThreadOptsTest,
public ::testing::WithParamInterface<std::tuple<ares_evsys_t,int>> {
public:
MockUDPEventThreadSingleQueryPerConnTest()
: MockEventThreadOptsTest(1, std::get<0>(GetParam()), std::get<1>(GetParam()), false,
FillOptions(&opts_),
ARES_OPT_UDP_MAX_QUERIES) {}
static struct ares_options* FillOptions(struct ares_options * opts) {
memset(opts, 0, sizeof(struct ares_options));
opts->udp_max_queries = 1;
return opts;
}
private:
struct ares_options opts_;
};
#define LOTSOFCONNECTIONS_CNT 64
TEST_P(MockUDPEventThreadSingleQueryPerConnTest, LotsOfConnections) {
DNSPacket rsp;
rsp.set_response().set_aa()
.add_question(new DNSQuestion("www.google.com", T_A))
.add_answer(new DNSARR("www.google.com", 100, {2, 3, 4, 5}));
ON_CALL(server_, OnRequest("www.google.com", T_A))
.WillByDefault(SetReply(&server_, &rsp));
// Get notified of new sockets so we can validate how many are created
int rc = ARES_SUCCESS;
ares_set_socket_callback(channel_, SocketConnectCallback, &rc);
sock_cb_count = 0;
HostResult result[LOTSOFCONNECTIONS_CNT];
for (size_t i=0; i<LOTSOFCONNECTIONS_CNT; i++) {
ares_gethostbyname(channel_, "www.google.com.", AF_INET, HostCallback, &result[i]);
}
Process();
EXPECT_EQ(LOTSOFCONNECTIONS_CNT, sock_cb_count);
for (size_t i=0; i<LOTSOFCONNECTIONS_CNT; i++) {
std::stringstream ss;
EXPECT_TRUE(result[i].done_);
ss << result[i].host_;
EXPECT_EQ("{'www.google.com' aliases=[] addrs=[2.3.4.5]}", ss.str());
}
}
class CacheQueriesEventThreadTest
: public MockEventThreadOptsTest,
public ::testing::WithParamInterface<std::tuple<ares_evsys_t,int>> {
@ -817,9 +866,9 @@ TEST_P(MockEventThreadTest, BulkCancel) {
for (size_t i = 0; i<BULKCANCEL_CNT; i++) {
EXPECT_TRUE(result[i].done_);
EXPECT_TRUE(result[i].status_ == ARES_ECANCELLED || result[i].status_ == ARES_SUCCESS);
if (result[i].status_ == ARES_SUCCESS)
if (result[i].done_ && result[i].status_ == ARES_SUCCESS)
success_cnt++;
if (result[i].status_ == ARES_ECANCELLED)
if (result[i].done_ && result[i].status_ == ARES_ECANCELLED)
cancel_cnt++;
}
if (verbose) std::cerr << "success: " << success_cnt << ", cancel: " << cancel_cnt << std::endl;
@ -1491,6 +1540,9 @@ INSTANTIATE_TEST_SUITE_P(TransportModes, NoRotateMultiMockEventThreadTest, ::tes
INSTANTIATE_TEST_SUITE_P(TransportModes, ServerFailoverOptsMockEventThreadTest, ::testing::ValuesIn(ares::test::evsys_families_modes), ares::test::PrintEvsysFamilyMode);
INSTANTIATE_TEST_SUITE_P(AddressFamilies, MockUDPEventThreadSingleQueryPerConnTest, ::testing::ValuesIn(ares::test::evsys_families), ares::test::PrintEvsysFamily);
} // namespace test
} // namespace ares

4
test/ares-test.cc vendored

@ -865,6 +865,8 @@ void MockEventThreadOptsTest::Process(unsigned int cancel_ms) {
}
while (ares_queue_active_queries(channel_)) {
//if (verbose) std::cerr << "pending queries: " << ares_queue_active_queries(channel_) << std::endl;
int nfds = 0;
fd_set readers;
@ -912,6 +914,8 @@ void MockEventThreadOptsTest::Process(unsigned int cancel_ms) {
}
}
}
//if (verbose) std::cerr << "pending queries at process end: " << ares_queue_active_queries(channel_) << std::endl;
}
std::ostream& operator<<(std::ostream& os, const HostResult& result) {

Loading…
Cancel
Save