Keep idle reused outgoing TCP connections under read

A reused TCP/TLS dispatch with no outstanding responses was left in the
reuse pool with no read pending, so a peer closing the idle connection
went unnoticed: the socket lingered in CLOSE-WAIT and the dead dispatch
was later handed to a new query, which failed and the fetch timed out.
Keep a read pending on an idle connected dispatch, bounded by an idle
timeout, so the close is seen promptly and the connection is dropped
from the pool instead of reused.

The idle read may only be (re)armed while the dispatch is still
connected; arming it on a dispatch that is already shutting down
re-reads a dying handle and double-schedules a netmgr job.

On shutdown, close the connection as soon as the dispatch reaches its
terminal state instead of waiting for the last reference to drop, so an
unexpected read (or a peer-side close) cannot leave the socket in
CLOSE-WAIT while a reference still lingers.
This commit is contained in:
Ondřej Surý 2026-06-21 09:24:14 +02:00
parent a65ece3648
commit febeac215d
3 changed files with 252 additions and 42 deletions

View file

@ -127,6 +127,7 @@ extern unsigned int dns_zone_mkey_month;
extern unsigned int dns_adb_entrywindow;
extern unsigned int dns_adb_cachemin;
extern size_t dns_dispatch_tcppipelining;
extern unsigned int dns_dispatch_tcp_idle_timeout;
static bool want_stats = false;
static char absolute_conffile[PATH_MAX];
@ -747,6 +748,8 @@ parse_T_opt(char *option) {
"least 1");
}
dns_dispatch_tcppipelining = pipelining;
} else if (!strncmp(option, "tcpidletimeout=", 15)) {
dns_dispatch_tcp_idle_timeout = atoi(option + 15);
} else {
fprintf(stderr, "unknown -T flag '%s'\n", option);
}

View file

@ -56,6 +56,15 @@
*/
size_t dns_dispatch_tcppipelining = 256;
/*
* Idle timeout (in milliseconds) for a reused outgoing TCP connection. While a
* dispatch has no outstanding responses we keep a read pending so a
* peer-initiated close is noticed promptly; this bounds how long such an idle
* connection is kept open for reuse. Can be overridden via
* 'named -T tcpidletimeout=N'.
*/
unsigned int dns_dispatch_tcp_idle_timeout = 5000;
typedef ISC_LIST(dns_dispentry_t) dns_displist_t;
struct dns_dispatchmgr {
@ -740,6 +749,11 @@ tcp_recv_shutdown(dns_dispatch_t *disp, dns_displist_t *resps,
tcp_recv_add(resps, resp, result);
}
disp->state = DNS_DISPATCHSTATE_CANCELED;
/* Close now so the socket doesn't linger in CLOSE-WAIT. */
if (disp->handle != NULL) {
isc_nmhandle_close(disp->handle);
}
}
static void
@ -754,6 +768,26 @@ tcp_recv_processall(dns_displist_t *resps, isc_region_t *region) {
}
}
static void
tcp_startrecv_idle(dns_dispatch_t *disp) {
REQUIRE(dns_dispatch_tcp_idle_timeout > 0);
/*
* No outstanding responses, but the connection is still up and
* reusable. Keep a read pending (bounded by the idle timeout)
* so that a peer-initiated close is detected promptly;
* otherwise the socket would linger in CLOSE-WAIT and be handed
* out again as a dead reused connection.
*/
isc_nmhandle_cleartimeout(disp->handle);
isc_nmhandle_settimeout(disp->handle, dns_dispatch_tcp_idle_timeout);
if (!disp->reading) {
dispatch_log(disp, ISC_LOG_DEBUG(90), "keeping idle read on %p",
disp->handle);
tcp_startrecv(disp, NULL);
}
}
/*
* General flow:
*
@ -768,7 +802,7 @@ tcp_recv_processall(dns_displist_t *resps, isc_region_t *region) {
* restart.
*/
static void
tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region,
tcp_recv(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region,
void *arg) {
dns_dispatch_t *disp = (dns_dispatch_t *)arg;
dns_dispentry_t *resp = NULL;
@ -777,6 +811,7 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region,
dns_displist_t resps = ISC_LIST_INITIALIZER;
isc_time_t now;
int timeout = 0;
isc_result_t result = eresult;
REQUIRE(VALID_DISPATCH(disp));
@ -786,7 +821,7 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region,
dispatch_log(disp, ISC_LOG_DEBUG(90),
"TCP read:%s:requests %" PRIuFAST32,
isc_result_totext(result), disp->requests);
isc_result_totext(eresult), disp->requests);
peer = isc_nmhandle_peeraddr(handle);
@ -794,7 +829,7 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region,
/*
* Phase 1: Process timeout and success.
*/
switch (result) {
switch (eresult) {
case ISC_R_TIMEDOUT:
/*
* Time out the oldest response in the active queue.
@ -819,7 +854,13 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region,
*/
if (result == ISC_R_NOTFOUND) {
if (disp->timedout > 0) {
if (eresult == ISC_R_TIMEDOUT) {
/*
* An idle keep-alive read timed out with no outstanding
* responses.
*/
result = ISC_R_CANCELED;
} else if (disp->timedout > 0) {
/* There was active query that timed-out before */
disp->timedout--;
} else {
@ -857,15 +898,16 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region,
case ISC_R_TIMEDOUT:
case ISC_R_NOTFOUND:
break;
case ISC_R_SHUTTINGDOWN:
case ISC_R_CANCELED:
case ISC_R_EOF:
case ISC_R_CONNECTIONRESET:
isc_sockaddr_format(&peer, buf, sizeof(buf));
dispatch_log(disp, ISC_LOG_DEBUG(90),
"shutting down TCP: %s: %s", buf,
isc_result_totext(result));
if (isc_log_wouldlog(ISC_LOG_DEBUG(90))) {
isc_sockaddr_format(&peer, buf, sizeof(buf));
dispatch_log(disp, ISC_LOG_DEBUG(90),
"shutting down TCP: %s: %s", buf,
isc_result_totext(result));
}
tcp_recv_shutdown(disp, &resps, result);
break;
default:
@ -900,6 +942,16 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region,
*/
tcp_recv_processall(&resps, region);
/*
* Phase 7: Restart reading on idle connections.
*/
if (ISC_LIST_EMPTY(disp->active) &&
disp->state == DNS_DISPATCHSTATE_CONNECTED &&
dns_dispatch_tcp_idle_timeout > 0)
{
tcp_startrecv_idle(disp);
}
dns_dispatch_detach(&disp); /* DISPATCH002 */
}
@ -1732,43 +1784,29 @@ tcp_dispentry_cancel(dns_dispentry_t *resp, isc_result_t result) {
INSIST(!ISC_LINK_LINKED(resp, alink));
if (ISC_LIST_EMPTY(disp->active)) {
if (ISC_LIST_EMPTY(disp->active) &&
disp->state == DNS_DISPATCHSTATE_CONNECTED)
{
INSIST(disp->handle != NULL);
#if DISPATCH_TCP_KEEPALIVE
/*
* This is an experimental code that keeps the TCP
* connection open for 1 second before it is finally
* closed. By keeping the TCP connection open, it can
* be reused by dns_request that uses
* dns_dispatch_gettcp() to join existing TCP
* connections.
*
* It is disabled for now, because it changes the
* behaviour, but I am keeping the code here for future
* reference when we improve the dns_dispatch to reuse
* the TCP connections also in the resolver.
*
* The TCP connection reuse should be seamless and not
* require any extra handling on the client side though.
* This was the last outstanding response on a still
* connected dispatch. Keep the connection open with a
* read pending (bounded by the idle timeout) so that it
* can be reused by a subsequent query and so that a
* peer-initiated close is noticed promptly instead of
* leaving the socket stuck in CLOSE-WAIT. If the
* dispatch is already shutting down, leave it alone so
* it can be torn down.
*/
isc_nmhandle_cleartimeout(disp->handle);
isc_nmhandle_settimeout(disp->handle, 1000);
if (!disp->reading) {
dispentry_log(resp, ISC_LOG_DEBUG(90),
"final 1 second timeout on %p",
disp->handle);
tcp_startrecv(disp, NULL);
}
#else
if (disp->reading) {
if (dns_dispatch_tcp_idle_timeout > 0) {
tcp_startrecv_idle(disp);
} else if (disp->reading) {
dispentry_log(resp, ISC_LOG_DEBUG(90),
"canceling read on %p",
disp->handle);
isc_nm_cancelread(disp->handle);
}
#endif
}
break;
@ -1846,6 +1884,10 @@ tcp_startrecv(dns_dispatch_t *disp, dns_dispentry_t *resp) {
REQUIRE(VALID_DISPATCH(disp));
REQUIRE(disp->socktype == isc_socktype_tcp);
if (disp->reading) {
return;
}
dns_dispatch_ref(disp); /* DISPATCH002 */
if (resp != NULL) {
dispentry_log(resp, ISC_LOG_DEBUG(90), "reading from %p",
@ -2102,13 +2144,19 @@ tcp_dispatch_connect(dns_dispatch_t *disp, dns_dispentry_t *resp) {
"already connected; attaching");
resp->reading = true;
/*
* Replace any idle timeout with this query's timeout. The
* connection may have been kept reading while idle (with the
* idle timeout applied), so update the handle timeout even when
* a read is already pending.
*/
isc_nmhandle_cleartimeout(disp->handle);
if (resp->timeout != 0) {
isc_nmhandle_settimeout(disp->handle, resp->timeout);
}
if (!disp->reading) {
/* Restart the reading */
isc_nmhandle_cleartimeout(disp->handle);
if (resp->timeout != 0) {
isc_nmhandle_settimeout(disp->handle,
resp->timeout);
}
tcp_startrecv(disp, resp);
}

View file

@ -28,6 +28,8 @@
#include <isc/lib.h>
#include <isc/managers.h>
#include <isc/refcount.h>
#include <isc/time.h>
#include <isc/timer.h>
#include <isc/tls.h>
#include <isc/util.h>
#include <isc/uv.h>
@ -872,8 +874,165 @@ ISC_LOOP_TEST_IMPL(dispatch_sharedtcp) {
dns_dispatch_connect(test->dispentry);
}
/*
* Regression test for a reused outgoing TCP connection that the peer closes
* while it is idle. The connection must be kept under read while idle so the
* close is noticed and the dead dispatch is dropped from the reuse pool;
* otherwise the next query would be handed the dead connection and fail.
*/
static test_dispatch_t *reuse_test1 = NULL;
static isc_timer_t *reuse_timer = NULL;
static void
server_senddone_close(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) {
isc_nmhandle_t *sendhandle = arg;
UNUSED(eresult);
/*
* Drop the server side of the connection right after answering, to
* mimic a forwarder (e.g. a DoT resolver) that closes idle
* connections.
*/
isc_nmhandle_close(handle);
isc_nmhandle_detach(&sendhandle);
}
static void
nameserver_close(isc_nmhandle_t *handle, isc_result_t eresult,
isc_region_t *region, void *arg ISC_ATTR_UNUSED) {
isc_region_t response;
static unsigned char buf[16];
isc_nmhandle_t *sendhandle = NULL;
if (eresult != ISC_R_SUCCESS) {
return;
}
memmove(buf, region->base, 12);
memset(buf + 12, 0, 4);
buf[2] |= 0x80; /* qr=1 */
response.base = buf;
response.length = sizeof(buf);
isc_nmhandle_attach(handle, &sendhandle);
isc_nm_send(sendhandle, &response, server_senddone_close, sendhandle);
}
static void
reuse_query2(void *arg ISC_ATTR_UNUSED) {
isc_result_t result;
test_dispatch_t *test2 = isc_mem_get(isc_g_mctx, sizeof(*test2));
*test2 = (test_dispatch_t){
.dispatchmgr = dns_dispatchmgr_ref(reuse_test1->dispatchmgr),
};
isc_timer_destroy(&reuse_timer);
testdata.region.base = testdata.message;
testdata.region.length = sizeof(testdata.message);
/*
* Create the dispatch for the second query. This inspects the reuse
* pool: the dispatch from the first query must no longer be offered
* (it was either removed or marked canceled when the peer closed it),
* so this gets a fresh connection.
*/
result = dns_dispatch_createtcp(
test2->dispatchmgr, &tcp_connect_addr, &tcp_server_addr, NULL,
DNS_DISPATCHTYPE_RESOLVER, 0, &test2->dispatch);
assert_int_equal(result, ISC_R_SUCCESS);
result = dns_dispatch_add(test2->dispatch, isc_loop_main(), 0,
T_CLIENT_CONNECT, T_CLIENT_INIT,
&tcp_server_addr, NULL, NULL, connected,
client_senddone, response_shutdown, test2,
&test2->id, &test2->dispentry);
assert_int_equal(result, ISC_R_SUCCESS);
testdata.message[0] = (test2->id >> 8) & 0xff;
testdata.message[1] = test2->id & 0xff;
dns_dispatch_connect(test2->dispentry);
/*
* Now release the lingering reference from the first query. Keeping
* it until after the createtcp above ensures the dead dispatch is
* still around to be (wrongly) reused if the fix were absent.
*/
dns_dispatch_detach(&reuse_test1->dispatch);
dns_dispatchmgr_detach(&reuse_test1->dispatchmgr);
isc_mem_put(isc_g_mctx, reuse_test1, sizeof(*reuse_test1));
}
static void
response_reuse(isc_result_t eresult, isc_region_t *region ISC_ATTR_UNUSED,
void *arg) {
test_dispatch_t *test = arg;
isc_interval_t interval;
assert_int_equal(eresult, ISC_R_SUCCESS);
/*
* Finish the first query but keep a reference on its dispatch, as an
* overlapping query would, so the connection lingers in the reuse pool
* after going idle.
*/
dns_dispatch_done(&test->dispentry);
reuse_test1 = test;
/*
* Defer the second query so the server's close of the idle connection
* is processed first.
*/
isc_interval_set(&interval, 1, 0);
isc_timer_create(isc_loop_main(), reuse_query2, NULL, &reuse_timer);
isc_timer_start(reuse_timer, isc_timertype_once, &interval);
}
ISC_LOOP_TEST_IMPL(dispatch_tcp_reuse_after_close) {
isc_result_t result;
test_dispatch_t *test = isc_mem_get(isc_g_mctx, sizeof(*test));
*test = (test_dispatch_t){ 0 };
/* Server: answer one query per connection, then close it. */
result = isc_nm_listenstreamdns(
ISC_NM_LISTEN_ONE, &tcp_server_addr, nameserver_close, NULL,
accept_cb, NULL, 0, NULL, NULL, ISC_NM_PROXY_NONE, &sock);
assert_int_equal(result, ISC_R_SUCCESS);
isc_loop_teardown(isc_loop_main(), stop_listening, sock);
/* Client */
testdata.region.base = testdata.message;
testdata.region.length = sizeof(testdata.message);
result = dns_dispatchmgr_create(isc_g_mctx, &test->dispatchmgr);
assert_int_equal(result, ISC_R_SUCCESS);
result = dns_dispatch_createtcp(
test->dispatchmgr, &tcp_connect_addr, &tcp_server_addr, NULL,
DNS_DISPATCHTYPE_RESOLVER, 0, &test->dispatch);
assert_int_equal(result, ISC_R_SUCCESS);
result = dns_dispatch_add(test->dispatch, isc_loop_main(), 0,
T_CLIENT_CONNECT, T_CLIENT_INIT,
&tcp_server_addr, NULL, NULL, connected,
client_senddone, response_reuse, test,
&test->id, &test->dispentry);
assert_int_equal(result, ISC_R_SUCCESS);
testdata.message[0] = (test->id >> 8) & 0xff;
testdata.message[1] = test->id & 0xff;
dns_dispatch_connect(test->dispentry);
}
ISC_TEST_LIST_START
ISC_TEST_ENTRY_CUSTOM(dispatch_sharedtcp, setup_test, teardown_test)
ISC_TEST_ENTRY_CUSTOM(dispatch_tcp_reuse_after_close, setup_test, teardown_test)
ISC_TEST_ENTRY_CUSTOM(dispatch_timeout_udp_response, setup_test, teardown_test)
ISC_TEST_ENTRY_CUSTOM(dispatchset_create, setup_test, teardown_test)
ISC_TEST_ENTRY_CUSTOM(dispatchset_get, setup_test, teardown_test)