diff --git a/bin/named/main.c b/bin/named/main.c index 37b77bc2ed..f7f7afb422 100644 --- a/bin/named/main.c +++ b/bin/named/main.c @@ -127,6 +127,7 @@ extern unsigned int dns_zone_mkey_month; extern unsigned int dns_adb_entrywindow; extern unsigned int dns_adb_cachemin; extern size_t dns_dispatch_tcppipelining; +extern unsigned int dns_dispatch_tcp_idle_timeout; static bool want_stats = false; static char absolute_conffile[PATH_MAX]; @@ -747,6 +748,8 @@ parse_T_opt(char *option) { "least 1"); } dns_dispatch_tcppipelining = pipelining; + } else if (!strncmp(option, "tcpidletimeout=", 15)) { + dns_dispatch_tcp_idle_timeout = atoi(option + 15); } else { fprintf(stderr, "unknown -T flag '%s'\n", option); } diff --git a/lib/dns/dispatch.c b/lib/dns/dispatch.c index a27d186329..dc31532290 100644 --- a/lib/dns/dispatch.c +++ b/lib/dns/dispatch.c @@ -56,6 +56,15 @@ */ size_t dns_dispatch_tcppipelining = 256; +/* + * Idle timeout (in milliseconds) for a reused outgoing TCP connection. While a + * dispatch has no outstanding responses we keep a read pending so a + * peer-initiated close is noticed promptly; this bounds how long such an idle + * connection is kept open for reuse. Can be overridden via + * 'named -T tcpidletimeout=N'. + */ +unsigned int dns_dispatch_tcp_idle_timeout = 5000; + typedef ISC_LIST(dns_dispentry_t) dns_displist_t; struct dns_dispatchmgr { @@ -740,6 +749,11 @@ tcp_recv_shutdown(dns_dispatch_t *disp, dns_displist_t *resps, tcp_recv_add(resps, resp, result); } disp->state = DNS_DISPATCHSTATE_CANCELED; + + /* Close now so the socket doesn't linger in CLOSE-WAIT. */ + if (disp->handle != NULL) { + isc_nmhandle_close(disp->handle); + } } static void @@ -754,6 +768,26 @@ tcp_recv_processall(dns_displist_t *resps, isc_region_t *region) { } } +static void +tcp_startrecv_idle(dns_dispatch_t *disp) { + REQUIRE(dns_dispatch_tcp_idle_timeout > 0); + + /* + * No outstanding responses, but the connection is still up and + * reusable. Keep a read pending (bounded by the idle timeout) + * so that a peer-initiated close is detected promptly; + * otherwise the socket would linger in CLOSE-WAIT and be handed + * out again as a dead reused connection. + */ + isc_nmhandle_cleartimeout(disp->handle); + isc_nmhandle_settimeout(disp->handle, dns_dispatch_tcp_idle_timeout); + if (!disp->reading) { + dispatch_log(disp, ISC_LOG_DEBUG(90), "keeping idle read on %p", + disp->handle); + tcp_startrecv(disp, NULL); + } +} + /* * General flow: * @@ -768,7 +802,7 @@ tcp_recv_processall(dns_displist_t *resps, isc_region_t *region) { * restart. */ static void -tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region, +tcp_recv(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region, void *arg) { dns_dispatch_t *disp = (dns_dispatch_t *)arg; dns_dispentry_t *resp = NULL; @@ -777,6 +811,7 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region, dns_displist_t resps = ISC_LIST_INITIALIZER; isc_time_t now; int timeout = 0; + isc_result_t result = eresult; REQUIRE(VALID_DISPATCH(disp)); @@ -786,7 +821,7 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region, dispatch_log(disp, ISC_LOG_DEBUG(90), "TCP read:%s:requests %" PRIuFAST32, - isc_result_totext(result), disp->requests); + isc_result_totext(eresult), disp->requests); peer = isc_nmhandle_peeraddr(handle); @@ -794,7 +829,7 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region, /* * Phase 1: Process timeout and success. */ - switch (result) { + switch (eresult) { case ISC_R_TIMEDOUT: /* * Time out the oldest response in the active queue. @@ -819,7 +854,13 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region, */ if (result == ISC_R_NOTFOUND) { - if (disp->timedout > 0) { + if (eresult == ISC_R_TIMEDOUT) { + /* + * An idle keep-alive read timed out with no outstanding + * responses. + */ + result = ISC_R_CANCELED; + } else if (disp->timedout > 0) { /* There was active query that timed-out before */ disp->timedout--; } else { @@ -857,15 +898,16 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region, case ISC_R_TIMEDOUT: case ISC_R_NOTFOUND: break; - case ISC_R_SHUTTINGDOWN: case ISC_R_CANCELED: case ISC_R_EOF: case ISC_R_CONNECTIONRESET: - isc_sockaddr_format(&peer, buf, sizeof(buf)); - dispatch_log(disp, ISC_LOG_DEBUG(90), - "shutting down TCP: %s: %s", buf, - isc_result_totext(result)); + if (isc_log_wouldlog(ISC_LOG_DEBUG(90))) { + isc_sockaddr_format(&peer, buf, sizeof(buf)); + dispatch_log(disp, ISC_LOG_DEBUG(90), + "shutting down TCP: %s: %s", buf, + isc_result_totext(result)); + } tcp_recv_shutdown(disp, &resps, result); break; default: @@ -900,6 +942,16 @@ tcp_recv(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region, */ tcp_recv_processall(&resps, region); + /* + * Phase 7: Restart reading on idle connections. + */ + if (ISC_LIST_EMPTY(disp->active) && + disp->state == DNS_DISPATCHSTATE_CONNECTED && + dns_dispatch_tcp_idle_timeout > 0) + { + tcp_startrecv_idle(disp); + } + dns_dispatch_detach(&disp); /* DISPATCH002 */ } @@ -1732,43 +1784,29 @@ tcp_dispentry_cancel(dns_dispentry_t *resp, isc_result_t result) { INSIST(!ISC_LINK_LINKED(resp, alink)); - if (ISC_LIST_EMPTY(disp->active)) { + if (ISC_LIST_EMPTY(disp->active) && + disp->state == DNS_DISPATCHSTATE_CONNECTED) + { INSIST(disp->handle != NULL); -#if DISPATCH_TCP_KEEPALIVE /* - * This is an experimental code that keeps the TCP - * connection open for 1 second before it is finally - * closed. By keeping the TCP connection open, it can - * be reused by dns_request that uses - * dns_dispatch_gettcp() to join existing TCP - * connections. - * - * It is disabled for now, because it changes the - * behaviour, but I am keeping the code here for future - * reference when we improve the dns_dispatch to reuse - * the TCP connections also in the resolver. - * - * The TCP connection reuse should be seamless and not - * require any extra handling on the client side though. + * This was the last outstanding response on a still + * connected dispatch. Keep the connection open with a + * read pending (bounded by the idle timeout) so that it + * can be reused by a subsequent query and so that a + * peer-initiated close is noticed promptly instead of + * leaving the socket stuck in CLOSE-WAIT. If the + * dispatch is already shutting down, leave it alone so + * it can be torn down. */ - isc_nmhandle_cleartimeout(disp->handle); - isc_nmhandle_settimeout(disp->handle, 1000); - - if (!disp->reading) { - dispentry_log(resp, ISC_LOG_DEBUG(90), - "final 1 second timeout on %p", - disp->handle); - tcp_startrecv(disp, NULL); - } -#else - if (disp->reading) { + if (dns_dispatch_tcp_idle_timeout > 0) { + tcp_startrecv_idle(disp); + } else if (disp->reading) { dispentry_log(resp, ISC_LOG_DEBUG(90), "canceling read on %p", disp->handle); isc_nm_cancelread(disp->handle); } -#endif } break; @@ -1846,6 +1884,10 @@ tcp_startrecv(dns_dispatch_t *disp, dns_dispentry_t *resp) { REQUIRE(VALID_DISPATCH(disp)); REQUIRE(disp->socktype == isc_socktype_tcp); + if (disp->reading) { + return; + } + dns_dispatch_ref(disp); /* DISPATCH002 */ if (resp != NULL) { dispentry_log(resp, ISC_LOG_DEBUG(90), "reading from %p", @@ -2102,13 +2144,19 @@ tcp_dispatch_connect(dns_dispatch_t *disp, dns_dispentry_t *resp) { "already connected; attaching"); resp->reading = true; + /* + * Replace any idle timeout with this query's timeout. The + * connection may have been kept reading while idle (with the + * idle timeout applied), so update the handle timeout even when + * a read is already pending. + */ + isc_nmhandle_cleartimeout(disp->handle); + if (resp->timeout != 0) { + isc_nmhandle_settimeout(disp->handle, resp->timeout); + } + if (!disp->reading) { /* Restart the reading */ - isc_nmhandle_cleartimeout(disp->handle); - if (resp->timeout != 0) { - isc_nmhandle_settimeout(disp->handle, - resp->timeout); - } tcp_startrecv(disp, resp); } diff --git a/tests/dns/dispatch_test.c b/tests/dns/dispatch_test.c index 5d024fe1df..ce4eee2850 100644 --- a/tests/dns/dispatch_test.c +++ b/tests/dns/dispatch_test.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include #include @@ -872,8 +874,165 @@ ISC_LOOP_TEST_IMPL(dispatch_sharedtcp) { dns_dispatch_connect(test->dispentry); } +/* + * Regression test for a reused outgoing TCP connection that the peer closes + * while it is idle. The connection must be kept under read while idle so the + * close is noticed and the dead dispatch is dropped from the reuse pool; + * otherwise the next query would be handed the dead connection and fail. + */ +static test_dispatch_t *reuse_test1 = NULL; +static isc_timer_t *reuse_timer = NULL; + +static void +server_senddone_close(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { + isc_nmhandle_t *sendhandle = arg; + + UNUSED(eresult); + + /* + * Drop the server side of the connection right after answering, to + * mimic a forwarder (e.g. a DoT resolver) that closes idle + * connections. + */ + isc_nmhandle_close(handle); + isc_nmhandle_detach(&sendhandle); +} + +static void +nameserver_close(isc_nmhandle_t *handle, isc_result_t eresult, + isc_region_t *region, void *arg ISC_ATTR_UNUSED) { + isc_region_t response; + static unsigned char buf[16]; + isc_nmhandle_t *sendhandle = NULL; + + if (eresult != ISC_R_SUCCESS) { + return; + } + + memmove(buf, region->base, 12); + memset(buf + 12, 0, 4); + buf[2] |= 0x80; /* qr=1 */ + + response.base = buf; + response.length = sizeof(buf); + + isc_nmhandle_attach(handle, &sendhandle); + isc_nm_send(sendhandle, &response, server_senddone_close, sendhandle); +} + +static void +reuse_query2(void *arg ISC_ATTR_UNUSED) { + isc_result_t result; + test_dispatch_t *test2 = isc_mem_get(isc_g_mctx, sizeof(*test2)); + + *test2 = (test_dispatch_t){ + .dispatchmgr = dns_dispatchmgr_ref(reuse_test1->dispatchmgr), + }; + + isc_timer_destroy(&reuse_timer); + + testdata.region.base = testdata.message; + testdata.region.length = sizeof(testdata.message); + + /* + * Create the dispatch for the second query. This inspects the reuse + * pool: the dispatch from the first query must no longer be offered + * (it was either removed or marked canceled when the peer closed it), + * so this gets a fresh connection. + */ + result = dns_dispatch_createtcp( + test2->dispatchmgr, &tcp_connect_addr, &tcp_server_addr, NULL, + DNS_DISPATCHTYPE_RESOLVER, 0, &test2->dispatch); + assert_int_equal(result, ISC_R_SUCCESS); + + result = dns_dispatch_add(test2->dispatch, isc_loop_main(), 0, + T_CLIENT_CONNECT, T_CLIENT_INIT, + &tcp_server_addr, NULL, NULL, connected, + client_senddone, response_shutdown, test2, + &test2->id, &test2->dispentry); + assert_int_equal(result, ISC_R_SUCCESS); + + testdata.message[0] = (test2->id >> 8) & 0xff; + testdata.message[1] = test2->id & 0xff; + + dns_dispatch_connect(test2->dispentry); + + /* + * Now release the lingering reference from the first query. Keeping + * it until after the createtcp above ensures the dead dispatch is + * still around to be (wrongly) reused if the fix were absent. + */ + dns_dispatch_detach(&reuse_test1->dispatch); + dns_dispatchmgr_detach(&reuse_test1->dispatchmgr); + isc_mem_put(isc_g_mctx, reuse_test1, sizeof(*reuse_test1)); +} + +static void +response_reuse(isc_result_t eresult, isc_region_t *region ISC_ATTR_UNUSED, + void *arg) { + test_dispatch_t *test = arg; + isc_interval_t interval; + + assert_int_equal(eresult, ISC_R_SUCCESS); + + /* + * Finish the first query but keep a reference on its dispatch, as an + * overlapping query would, so the connection lingers in the reuse pool + * after going idle. + */ + dns_dispatch_done(&test->dispentry); + reuse_test1 = test; + + /* + * Defer the second query so the server's close of the idle connection + * is processed first. + */ + isc_interval_set(&interval, 1, 0); + isc_timer_create(isc_loop_main(), reuse_query2, NULL, &reuse_timer); + isc_timer_start(reuse_timer, isc_timertype_once, &interval); +} + +ISC_LOOP_TEST_IMPL(dispatch_tcp_reuse_after_close) { + isc_result_t result; + test_dispatch_t *test = isc_mem_get(isc_g_mctx, sizeof(*test)); + *test = (test_dispatch_t){ 0 }; + + /* Server: answer one query per connection, then close it. */ + result = isc_nm_listenstreamdns( + ISC_NM_LISTEN_ONE, &tcp_server_addr, nameserver_close, NULL, + accept_cb, NULL, 0, NULL, NULL, ISC_NM_PROXY_NONE, &sock); + assert_int_equal(result, ISC_R_SUCCESS); + + isc_loop_teardown(isc_loop_main(), stop_listening, sock); + + /* Client */ + testdata.region.base = testdata.message; + testdata.region.length = sizeof(testdata.message); + + result = dns_dispatchmgr_create(isc_g_mctx, &test->dispatchmgr); + assert_int_equal(result, ISC_R_SUCCESS); + + result = dns_dispatch_createtcp( + test->dispatchmgr, &tcp_connect_addr, &tcp_server_addr, NULL, + DNS_DISPATCHTYPE_RESOLVER, 0, &test->dispatch); + assert_int_equal(result, ISC_R_SUCCESS); + + result = dns_dispatch_add(test->dispatch, isc_loop_main(), 0, + T_CLIENT_CONNECT, T_CLIENT_INIT, + &tcp_server_addr, NULL, NULL, connected, + client_senddone, response_reuse, test, + &test->id, &test->dispentry); + assert_int_equal(result, ISC_R_SUCCESS); + + testdata.message[0] = (test->id >> 8) & 0xff; + testdata.message[1] = test->id & 0xff; + + dns_dispatch_connect(test->dispentry); +} + ISC_TEST_LIST_START ISC_TEST_ENTRY_CUSTOM(dispatch_sharedtcp, setup_test, teardown_test) +ISC_TEST_ENTRY_CUSTOM(dispatch_tcp_reuse_after_close, setup_test, teardown_test) ISC_TEST_ENTRY_CUSTOM(dispatch_timeout_udp_response, setup_test, teardown_test) ISC_TEST_ENTRY_CUSTOM(dispatchset_create, setup_test, teardown_test) ISC_TEST_ENTRY_CUSTOM(dispatchset_get, setup_test, teardown_test)