mirror of
https://github.com/isc-projects/bind9.git
synced 2026-05-25 02:47:54 -04:00
Implement IP_LOCAL_PORT_RANGE socket option for Linux
For Linux >= 6.8: Since 2023, Linux has introduced a change to the IP_LOCAL_PORT_RANGE socket option that eliminates the need for the random window shifting (implemented as a fallback in the next commit). By setting IP_LOCAL_PORT_RANGE option, we tell the kernel to use better approach to the source port selection. For Linux << 6.8: This implement selecting port by random shifting range leveraging the IP_LOCAL_PORT_RANGE socket option. The network manager is initialized with the ephemeral port range (on startup and on reconfig) and then for every outgoing TCP connection, we define a custom port range (1000 ports) and then randomly shift the custom range within the system range. This helps the kernel to reduce the search space to the custom window between <random_offset, random_offset + 1000>. Reference: https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel
This commit is contained in:
parent
2c48fcaeed
commit
04c81b55d2
11 changed files with 185 additions and 37 deletions
|
|
@ -7704,7 +7704,7 @@ apply_configuration(cfg_obj_t *effectiveconfig, cfg_obj_t *bindkeys,
|
|||
dns_kasplist_t tmpkasplist, kasplist;
|
||||
dns_keystorelist_t tmpkeystorelist, keystorelist;
|
||||
dns_viewlist_t viewlist;
|
||||
in_port_t listen_port, udpport_low, udpport_high;
|
||||
in_port_t listen_port, port_low, port_high;
|
||||
int i, backlog;
|
||||
isc_interval_t interval;
|
||||
isc_logconfig_t *logc = NULL;
|
||||
|
|
@ -8048,39 +8048,26 @@ apply_configuration(cfg_obj_t *effectiveconfig, cfg_obj_t *bindkeys,
|
|||
isc_portset_create(isc_g_mctx, &v4portset);
|
||||
isc_portset_create(isc_g_mctx, &v6portset);
|
||||
|
||||
result = isc_net_getudpportrange(AF_INET, &udpport_low, &udpport_high);
|
||||
if (result != ISC_R_SUCCESS) {
|
||||
isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
|
||||
ISC_LOG_ERROR,
|
||||
"get the default UDP/IPv4 port range: %s",
|
||||
isc_result_totext(result));
|
||||
goto cleanup_portsets;
|
||||
}
|
||||
|
||||
isc_portset_addrange(v4portset, udpport_low, udpport_high);
|
||||
isc_net_getudpportrange(AF_INET, &port_low, &port_high);
|
||||
isc_netmgr_portrange(AF_INET, port_low, port_high);
|
||||
isc_portset_addrange(v4portset, port_low, port_high);
|
||||
if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE4)) {
|
||||
isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
|
||||
ISC_LOG_INFO,
|
||||
"using default UDP/IPv4 port range: "
|
||||
"[%d, %d]",
|
||||
udpport_low, udpport_high);
|
||||
port_low, port_high);
|
||||
}
|
||||
|
||||
result = isc_net_getudpportrange(AF_INET6, &udpport_low, &udpport_high);
|
||||
if (result != ISC_R_SUCCESS) {
|
||||
isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
|
||||
ISC_LOG_ERROR,
|
||||
"get the default UDP/IPv6 port range: %s",
|
||||
isc_result_totext(result));
|
||||
goto cleanup_portsets;
|
||||
}
|
||||
isc_portset_addrange(v6portset, udpport_low, udpport_high);
|
||||
isc_net_getudpportrange(AF_INET6, &port_low, &port_high);
|
||||
isc_netmgr_portrange(AF_INET6, port_low, port_high);
|
||||
isc_portset_addrange(v6portset, port_low, port_high);
|
||||
if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE6)) {
|
||||
isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
|
||||
ISC_LOG_INFO,
|
||||
"using default UDP/IPv6 port range: "
|
||||
"[%d, %d]",
|
||||
udpport_low, udpport_high);
|
||||
port_low, port_high);
|
||||
}
|
||||
|
||||
dns_dispatchmgr_setavailports(named_g_dispatchmgr, v4portset,
|
||||
|
|
|
|||
|
|
@ -216,7 +216,7 @@ isc_net_enableipv4(void);
|
|||
void
|
||||
isc_net_enableipv6(void);
|
||||
|
||||
isc_result_t
|
||||
void
|
||||
isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high);
|
||||
/*%<
|
||||
* Returns system's default range of ephemeral UDP ports, if defined.
|
||||
|
|
|
|||
|
|
@ -946,3 +946,9 @@ isc_nmsocket_getaddr(isc_nmsocket_t *sock);
|
|||
/*%<
|
||||
* Return the local address of 'sock'.
|
||||
*/
|
||||
|
||||
void
|
||||
isc_netmgr_portrange(sa_family_t af, in_port_t low, in_port_t high);
|
||||
/*%<
|
||||
* Set the ephemeral port range <low, high> for 'af' family.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -45,3 +45,10 @@ isc_os_umask(void);
|
|||
/*%<
|
||||
* Return umask of the current process as initialized at the program start
|
||||
*/
|
||||
|
||||
void
|
||||
isc_os_kernel(char **name, int *major, int *minor, int *patch);
|
||||
/*%<
|
||||
* Fill the running kernel version into major, minor and patch.
|
||||
* If any of these are not available then -1 is returned.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -19,9 +19,17 @@
|
|||
|
||||
void
|
||||
isc_managers_create(uint32_t workers) {
|
||||
in_port_t port_low, port_high;
|
||||
|
||||
isc_loopmgr_create(isc_g_mctx, workers);
|
||||
isc_netmgr_create(isc_g_mctx);
|
||||
isc_rwlock_setworkers(workers);
|
||||
|
||||
isc_net_getudpportrange(AF_INET, &port_low, &port_high);
|
||||
isc_netmgr_portrange(AF_INET, port_low, port_high);
|
||||
|
||||
isc_net_getudpportrange(AF_INET6, &port_low, &port_high);
|
||||
isc_netmgr_portrange(AF_INET6, port_low, port_high);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
|||
|
|
@ -175,7 +175,7 @@ getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
|
|||
#endif /* HAVE_SYSCTLBYNAME */
|
||||
#endif /* USE_SYSCTL_PORTRANGE */
|
||||
|
||||
isc_result_t
|
||||
void
|
||||
isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) {
|
||||
int result = ISC_R_FAILURE;
|
||||
#if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux)
|
||||
|
|
@ -214,8 +214,6 @@ isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) {
|
|||
*low = ISC_NET_PORTRANGELOW;
|
||||
*high = ISC_NET_PORTRANGEHIGH;
|
||||
}
|
||||
|
||||
return ISC_R_SUCCESS; /* we currently never fail in this function */
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
|||
|
|
@ -358,6 +358,12 @@ typedef struct isc__netmgr {
|
|||
atomic_int_fast32_t send_udp_buffer_size;
|
||||
atomic_int_fast32_t recv_tcp_buffer_size;
|
||||
atomic_int_fast32_t send_tcp_buffer_size;
|
||||
|
||||
_Atomic(in_port_t) port_low4;
|
||||
_Atomic(in_port_t) port_high4;
|
||||
_Atomic(in_port_t) port_low6;
|
||||
_Atomic(in_port_t) port_high6;
|
||||
|
||||
} isc__netmgr_t;
|
||||
|
||||
extern isc__netmgr_t *isc__netmgr;
|
||||
|
|
@ -1387,9 +1393,11 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family);
|
|||
*/
|
||||
|
||||
isc_result_t
|
||||
isc__nm_tcp_bind_no_port(uv_tcp_t *handle);
|
||||
isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
|
||||
sa_family_t sa_family ISC_ATTR_UNUSED);
|
||||
/*%<
|
||||
* Set IP_BIND_ADDRESS_NO_PORT on the socket (Linux only).
|
||||
* Set IP_BIND_ADDRESS_NO_PORT and IP_LOCAL_PORT_RANGE on the socket
|
||||
* (Linux only).
|
||||
*/
|
||||
|
||||
void
|
||||
|
|
|
|||
|
|
@ -155,6 +155,7 @@ netmgr_teardown(void *arg ISC_ATTR_UNUSED) {
|
|||
void
|
||||
isc_netmgr_create(isc_mem_t *mctx) {
|
||||
isc__netmgr_t *netmgr = NULL;
|
||||
in_port_t port_low, port_high;
|
||||
|
||||
#ifdef MAXIMAL_UV_VERSION
|
||||
if (uv_version() > MAXIMAL_UV_VERSION) {
|
||||
|
|
@ -185,6 +186,11 @@ isc_netmgr_create(isc_mem_t *mctx) {
|
|||
atomic_init(&netmgr->send_tcp_buffer_size, 0);
|
||||
atomic_init(&netmgr->recv_udp_buffer_size, 0);
|
||||
atomic_init(&netmgr->send_udp_buffer_size, 0);
|
||||
atomic_init(&netmgr->port_low4, 0);
|
||||
atomic_init(&netmgr->port_high4, 65535);
|
||||
atomic_init(&netmgr->port_low6, 0);
|
||||
atomic_init(&netmgr->port_high6, 65535);
|
||||
|
||||
#if HAVE_SO_REUSEPORT_LB
|
||||
netmgr->load_balance_sockets = true;
|
||||
#else
|
||||
|
|
@ -237,6 +243,15 @@ isc_netmgr_create(isc_mem_t *mctx) {
|
|||
}
|
||||
|
||||
isc__netmgr = netmgr;
|
||||
|
||||
/*
|
||||
* Set the initial port range for IP_LOCAL_PORT_RANGE.
|
||||
*/
|
||||
isc_net_getudpportrange(AF_INET, &port_low, &port_high);
|
||||
isc_netmgr_portrange(AF_INET, port_low, port_high);
|
||||
|
||||
isc_net_getudpportrange(AF_INET6, &port_low, &port_high);
|
||||
isc_netmgr_portrange(AF_INET6, port_low, port_high);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -2898,6 +2913,23 @@ isc__networker_get(uint32_t tid) {
|
|||
return &isc__netmgr->workers[tid];
|
||||
}
|
||||
|
||||
void
|
||||
isc_netmgr_portrange(sa_family_t af, in_port_t low, in_port_t high) {
|
||||
REQUIRE(VALID_NM(isc__netmgr));
|
||||
switch (af) {
|
||||
case AF_INET:
|
||||
atomic_store_relaxed(&isc__netmgr->port_low4, low);
|
||||
atomic_store_relaxed(&isc__netmgr->port_high4, high);
|
||||
break;
|
||||
case AF_INET6:
|
||||
atomic_store_relaxed(&isc__netmgr->port_low6, low);
|
||||
atomic_store_relaxed(&isc__netmgr->port_high6, high);
|
||||
break;
|
||||
default:
|
||||
INSIST(0);
|
||||
}
|
||||
}
|
||||
|
||||
#if ISC_NETMGR_TRACE
|
||||
/*
|
||||
* Dump all active sockets in netmgr. We output to stderr
|
||||
|
|
|
|||
|
|
@ -11,7 +11,10 @@
|
|||
* information regarding copyright ownership.
|
||||
*/
|
||||
|
||||
#include <netinet/in.h>
|
||||
|
||||
#include <isc/errno.h>
|
||||
#include <isc/result.h>
|
||||
#include <isc/uv.h>
|
||||
|
||||
#include "netmgr-int.h"
|
||||
|
|
@ -370,17 +373,81 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) {
|
|||
return ISC_R_SUCCESS;
|
||||
}
|
||||
|
||||
isc_result_t
|
||||
isc__nm_tcp_bind_no_port(uv_tcp_t *handle ISC_ATTR_UNUSED) {
|
||||
#ifdef IP_BIND_ADDRESS_NO_PORT
|
||||
uv_os_sock_t fd = -1;
|
||||
/*
|
||||
* See
|
||||
* https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel
|
||||
* for rationalle.
|
||||
*/
|
||||
#define PORT_RANGE 1000
|
||||
|
||||
int r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd);
|
||||
if (r < 0) {
|
||||
isc_result_t
|
||||
isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
|
||||
sa_family_t af ISC_ATTR_UNUSED) {
|
||||
#ifdef IP_BIND_ADDRESS_NO_PORT
|
||||
if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) {
|
||||
return ISC_R_FAILURE;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) {
|
||||
#if defined(IP_LOCAL_PORT_RANGE) && defined(__linux__)
|
||||
/*
|
||||
* The option takes an uint32_t value with the high 16 bits
|
||||
* set to the upper range bound, and the low 16 bits set to
|
||||
* the lower range bound. Range bounds are inclusive. The
|
||||
* 16-bit values should be in host byte order.
|
||||
*/
|
||||
uint32_t port_range;
|
||||
int major, minor;
|
||||
isc_os_kernel(NULL, &major, &minor, NULL);
|
||||
|
||||
in_port_t port_low, port_high;
|
||||
switch (af) {
|
||||
case AF_INET:
|
||||
port_low = isc__netmgr->port_low4;
|
||||
port_high = isc__netmgr->port_high4;
|
||||
break;
|
||||
case AF_INET6:
|
||||
port_low = isc__netmgr->port_low6;
|
||||
port_high = isc__netmgr->port_high6;
|
||||
break;
|
||||
default:
|
||||
INSIST(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Linux 6.8 implemented a following patch:
|
||||
*
|
||||
* If IP_LOCAL_PORT_RANGE is set on a socket before accept(),
|
||||
* port selection no longer favors even ports.
|
||||
*
|
||||
* This means that connect() can find a suitable source port
|
||||
* faster, and applications can use a different split between
|
||||
* connect() and bind() users.
|
||||
*/
|
||||
if (major < 6 || (major == 6 && minor < 8)) {
|
||||
/*
|
||||
* On Linux << 6.8, use IP_LOCAL_PORT_RANGE to
|
||||
* partition ephemeral port range randomly to help
|
||||
* with the port selection.
|
||||
*/
|
||||
if (port_high - port_low <= PORT_RANGE) {
|
||||
return ISC_R_RANGE;
|
||||
}
|
||||
|
||||
/*
|
||||
* port_low <= N < port_high - PORT_RANGE
|
||||
*/
|
||||
port_high -= PORT_RANGE;
|
||||
port_low += isc_random_uniform(port_high - port_low);
|
||||
port_high = port_low + PORT_RANGE;
|
||||
}
|
||||
INSIST(port_low > 0);
|
||||
INSIST(port_low < port_high);
|
||||
|
||||
port_range = (uint32_t)port_low | ((uint32_t)port_high << 16);
|
||||
if (setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE, &port_range,
|
||||
sizeof(port_range)) == -1)
|
||||
{
|
||||
return ISC_R_FAILURE;
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@
|
|||
*/
|
||||
|
||||
#include <libgen.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <isc/async.h>
|
||||
|
|
@ -141,8 +142,6 @@ tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {
|
|||
}
|
||||
isc__nm_incstats(sock, STATID_OPEN);
|
||||
|
||||
isc__nm_tcp_bind_no_port(&sock->uv_handle.tcp);
|
||||
|
||||
if (req->local.length != 0) {
|
||||
r = uv_tcp_bind(&sock->uv_handle.tcp, &req->local.type.sa, 0);
|
||||
if (r != 0) {
|
||||
|
|
@ -291,6 +290,15 @@ isc_nm_tcpconnect(isc_sockaddr_t *local, isc_sockaddr_t *peer,
|
|||
|
||||
(void)isc__nm_socket_min_mtu(sock->fd, sa_family);
|
||||
(void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG);
|
||||
result = isc__nm_socket_max_port_range(sock->fd, sa_family);
|
||||
if (result != ISC_R_SUCCESS) {
|
||||
isc__nmsocket_log(sock, ISC_LOG_DEBUG(99),
|
||||
"setting up IP_BIND_ADDRESS_NO_PORT or "
|
||||
"IP_LOCAL_PORT_RANGE failed: %s\n",
|
||||
result == ISC_R_RANGE
|
||||
? isc_result_totext(result)
|
||||
: strerror(errno));
|
||||
}
|
||||
|
||||
sock->active = true;
|
||||
|
||||
|
|
|
|||
27
lib/isc/os.c
27
lib/isc/os.c
|
|
@ -11,10 +11,13 @@
|
|||
* information regarding copyright ownership.
|
||||
*/
|
||||
|
||||
#include <ctype.h>
|
||||
#include <inttypes.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/utsname.h>
|
||||
|
||||
#include <isc/os.h>
|
||||
#include <isc/string.h>
|
||||
#include <isc/types.h>
|
||||
#include <isc/util.h>
|
||||
#include <isc/uv.h>
|
||||
|
|
@ -25,6 +28,8 @@
|
|||
static unsigned int isc__os_ncpus = 0;
|
||||
static unsigned long isc__os_cacheline = ISC_OS_CACHELINE_SIZE;
|
||||
static mode_t isc__os_umask = 0;
|
||||
static int kernel_major = -1, kernel_minor = -1, kernel_patch = -1;
|
||||
static char kernel_name[64];
|
||||
|
||||
/*
|
||||
* The affinity support for non-Linux is in the review in the upstream
|
||||
|
|
@ -177,6 +182,19 @@ umask_initialize(void) {
|
|||
(void)umask(isc__os_umask);
|
||||
}
|
||||
|
||||
static void
|
||||
kernel_initialize(void) {
|
||||
struct utsname buffer;
|
||||
|
||||
if (uname(&buffer) == -1) {
|
||||
return;
|
||||
}
|
||||
|
||||
(void)sscanf(buffer.release, "%d.%d.%d", &kernel_major, &kernel_minor,
|
||||
&kernel_patch);
|
||||
(void)strlcpy(kernel_name, buffer.sysname, sizeof(kernel_name));
|
||||
}
|
||||
|
||||
unsigned int
|
||||
isc_os_ncpus(void) {
|
||||
return isc__os_ncpus;
|
||||
|
|
@ -192,10 +210,19 @@ isc_os_umask(void) {
|
|||
return isc__os_umask;
|
||||
}
|
||||
|
||||
void
|
||||
isc_os_kernel(char **name, int *major, int *minor, int *patch) {
|
||||
SET_IF_NOT_NULL(name, kernel_name)
|
||||
SET_IF_NOT_NULL(major, kernel_major);
|
||||
SET_IF_NOT_NULL(minor, kernel_minor);
|
||||
SET_IF_NOT_NULL(patch, kernel_patch);
|
||||
}
|
||||
|
||||
void
|
||||
isc__os_initialize(void) {
|
||||
umask_initialize();
|
||||
ncpus_initialize();
|
||||
kernel_initialize();
|
||||
#if defined(_SC_LEVEL1_DCACHE_LINESIZE)
|
||||
long s = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
|
||||
if (s > 0 && (unsigned long)s > isc__os_cacheline) {
|
||||
|
|
|
|||
Loading…
Reference in a new issue