Implement IP_LOCAL_PORT_RANGE socket option for Linux

For Linux >= 6.8:

Since 2023, Linux has introduced a change to the IP_LOCAL_PORT_RANGE
socket option that eliminates the need for the random window
shifting (implemented as a fallback in the next commit).

By setting IP_LOCAL_PORT_RANGE option, we tell the kernel to use better
approach to the source port selection.

For Linux << 6.8:

This implement selecting port by random shifting range leveraging the
IP_LOCAL_PORT_RANGE socket option.  The network manager is initialized
with the ephemeral port range (on startup and on reconfig) and then for
every outgoing TCP connection, we define a custom port range (1000
ports) and then randomly shift the custom range within the system range.

This helps the kernel to reduce the search space to the custom window
between <random_offset, random_offset + 1000>.

Reference:
https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel
This commit is contained in:
Ondřej Surý 2025-07-24 11:43:14 +02:00
parent 2c48fcaeed
commit 04c81b55d2
No known key found for this signature in database
GPG key ID: 2820F37E873DEA41
11 changed files with 185 additions and 37 deletions

View file

@ -7704,7 +7704,7 @@ apply_configuration(cfg_obj_t *effectiveconfig, cfg_obj_t *bindkeys,
dns_kasplist_t tmpkasplist, kasplist;
dns_keystorelist_t tmpkeystorelist, keystorelist;
dns_viewlist_t viewlist;
in_port_t listen_port, udpport_low, udpport_high;
in_port_t listen_port, port_low, port_high;
int i, backlog;
isc_interval_t interval;
isc_logconfig_t *logc = NULL;
@ -8048,39 +8048,26 @@ apply_configuration(cfg_obj_t *effectiveconfig, cfg_obj_t *bindkeys,
isc_portset_create(isc_g_mctx, &v4portset);
isc_portset_create(isc_g_mctx, &v6portset);
result = isc_net_getudpportrange(AF_INET, &udpport_low, &udpport_high);
if (result != ISC_R_SUCCESS) {
isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
ISC_LOG_ERROR,
"get the default UDP/IPv4 port range: %s",
isc_result_totext(result));
goto cleanup_portsets;
}
isc_portset_addrange(v4portset, udpport_low, udpport_high);
isc_net_getudpportrange(AF_INET, &port_low, &port_high);
isc_netmgr_portrange(AF_INET, port_low, port_high);
isc_portset_addrange(v4portset, port_low, port_high);
if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE4)) {
isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
ISC_LOG_INFO,
"using default UDP/IPv4 port range: "
"[%d, %d]",
udpport_low, udpport_high);
port_low, port_high);
}
result = isc_net_getudpportrange(AF_INET6, &udpport_low, &udpport_high);
if (result != ISC_R_SUCCESS) {
isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
ISC_LOG_ERROR,
"get the default UDP/IPv6 port range: %s",
isc_result_totext(result));
goto cleanup_portsets;
}
isc_portset_addrange(v6portset, udpport_low, udpport_high);
isc_net_getudpportrange(AF_INET6, &port_low, &port_high);
isc_netmgr_portrange(AF_INET6, port_low, port_high);
isc_portset_addrange(v6portset, port_low, port_high);
if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE6)) {
isc_log_write(NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER,
ISC_LOG_INFO,
"using default UDP/IPv6 port range: "
"[%d, %d]",
udpport_low, udpport_high);
port_low, port_high);
}
dns_dispatchmgr_setavailports(named_g_dispatchmgr, v4portset,

View file

@ -216,7 +216,7 @@ isc_net_enableipv4(void);
void
isc_net_enableipv6(void);
isc_result_t
void
isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high);
/*%<
* Returns system's default range of ephemeral UDP ports, if defined.

View file

@ -946,3 +946,9 @@ isc_nmsocket_getaddr(isc_nmsocket_t *sock);
/*%<
* Return the local address of 'sock'.
*/
void
isc_netmgr_portrange(sa_family_t af, in_port_t low, in_port_t high);
/*%<
* Set the ephemeral port range <low, high> for 'af' family.
*/

View file

@ -45,3 +45,10 @@ isc_os_umask(void);
/*%<
* Return umask of the current process as initialized at the program start
*/
void
isc_os_kernel(char **name, int *major, int *minor, int *patch);
/*%<
* Fill the running kernel version into major, minor and patch.
* If any of these are not available then -1 is returned.
*/

View file

@ -19,9 +19,17 @@
void
isc_managers_create(uint32_t workers) {
in_port_t port_low, port_high;
isc_loopmgr_create(isc_g_mctx, workers);
isc_netmgr_create(isc_g_mctx);
isc_rwlock_setworkers(workers);
isc_net_getudpportrange(AF_INET, &port_low, &port_high);
isc_netmgr_portrange(AF_INET, port_low, port_high);
isc_net_getudpportrange(AF_INET6, &port_low, &port_high);
isc_netmgr_portrange(AF_INET6, port_low, port_high);
}
void

View file

@ -175,7 +175,7 @@ getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
#endif /* HAVE_SYSCTLBYNAME */
#endif /* USE_SYSCTL_PORTRANGE */
isc_result_t
void
isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) {
int result = ISC_R_FAILURE;
#if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux)
@ -214,8 +214,6 @@ isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) {
*low = ISC_NET_PORTRANGELOW;
*high = ISC_NET_PORTRANGEHIGH;
}
return ISC_R_SUCCESS; /* we currently never fail in this function */
}
void

View file

@ -358,6 +358,12 @@ typedef struct isc__netmgr {
atomic_int_fast32_t send_udp_buffer_size;
atomic_int_fast32_t recv_tcp_buffer_size;
atomic_int_fast32_t send_tcp_buffer_size;
_Atomic(in_port_t) port_low4;
_Atomic(in_port_t) port_high4;
_Atomic(in_port_t) port_low6;
_Atomic(in_port_t) port_high6;
} isc__netmgr_t;
extern isc__netmgr_t *isc__netmgr;
@ -1387,9 +1393,11 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family);
*/
isc_result_t
isc__nm_tcp_bind_no_port(uv_tcp_t *handle);
isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
sa_family_t sa_family ISC_ATTR_UNUSED);
/*%<
* Set IP_BIND_ADDRESS_NO_PORT on the socket (Linux only).
* Set IP_BIND_ADDRESS_NO_PORT and IP_LOCAL_PORT_RANGE on the socket
* (Linux only).
*/
void

View file

@ -155,6 +155,7 @@ netmgr_teardown(void *arg ISC_ATTR_UNUSED) {
void
isc_netmgr_create(isc_mem_t *mctx) {
isc__netmgr_t *netmgr = NULL;
in_port_t port_low, port_high;
#ifdef MAXIMAL_UV_VERSION
if (uv_version() > MAXIMAL_UV_VERSION) {
@ -185,6 +186,11 @@ isc_netmgr_create(isc_mem_t *mctx) {
atomic_init(&netmgr->send_tcp_buffer_size, 0);
atomic_init(&netmgr->recv_udp_buffer_size, 0);
atomic_init(&netmgr->send_udp_buffer_size, 0);
atomic_init(&netmgr->port_low4, 0);
atomic_init(&netmgr->port_high4, 65535);
atomic_init(&netmgr->port_low6, 0);
atomic_init(&netmgr->port_high6, 65535);
#if HAVE_SO_REUSEPORT_LB
netmgr->load_balance_sockets = true;
#else
@ -237,6 +243,15 @@ isc_netmgr_create(isc_mem_t *mctx) {
}
isc__netmgr = netmgr;
/*
* Set the initial port range for IP_LOCAL_PORT_RANGE.
*/
isc_net_getudpportrange(AF_INET, &port_low, &port_high);
isc_netmgr_portrange(AF_INET, port_low, port_high);
isc_net_getudpportrange(AF_INET6, &port_low, &port_high);
isc_netmgr_portrange(AF_INET6, port_low, port_high);
}
/*
@ -2898,6 +2913,23 @@ isc__networker_get(uint32_t tid) {
return &isc__netmgr->workers[tid];
}
void
isc_netmgr_portrange(sa_family_t af, in_port_t low, in_port_t high) {
REQUIRE(VALID_NM(isc__netmgr));
switch (af) {
case AF_INET:
atomic_store_relaxed(&isc__netmgr->port_low4, low);
atomic_store_relaxed(&isc__netmgr->port_high4, high);
break;
case AF_INET6:
atomic_store_relaxed(&isc__netmgr->port_low6, low);
atomic_store_relaxed(&isc__netmgr->port_high6, high);
break;
default:
INSIST(0);
}
}
#if ISC_NETMGR_TRACE
/*
* Dump all active sockets in netmgr. We output to stderr

View file

@ -11,7 +11,10 @@
* information regarding copyright ownership.
*/
#include <netinet/in.h>
#include <isc/errno.h>
#include <isc/result.h>
#include <isc/uv.h>
#include "netmgr-int.h"
@ -370,17 +373,81 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) {
return ISC_R_SUCCESS;
}
isc_result_t
isc__nm_tcp_bind_no_port(uv_tcp_t *handle ISC_ATTR_UNUSED) {
#ifdef IP_BIND_ADDRESS_NO_PORT
uv_os_sock_t fd = -1;
/*
* See
* https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel
* for rationalle.
*/
#define PORT_RANGE 1000
int r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd);
if (r < 0) {
isc_result_t
isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
sa_family_t af ISC_ATTR_UNUSED) {
#ifdef IP_BIND_ADDRESS_NO_PORT
if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) {
return ISC_R_FAILURE;
}
#endif
if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) {
#if defined(IP_LOCAL_PORT_RANGE) && defined(__linux__)
/*
* The option takes an uint32_t value with the high 16 bits
* set to the upper range bound, and the low 16 bits set to
* the lower range bound. Range bounds are inclusive. The
* 16-bit values should be in host byte order.
*/
uint32_t port_range;
int major, minor;
isc_os_kernel(NULL, &major, &minor, NULL);
in_port_t port_low, port_high;
switch (af) {
case AF_INET:
port_low = isc__netmgr->port_low4;
port_high = isc__netmgr->port_high4;
break;
case AF_INET6:
port_low = isc__netmgr->port_low6;
port_high = isc__netmgr->port_high6;
break;
default:
INSIST(0);
}
/*
* Linux 6.8 implemented a following patch:
*
* If IP_LOCAL_PORT_RANGE is set on a socket before accept(),
* port selection no longer favors even ports.
*
* This means that connect() can find a suitable source port
* faster, and applications can use a different split between
* connect() and bind() users.
*/
if (major < 6 || (major == 6 && minor < 8)) {
/*
* On Linux << 6.8, use IP_LOCAL_PORT_RANGE to
* partition ephemeral port range randomly to help
* with the port selection.
*/
if (port_high - port_low <= PORT_RANGE) {
return ISC_R_RANGE;
}
/*
* port_low <= N < port_high - PORT_RANGE
*/
port_high -= PORT_RANGE;
port_low += isc_random_uniform(port_high - port_low);
port_high = port_low + PORT_RANGE;
}
INSIST(port_low > 0);
INSIST(port_low < port_high);
port_range = (uint32_t)port_low | ((uint32_t)port_high << 16);
if (setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE, &port_range,
sizeof(port_range)) == -1)
{
return ISC_R_FAILURE;
}
#endif

View file

@ -12,6 +12,7 @@
*/
#include <libgen.h>
#include <string.h>
#include <unistd.h>
#include <isc/async.h>
@ -141,8 +142,6 @@ tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) {
}
isc__nm_incstats(sock, STATID_OPEN);
isc__nm_tcp_bind_no_port(&sock->uv_handle.tcp);
if (req->local.length != 0) {
r = uv_tcp_bind(&sock->uv_handle.tcp, &req->local.type.sa, 0);
if (r != 0) {
@ -291,6 +290,15 @@ isc_nm_tcpconnect(isc_sockaddr_t *local, isc_sockaddr_t *peer,
(void)isc__nm_socket_min_mtu(sock->fd, sa_family);
(void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG);
result = isc__nm_socket_max_port_range(sock->fd, sa_family);
if (result != ISC_R_SUCCESS) {
isc__nmsocket_log(sock, ISC_LOG_DEBUG(99),
"setting up IP_BIND_ADDRESS_NO_PORT or "
"IP_LOCAL_PORT_RANGE failed: %s\n",
result == ISC_R_RANGE
? isc_result_totext(result)
: strerror(errno));
}
sock->active = true;

View file

@ -11,10 +11,13 @@
* information regarding copyright ownership.
*/
#include <ctype.h>
#include <inttypes.h>
#include <sys/stat.h>
#include <sys/utsname.h>
#include <isc/os.h>
#include <isc/string.h>
#include <isc/types.h>
#include <isc/util.h>
#include <isc/uv.h>
@ -25,6 +28,8 @@
static unsigned int isc__os_ncpus = 0;
static unsigned long isc__os_cacheline = ISC_OS_CACHELINE_SIZE;
static mode_t isc__os_umask = 0;
static int kernel_major = -1, kernel_minor = -1, kernel_patch = -1;
static char kernel_name[64];
/*
* The affinity support for non-Linux is in the review in the upstream
@ -177,6 +182,19 @@ umask_initialize(void) {
(void)umask(isc__os_umask);
}
static void
kernel_initialize(void) {
struct utsname buffer;
if (uname(&buffer) == -1) {
return;
}
(void)sscanf(buffer.release, "%d.%d.%d", &kernel_major, &kernel_minor,
&kernel_patch);
(void)strlcpy(kernel_name, buffer.sysname, sizeof(kernel_name));
}
unsigned int
isc_os_ncpus(void) {
return isc__os_ncpus;
@ -192,10 +210,19 @@ isc_os_umask(void) {
return isc__os_umask;
}
void
isc_os_kernel(char **name, int *major, int *minor, int *patch) {
SET_IF_NOT_NULL(name, kernel_name)
SET_IF_NOT_NULL(major, kernel_major);
SET_IF_NOT_NULL(minor, kernel_minor);
SET_IF_NOT_NULL(patch, kernel_patch);
}
void
isc__os_initialize(void) {
umask_initialize();
ncpus_initialize();
kernel_initialize();
#if defined(_SC_LEVEL1_DCACHE_LINESIZE)
long s = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
if (s > 0 && (unsigned long)s > isc__os_cacheline) {