mirror of
https://github.com/haproxy/haproxy.git
synced 2026-05-19 08:32:34 -04:00
The QUIC code can only handle IPv4 or IPv6 addresses, so using two sockaddr_storage structs wastes a lot of space in the quic_dgram struct. This is a very large overhead since this structure is written in the MPSC ring buffers before every datagram, while many of those datagrams are only 50 bytes or less. Using an union instead saves 200 bytes per datagram, increasing the capacity of the buffers significantly.
1116 lines
30 KiB
C
1116 lines
30 KiB
C
/*
|
|
* QUIC socket management.
|
|
*
|
|
* Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com>
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*
|
|
*/
|
|
|
|
#define _GNU_SOURCE /* required for struct in6_pktinfo */
|
|
#include <errno.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include <sys/socket.h>
|
|
#include <sys/types.h>
|
|
#include <netinet/in.h>
|
|
#include <netinet/udp.h>
|
|
|
|
#include <haproxy/api.h>
|
|
#include <haproxy/buf.h>
|
|
#include <haproxy/connection.h>
|
|
#include <haproxy/counters.h>
|
|
#include <haproxy/dynbuf.h>
|
|
#include <haproxy/fd.h>
|
|
#include <haproxy/global-t.h>
|
|
#include <haproxy/list.h>
|
|
#include <haproxy/listener.h>
|
|
#include <haproxy/log.h>
|
|
#include <haproxy/obj_type.h>
|
|
#include <haproxy/pool.h>
|
|
#include <haproxy/protocol-t.h>
|
|
#include <haproxy/proto_quic.h>
|
|
#include <haproxy/proxy-t.h>
|
|
#include <haproxy/quic_cid.h>
|
|
#include <haproxy/quic_conn.h>
|
|
#include <haproxy/quic_rx.h>
|
|
#include <haproxy/quic_sock.h>
|
|
#include <haproxy/quic_stats.h>
|
|
#include <haproxy/quic_tp-t.h>
|
|
#include <haproxy/quic_trace.h>
|
|
#include <haproxy/session.h>
|
|
#include <haproxy/task.h>
|
|
#include <haproxy/trace.h>
|
|
#include <haproxy/tools.h>
|
|
#include <haproxy/trace.h>
|
|
|
|
/* Log only first EACCES bind() error runtime occurrence. */
|
|
static volatile char quic_bind_eacces_warn = 0;
|
|
|
|
/* Retrieve a connection's source address. Returns -1 on failure. */
|
|
int quic_sock_get_src(struct connection *conn, struct sockaddr *addr, socklen_t len)
|
|
{
|
|
struct quic_conn *qc;
|
|
|
|
if (!conn || !conn->handle.qc)
|
|
return -1;
|
|
|
|
qc = conn->handle.qc;
|
|
if (conn_is_back(conn)) {
|
|
/* source address should be known since connect() */
|
|
if (!is_addr(&qc->local_addr))
|
|
return -1;
|
|
|
|
if (len > sizeof(qc->local_addr))
|
|
len = sizeof(qc->local_addr);
|
|
memcpy(addr, &qc->local_addr, len);
|
|
return 0;
|
|
} else {
|
|
/* front connection, return the peer's address */
|
|
if (len > sizeof(qc->peer_addr))
|
|
len = sizeof(qc->peer_addr);
|
|
memcpy(addr, &qc->peer_addr, len);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/* Retrieve a connection's destination address. Returns -1 on failure. */
|
|
int quic_sock_get_dst(struct connection *conn, struct sockaddr *addr, socklen_t len)
|
|
{
|
|
struct quic_conn *qc;
|
|
|
|
if (!conn || !conn->handle.qc)
|
|
return -1;
|
|
|
|
qc = conn->handle.qc;
|
|
if (conn_is_back(conn)) {
|
|
/* back connection, return the peer's address */
|
|
if (len > sizeof(qc->peer_addr))
|
|
len = sizeof(qc->peer_addr);
|
|
memcpy(addr, &qc->peer_addr, len);
|
|
} else {
|
|
struct sockaddr_storage *from;
|
|
struct listener *l = qc->li;
|
|
|
|
/* Return listener address if IP_PKTINFO or friends are not
|
|
* supported by the socket.
|
|
*/
|
|
BUG_ON(!l);
|
|
from = is_addr(&qc->local_addr) ? &qc->local_addr : &l->rx.addr;
|
|
if (len > sizeof(*from))
|
|
len = sizeof(*from);
|
|
memcpy(addr, from, len);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Inspired from session_accept_fd().
|
|
* Instantiate a new connection (connection struct) to be attached to <qc>
|
|
* QUIC connection of <l> listener.
|
|
* Returns 1 if succeeded, 0 if not.
|
|
*/
|
|
static int new_quic_cli_conn(struct quic_conn *qc, struct listener *l,
|
|
struct sockaddr_storage *saddr)
|
|
{
|
|
struct connection *cli_conn;
|
|
|
|
if (unlikely((cli_conn = conn_new(&l->obj_type)) == NULL))
|
|
goto out;
|
|
|
|
if (!sockaddr_alloc(&cli_conn->src, saddr, sizeof *saddr))
|
|
goto out_free_conn;
|
|
|
|
cli_conn->flags |= CO_FL_FDLESS;
|
|
qc->conn = cli_conn;
|
|
cli_conn->handle.qc = qc;
|
|
|
|
cli_conn->target = &l->obj_type;
|
|
|
|
return 1;
|
|
|
|
out_free_conn:
|
|
qc->conn = NULL;
|
|
conn_stop_tracking(cli_conn);
|
|
conn_xprt_close(cli_conn);
|
|
conn_free(cli_conn);
|
|
out:
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Tests if the receiver supports accepting connections. Returns positive on
|
|
* success, 0 if not possible
|
|
*/
|
|
int quic_sock_accepting_conn(const struct receiver *rx)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
/* Accept an incoming connection from listener <l>, and return it, as well as
|
|
* a CO_AC_* status code into <status> if not null. Null is returned on error.
|
|
* <l> must be a valid listener with a valid frontend.
|
|
*/
|
|
struct connection *quic_sock_accept_conn(struct listener *l, int *status)
|
|
{
|
|
struct quic_conn *qc;
|
|
struct li_per_thread *lthr = &l->per_thr[ti->ltid];
|
|
|
|
qc = MT_LIST_POP(<hr->quic_accept.conns, struct quic_conn *, accept_list);
|
|
if (!qc || qc->flags & (QUIC_FL_CONN_CLOSING|QUIC_FL_CONN_DRAINING))
|
|
goto done;
|
|
|
|
if (!new_quic_cli_conn(qc, l, &qc->peer_addr))
|
|
goto err;
|
|
|
|
done:
|
|
*status = CO_AC_DONE;
|
|
|
|
if (qc) {
|
|
BUG_ON(l->rx.quic_curr_accept <= 0);
|
|
HA_ATOMIC_DEC(&l->rx.quic_curr_accept);
|
|
return qc->conn;
|
|
}
|
|
else {
|
|
return NULL;
|
|
}
|
|
|
|
err:
|
|
/* in case of error reinsert the element to process it later. */
|
|
MT_LIST_INSERT(<hr->quic_accept.conns, &qc->accept_list);
|
|
|
|
*status = CO_AC_PAUSE;
|
|
return NULL;
|
|
}
|
|
|
|
/* QUIC datagrams handler task. */
|
|
struct task *quic_lstnr_dghdlr(struct task *t, void *ctx, unsigned int state)
|
|
{
|
|
struct quic_dghdlr *dghdlr = ctx;
|
|
struct quic_dgram *dgram;
|
|
size_t len;
|
|
int max_dgrams = global.tune.maxpollevents;
|
|
|
|
TRACE_ENTER(QUIC_EV_CONN_LPKT);
|
|
|
|
while ((dgram = mpring_read_begin(&dghdlr->buf, &len))) {
|
|
/* We ignore the return value of quic_dgram_parse() because
|
|
* whether it was successful or not, we still need to empty the
|
|
* ring buffer. Exiting early would leave us with data left to
|
|
* process, and no guarantee we would get woken up again.
|
|
*/
|
|
quic_dgram_parse(dgram, NULL, dgram->owner);
|
|
mpring_read_end(&dghdlr->buf, len);
|
|
|
|
if (--max_dgrams <= 0) {
|
|
/* too much work done at once, come back here later */
|
|
tasklet_wakeup((struct tasklet *)t);
|
|
break;
|
|
}
|
|
}
|
|
|
|
TRACE_LEAVE(QUIC_EV_CONN_LPKT);
|
|
return t;
|
|
}
|
|
|
|
/* Retrieve the DCID from a QUIC datagram or packet at <pos> position,
|
|
* <end> being at one byte past the end of this datagram.
|
|
* Returns 1 if succeeded, 0 if not.
|
|
*/
|
|
static int quic_get_dgram_dcid(unsigned char *start, const unsigned char *end,
|
|
size_t *dcid_off, size_t *dcid_len)
|
|
{
|
|
unsigned char *pos;
|
|
int ret = 0, long_header;
|
|
size_t minlen, skip;
|
|
|
|
TRACE_ENTER(QUIC_EV_CONN_RXPKT);
|
|
|
|
pos = start;
|
|
if (!(*pos & QUIC_PACKET_FIXED_BIT)) {
|
|
TRACE_PROTO("fixed bit not set", QUIC_EV_CONN_RXPKT);
|
|
goto err;
|
|
}
|
|
|
|
long_header = *pos & QUIC_PACKET_LONG_HEADER_BIT;
|
|
minlen = long_header ? QUIC_LONG_PACKET_MINLEN :
|
|
QUIC_SHORT_PACKET_MINLEN + QUIC_HAP_CID_LEN + QUIC_TLS_TAG_LEN;
|
|
skip = long_header ? QUIC_LONG_PACKET_DCID_OFF : QUIC_SHORT_PACKET_DCID_OFF;
|
|
if (end - pos < minlen)
|
|
goto err;
|
|
|
|
pos += skip;
|
|
*dcid_len = long_header ? *pos++ : QUIC_HAP_CID_LEN;
|
|
if (*dcid_len > QUIC_CID_MAXLEN || end - pos <= *dcid_len)
|
|
goto err;
|
|
|
|
*dcid_off = pos - start;
|
|
|
|
ret = 1;
|
|
leave:
|
|
TRACE_LEAVE(QUIC_EV_CONN_RXPKT);
|
|
return ret;
|
|
|
|
err:
|
|
TRACE_PROTO("wrong datagram", QUIC_EV_CONN_RXPKT);
|
|
goto leave;
|
|
}
|
|
|
|
/* Initialize a QUIC datagram. */
|
|
static void quic_dgram_init(struct quic_dgram *dgram,
|
|
unsigned char *pos, size_t len, void *owner,
|
|
size_t dcid_off, size_t dcid_len,
|
|
struct sockaddr_storage *saddr,
|
|
struct sockaddr_storage *daddr)
|
|
{
|
|
BUG_ON_HOT(!is_inet_addr(saddr) || !is_inet_addr(daddr));
|
|
|
|
dgram->obj_type = OBJ_TYPE_DGRAM;
|
|
dgram->owner = owner;
|
|
dgram->buf = pos;
|
|
dgram->len = len;
|
|
dgram->dcid_off = dcid_off;
|
|
dgram->dcid_len = dcid_len;
|
|
memcpy(&dgram->saddr, saddr, sizeof(dgram->saddr));
|
|
memcpy(&dgram->daddr, daddr, sizeof(dgram->daddr));
|
|
dgram->qc = NULL;
|
|
dgram->flags = 0;
|
|
}
|
|
|
|
/* Receive a single message from datagram socket <fd>. Data are placed in <out>
|
|
* buffer of length <len>.
|
|
*
|
|
* Datagram addresses will be returned via the next arguments. <from> will be
|
|
* the peer address and <to> the reception one. Note that <to> can only be
|
|
* retrieved if the socket supports IP_PKTINFO or affiliated options. If not,
|
|
* <to> will be set as AF_UNSPEC. The caller must specify <to_port> to ensure
|
|
* that <to> address is completely filled.
|
|
*
|
|
* Set <check_port> if communication with privileged port should be handle with
|
|
* constraint, which is desirable when interacting with QUIC clients on
|
|
* frontend side.
|
|
*
|
|
* Returns value from recvmsg syscall.
|
|
*/
|
|
static ssize_t quic_recv(int fd, void *out, size_t len,
|
|
struct sockaddr *from, socklen_t from_len,
|
|
struct sockaddr *to, socklen_t to_len,
|
|
uint16_t dst_port, int check_port)
|
|
{
|
|
union pktinfo {
|
|
#ifdef IP_PKTINFO
|
|
struct in_pktinfo in;
|
|
#else /* !IP_PKTINFO */
|
|
struct in_addr addr;
|
|
#endif
|
|
#ifdef IPV6_RECVPKTINFO
|
|
struct in6_pktinfo in6;
|
|
#endif
|
|
};
|
|
char cdata[CMSG_SPACE(sizeof(union pktinfo))];
|
|
struct msghdr msg;
|
|
struct iovec vec;
|
|
struct cmsghdr *cmsg;
|
|
ssize_t ret;
|
|
|
|
vec.iov_base = out;
|
|
vec.iov_len = len;
|
|
|
|
memset(&msg, 0, sizeof(msg));
|
|
msg.msg_name = from;
|
|
msg.msg_namelen = from_len;
|
|
msg.msg_iov = &vec;
|
|
msg.msg_iovlen = 1;
|
|
msg.msg_control = &cdata;
|
|
msg.msg_controllen = sizeof(cdata);
|
|
|
|
clear_addr((struct sockaddr_storage *)to);
|
|
|
|
do {
|
|
ret = recvmsg(fd, &msg, 0);
|
|
} while (ret < 0 && errno == EINTR);
|
|
|
|
/* TODO handle errno. On EAGAIN/EWOULDBLOCK use fd_cant_recv() if
|
|
* using dedicated connection socket.
|
|
*/
|
|
|
|
if (ret < 0)
|
|
goto end;
|
|
|
|
if (check_port && unlikely(port_is_restricted((struct sockaddr_storage *)from, HA_PROTO_QUIC))) {
|
|
ret = -1;
|
|
goto end;
|
|
}
|
|
|
|
for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
|
|
switch (cmsg->cmsg_level) {
|
|
case IPPROTO_IP:
|
|
#if defined(IP_PKTINFO)
|
|
if (cmsg->cmsg_type == IP_PKTINFO) {
|
|
struct sockaddr_in *in = (struct sockaddr_in *)to;
|
|
struct in_pktinfo *info = (struct in_pktinfo *)CMSG_DATA(cmsg);
|
|
|
|
if (to_len >= sizeof(struct sockaddr_in)) {
|
|
in->sin_family = AF_INET;
|
|
in->sin_addr = info->ipi_addr;
|
|
in->sin_port = dst_port;
|
|
}
|
|
}
|
|
#elif defined(IP_RECVDSTADDR)
|
|
if (cmsg->cmsg_type == IP_RECVDSTADDR) {
|
|
struct sockaddr_in *in = (struct sockaddr_in *)to;
|
|
struct in_addr *info = (struct in_addr *)CMSG_DATA(cmsg);
|
|
|
|
if (to_len >= sizeof(struct sockaddr_in)) {
|
|
in->sin_family = AF_INET;
|
|
in->sin_addr.s_addr = info->s_addr;
|
|
in->sin_port = dst_port;
|
|
}
|
|
}
|
|
#endif /* IP_PKTINFO || IP_RECVDSTADDR */
|
|
break;
|
|
|
|
case IPPROTO_IPV6:
|
|
#ifdef IPV6_RECVPKTINFO
|
|
if (cmsg->cmsg_type == IPV6_PKTINFO) {
|
|
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)to;
|
|
struct in6_pktinfo *info6 = (struct in6_pktinfo *)CMSG_DATA(cmsg);
|
|
|
|
if (to_len >= sizeof(struct sockaddr_in6)) {
|
|
in6->sin6_family = AF_INET6;
|
|
memcpy(&in6->sin6_addr, &info6->ipi6_addr, sizeof(in6->sin6_addr));
|
|
in6->sin6_port = dst_port;
|
|
}
|
|
}
|
|
#endif
|
|
break;
|
|
}
|
|
}
|
|
|
|
end:
|
|
return ret;
|
|
}
|
|
|
|
/* Low-level function to write a datagram to the buffer of the handler thread. */
|
|
static int quic_dgram_write(unsigned char *pos, size_t len, void *owner,
|
|
size_t dcid_off, size_t dcid_len,
|
|
struct sockaddr_storage *saddr,
|
|
struct sockaddr_storage *daddr,
|
|
unsigned int cid_tid)
|
|
{
|
|
struct mpring *dst;
|
|
struct quic_dgram *dgram;
|
|
unsigned char *data;
|
|
size_t bring_len;
|
|
void *buf;
|
|
|
|
dst = &quic_dghdlrs[cid_tid].buf;
|
|
|
|
bring_len = sizeof(struct quic_dgram) + len;
|
|
buf = mpring_write_reserve(dst, bring_len);
|
|
if (!buf)
|
|
return 0;
|
|
|
|
dgram = buf;
|
|
quic_dgram_init(dgram, pos, len, owner, dcid_off, dcid_len, saddr, daddr);
|
|
|
|
data = (unsigned char *)(dgram + 1);
|
|
memcpy(data, pos, len);
|
|
|
|
dgram->buf = data;
|
|
mpring_write_commit(dst, buf, bring_len);
|
|
|
|
/* typically quic_lstnr_dghdlr() */
|
|
tasklet_wakeup(quic_dghdlrs[cid_tid].task);
|
|
|
|
return 1;
|
|
}
|
|
|
|
int quic_dgram_requeue(struct quic_dgram *dgram, int cid_tid)
|
|
{
|
|
return quic_dgram_write(dgram->buf, dgram->len, dgram->owner,
|
|
dgram->dcid_off, dgram->dcid_len,
|
|
(struct sockaddr_storage *)&dgram->saddr,
|
|
(struct sockaddr_storage *)&dgram->daddr, cid_tid);
|
|
}
|
|
|
|
/* Attempt to push a datagram to its handler thread.
|
|
*
|
|
* Returns 1 if successful, or 0 if the handler thread's buffer is full. If
|
|
* the datagram could not be pushed, it will be put on the appropriate pending
|
|
* lists. In all cases, take complete ownership of dgram - it won't be valid
|
|
* anymore after calling this function.
|
|
*/
|
|
static int quic_lstnr_dgram_dispatch(unsigned char *pos, size_t len, struct listener *l,
|
|
size_t dcid_off, size_t dcid_len,
|
|
struct sockaddr_storage *saddr,
|
|
struct sockaddr_storage *daddr)
|
|
{
|
|
struct proxy *px;
|
|
struct quic_counters *prx_counters;
|
|
int cid_tid;
|
|
|
|
if (!len)
|
|
goto err;
|
|
|
|
if ((cid_tid = quic_get_cid_tid(pos + dcid_off, dcid_len, saddr, pos, len)) < 0) {
|
|
/* Use the current thread if CID not found. If a clients opens
|
|
* a connection with multiple packets, it is possible that
|
|
* several threads will deal with datagrams sharing the same
|
|
* CID. For this reason, the CID tree insertion will be
|
|
* conducted as an atomic operation and the datagram ultimately
|
|
* redispatch by the late thread.
|
|
*/
|
|
cid_tid = tid;
|
|
}
|
|
|
|
if (!quic_dgram_write(pos, len, l, dcid_off, dcid_len, saddr, daddr, cid_tid))
|
|
goto err;
|
|
|
|
return 1;
|
|
|
|
err:
|
|
px = l->bind_conf->frontend;
|
|
prx_counters = EXTRA_COUNTERS_GET(px->extra_counters_fe, &quic_stats_module);
|
|
HA_ATOMIC_INC(&prx_counters->rxbuf_full);
|
|
return 0;
|
|
}
|
|
|
|
/* Function called on a read event from a listening socket. It tries
|
|
* to handle as many connections as possible.
|
|
*/
|
|
void quic_lstnr_sock_fd_iocb(int fd)
|
|
{
|
|
ssize_t ret;
|
|
unsigned char buf[QUIC_MAX_UDP_PAYLOAD_SIZE];
|
|
struct listener *l = objt_listener(fdtab[fd].owner);
|
|
struct quic_transport_params *params;
|
|
/* Source address */
|
|
struct sockaddr_storage saddr = {0}, daddr = {0};
|
|
size_t dcid_off, dcid_len, max_sz;
|
|
int max_dgrams;
|
|
|
|
BUG_ON(!l);
|
|
|
|
if (!(fdtab[fd].state & FD_POLL_IN) || !fd_recv_ready(fd))
|
|
return;
|
|
|
|
max_dgrams = global.tune.maxpollevents;
|
|
params = &l->bind_conf->quic_params;
|
|
max_sz = params->max_udp_payload_size;
|
|
BUG_ON(max_sz > sizeof(buf));
|
|
|
|
start:
|
|
ret = quic_recv(fd, buf, sizeof(buf),
|
|
(struct sockaddr *)&saddr, sizeof(saddr),
|
|
(struct sockaddr *)&daddr, sizeof(daddr),
|
|
get_net_port(&l->rx.addr), 1);
|
|
if (ret <= 0)
|
|
return;
|
|
|
|
if (quic_get_dgram_dcid(buf, buf + ret, &dcid_off, &dcid_len))
|
|
quic_lstnr_dgram_dispatch(buf, ret, l, dcid_off, dcid_len, &saddr, &daddr);
|
|
|
|
if (--max_dgrams > 0)
|
|
goto start;
|
|
}
|
|
|
|
/* FD-owned quic-conn socket callback. */
|
|
void quic_conn_sock_fd_iocb(int fd)
|
|
{
|
|
struct quic_conn *qc = fdtab[fd].owner;
|
|
|
|
TRACE_ENTER(QUIC_EV_CONN_RCV, qc);
|
|
|
|
if (fd_send_active(fd) && fd_send_ready(fd)) {
|
|
TRACE_DEVEL("send ready", QUIC_EV_CONN_RCV, qc);
|
|
fd_stop_send(fd);
|
|
tasklet_wakeup_after(NULL, qc->wait_event.tasklet);
|
|
qc_notify_send(qc);
|
|
}
|
|
|
|
if (fd_recv_ready(fd)) {
|
|
TRACE_DEVEL("recv ready", QUIC_EV_CONN_RCV, qc);
|
|
tasklet_wakeup_after(NULL, qc->wait_event.tasklet);
|
|
fd_stop_recv(fd);
|
|
}
|
|
|
|
TRACE_LEAVE(QUIC_EV_CONN_RCV, qc);
|
|
}
|
|
|
|
/* quic_conn_closed FD handler (only used on backend side) */
|
|
void quic_conn_closed_sock_fd_iocb(int fd)
|
|
{
|
|
struct quic_conn_closed *cc_qc = fdtab[fd].owner;
|
|
|
|
TRACE_ENTER(QUIC_EV_CONN_RCV, cc_qc);
|
|
|
|
if (fd_send_active(fd) && fd_send_ready(fd)) {
|
|
TRACE_DEVEL("send ready", QUIC_EV_CONN_RCV, cc_qc);
|
|
fd_stop_send(fd);
|
|
tasklet_wakeup_after(NULL, cc_qc->wait_event.tasklet);
|
|
}
|
|
|
|
if (fd_recv_ready(fd)) {
|
|
TRACE_DEVEL("recv ready", QUIC_EV_CONN_RCV, cc_qc);
|
|
tasklet_wakeup_after(NULL, cc_qc->wait_event.tasklet);
|
|
fd_stop_recv(fd);
|
|
}
|
|
|
|
TRACE_LEAVE(QUIC_EV_CONN_RCV, cc_qc);
|
|
}
|
|
|
|
static void cmsg_set_saddr(struct msghdr *msg, struct cmsghdr **cmsg,
|
|
struct sockaddr_storage *saddr)
|
|
{
|
|
struct cmsghdr *c;
|
|
#ifdef IP_PKTINFO
|
|
struct in_pktinfo *in;
|
|
#endif /* IP_PKTINFO */
|
|
#ifdef IPV6_RECVPKTINFO
|
|
struct in6_pktinfo *in6;
|
|
#endif /* IPV6_RECVPKTINFO */
|
|
size_t sz = 0;
|
|
|
|
/* First determine size of ancillary data depending on the system support. */
|
|
switch (saddr->ss_family) {
|
|
case AF_INET:
|
|
#if defined(IP_PKTINFO)
|
|
sz = sizeof(struct in_pktinfo);
|
|
#elif defined(IP_RECVDSTADDR)
|
|
sz = sizeof(struct in_addr);
|
|
#endif /* IP_PKTINFO || IP_RECVDSTADDR */
|
|
break;
|
|
case AF_INET6:
|
|
#ifdef IPV6_RECVPKTINFO
|
|
sz = sizeof(struct in6_pktinfo);
|
|
#endif /* IPV6_RECVPKTINFO */
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
/* Size is null if system does not support send source address setting. */
|
|
if (!sz)
|
|
return;
|
|
|
|
/* Set first msg_controllen to be able to use CMSG_* macros. */
|
|
msg->msg_controllen += CMSG_SPACE(sz);
|
|
|
|
/* seems necessary to please gcc-13 */
|
|
ASSUME_NONNULL(CMSG_FIRSTHDR(msg));
|
|
|
|
*cmsg = !(*cmsg) ? CMSG_FIRSTHDR(msg) : CMSG_NXTHDR(msg, *cmsg);
|
|
ASSUME_NONNULL(*cmsg);
|
|
c = *cmsg;
|
|
c->cmsg_len = CMSG_LEN(sz);
|
|
|
|
switch (saddr->ss_family) {
|
|
case AF_INET:
|
|
c->cmsg_level = IPPROTO_IP;
|
|
#if defined(IP_PKTINFO)
|
|
c->cmsg_type = IP_PKTINFO;
|
|
in = (struct in_pktinfo *)CMSG_DATA(c);
|
|
in->ipi_ifindex = 0;
|
|
in->ipi_addr.s_addr = 0;
|
|
memcpy(&in->ipi_spec_dst,
|
|
&((struct sockaddr_in *)saddr)->sin_addr,
|
|
sizeof(struct in_addr));
|
|
#elif defined(IP_RECVDSTADDR)
|
|
c->cmsg_type = IP_SENDSRCADDR;
|
|
memcpy(CMSG_DATA(c),
|
|
&((struct sockaddr_in *)saddr)->sin_addr,
|
|
sizeof(struct in_addr));
|
|
#endif /* IP_PKTINFO || IP_RECVDSTADDR */
|
|
|
|
break;
|
|
|
|
case AF_INET6:
|
|
#ifdef IPV6_RECVPKTINFO
|
|
c->cmsg_level = IPPROTO_IPV6;
|
|
c->cmsg_type = IPV6_PKTINFO;
|
|
in6 = (struct in6_pktinfo *)CMSG_DATA(c);
|
|
in6->ipi6_ifindex = 0;
|
|
memcpy(&in6->ipi6_addr,
|
|
&((struct sockaddr_in6 *)saddr)->sin6_addr,
|
|
sizeof(struct in6_addr));
|
|
#endif /* IPV6_RECVPKTINFO */
|
|
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void cmsg_set_gso(struct msghdr *msg, struct cmsghdr **cmsg,
|
|
uint16_t gso_size)
|
|
{
|
|
#ifdef UDP_SEGMENT
|
|
struct cmsghdr *c;
|
|
size_t sz = sizeof(gso_size);
|
|
|
|
/* Set first msg_controllen to be able to use CMSG_* macros. */
|
|
msg->msg_controllen += CMSG_SPACE(sz);
|
|
|
|
/* seems necessary to please gcc-13 */
|
|
ASSUME_NONNULL(CMSG_FIRSTHDR(msg));
|
|
|
|
*cmsg = !(*cmsg) ? CMSG_FIRSTHDR(msg) : CMSG_NXTHDR(msg, *cmsg);
|
|
ASSUME_NONNULL(*cmsg);
|
|
c = *cmsg;
|
|
c->cmsg_len = CMSG_LEN(sz);
|
|
|
|
c->cmsg_level = SOL_UDP;
|
|
c->cmsg_type = UDP_SEGMENT;
|
|
c->cmsg_len = CMSG_LEN(sz);
|
|
write_u16(CMSG_DATA(c), gso_size);
|
|
#endif
|
|
}
|
|
|
|
/* Return 1 if the source address may be used, 0 if not. */
|
|
static int qc_may_use_saddr(struct quic_conn *qc)
|
|
{
|
|
/* For QUIC backends, the connection fd is always initialized */
|
|
/* Not useful for connection using the listener socket */
|
|
if (qc_test_fd(qc))
|
|
return 0;
|
|
|
|
/* Connection to a listener from here.
|
|
* The source address may be used when using listener socket (fd=-1) if
|
|
* possible. This is not useful if the listening socket is bound to
|
|
* a specific address. It is even prohibited on FreeBSD.
|
|
*/
|
|
return (!is_addr(&qc->li->rx.addr) &&
|
|
is_addr(&qc->local_addr));
|
|
}
|
|
|
|
/* Send a datagram stored into <buf> buffer with <sz> as size. The caller must
|
|
* ensure there is at least <sz> bytes in this buffer.
|
|
*
|
|
* If <gso_size> is non null, it will be used as value for UDP_SEGMENT option.
|
|
* This allows to transmit multiple datagrams in a single syscall.
|
|
*
|
|
* Returns the total bytes sent over the socket. 0 is returned if a transient
|
|
* error is encountered which allows send to be retry later. A negative value
|
|
* is used for a fatal error which guarantee that all future send operation for
|
|
* this connection will fail.
|
|
*
|
|
* TODO standardize this function for a generic UDP sendto wrapper. This can be
|
|
* done by removing the <qc> arg and replace it with address/port.
|
|
*/
|
|
int qc_snd_buf(struct quic_conn *qc, const struct buffer *buf, size_t sz,
|
|
int flags, uint16_t gso_size)
|
|
{
|
|
ssize_t ret;
|
|
struct msghdr msg;
|
|
struct iovec vec;
|
|
struct cmsghdr *cmsg __maybe_unused = NULL;
|
|
|
|
union {
|
|
#ifdef IP_PKTINFO
|
|
char buf[CMSG_SPACE(sizeof(struct in_pktinfo)) + CMSG_SPACE(sizeof(gso_size))];
|
|
#endif /* IP_PKTINFO */
|
|
#ifdef IPV6_RECVPKTINFO
|
|
char buf6[CMSG_SPACE(sizeof(struct in6_pktinfo)) + CMSG_SPACE(sizeof(gso_size))];
|
|
#endif /* IPV6_RECVPKTINFO */
|
|
char bufaddr[CMSG_SPACE(sizeof(struct in_addr)) + CMSG_SPACE(sizeof(gso_size))];
|
|
struct cmsghdr align;
|
|
} ancillary_data;
|
|
|
|
/* man 3 cmsg
|
|
*
|
|
* When initializing a buffer that will contain a
|
|
* series of cmsghdr structures (e.g., to be sent with
|
|
* sendmsg(2)), that buffer should first be
|
|
* zero-initialized to ensure the correct operation of
|
|
* CMSG_NXTHDR().
|
|
*/
|
|
memset(&ancillary_data, 0, sizeof(ancillary_data));
|
|
|
|
vec.iov_base = b_peek(buf, b_head_ofs(buf));
|
|
vec.iov_len = sz;
|
|
|
|
/* man 2 sendmsg
|
|
*
|
|
* The msg_name field is used on an unconnected socket to specify the
|
|
* target address for a datagram. It points to a buffer containing the
|
|
* address; the msg_namelen field should be set to the size of the
|
|
* address. For a connected socket, these fields should be specified
|
|
* as NULL and 0, respectively.
|
|
*/
|
|
if (!qc_test_fd(qc)) {
|
|
msg.msg_name = &qc->peer_addr;
|
|
msg.msg_namelen = get_addr_len(&qc->peer_addr);
|
|
}
|
|
else {
|
|
msg.msg_name = NULL;
|
|
msg.msg_namelen = 0;
|
|
}
|
|
|
|
msg.msg_iov = &vec;
|
|
msg.msg_iovlen = 1;
|
|
msg.msg_control = NULL;
|
|
msg.msg_controllen = 0;
|
|
msg.msg_flags = 0;
|
|
|
|
if (qc_test_fd(qc) && !fd_send_ready(qc->fd))
|
|
return 0;
|
|
|
|
if (qc_may_use_saddr(qc)) {
|
|
msg.msg_control = ancillary_data.bufaddr;
|
|
cmsg_set_saddr(&msg, &cmsg, &qc->local_addr);
|
|
}
|
|
|
|
/* Set GSO parameter if datagram size is bigger than MTU. */
|
|
if (gso_size) {
|
|
/* GSO size must be less than total data to sent for multiple datagrams. */
|
|
BUG_ON_HOT(b_data(buf) <= gso_size);
|
|
|
|
if (!msg.msg_control)
|
|
msg.msg_control = ancillary_data.bufaddr;
|
|
cmsg_set_gso(&msg, &cmsg, gso_size);
|
|
}
|
|
|
|
do {
|
|
ret = sendmsg(qc_fd(qc), &msg, MSG_DONTWAIT|MSG_NOSIGNAL);
|
|
} while (ret < 0 && errno == EINTR);
|
|
|
|
if (ret < 0) {
|
|
if (errno == EAGAIN || errno == EWOULDBLOCK ||
|
|
errno == ENOTCONN || errno == EINPROGRESS) {
|
|
/* transient error */
|
|
if (errno == EAGAIN || errno == EWOULDBLOCK)
|
|
qc->cntrs.socket_full++;
|
|
else
|
|
qc->cntrs.sendto_err++;
|
|
|
|
if (qc_test_fd(qc)) {
|
|
fd_want_send(qc->fd);
|
|
fd_cant_send(qc->fd);
|
|
}
|
|
TRACE_PRINTF(TRACE_LEVEL_USER, QUIC_EV_CONN_SPPKTS, qc, 0, 0, 0,
|
|
"UDP send failure errno=%d (%s)", errno, strerror(errno));
|
|
return 0;
|
|
}
|
|
else {
|
|
/* unrecoverable error */
|
|
qc->cntrs.sendto_err_unknown++;
|
|
TRACE_PRINTF(TRACE_LEVEL_USER, QUIC_EV_CONN_SPPKTS, qc, 0, 0, 0,
|
|
"UDP send failure errno=%d (%s)", errno, strerror(errno));
|
|
return -errno;
|
|
}
|
|
}
|
|
|
|
if (ret != sz)
|
|
return 0;
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* Receive datagram on <qc> FD-owned socket.
|
|
*
|
|
* Returns the total number of bytes read or a negative value on error.
|
|
*/
|
|
int qc_rcv_buf(struct quic_conn *qc)
|
|
{
|
|
struct sockaddr_storage saddr = {0}, daddr = {0};
|
|
struct quic_dgram dgram, *new_dgram;
|
|
struct buffer buf = BUF_NULL;
|
|
unsigned char *dgram_buf;
|
|
size_t dcid_off, dcid_len;
|
|
ssize_t ret = 0;
|
|
struct listener *l = qc->li;
|
|
|
|
/* Do not call this if quic-conn FD is uninitialized. */
|
|
BUG_ON(qc->fd < 0);
|
|
|
|
TRACE_ENTER(QUIC_EV_CONN_RCV, qc);
|
|
|
|
new_dgram = &dgram;
|
|
do {
|
|
if (!b_alloc(&buf, DB_MUX_RX))
|
|
break; /* TODO subscribe for memory again available. */
|
|
|
|
b_reset(&buf);
|
|
BUG_ON(b_contig_space(&buf) < qc->max_udp_payload);
|
|
|
|
dgram_buf = (unsigned char *)b_tail(&buf);
|
|
ret = quic_recv(qc->fd, dgram_buf, qc->max_udp_payload,
|
|
(struct sockaddr *)&saddr, sizeof(saddr),
|
|
(struct sockaddr *)&daddr, sizeof(daddr),
|
|
get_net_port(&qc->local_addr), !!l);
|
|
if (ret <= 0) {
|
|
/* Subscribe FD for future reception. */
|
|
if (errno == EAGAIN || errno == EWOULDBLOCK || errno == ENOTCONN)
|
|
fd_want_recv(qc->fd);
|
|
/* TODO handle other error codes as fatal on the connection. */
|
|
break;
|
|
}
|
|
|
|
b_add(&buf, ret);
|
|
|
|
TRACE_DEVEL("read datagram", QUIC_EV_CONN_RCV, qc, new_dgram);
|
|
|
|
if (!quic_get_dgram_dcid(dgram_buf, dgram_buf + ret, &dcid_off, &dcid_len))
|
|
continue;
|
|
|
|
if (l && !qc_check_dcid(qc, dgram_buf + dcid_off, dcid_len)) {
|
|
/* Datagram received by error on the connection FD, dispatch it
|
|
* to its associated quic-conn.
|
|
*
|
|
* TODO count redispatch datagrams.
|
|
*/
|
|
TRACE_STATE("datagram for other connection on quic-conn socket, requeue it", QUIC_EV_CONN_RCV, qc);
|
|
|
|
quic_lstnr_dgram_dispatch(dgram_buf, ret, l, dcid_off, dcid_len, &saddr, &daddr);
|
|
continue;
|
|
}
|
|
|
|
quic_dgram_init(new_dgram, dgram_buf, ret, NULL, dcid_off, dcid_len, &saddr, &daddr);
|
|
quic_dgram_parse(new_dgram, qc, l ? &l->obj_type :
|
|
(qc->conn ? &__objt_server(qc->conn->target)->obj_type : NULL));
|
|
} while (ret > 0);
|
|
|
|
if (b_size(&buf)) {
|
|
b_free(&buf);
|
|
offer_buffers(NULL, 1);
|
|
}
|
|
|
|
TRACE_LEAVE(QUIC_EV_CONN_RCV, qc);
|
|
return ret;
|
|
}
|
|
|
|
/* Allocate a socket file-descriptor specific for QUIC connection <qc>.
|
|
* Endpoint addresses are specified by the two following arguments : <src> is
|
|
* the local address and <dst> is the remote one.
|
|
*
|
|
* Return the socket FD or a negative error code. On error, socket is marked as
|
|
* uninitialized.
|
|
* Note: This function must not be run for backends connection.
|
|
*/
|
|
void qc_alloc_fd(struct quic_conn *qc, const struct sockaddr_storage *src,
|
|
const struct sockaddr_storage *dst)
|
|
{
|
|
struct listener *l = qc->li;
|
|
struct bind_conf *bc = l->bind_conf;
|
|
struct proxy *p = bc->frontend;
|
|
int fd = -1;
|
|
int ret;
|
|
|
|
/* Must not happen. */
|
|
BUG_ON(src->ss_family != dst->ss_family);
|
|
|
|
qc_init_fd(qc);
|
|
|
|
fd = socket(src->ss_family, SOCK_DGRAM, 0);
|
|
if (fd < 0)
|
|
goto err;
|
|
|
|
if (fd >= global.maxsock) {
|
|
send_log(p, LOG_EMERG,
|
|
"Proxy %s reached the configured maximum connection limit. Please check the global 'maxconn' value.\n",
|
|
p->id);
|
|
goto err;
|
|
}
|
|
|
|
ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
|
|
if (ret < 0)
|
|
goto err;
|
|
|
|
switch (src->ss_family) {
|
|
case AF_INET:
|
|
#if defined(IP_PKTINFO)
|
|
ret = setsockopt(fd, IPPROTO_IP, IP_PKTINFO, &one, sizeof(one));
|
|
#elif defined(IP_RECVDSTADDR)
|
|
ret = setsockopt(fd, IPPROTO_IP, IP_RECVDSTADDR, &one, sizeof(one));
|
|
#endif /* IP_PKTINFO || IP_RECVDSTADDR */
|
|
break;
|
|
case AF_INET6:
|
|
#ifdef IPV6_RECVPKTINFO
|
|
ret = setsockopt(fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one));
|
|
#endif
|
|
break;
|
|
}
|
|
if (ret < 0)
|
|
goto err;
|
|
|
|
ret = bind(fd, (struct sockaddr *)src, get_addr_len(src));
|
|
if (ret < 0) {
|
|
if (errno == EACCES) {
|
|
if (!quic_bind_eacces_warn) {
|
|
send_log(p, LOG_WARNING,
|
|
"Permission error on QUIC socket binding for proxy %s. Consider using setcap cap_net_bind_service (Linux only) or running as root.\n",
|
|
p->id);
|
|
quic_bind_eacces_warn = 1;
|
|
}
|
|
|
|
/* Fallback to listener socket for this receiver instance. */
|
|
HA_ATOMIC_STORE(&l->rx.quic_mode, QUIC_SOCK_MODE_LSTNR);
|
|
}
|
|
goto err;
|
|
}
|
|
|
|
ret = connect(fd, (struct sockaddr *)dst, get_addr_len(dst));
|
|
if (ret < 0)
|
|
goto err;
|
|
|
|
qc->fd = fd;
|
|
fd_set_nonblock(fd);
|
|
fd_insert(fd, qc, quic_conn_sock_fd_iocb, tgid, ti->ltid_bit);
|
|
fd_want_recv(fd);
|
|
|
|
return;
|
|
|
|
err:
|
|
if (fd >= 0)
|
|
close(fd);
|
|
}
|
|
|
|
/* Release socket file-descriptor specific for QUIC connection <qc>. Set
|
|
* <reinit> if socket should be reinitialized after address migration.
|
|
*/
|
|
void qc_release_fd(struct quic_conn *qc, int reinit)
|
|
{
|
|
if (qc_test_fd(qc)) {
|
|
fd_delete(qc->fd);
|
|
qc->fd = DEAD_FD_MAGIC;
|
|
|
|
if (reinit)
|
|
qc_init_fd(qc);
|
|
}
|
|
}
|
|
|
|
/* Wrapper for fd_want_recv(). Safe even if connection does not used its owned
|
|
* socket.
|
|
*/
|
|
void qc_want_recv(struct quic_conn *qc)
|
|
{
|
|
if (qc_test_fd(qc))
|
|
fd_want_recv(qc->fd);
|
|
}
|
|
|
|
/*********************** QUIC accept queue management ***********************/
|
|
/* per-thread accept queues */
|
|
struct quic_accept_queue *quic_accept_queues;
|
|
|
|
/* Install <qc> on the queue ready to be accepted. The queue task is then woken
|
|
* up.
|
|
*/
|
|
void quic_accept_push_qc(struct quic_conn *qc)
|
|
{
|
|
struct quic_accept_queue *queue = &quic_accept_queues[tid];
|
|
struct listener *l = qc->li;
|
|
struct li_per_thread *lthr = &l->per_thr[ti->ltid];
|
|
|
|
/* A connection must only be accepted once per instance. */
|
|
BUG_ON(qc->flags & QUIC_FL_CONN_ACCEPT_REGISTERED);
|
|
|
|
BUG_ON(MT_LIST_INLIST(&qc->accept_list));
|
|
HA_ATOMIC_INC(&l->rx.quic_curr_accept);
|
|
|
|
qc->flags |= QUIC_FL_CONN_ACCEPT_REGISTERED;
|
|
/* 1. insert the listener in the accept queue
|
|
*
|
|
* Use TRY_APPEND as there is a possible race even with INLIST if
|
|
* multiple threads try to add the same listener instance from several
|
|
* quic_conn.
|
|
*/
|
|
if (!MT_LIST_INLIST(&(lthr->quic_accept.list)))
|
|
MT_LIST_TRY_APPEND(&queue->listeners, &(lthr->quic_accept.list));
|
|
|
|
/* 2. insert the quic_conn in the listener per-thread queue. */
|
|
MT_LIST_APPEND(<hr->quic_accept.conns, &qc->accept_list);
|
|
|
|
/* 3. wake up the queue tasklet */
|
|
tasklet_wakeup(quic_accept_queues[tid].tasklet);
|
|
}
|
|
|
|
/* Tasklet handler to accept QUIC connections. Call listener_accept on every
|
|
* listener instances registered in the accept queue.
|
|
*/
|
|
struct task *quic_accept_run(struct task *t, void *ctx, unsigned int i)
|
|
{
|
|
struct li_per_thread *lthr;
|
|
struct mt_list back;
|
|
struct quic_accept_queue *queue = &quic_accept_queues[tid];
|
|
|
|
MT_LIST_FOR_EACH_ENTRY_LOCKED(lthr, &queue->listeners, quic_accept.list, back) {
|
|
listener_accept(lthr->li);
|
|
if (!MT_LIST_ISEMPTY(<hr->quic_accept.conns)) {
|
|
/* entry is left in queue */
|
|
tasklet_wakeup((struct tasklet*)t);
|
|
}
|
|
else {
|
|
mt_list_unlock_self(<hr->quic_accept.list);
|
|
lthr = NULL; /* delete it */
|
|
}
|
|
}
|
|
|
|
return t;
|
|
}
|
|
|
|
/* Returns the maximum number of QUIC connections waiting for handshake to
|
|
* complete in parallel on listener <l> instance. This is directly based on
|
|
* listener backlog value.
|
|
*/
|
|
int quic_listener_max_handshake(const struct listener *l)
|
|
{
|
|
return listener_backlog(l) / 2;
|
|
}
|
|
|
|
/* Returns the value which is considered as the maximum number of QUIC
|
|
* connections waiting to be accepted for listener <l> instance. This is
|
|
* directly based on listener backlog value.
|
|
*/
|
|
int quic_listener_max_accept(const struct listener *l)
|
|
{
|
|
return listener_backlog(l) / 2;
|
|
}
|
|
|
|
static int quic_alloc_accept_queues(void)
|
|
{
|
|
int i;
|
|
|
|
quic_accept_queues = calloc(global.nbthread,
|
|
sizeof(*quic_accept_queues));
|
|
if (!quic_accept_queues) {
|
|
ha_alert("Failed to allocate the quic accept queues.\n");
|
|
return 0;
|
|
}
|
|
|
|
for (i = 0; i < global.nbthread; ++i) {
|
|
struct tasklet *task;
|
|
if (!(task = tasklet_new())) {
|
|
ha_alert("Failed to allocate the quic accept queue on thread %d.\n", i);
|
|
return 0;
|
|
}
|
|
|
|
tasklet_set_tid(task, i);
|
|
task->process = quic_accept_run;
|
|
quic_accept_queues[i].tasklet = task;
|
|
|
|
MT_LIST_INIT(&quic_accept_queues[i].listeners);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
REGISTER_POST_CHECK(quic_alloc_accept_queues);
|
|
|
|
static int quic_deallocate_accept_queues(void)
|
|
{
|
|
int i;
|
|
|
|
if (quic_accept_queues) {
|
|
for (i = 0; i < global.nbthread; ++i)
|
|
tasklet_free(quic_accept_queues[i].tasklet);
|
|
free(quic_accept_queues);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
REGISTER_POST_DEINIT(quic_deallocate_accept_queues);
|