mirror of
https://github.com/haproxy/haproxy.git
synced 2026-05-27 11:52:34 -04:00
Recently, datagram reception architecture has been completely reworked to improve performance. A regression has been introduced when using traces in qc_rcv_buf() : datagram argument is uninitialized after recv syscall. This may cause a crash as CIDs buffer is dereferenced. Fix this by removing dgram argument from the affected trace. A new trace is added after quic_dgram_init() to keep the ability to display the received content. This issue has caused failure of all QUIC interop testing. No need to backport.
1118 lines
30 KiB
C
1118 lines
30 KiB
C
/*
|
|
* QUIC socket management.
|
|
*
|
|
* Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com>
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*
|
|
*/
|
|
|
|
#define _GNU_SOURCE /* required for struct in6_pktinfo */
|
|
#include <errno.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include <sys/socket.h>
|
|
#include <sys/types.h>
|
|
#include <netinet/in.h>
|
|
#include <netinet/udp.h>
|
|
|
|
#include <haproxy/api.h>
|
|
#include <haproxy/buf.h>
|
|
#include <haproxy/connection.h>
|
|
#include <haproxy/counters.h>
|
|
#include <haproxy/dynbuf.h>
|
|
#include <haproxy/fd.h>
|
|
#include <haproxy/global-t.h>
|
|
#include <haproxy/list.h>
|
|
#include <haproxy/listener.h>
|
|
#include <haproxy/log.h>
|
|
#include <haproxy/obj_type.h>
|
|
#include <haproxy/pool.h>
|
|
#include <haproxy/protocol-t.h>
|
|
#include <haproxy/proto_quic.h>
|
|
#include <haproxy/proxy-t.h>
|
|
#include <haproxy/quic_cid.h>
|
|
#include <haproxy/quic_conn.h>
|
|
#include <haproxy/quic_rx.h>
|
|
#include <haproxy/quic_sock.h>
|
|
#include <haproxy/quic_stats.h>
|
|
#include <haproxy/quic_tp-t.h>
|
|
#include <haproxy/quic_trace.h>
|
|
#include <haproxy/session.h>
|
|
#include <haproxy/task.h>
|
|
#include <haproxy/trace.h>
|
|
#include <haproxy/tools.h>
|
|
#include <haproxy/trace.h>
|
|
|
|
/* Log only first EACCES bind() error runtime occurrence. */
|
|
static volatile char quic_bind_eacces_warn = 0;
|
|
|
|
/* Retrieve a connection's source address. Returns -1 on failure. */
|
|
int quic_sock_get_src(struct connection *conn, struct sockaddr *addr, socklen_t len)
|
|
{
|
|
struct quic_conn *qc;
|
|
|
|
if (!conn || !conn->handle.qc)
|
|
return -1;
|
|
|
|
qc = conn->handle.qc;
|
|
if (conn_is_back(conn)) {
|
|
/* source address should be known since connect() */
|
|
if (!is_addr(&qc->local_addr))
|
|
return -1;
|
|
|
|
if (len > sizeof(qc->local_addr))
|
|
len = sizeof(qc->local_addr);
|
|
memcpy(addr, &qc->local_addr, len);
|
|
return 0;
|
|
} else {
|
|
/* front connection, return the peer's address */
|
|
if (len > sizeof(qc->peer_addr))
|
|
len = sizeof(qc->peer_addr);
|
|
memcpy(addr, &qc->peer_addr, len);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/* Retrieve a connection's destination address. Returns -1 on failure. */
|
|
int quic_sock_get_dst(struct connection *conn, struct sockaddr *addr, socklen_t len)
|
|
{
|
|
struct quic_conn *qc;
|
|
|
|
if (!conn || !conn->handle.qc)
|
|
return -1;
|
|
|
|
qc = conn->handle.qc;
|
|
if (conn_is_back(conn)) {
|
|
/* back connection, return the peer's address */
|
|
if (len > sizeof(qc->peer_addr))
|
|
len = sizeof(qc->peer_addr);
|
|
memcpy(addr, &qc->peer_addr, len);
|
|
} else {
|
|
struct sockaddr_storage *from;
|
|
struct listener *l = qc->li;
|
|
|
|
/* Return listener address if IP_PKTINFO or friends are not
|
|
* supported by the socket.
|
|
*/
|
|
BUG_ON(!l);
|
|
from = is_addr(&qc->local_addr) ? &qc->local_addr : &l->rx.addr;
|
|
if (len > sizeof(*from))
|
|
len = sizeof(*from);
|
|
memcpy(addr, from, len);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Inspired from session_accept_fd().
|
|
* Instantiate a new connection (connection struct) to be attached to <qc>
|
|
* QUIC connection of <l> listener.
|
|
* Returns 1 if succeeded, 0 if not.
|
|
*/
|
|
static int new_quic_cli_conn(struct quic_conn *qc, struct listener *l,
|
|
struct sockaddr_storage *saddr)
|
|
{
|
|
struct connection *cli_conn;
|
|
|
|
if (unlikely((cli_conn = conn_new(&l->obj_type)) == NULL))
|
|
goto out;
|
|
|
|
if (!sockaddr_alloc(&cli_conn->src, saddr, sizeof *saddr))
|
|
goto out_free_conn;
|
|
|
|
cli_conn->flags |= CO_FL_FDLESS;
|
|
qc->conn = cli_conn;
|
|
cli_conn->handle.qc = qc;
|
|
|
|
cli_conn->target = &l->obj_type;
|
|
|
|
return 1;
|
|
|
|
out_free_conn:
|
|
qc->conn = NULL;
|
|
conn_stop_tracking(cli_conn);
|
|
conn_xprt_close(cli_conn);
|
|
conn_free(cli_conn);
|
|
out:
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Tests if the receiver supports accepting connections. Returns positive on
|
|
* success, 0 if not possible
|
|
*/
|
|
int quic_sock_accepting_conn(const struct receiver *rx)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
/* Accept an incoming connection from listener <l>, and return it, as well as
|
|
* a CO_AC_* status code into <status> if not null. Null is returned on error.
|
|
* <l> must be a valid listener with a valid frontend.
|
|
*/
|
|
struct connection *quic_sock_accept_conn(struct listener *l, int *status)
|
|
{
|
|
struct quic_conn *qc;
|
|
struct li_per_thread *lthr = &l->per_thr[ti->ltid];
|
|
|
|
qc = MT_LIST_POP(<hr->quic_accept.conns, struct quic_conn *, accept_list);
|
|
if (!qc || qc->flags & (QUIC_FL_CONN_CLOSING|QUIC_FL_CONN_DRAINING))
|
|
goto done;
|
|
|
|
if (!new_quic_cli_conn(qc, l, &qc->peer_addr))
|
|
goto err;
|
|
|
|
done:
|
|
*status = CO_AC_DONE;
|
|
|
|
if (qc) {
|
|
BUG_ON(l->rx.quic_curr_accept <= 0);
|
|
HA_ATOMIC_DEC(&l->rx.quic_curr_accept);
|
|
return qc->conn;
|
|
}
|
|
else {
|
|
return NULL;
|
|
}
|
|
|
|
err:
|
|
/* in case of error reinsert the element to process it later. */
|
|
MT_LIST_INSERT(<hr->quic_accept.conns, &qc->accept_list);
|
|
|
|
*status = CO_AC_PAUSE;
|
|
return NULL;
|
|
}
|
|
|
|
/* QUIC datagrams handler task. */
|
|
struct task *quic_lstnr_dghdlr(struct task *t, void *ctx, unsigned int state)
|
|
{
|
|
struct quic_dghdlr *dghdlr = ctx;
|
|
struct quic_dgram *dgram;
|
|
size_t len;
|
|
int max_dgrams = global.tune.maxpollevents;
|
|
|
|
TRACE_ENTER(QUIC_EV_CONN_LPKT);
|
|
|
|
while ((dgram = mpring_read_begin(&dghdlr->buf, &len))) {
|
|
/* We ignore the return value of quic_dgram_parse() because
|
|
* whether it was successful or not, we still need to empty the
|
|
* ring buffer. Exiting early would leave us with data left to
|
|
* process, and no guarantee we would get woken up again.
|
|
*/
|
|
quic_dgram_parse(dgram, NULL, dgram->owner);
|
|
mpring_read_end(&dghdlr->buf, len);
|
|
|
|
if (--max_dgrams <= 0) {
|
|
/* too much work done at once, come back here later */
|
|
tasklet_wakeup((struct tasklet *)t);
|
|
break;
|
|
}
|
|
}
|
|
|
|
TRACE_LEAVE(QUIC_EV_CONN_LPKT);
|
|
return t;
|
|
}
|
|
|
|
/* Retrieve the DCID from a QUIC datagram or packet at <start> position,
|
|
* <end> being at one byte past the end of this datagram.
|
|
* Returns 1 if succeeded, 0 if not.
|
|
*/
|
|
static int quic_get_dgram_dcid(unsigned char *start, const unsigned char *end,
|
|
size_t *dcid_off, size_t *dcid_len)
|
|
{
|
|
unsigned char *pos;
|
|
int ret = 0, long_header;
|
|
size_t minlen, skip;
|
|
|
|
TRACE_ENTER(QUIC_EV_CONN_RXPKT);
|
|
|
|
pos = start;
|
|
if (!(*pos & QUIC_PACKET_FIXED_BIT)) {
|
|
TRACE_PROTO("fixed bit not set", QUIC_EV_CONN_RXPKT);
|
|
goto err;
|
|
}
|
|
|
|
long_header = *pos & QUIC_PACKET_LONG_HEADER_BIT;
|
|
minlen = long_header ? QUIC_LONG_PACKET_MINLEN :
|
|
QUIC_SHORT_PACKET_MINLEN + QUIC_HAP_CID_LEN + QUIC_TLS_TAG_LEN;
|
|
skip = long_header ? QUIC_LONG_PACKET_DCID_OFF : QUIC_SHORT_PACKET_DCID_OFF;
|
|
if (end - pos < minlen)
|
|
goto err;
|
|
|
|
pos += skip;
|
|
*dcid_len = long_header ? *pos++ : QUIC_HAP_CID_LEN;
|
|
if (*dcid_len > QUIC_CID_MAXLEN || end - pos <= *dcid_len)
|
|
goto err;
|
|
|
|
*dcid_off = pos - start;
|
|
|
|
ret = 1;
|
|
leave:
|
|
TRACE_LEAVE(QUIC_EV_CONN_RXPKT);
|
|
return ret;
|
|
|
|
err:
|
|
TRACE_PROTO("wrong datagram", QUIC_EV_CONN_RXPKT);
|
|
goto leave;
|
|
}
|
|
|
|
/* Initialize a QUIC datagram. */
|
|
static void quic_dgram_init(struct quic_dgram *dgram,
|
|
unsigned char *pos, size_t len, void *owner,
|
|
size_t dcid_off, size_t dcid_len,
|
|
struct sockaddr_storage *saddr,
|
|
struct sockaddr_storage *daddr)
|
|
{
|
|
BUG_ON_HOT(!is_inet_addr(saddr) ||
|
|
(daddr->ss_family != AF_UNSPEC && !is_inet_addr(daddr)));
|
|
|
|
dgram->obj_type = OBJ_TYPE_DGRAM;
|
|
dgram->owner = owner;
|
|
dgram->buf = pos;
|
|
dgram->len = len;
|
|
dgram->dcid_off = dcid_off;
|
|
dgram->dcid_len = dcid_len;
|
|
memcpy(&dgram->saddr, saddr, sizeof(dgram->saddr));
|
|
memcpy(&dgram->daddr, daddr, sizeof(dgram->daddr));
|
|
dgram->qc = NULL;
|
|
dgram->flags = 0;
|
|
}
|
|
|
|
/* Receive a single message from datagram socket <fd>. Data are placed in <out>
|
|
* buffer of length <len>.
|
|
*
|
|
* Datagram addresses will be returned via the next arguments. <from> will be
|
|
* the peer address and <to> the reception one. Note that <to> can only be
|
|
* retrieved if the socket supports IP_PKTINFO or affiliated options. If not,
|
|
* <to> will be set as AF_UNSPEC. The caller must specify <to_port> to ensure
|
|
* that <to> address is completely filled.
|
|
*
|
|
* Set <check_port> if communication with privileged port should be handle with
|
|
* constraint, which is desirable when interacting with QUIC clients on
|
|
* frontend side.
|
|
*
|
|
* Returns value from recvmsg syscall.
|
|
*/
|
|
static ssize_t quic_recv(int fd, void *out, size_t len,
|
|
struct sockaddr *from, socklen_t from_len,
|
|
struct sockaddr *to, socklen_t to_len,
|
|
uint16_t dst_port, int check_port)
|
|
{
|
|
union pktinfo {
|
|
#ifdef IP_PKTINFO
|
|
struct in_pktinfo in;
|
|
#else /* !IP_PKTINFO */
|
|
struct in_addr addr;
|
|
#endif
|
|
#ifdef IPV6_RECVPKTINFO
|
|
struct in6_pktinfo in6;
|
|
#endif
|
|
};
|
|
char cdata[CMSG_SPACE(sizeof(union pktinfo))];
|
|
struct msghdr msg;
|
|
struct iovec vec;
|
|
struct cmsghdr *cmsg;
|
|
ssize_t ret;
|
|
|
|
vec.iov_base = out;
|
|
vec.iov_len = len;
|
|
|
|
memset(&msg, 0, sizeof(msg));
|
|
msg.msg_name = from;
|
|
msg.msg_namelen = from_len;
|
|
msg.msg_iov = &vec;
|
|
msg.msg_iovlen = 1;
|
|
msg.msg_control = &cdata;
|
|
msg.msg_controllen = sizeof(cdata);
|
|
|
|
clear_addr((struct sockaddr_storage *)to);
|
|
|
|
do {
|
|
ret = recvmsg(fd, &msg, 0);
|
|
} while (ret < 0 && errno == EINTR);
|
|
|
|
/* TODO handle errno. On EAGAIN/EWOULDBLOCK use fd_cant_recv() if
|
|
* using dedicated connection socket.
|
|
*/
|
|
|
|
if (ret < 0)
|
|
goto end;
|
|
|
|
if (check_port && unlikely(port_is_restricted((struct sockaddr_storage *)from, HA_PROTO_QUIC))) {
|
|
ret = -1;
|
|
goto end;
|
|
}
|
|
|
|
for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
|
|
switch (cmsg->cmsg_level) {
|
|
case IPPROTO_IP:
|
|
#if defined(IP_PKTINFO)
|
|
if (cmsg->cmsg_type == IP_PKTINFO) {
|
|
struct sockaddr_in *in = (struct sockaddr_in *)to;
|
|
struct in_pktinfo *info = (struct in_pktinfo *)CMSG_DATA(cmsg);
|
|
|
|
if (to_len >= sizeof(struct sockaddr_in)) {
|
|
in->sin_family = AF_INET;
|
|
in->sin_addr = info->ipi_addr;
|
|
in->sin_port = dst_port;
|
|
}
|
|
}
|
|
#elif defined(IP_RECVDSTADDR)
|
|
if (cmsg->cmsg_type == IP_RECVDSTADDR) {
|
|
struct sockaddr_in *in = (struct sockaddr_in *)to;
|
|
struct in_addr *info = (struct in_addr *)CMSG_DATA(cmsg);
|
|
|
|
if (to_len >= sizeof(struct sockaddr_in)) {
|
|
in->sin_family = AF_INET;
|
|
in->sin_addr.s_addr = info->s_addr;
|
|
in->sin_port = dst_port;
|
|
}
|
|
}
|
|
#endif /* IP_PKTINFO || IP_RECVDSTADDR */
|
|
break;
|
|
|
|
case IPPROTO_IPV6:
|
|
#ifdef IPV6_RECVPKTINFO
|
|
if (cmsg->cmsg_type == IPV6_PKTINFO) {
|
|
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)to;
|
|
struct in6_pktinfo *info6 = (struct in6_pktinfo *)CMSG_DATA(cmsg);
|
|
|
|
if (to_len >= sizeof(struct sockaddr_in6)) {
|
|
in6->sin6_family = AF_INET6;
|
|
memcpy(&in6->sin6_addr, &info6->ipi6_addr, sizeof(in6->sin6_addr));
|
|
in6->sin6_port = dst_port;
|
|
}
|
|
}
|
|
#endif
|
|
break;
|
|
}
|
|
}
|
|
|
|
end:
|
|
return ret;
|
|
}
|
|
|
|
/* Low-level function to write a datagram to the buffer of the handler thread. */
|
|
static int quic_dgram_write(unsigned char *pos, size_t len, void *owner,
|
|
size_t dcid_off, size_t dcid_len,
|
|
struct sockaddr_storage *saddr,
|
|
struct sockaddr_storage *daddr,
|
|
unsigned int cid_tid)
|
|
{
|
|
struct mpring *dst;
|
|
struct quic_dgram *dgram;
|
|
unsigned char *data;
|
|
size_t bring_len;
|
|
void *buf;
|
|
|
|
dst = &quic_dghdlrs[cid_tid].buf;
|
|
|
|
bring_len = sizeof(struct quic_dgram) + len;
|
|
buf = mpring_write_reserve(dst, bring_len);
|
|
if (!buf)
|
|
return 0;
|
|
|
|
dgram = buf;
|
|
quic_dgram_init(dgram, pos, len, owner, dcid_off, dcid_len, saddr, daddr);
|
|
|
|
data = (unsigned char *)(dgram + 1);
|
|
memcpy(data, pos, len);
|
|
|
|
dgram->buf = data;
|
|
mpring_write_commit(dst, buf, bring_len);
|
|
|
|
/* typically quic_lstnr_dghdlr() */
|
|
tasklet_wakeup(quic_dghdlrs[cid_tid].task);
|
|
|
|
return 1;
|
|
}
|
|
|
|
int quic_dgram_requeue(struct quic_dgram *dgram, int cid_tid)
|
|
{
|
|
return quic_dgram_write(dgram->buf, dgram->len, dgram->owner,
|
|
dgram->dcid_off, dgram->dcid_len,
|
|
(struct sockaddr_storage *)&dgram->saddr,
|
|
(struct sockaddr_storage *)&dgram->daddr, cid_tid);
|
|
}
|
|
|
|
/* Attempt to push a datagram to its handler thread.
|
|
*
|
|
* Returns 1 if successful, or 0 if the handler thread's buffer is full. If
|
|
* the datagram could not be pushed, it will be put on the appropriate pending
|
|
* lists. In all cases, take complete ownership of dgram - it won't be valid
|
|
* anymore after calling this function.
|
|
*/
|
|
static int quic_lstnr_dgram_dispatch(unsigned char *pos, size_t len, struct listener *l,
|
|
size_t dcid_off, size_t dcid_len,
|
|
struct sockaddr_storage *saddr,
|
|
struct sockaddr_storage *daddr)
|
|
{
|
|
struct proxy *px;
|
|
struct quic_counters *prx_counters;
|
|
int cid_tid;
|
|
|
|
if (!len)
|
|
goto err;
|
|
|
|
if ((cid_tid = quic_get_cid_tid(pos + dcid_off, dcid_len, saddr, pos, len)) < 0) {
|
|
/* Use the current thread if CID not found. If a clients opens
|
|
* a connection with multiple packets, it is possible that
|
|
* several threads will deal with datagrams sharing the same
|
|
* CID. For this reason, the CID tree insertion will be
|
|
* conducted as an atomic operation and the datagram ultimately
|
|
* redispatch by the late thread.
|
|
*/
|
|
cid_tid = tid;
|
|
}
|
|
|
|
if (!quic_dgram_write(pos, len, l, dcid_off, dcid_len, saddr, daddr, cid_tid))
|
|
goto err;
|
|
|
|
return 1;
|
|
|
|
err:
|
|
px = l->bind_conf->frontend;
|
|
prx_counters = EXTRA_COUNTERS_GET(px->extra_counters_fe, &quic_stats_module);
|
|
HA_ATOMIC_INC(&prx_counters->rxbuf_full);
|
|
return 0;
|
|
}
|
|
|
|
/* Function called on a read event from a listening socket. It tries
|
|
* to handle as many connections as possible.
|
|
*/
|
|
void quic_lstnr_sock_fd_iocb(int fd)
|
|
{
|
|
ssize_t ret;
|
|
unsigned char buf[QUIC_MAX_UDP_PAYLOAD_SIZE];
|
|
struct listener *l = objt_listener(fdtab[fd].owner);
|
|
struct quic_transport_params *params;
|
|
/* Source address */
|
|
struct sockaddr_storage saddr = {0}, daddr = {0};
|
|
size_t dcid_off, dcid_len, max_sz;
|
|
int max_dgrams;
|
|
|
|
BUG_ON(!l);
|
|
|
|
if (!(fdtab[fd].state & FD_POLL_IN) || !fd_recv_ready(fd))
|
|
return;
|
|
|
|
max_dgrams = global.tune.maxpollevents;
|
|
params = &l->bind_conf->quic_params;
|
|
max_sz = params->max_udp_payload_size;
|
|
BUG_ON(max_sz > sizeof(buf));
|
|
|
|
start:
|
|
ret = quic_recv(fd, buf, sizeof(buf),
|
|
(struct sockaddr *)&saddr, sizeof(saddr),
|
|
(struct sockaddr *)&daddr, sizeof(daddr),
|
|
get_net_port(&l->rx.addr), 1);
|
|
if (ret <= 0)
|
|
return;
|
|
|
|
if (quic_get_dgram_dcid(buf, buf + ret, &dcid_off, &dcid_len))
|
|
quic_lstnr_dgram_dispatch(buf, ret, l, dcid_off, dcid_len, &saddr, &daddr);
|
|
|
|
if (--max_dgrams > 0)
|
|
goto start;
|
|
}
|
|
|
|
/* FD-owned quic-conn socket callback. */
|
|
void quic_conn_sock_fd_iocb(int fd)
|
|
{
|
|
struct quic_conn *qc = fdtab[fd].owner;
|
|
|
|
TRACE_ENTER(QUIC_EV_CONN_RCV, qc);
|
|
|
|
if (fd_send_active(fd) && fd_send_ready(fd)) {
|
|
TRACE_DEVEL("send ready", QUIC_EV_CONN_RCV, qc);
|
|
fd_stop_send(fd);
|
|
tasklet_wakeup_after(NULL, qc->wait_event.tasklet);
|
|
qc_notify_send(qc);
|
|
}
|
|
|
|
if (fd_recv_ready(fd)) {
|
|
TRACE_DEVEL("recv ready", QUIC_EV_CONN_RCV, qc);
|
|
tasklet_wakeup_after(NULL, qc->wait_event.tasklet);
|
|
fd_stop_recv(fd);
|
|
}
|
|
|
|
TRACE_LEAVE(QUIC_EV_CONN_RCV, qc);
|
|
}
|
|
|
|
/* quic_conn_closed FD handler (only used on backend side) */
|
|
void quic_conn_closed_sock_fd_iocb(int fd)
|
|
{
|
|
struct quic_conn_closed *cc_qc = fdtab[fd].owner;
|
|
|
|
TRACE_ENTER(QUIC_EV_CONN_RCV, cc_qc);
|
|
|
|
if (fd_send_active(fd) && fd_send_ready(fd)) {
|
|
TRACE_DEVEL("send ready", QUIC_EV_CONN_RCV, cc_qc);
|
|
fd_stop_send(fd);
|
|
tasklet_wakeup_after(NULL, cc_qc->wait_event.tasklet);
|
|
}
|
|
|
|
if (fd_recv_ready(fd)) {
|
|
TRACE_DEVEL("recv ready", QUIC_EV_CONN_RCV, cc_qc);
|
|
tasklet_wakeup_after(NULL, cc_qc->wait_event.tasklet);
|
|
fd_stop_recv(fd);
|
|
}
|
|
|
|
TRACE_LEAVE(QUIC_EV_CONN_RCV, cc_qc);
|
|
}
|
|
|
|
static void cmsg_set_saddr(struct msghdr *msg, struct cmsghdr **cmsg,
|
|
struct sockaddr_storage *saddr)
|
|
{
|
|
struct cmsghdr *c;
|
|
#ifdef IP_PKTINFO
|
|
struct in_pktinfo *in;
|
|
#endif /* IP_PKTINFO */
|
|
#ifdef IPV6_RECVPKTINFO
|
|
struct in6_pktinfo *in6;
|
|
#endif /* IPV6_RECVPKTINFO */
|
|
size_t sz = 0;
|
|
|
|
/* First determine size of ancillary data depending on the system support. */
|
|
switch (saddr->ss_family) {
|
|
case AF_INET:
|
|
#if defined(IP_PKTINFO)
|
|
sz = sizeof(struct in_pktinfo);
|
|
#elif defined(IP_RECVDSTADDR)
|
|
sz = sizeof(struct in_addr);
|
|
#endif /* IP_PKTINFO || IP_RECVDSTADDR */
|
|
break;
|
|
case AF_INET6:
|
|
#ifdef IPV6_RECVPKTINFO
|
|
sz = sizeof(struct in6_pktinfo);
|
|
#endif /* IPV6_RECVPKTINFO */
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
/* Size is null if system does not support send source address setting. */
|
|
if (!sz)
|
|
return;
|
|
|
|
/* Set first msg_controllen to be able to use CMSG_* macros. */
|
|
msg->msg_controllen += CMSG_SPACE(sz);
|
|
|
|
/* seems necessary to please gcc-13 */
|
|
ASSUME_NONNULL(CMSG_FIRSTHDR(msg));
|
|
|
|
*cmsg = !(*cmsg) ? CMSG_FIRSTHDR(msg) : CMSG_NXTHDR(msg, *cmsg);
|
|
ASSUME_NONNULL(*cmsg);
|
|
c = *cmsg;
|
|
c->cmsg_len = CMSG_LEN(sz);
|
|
|
|
switch (saddr->ss_family) {
|
|
case AF_INET:
|
|
c->cmsg_level = IPPROTO_IP;
|
|
#if defined(IP_PKTINFO)
|
|
c->cmsg_type = IP_PKTINFO;
|
|
in = (struct in_pktinfo *)CMSG_DATA(c);
|
|
in->ipi_ifindex = 0;
|
|
in->ipi_addr.s_addr = 0;
|
|
memcpy(&in->ipi_spec_dst,
|
|
&((struct sockaddr_in *)saddr)->sin_addr,
|
|
sizeof(struct in_addr));
|
|
#elif defined(IP_RECVDSTADDR)
|
|
c->cmsg_type = IP_SENDSRCADDR;
|
|
memcpy(CMSG_DATA(c),
|
|
&((struct sockaddr_in *)saddr)->sin_addr,
|
|
sizeof(struct in_addr));
|
|
#endif /* IP_PKTINFO || IP_RECVDSTADDR */
|
|
|
|
break;
|
|
|
|
case AF_INET6:
|
|
#ifdef IPV6_RECVPKTINFO
|
|
c->cmsg_level = IPPROTO_IPV6;
|
|
c->cmsg_type = IPV6_PKTINFO;
|
|
in6 = (struct in6_pktinfo *)CMSG_DATA(c);
|
|
in6->ipi6_ifindex = 0;
|
|
memcpy(&in6->ipi6_addr,
|
|
&((struct sockaddr_in6 *)saddr)->sin6_addr,
|
|
sizeof(struct in6_addr));
|
|
#endif /* IPV6_RECVPKTINFO */
|
|
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void cmsg_set_gso(struct msghdr *msg, struct cmsghdr **cmsg,
|
|
uint16_t gso_size)
|
|
{
|
|
#ifdef UDP_SEGMENT
|
|
struct cmsghdr *c;
|
|
size_t sz = sizeof(gso_size);
|
|
|
|
/* Set first msg_controllen to be able to use CMSG_* macros. */
|
|
msg->msg_controllen += CMSG_SPACE(sz);
|
|
|
|
/* seems necessary to please gcc-13 */
|
|
ASSUME_NONNULL(CMSG_FIRSTHDR(msg));
|
|
|
|
*cmsg = !(*cmsg) ? CMSG_FIRSTHDR(msg) : CMSG_NXTHDR(msg, *cmsg);
|
|
ASSUME_NONNULL(*cmsg);
|
|
c = *cmsg;
|
|
c->cmsg_len = CMSG_LEN(sz);
|
|
|
|
c->cmsg_level = SOL_UDP;
|
|
c->cmsg_type = UDP_SEGMENT;
|
|
c->cmsg_len = CMSG_LEN(sz);
|
|
write_u16(CMSG_DATA(c), gso_size);
|
|
#endif
|
|
}
|
|
|
|
/* Return 1 if the source address may be used, 0 if not. */
|
|
static int qc_may_use_saddr(struct quic_conn *qc)
|
|
{
|
|
/* For QUIC backends, the connection fd is always initialized */
|
|
/* Not useful for connection using the listener socket */
|
|
if (qc_test_fd(qc))
|
|
return 0;
|
|
|
|
/* Connection to a listener from here.
|
|
* The source address may be used when using listener socket (fd=-1) if
|
|
* possible. This is not useful if the listening socket is bound to
|
|
* a specific address. It is even prohibited on FreeBSD.
|
|
*/
|
|
return (!is_addr(&qc->li->rx.addr) &&
|
|
is_addr(&qc->local_addr));
|
|
}
|
|
|
|
/* Send a datagram stored into <buf> buffer with <sz> as size. The caller must
|
|
* ensure there is at least <sz> bytes in this buffer.
|
|
*
|
|
* If <gso_size> is non null, it will be used as value for UDP_SEGMENT option.
|
|
* This allows to transmit multiple datagrams in a single syscall.
|
|
*
|
|
* Returns the total bytes sent over the socket. 0 is returned if a transient
|
|
* error is encountered which allows send to be retry later. A negative value
|
|
* is used for a fatal error which guarantee that all future send operation for
|
|
* this connection will fail.
|
|
*
|
|
* TODO standardize this function for a generic UDP sendto wrapper. This can be
|
|
* done by removing the <qc> arg and replace it with address/port.
|
|
*/
|
|
int qc_snd_buf(struct quic_conn *qc, const struct buffer *buf, size_t sz,
|
|
int flags, uint16_t gso_size)
|
|
{
|
|
ssize_t ret;
|
|
struct msghdr msg;
|
|
struct iovec vec;
|
|
struct cmsghdr *cmsg __maybe_unused = NULL;
|
|
|
|
union {
|
|
#ifdef IP_PKTINFO
|
|
char buf[CMSG_SPACE(sizeof(struct in_pktinfo)) + CMSG_SPACE(sizeof(gso_size))];
|
|
#endif /* IP_PKTINFO */
|
|
#ifdef IPV6_RECVPKTINFO
|
|
char buf6[CMSG_SPACE(sizeof(struct in6_pktinfo)) + CMSG_SPACE(sizeof(gso_size))];
|
|
#endif /* IPV6_RECVPKTINFO */
|
|
char bufaddr[CMSG_SPACE(sizeof(struct in_addr)) + CMSG_SPACE(sizeof(gso_size))];
|
|
struct cmsghdr align;
|
|
} ancillary_data;
|
|
|
|
/* man 3 cmsg
|
|
*
|
|
* When initializing a buffer that will contain a
|
|
* series of cmsghdr structures (e.g., to be sent with
|
|
* sendmsg(2)), that buffer should first be
|
|
* zero-initialized to ensure the correct operation of
|
|
* CMSG_NXTHDR().
|
|
*/
|
|
memset(&ancillary_data, 0, sizeof(ancillary_data));
|
|
|
|
vec.iov_base = b_peek(buf, b_head_ofs(buf));
|
|
vec.iov_len = sz;
|
|
|
|
/* man 2 sendmsg
|
|
*
|
|
* The msg_name field is used on an unconnected socket to specify the
|
|
* target address for a datagram. It points to a buffer containing the
|
|
* address; the msg_namelen field should be set to the size of the
|
|
* address. For a connected socket, these fields should be specified
|
|
* as NULL and 0, respectively.
|
|
*/
|
|
if (!qc_test_fd(qc)) {
|
|
msg.msg_name = &qc->peer_addr;
|
|
msg.msg_namelen = get_addr_len(&qc->peer_addr);
|
|
}
|
|
else {
|
|
msg.msg_name = NULL;
|
|
msg.msg_namelen = 0;
|
|
}
|
|
|
|
msg.msg_iov = &vec;
|
|
msg.msg_iovlen = 1;
|
|
msg.msg_control = NULL;
|
|
msg.msg_controllen = 0;
|
|
msg.msg_flags = 0;
|
|
|
|
if (qc_test_fd(qc) && !fd_send_ready(qc->fd))
|
|
return 0;
|
|
|
|
if (qc_may_use_saddr(qc)) {
|
|
msg.msg_control = ancillary_data.bufaddr;
|
|
cmsg_set_saddr(&msg, &cmsg, &qc->local_addr);
|
|
}
|
|
|
|
/* Set GSO parameter if datagram size is bigger than MTU. */
|
|
if (gso_size) {
|
|
/* GSO size must be less than total data to sent for multiple datagrams. */
|
|
BUG_ON_HOT(b_data(buf) <= gso_size);
|
|
|
|
if (!msg.msg_control)
|
|
msg.msg_control = ancillary_data.bufaddr;
|
|
cmsg_set_gso(&msg, &cmsg, gso_size);
|
|
}
|
|
|
|
do {
|
|
ret = sendmsg(qc_fd(qc), &msg, MSG_DONTWAIT|MSG_NOSIGNAL);
|
|
} while (ret < 0 && errno == EINTR);
|
|
|
|
if (ret < 0) {
|
|
if (errno == EAGAIN || errno == EWOULDBLOCK ||
|
|
errno == ENOTCONN || errno == EINPROGRESS) {
|
|
/* transient error */
|
|
if (errno == EAGAIN || errno == EWOULDBLOCK)
|
|
qc->cntrs.socket_full++;
|
|
else
|
|
qc->cntrs.sendto_err++;
|
|
|
|
if (qc_test_fd(qc)) {
|
|
fd_want_send(qc->fd);
|
|
fd_cant_send(qc->fd);
|
|
}
|
|
TRACE_PRINTF(TRACE_LEVEL_USER, QUIC_EV_CONN_SPPKTS, qc, 0, 0, 0,
|
|
"UDP send failure errno=%d (%s)", errno, strerror(errno));
|
|
return 0;
|
|
}
|
|
else {
|
|
/* unrecoverable error */
|
|
qc->cntrs.sendto_err_unknown++;
|
|
TRACE_PRINTF(TRACE_LEVEL_USER, QUIC_EV_CONN_SPPKTS, qc, 0, 0, 0,
|
|
"UDP send failure errno=%d (%s)", errno, strerror(errno));
|
|
return -errno;
|
|
}
|
|
}
|
|
|
|
if (ret != sz)
|
|
return 0;
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* Receive datagram on <qc> FD-owned socket.
|
|
*
|
|
* Returns the total number of bytes read or a negative value on error.
|
|
*/
|
|
int qc_rcv_buf(struct quic_conn *qc)
|
|
{
|
|
struct sockaddr_storage saddr = {0}, daddr = {0};
|
|
struct quic_dgram dgram, *new_dgram;
|
|
struct buffer buf = BUF_NULL;
|
|
unsigned char *dgram_buf;
|
|
size_t dcid_off, dcid_len;
|
|
ssize_t ret = 0;
|
|
struct listener *l = qc->li;
|
|
|
|
/* Do not call this if quic-conn FD is uninitialized. */
|
|
BUG_ON(qc->fd < 0);
|
|
|
|
TRACE_ENTER(QUIC_EV_CONN_RCV, qc);
|
|
|
|
new_dgram = &dgram;
|
|
do {
|
|
if (!b_alloc(&buf, DB_MUX_RX))
|
|
break; /* TODO subscribe for memory again available. */
|
|
|
|
b_reset(&buf);
|
|
BUG_ON(b_contig_space(&buf) < qc->max_udp_payload);
|
|
|
|
dgram_buf = (unsigned char *)b_tail(&buf);
|
|
ret = quic_recv(qc->fd, dgram_buf, qc->max_udp_payload,
|
|
(struct sockaddr *)&saddr, sizeof(saddr),
|
|
(struct sockaddr *)&daddr, sizeof(daddr),
|
|
get_net_port(&qc->local_addr), !!l);
|
|
if (ret <= 0) {
|
|
/* Subscribe FD for future reception. */
|
|
if (errno == EAGAIN || errno == EWOULDBLOCK || errno == ENOTCONN)
|
|
fd_want_recv(qc->fd);
|
|
/* TODO handle other error codes as fatal on the connection. */
|
|
break;
|
|
}
|
|
|
|
b_add(&buf, ret);
|
|
|
|
TRACE_DEVEL("read datagram", QUIC_EV_CONN_RCV, qc);
|
|
|
|
if (!quic_get_dgram_dcid(dgram_buf, dgram_buf + ret, &dcid_off, &dcid_len))
|
|
continue;
|
|
|
|
if (l && !qc_check_dcid(qc, dgram_buf + dcid_off, dcid_len)) {
|
|
/* Datagram received by error on the connection FD, dispatch it
|
|
* to its associated quic-conn.
|
|
*
|
|
* TODO count redispatch datagrams.
|
|
*/
|
|
TRACE_STATE("datagram for other connection on quic-conn socket, requeue it", QUIC_EV_CONN_RCV, qc);
|
|
|
|
quic_lstnr_dgram_dispatch(dgram_buf, ret, l, dcid_off, dcid_len, &saddr, &daddr);
|
|
continue;
|
|
}
|
|
|
|
quic_dgram_init(new_dgram, dgram_buf, ret, NULL, dcid_off, dcid_len, &saddr, &daddr);
|
|
TRACE_DEVEL("datagram ready for parsing", QUIC_EV_CONN_RCV, qc, new_dgram);
|
|
quic_dgram_parse(new_dgram, qc, l ? &l->obj_type :
|
|
(qc->conn ? &__objt_server(qc->conn->target)->obj_type : NULL));
|
|
} while (ret > 0);
|
|
|
|
if (b_size(&buf)) {
|
|
b_free(&buf);
|
|
offer_buffers(NULL, 1);
|
|
}
|
|
|
|
TRACE_LEAVE(QUIC_EV_CONN_RCV, qc);
|
|
return ret;
|
|
}
|
|
|
|
/* Allocate a socket file-descriptor specific for QUIC connection <qc>.
|
|
* Endpoint addresses are specified by the two following arguments : <src> is
|
|
* the local address and <dst> is the remote one.
|
|
*
|
|
* Return the socket FD or a negative error code. On error, socket is marked as
|
|
* uninitialized.
|
|
* Note: This function must not be run for backends connection.
|
|
*/
|
|
void qc_alloc_fd(struct quic_conn *qc, const struct sockaddr_storage *src,
|
|
const struct sockaddr_storage *dst)
|
|
{
|
|
struct listener *l = qc->li;
|
|
struct bind_conf *bc = l->bind_conf;
|
|
struct proxy *p = bc->frontend;
|
|
int fd = -1;
|
|
int ret;
|
|
|
|
/* Must not happen. */
|
|
BUG_ON(src->ss_family != dst->ss_family);
|
|
|
|
qc_init_fd(qc);
|
|
|
|
fd = socket(src->ss_family, SOCK_DGRAM, 0);
|
|
if (fd < 0)
|
|
goto err;
|
|
|
|
if (fd >= global.maxsock) {
|
|
send_log(p, LOG_EMERG,
|
|
"Proxy %s reached the configured maximum connection limit. Please check the global 'maxconn' value.\n",
|
|
p->id);
|
|
goto err;
|
|
}
|
|
|
|
ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
|
|
if (ret < 0)
|
|
goto err;
|
|
|
|
switch (src->ss_family) {
|
|
case AF_INET:
|
|
#if defined(IP_PKTINFO)
|
|
ret = setsockopt(fd, IPPROTO_IP, IP_PKTINFO, &one, sizeof(one));
|
|
#elif defined(IP_RECVDSTADDR)
|
|
ret = setsockopt(fd, IPPROTO_IP, IP_RECVDSTADDR, &one, sizeof(one));
|
|
#endif /* IP_PKTINFO || IP_RECVDSTADDR */
|
|
break;
|
|
case AF_INET6:
|
|
#ifdef IPV6_RECVPKTINFO
|
|
ret = setsockopt(fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one));
|
|
#endif
|
|
break;
|
|
}
|
|
if (ret < 0)
|
|
goto err;
|
|
|
|
ret = bind(fd, (struct sockaddr *)src, get_addr_len(src));
|
|
if (ret < 0) {
|
|
if (errno == EACCES) {
|
|
if (!quic_bind_eacces_warn) {
|
|
send_log(p, LOG_WARNING,
|
|
"Permission error on QUIC socket binding for proxy %s. Consider using setcap cap_net_bind_service (Linux only) or running as root.\n",
|
|
p->id);
|
|
quic_bind_eacces_warn = 1;
|
|
}
|
|
|
|
/* Fallback to listener socket for this receiver instance. */
|
|
HA_ATOMIC_STORE(&l->rx.quic_mode, QUIC_SOCK_MODE_LSTNR);
|
|
}
|
|
goto err;
|
|
}
|
|
|
|
ret = connect(fd, (struct sockaddr *)dst, get_addr_len(dst));
|
|
if (ret < 0)
|
|
goto err;
|
|
|
|
qc->fd = fd;
|
|
fd_set_nonblock(fd);
|
|
fd_insert(fd, qc, quic_conn_sock_fd_iocb, tgid, ti->ltid_bit);
|
|
fd_want_recv(fd);
|
|
|
|
return;
|
|
|
|
err:
|
|
if (fd >= 0)
|
|
close(fd);
|
|
}
|
|
|
|
/* Release socket file-descriptor specific for QUIC connection <qc>. Set
|
|
* <reinit> if socket should be reinitialized after address migration.
|
|
*/
|
|
void qc_release_fd(struct quic_conn *qc, int reinit)
|
|
{
|
|
if (qc_test_fd(qc)) {
|
|
fd_delete(qc->fd);
|
|
qc->fd = DEAD_FD_MAGIC;
|
|
|
|
if (reinit)
|
|
qc_init_fd(qc);
|
|
}
|
|
}
|
|
|
|
/* Wrapper for fd_want_recv(). Safe even if connection does not used its owned
|
|
* socket.
|
|
*/
|
|
void qc_want_recv(struct quic_conn *qc)
|
|
{
|
|
if (qc_test_fd(qc))
|
|
fd_want_recv(qc->fd);
|
|
}
|
|
|
|
/*********************** QUIC accept queue management ***********************/
|
|
/* per-thread accept queues */
|
|
struct quic_accept_queue *quic_accept_queues;
|
|
|
|
/* Install <qc> on the queue ready to be accepted. The queue task is then woken
|
|
* up.
|
|
*/
|
|
void quic_accept_push_qc(struct quic_conn *qc)
|
|
{
|
|
struct quic_accept_queue *queue = &quic_accept_queues[tid];
|
|
struct listener *l = qc->li;
|
|
struct li_per_thread *lthr = &l->per_thr[ti->ltid];
|
|
|
|
/* A connection must only be accepted once per instance. */
|
|
BUG_ON(qc->flags & QUIC_FL_CONN_ACCEPT_REGISTERED);
|
|
|
|
BUG_ON(MT_LIST_INLIST(&qc->accept_list));
|
|
HA_ATOMIC_INC(&l->rx.quic_curr_accept);
|
|
|
|
qc->flags |= QUIC_FL_CONN_ACCEPT_REGISTERED;
|
|
/* 1. insert the listener in the accept queue
|
|
*
|
|
* Use TRY_APPEND as there is a possible race even with INLIST if
|
|
* multiple threads try to add the same listener instance from several
|
|
* quic_conn.
|
|
*/
|
|
if (!MT_LIST_INLIST(&(lthr->quic_accept.list)))
|
|
MT_LIST_TRY_APPEND(&queue->listeners, &(lthr->quic_accept.list));
|
|
|
|
/* 2. insert the quic_conn in the listener per-thread queue. */
|
|
MT_LIST_APPEND(<hr->quic_accept.conns, &qc->accept_list);
|
|
|
|
/* 3. wake up the queue tasklet */
|
|
tasklet_wakeup(quic_accept_queues[tid].tasklet);
|
|
}
|
|
|
|
/* Tasklet handler to accept QUIC connections. Call listener_accept on every
|
|
* listener instances registered in the accept queue.
|
|
*/
|
|
struct task *quic_accept_run(struct task *t, void *ctx, unsigned int i)
|
|
{
|
|
struct li_per_thread *lthr;
|
|
struct mt_list back;
|
|
struct quic_accept_queue *queue = &quic_accept_queues[tid];
|
|
|
|
MT_LIST_FOR_EACH_ENTRY_LOCKED(lthr, &queue->listeners, quic_accept.list, back) {
|
|
listener_accept(lthr->li);
|
|
if (!MT_LIST_ISEMPTY(<hr->quic_accept.conns)) {
|
|
/* entry is left in queue */
|
|
tasklet_wakeup((struct tasklet*)t);
|
|
}
|
|
else {
|
|
mt_list_unlock_self(<hr->quic_accept.list);
|
|
lthr = NULL; /* delete it */
|
|
}
|
|
}
|
|
|
|
return t;
|
|
}
|
|
|
|
/* Returns the maximum number of QUIC connections waiting for handshake to
|
|
* complete in parallel on listener <l> instance. This is directly based on
|
|
* listener backlog value.
|
|
*/
|
|
int quic_listener_max_handshake(const struct listener *l)
|
|
{
|
|
return listener_backlog(l) / 2;
|
|
}
|
|
|
|
/* Returns the value which is considered as the maximum number of QUIC
|
|
* connections waiting to be accepted for listener <l> instance. This is
|
|
* directly based on listener backlog value.
|
|
*/
|
|
int quic_listener_max_accept(const struct listener *l)
|
|
{
|
|
return listener_backlog(l) / 2;
|
|
}
|
|
|
|
static int quic_alloc_accept_queues(void)
|
|
{
|
|
int i;
|
|
|
|
quic_accept_queues = calloc(global.nbthread,
|
|
sizeof(*quic_accept_queues));
|
|
if (!quic_accept_queues) {
|
|
ha_alert("Failed to allocate the quic accept queues.\n");
|
|
return 0;
|
|
}
|
|
|
|
for (i = 0; i < global.nbthread; ++i) {
|
|
struct tasklet *task;
|
|
if (!(task = tasklet_new())) {
|
|
ha_alert("Failed to allocate the quic accept queue on thread %d.\n", i);
|
|
return 0;
|
|
}
|
|
|
|
tasklet_set_tid(task, i);
|
|
task->process = quic_accept_run;
|
|
quic_accept_queues[i].tasklet = task;
|
|
|
|
MT_LIST_INIT(&quic_accept_queues[i].listeners);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
REGISTER_POST_CHECK(quic_alloc_accept_queues);
|
|
|
|
static int quic_deallocate_accept_queues(void)
|
|
{
|
|
int i;
|
|
|
|
if (quic_accept_queues) {
|
|
for (i = 0; i < global.nbthread; ++i)
|
|
tasklet_free(quic_accept_queues[i].tasklet);
|
|
free(quic_accept_queues);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
REGISTER_POST_DEINIT(quic_deallocate_accept_queues);
|