haproxy/include/types/fd.h

196 lines
7.6 KiB
C
Raw Normal View History

/*
* include/types/fd.h
MAJOR: polling: rework the whole polling system This commit heavily changes the polling system in order to definitely fix the frequent breakage of SSL which needs to remember the last EAGAIN before deciding whether to poll or not. Now we have a state per direction for each FD, as opposed to a previous and current state previously. An FD can have up to 8 different states for each direction, each of which being the result of a 3-bit combination. These 3 bits indicate a wish to access the FD, the readiness of the FD and the subscription of the FD to the polling system. This means that it will now be possible to remember the state of a file descriptor across disable/enable sequences that generally happen during forwarding, where enabling reading on a previously disabled FD would result in forgetting the EAGAIN flag it met last time. Several new state manipulation functions have been introduced or adapted : - fd_want_{recv,send} : enable receiving/sending on the FD regardless of its state (sets the ACTIVE flag) ; - fd_stop_{recv,send} : stop receiving/sending on the FD regardless of its state (clears the ACTIVE flag) ; - fd_cant_{recv,send} : report a failure to receive/send on the FD corresponding to EAGAIN (clears the READY flag) ; - fd_may_{recv,send} : report the ability to receive/send on the FD as reported by poll() (sets the READY flag) ; Some functions are used to report the current FD status : - fd_{recv,send}_active - fd_{recv,send}_ready - fd_{recv,send}_polled Some functions were removed : - fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai() The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers knows it can try to access the file descriptor to get this information. In order to simplify the conditions to add/remove cache entries, a new function fd_alloc_or_release_cache_entry() was created to be used from pollers while scanning for updates. The following pollers have been updated : ev_select() : done, built, tested on Linux 3.10 ev_poll() : done, built, tested on Linux 3.10 ev_epoll() : done, built, tested on Linux 3.10 & 3.13 ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
* File descriptors states - check src/fd.c for explanations.
*
MAJOR: polling: rework the whole polling system This commit heavily changes the polling system in order to definitely fix the frequent breakage of SSL which needs to remember the last EAGAIN before deciding whether to poll or not. Now we have a state per direction for each FD, as opposed to a previous and current state previously. An FD can have up to 8 different states for each direction, each of which being the result of a 3-bit combination. These 3 bits indicate a wish to access the FD, the readiness of the FD and the subscription of the FD to the polling system. This means that it will now be possible to remember the state of a file descriptor across disable/enable sequences that generally happen during forwarding, where enabling reading on a previously disabled FD would result in forgetting the EAGAIN flag it met last time. Several new state manipulation functions have been introduced or adapted : - fd_want_{recv,send} : enable receiving/sending on the FD regardless of its state (sets the ACTIVE flag) ; - fd_stop_{recv,send} : stop receiving/sending on the FD regardless of its state (clears the ACTIVE flag) ; - fd_cant_{recv,send} : report a failure to receive/send on the FD corresponding to EAGAIN (clears the READY flag) ; - fd_may_{recv,send} : report the ability to receive/send on the FD as reported by poll() (sets the READY flag) ; Some functions are used to report the current FD status : - fd_{recv,send}_active - fd_{recv,send}_ready - fd_{recv,send}_polled Some functions were removed : - fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai() The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers knows it can try to access the file descriptor to get this information. In order to simplify the conditions to add/remove cache entries, a new function fd_alloc_or_release_cache_entry() was created to be used from pollers while scanning for updates. The following pollers have been updated : ev_select() : done, built, tested on Linux 3.10 ev_poll() : done, built, tested on Linux 3.10 ev_epoll() : done, built, tested on Linux 3.10 & 3.13 ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
* Copyright (C) 2000-2014 Willy Tarreau - w@1wt.eu
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, version 2.1
* exclusively.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef _TYPES_FD_H
#define _TYPES_FD_H
#include <common/config.h>
#include <common/hathreads.h>
#include <common/ist.h>
#include <types/port_range.h>
/* Direction for each FD event update */
enum {
DIR_RD=0,
DIR_WR=1,
};
/* Polling status flags returned in fdtab[].ev :
* FD_POLL_IN remains set as long as some data is pending for read.
* FD_POLL_OUT remains set as long as the fd accepts to write data.
* FD_POLL_ERR and FD_POLL_ERR remain set forever (until processed).
*/
#define FD_POLL_IN 0x01
#define FD_POLL_PRI 0x02
#define FD_POLL_OUT 0x04
#define FD_POLL_ERR 0x08
#define FD_POLL_HUP 0x10
#define FD_POLL_DATA (FD_POLL_IN | FD_POLL_OUT)
#define FD_POLL_STICKY (FD_POLL_ERR | FD_POLL_HUP)
/* FD_EV_* are the values used in fdtab[].state to define the polling states in
* each direction. Most of them are manipulated using test-and-set operations
* which require the bit position in the mask, which is given in the _BIT
* variant.
*/
MAJOR: polling: rework the whole polling system This commit heavily changes the polling system in order to definitely fix the frequent breakage of SSL which needs to remember the last EAGAIN before deciding whether to poll or not. Now we have a state per direction for each FD, as opposed to a previous and current state previously. An FD can have up to 8 different states for each direction, each of which being the result of a 3-bit combination. These 3 bits indicate a wish to access the FD, the readiness of the FD and the subscription of the FD to the polling system. This means that it will now be possible to remember the state of a file descriptor across disable/enable sequences that generally happen during forwarding, where enabling reading on a previously disabled FD would result in forgetting the EAGAIN flag it met last time. Several new state manipulation functions have been introduced or adapted : - fd_want_{recv,send} : enable receiving/sending on the FD regardless of its state (sets the ACTIVE flag) ; - fd_stop_{recv,send} : stop receiving/sending on the FD regardless of its state (clears the ACTIVE flag) ; - fd_cant_{recv,send} : report a failure to receive/send on the FD corresponding to EAGAIN (clears the READY flag) ; - fd_may_{recv,send} : report the ability to receive/send on the FD as reported by poll() (sets the READY flag) ; Some functions are used to report the current FD status : - fd_{recv,send}_active - fd_{recv,send}_ready - fd_{recv,send}_polled Some functions were removed : - fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai() The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers knows it can try to access the file descriptor to get this information. In order to simplify the conditions to add/remove cache entries, a new function fd_alloc_or_release_cache_entry() was created to be used from pollers while scanning for updates. The following pollers have been updated : ev_select() : done, built, tested on Linux 3.10 ev_poll() : done, built, tested on Linux 3.10 ev_epoll() : done, built, tested on Linux 3.10 & 3.13 ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
BUG/MEDIUM: fd/threads: fix excessive CPU usage on multi-thread accept While experimenting with potentially improved fairness and latency using ticket locks on a Ryzen 16-thread/8-core, a very strange situation happened a lot for some levels of traffic. Around 300k connections per second, no more connections would be accepted on the multi-threaded listener but all others would continue to work fine. All attempts to trace showed that the threads were all in the trylock in the fd cache, or in the spinlock of fd_update_events(), or in the one of fd_may_recv(). But as indicated this was not a deadlock since the process continues to work fine. After quite some investigation it appeared that the issue is caused by a lack of fairness between the fdcache's trylock and these functions' spin locks above. In fact, regardless of the success or failure of the fdcache's attempt at grabbing the lock, the poller was calling fd_update_events() which locks the FD once for something that can be done with a CAS, and then calls fd_may_recv() with another lock for something that most often didn't change. The high contention on these spinlocks leaves no chance to any other thread to grab the lock using trylock(), and once this happens, there is no thread left to process incoming connection events nor to stop polling on the FD, leaving all threads at 100% CPU but partially operational. This patch addresses the issue by using bit-test-and-set instead of the OR in fd_may_recv() / fd_may_send() so that nothing is done if the FD was already configured as expected. It does the same in fd_update_events() using a CAS to check if the FD's events need to be changed at all or not. With this patch applied, it became impossible to reproduce the issue, and now there's no way to saturate all 16 CPUs with the load used for testing, as no more than 1350-1400 were noticed at 300+kcps vs 1600. Ideally this patch should go further and try to remove the remaining incarnations of the fdlock as this seems possible, but it's difficult enough to be done in a distinct patch that will not have to be backported. It is possible that workloads involving a high connection rate may slightly benefit from this patch and observe a slightly lower CPU usage even when the service doesn't misbehave. This patch must be backported to 2.0 and 1.9.
2019-07-08 17:09:03 -04:00
/* bits positions for a few flags */
#define FD_EV_ACTIVE_R_BIT 0
#define FD_EV_READY_R_BIT 1
#define FD_EV_SHUT_R_BIT 2
/* unused: 3 */
#define FD_EV_ACTIVE_W_BIT 4
#define FD_EV_READY_W_BIT 5
#define FD_EV_SHUT_W_BIT 6
#define FD_EV_ERR_RW_BIT 7
BUG/MEDIUM: fd/threads: fix excessive CPU usage on multi-thread accept While experimenting with potentially improved fairness and latency using ticket locks on a Ryzen 16-thread/8-core, a very strange situation happened a lot for some levels of traffic. Around 300k connections per second, no more connections would be accepted on the multi-threaded listener but all others would continue to work fine. All attempts to trace showed that the threads were all in the trylock in the fd cache, or in the spinlock of fd_update_events(), or in the one of fd_may_recv(). But as indicated this was not a deadlock since the process continues to work fine. After quite some investigation it appeared that the issue is caused by a lack of fairness between the fdcache's trylock and these functions' spin locks above. In fact, regardless of the success or failure of the fdcache's attempt at grabbing the lock, the poller was calling fd_update_events() which locks the FD once for something that can be done with a CAS, and then calls fd_may_recv() with another lock for something that most often didn't change. The high contention on these spinlocks leaves no chance to any other thread to grab the lock using trylock(), and once this happens, there is no thread left to process incoming connection events nor to stop polling on the FD, leaving all threads at 100% CPU but partially operational. This patch addresses the issue by using bit-test-and-set instead of the OR in fd_may_recv() / fd_may_send() so that nothing is done if the FD was already configured as expected. It does the same in fd_update_events() using a CAS to check if the FD's events need to be changed at all or not. With this patch applied, it became impossible to reproduce the issue, and now there's no way to saturate all 16 CPUs with the load used for testing, as no more than 1350-1400 were noticed at 300+kcps vs 1600. Ideally this patch should go further and try to remove the remaining incarnations of the fdlock as this seems possible, but it's difficult enough to be done in a distinct patch that will not have to be backported. It is possible that workloads involving a high connection rate may slightly benefit from this patch and observe a slightly lower CPU usage even when the service doesn't misbehave. This patch must be backported to 2.0 and 1.9.
2019-07-08 17:09:03 -04:00
/* and flag values */
#define FD_EV_ACTIVE_R (1U << FD_EV_ACTIVE_R_BIT)
#define FD_EV_ACTIVE_W (1U << FD_EV_ACTIVE_W_BIT)
#define FD_EV_ACTIVE_RW (FD_EV_ACTIVE_R | FD_EV_ACTIVE_W)
#define FD_EV_READY_R (1U << FD_EV_READY_R_BIT)
#define FD_EV_READY_W (1U << FD_EV_READY_W_BIT)
MAJOR: polling: rework the whole polling system This commit heavily changes the polling system in order to definitely fix the frequent breakage of SSL which needs to remember the last EAGAIN before deciding whether to poll or not. Now we have a state per direction for each FD, as opposed to a previous and current state previously. An FD can have up to 8 different states for each direction, each of which being the result of a 3-bit combination. These 3 bits indicate a wish to access the FD, the readiness of the FD and the subscription of the FD to the polling system. This means that it will now be possible to remember the state of a file descriptor across disable/enable sequences that generally happen during forwarding, where enabling reading on a previously disabled FD would result in forgetting the EAGAIN flag it met last time. Several new state manipulation functions have been introduced or adapted : - fd_want_{recv,send} : enable receiving/sending on the FD regardless of its state (sets the ACTIVE flag) ; - fd_stop_{recv,send} : stop receiving/sending on the FD regardless of its state (clears the ACTIVE flag) ; - fd_cant_{recv,send} : report a failure to receive/send on the FD corresponding to EAGAIN (clears the READY flag) ; - fd_may_{recv,send} : report the ability to receive/send on the FD as reported by poll() (sets the READY flag) ; Some functions are used to report the current FD status : - fd_{recv,send}_active - fd_{recv,send}_ready - fd_{recv,send}_polled Some functions were removed : - fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai() The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers knows it can try to access the file descriptor to get this information. In order to simplify the conditions to add/remove cache entries, a new function fd_alloc_or_release_cache_entry() was created to be used from pollers while scanning for updates. The following pollers have been updated : ev_select() : done, built, tested on Linux 3.10 ev_poll() : done, built, tested on Linux 3.10 ev_epoll() : done, built, tested on Linux 3.10 & 3.13 ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
#define FD_EV_READY_RW (FD_EV_READY_R | FD_EV_READY_W)
/* note that when FD_EV_SHUT is set, ACTIVE and READY are cleared */
#define FD_EV_SHUT_R (1U << FD_EV_SHUT_R_BIT)
#define FD_EV_SHUT_W (1U << FD_EV_SHUT_W_BIT)
#define FD_EV_SHUT_RW (FD_EV_SHUT_R | FD_EV_SHUT_W)
/* note that when FD_EV_ERR is set, SHUT is also set. Also, ERR is for both
* directions at once (write error, socket dead, etc).
*/
#define FD_EV_ERR_RW (1U << FD_EV_ERR_RW_BIT)
/* This is the value used to mark a file descriptor as dead. This value is
* negative, this is important so that tests on fd < 0 properly match. It
* also has the nice property of being highly negative but neither overflowing
* nor changing sign on 32-bit machines when multiplied by sizeof(fdtab).
* This ensures that any unexpected dereference of such an uninitialized
* file descriptor will lead to so large a dereference that it will crash
* the process at the exact location of the bug with a clean stack trace
* instead of causing silent manipulation of other FDs. And it's readable
* when found in a dump.
*/
#define DEAD_FD_MAGIC 0xFDDEADFD
/* fdlist_entry: entry used by the fd cache.
* >= 0 means we're in the cache and gives the FD of the next in the cache,
* -1 means we're in the cache and the last element,
* -2 means the entry is locked,
* <= -3 means not in the cache, and next element is -4-fd
*
* It must remain 8-aligned so that aligned CAS operations may be done on both
* entries at once.
*/
struct fdlist_entry {
int next;
int prev;
} __attribute__ ((aligned(8)));
/* head of the fd cache */
struct fdlist {
int first;
int last;
} __attribute__ ((aligned(8)));
/* info about one given fd */
struct fdtab {
unsigned long running_mask; /* mask of thread IDs currently using the fd */
unsigned long thread_mask; /* mask of thread IDs authorized to process the fd */
unsigned long update_mask; /* mask of thread IDs having an update for fd */
struct fdlist_entry update; /* Entry in the global update list */
void (*iocb)(int fd); /* I/O handler */
void *owner; /* the connection or listener associated with this fd, NULL if closed */
unsigned char state; /* FD state for read and write directions (FD_EV_*) */
unsigned char ev; /* event seen in return of poll() : FD_POLL_* */
unsigned char linger_risk:1; /* 1 if we must kill lingering before closing */
unsigned char cloned:1; /* 1 if a cloned socket, requires EPOLL_CTL_DEL on close */
unsigned char initialized:1; /* 1 if init phase was done on this fd (e.g. set non-blocking) */
}
#ifdef USE_THREAD
/* only align on cache lines when using threads; 32-bit small archs
* can put everything in 32-bytes when threads are disabled.
*/
__attribute__((aligned(64)))
#endif
;
/* less often used information */
struct fdinfo {
struct port_range *port_range; /* optional port range to bind to */
int local_port; /* optional local port */
};
/*
* Poller descriptors.
* - <name> is initialized by the poller's register() function, and should not
* be allocated, just linked to.
* - <pref> is initialized by the poller's register() function. It is set to 0
* by default, meaning the poller is disabled. init() should set it to 0 in
* case of failure. term() must set it to 0. A generic unoptimized select()
* poller should set it to 100.
* - <private> is initialized by the poller's init() function, and cleaned by
* the term() function.
* - clo() should be used to do indicate the poller that fd will be closed.
* - poll() calls the poller, expiring at <exp>, or immediately if <wake> is set
* - flags indicate what the poller supports (HAP_POLL_F_*)
*/
#define HAP_POLL_F_RDHUP 0x00000001 /* the poller notifies of HUP with reads */
#define HAP_POLL_F_ERRHUP 0x00000002 /* the poller reports ERR and HUP */
struct poller {
void *private; /* any private data for the poller */
void (*clo)(const int fd); /* mark <fd> as closed */
void (*poll)(struct poller *p, int exp, int wake); /* the poller itself */
int (*init)(struct poller *p); /* poller initialization */
void (*term)(struct poller *p); /* termination of this poller */
int (*test)(struct poller *p); /* pre-init check of the poller */
int (*fork)(struct poller *p); /* post-fork re-opening */
const char *name; /* poller name */
unsigned int flags; /* HAP_POLL_F_* */
int pref; /* try pollers with higher preference first */
};
extern struct poller cur_poller; /* the current poller */
extern int nbpollers;
#define MAX_POLLERS 10
extern struct poller pollers[MAX_POLLERS]; /* all registered pollers */
extern struct fdtab *fdtab; /* array of all the file descriptors */
extern struct fdinfo *fdinfo; /* less-often used infos for file descriptors */
extern int totalconn; /* total # of terminated sessions */
extern int actconn; /* # of active sessions */
#endif /* _TYPES_FD_H */
/*
* Local variables:
* c-indent-level: 8
* c-basic-offset: 8
* End:
*/