haproxy/include/haproxy/task.h

661 lines
21 KiB
C
Raw Normal View History

/*
* include/haproxy/task.h
* Functions for task management.
*
* Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, version 2.1
* exclusively.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef _HAPROXY_TASK_H
#define _HAPROXY_TASK_H
#include <sys/time.h>
#include <import/eb32sctree.h>
#include <import/eb32tree.h>
#include <haproxy/api.h>
#include <haproxy/fd.h>
#include <haproxy/global.h>
#include <haproxy/intops.h>
#include <haproxy/list.h>
#include <haproxy/pool.h>
#include <haproxy/task-t.h>
#include <haproxy/thread.h>
#include <haproxy/ticks.h>
/* Principle of the wait queue.
*
* We want to be able to tell whether an expiration date is before of after the
* current time <now>. We KNOW that expiration dates are never too far apart,
* because they are measured in ticks (milliseconds). We also know that almost
* all dates will be in the future, and that a very small part of them will be
* in the past, they are the ones which have expired since last time we checked
* them. Using ticks, we know if a date is in the future or in the past, but we
* cannot use that to store sorted information because that reference changes
* all the time.
*
* We'll use the fact that the time wraps to sort timers. Timers above <now>
* are in the future, timers below <now> are in the past. Here, "above" and
* "below" are to be considered modulo 2^31.
*
* Timers are stored sorted in an ebtree. We use the new ability for ebtrees to
* lookup values starting from X to only expire tasks between <now> - 2^31 and
* <now>. If the end of the tree is reached while walking over it, we simply
* loop back to the beginning. That way, we have no problem keeping sorted
* wrapping timers in a tree, between (now - 24 days) and (now + 24 days). The
* keys in the tree always reflect their real position, none can be infinite.
* This reduces the number of checks to be performed.
*
* Another nice optimisation is to allow a timer to stay at an old place in the
* queue as long as it's not further than the real expiration date. That way,
* we use the tree as a place holder for a minorant of the real expiration
* date. Since we have a very low chance of hitting a timeout anyway, we can
* bounce the nodes to their right place when we scan the tree if we encounter
* a misplaced node once in a while. This even allows us not to remove the
* infinite timers from the wait queue.
*
* So, to summarize, we have :
* - node->key always defines current position in the wait queue
* - timer is the real expiration date (possibly infinite)
* - node->key is always before or equal to timer
*
* The run queue works similarly to the wait queue except that the current date
* is replaced by an insertion counter which can also wrap without any problem.
*/
/* The farthest we can look back in a timer tree */
#define TIMER_LOOK_BACK (1U << 31)
/* tasklets are recognized with nice==-32768 */
#define TASK_IS_TASKLET(t) ((t)->nice == -32768)
/* a few exported variables */
extern unsigned int nb_tasks; /* total number of tasks */
extern volatile unsigned long global_tasks_mask; /* Mask of threads with tasks in the global runqueue */
extern unsigned int tasks_run_queue; /* run queue size */
extern unsigned int tasks_run_queue_cur;
extern unsigned int nb_tasks_cur;
extern unsigned int niced_tasks; /* number of niced tasks in the run queue */
extern struct pool_head *pool_head_task;
extern struct pool_head *pool_head_tasklet;
extern struct pool_head *pool_head_notification;
extern THREAD_LOCAL struct task_per_thread *sched; /* current's thread scheduler context */
#ifdef USE_THREAD
extern struct eb_root timers; /* sorted timers tree, global */
extern struct eb_root rqueue; /* tree constituting the run queue */
#endif
extern struct task_per_thread task_per_thread[MAX_THREADS];
__decl_thread(extern HA_SPINLOCK_T rq_lock); /* spin lock related to run queue */
__decl_thread(extern HA_RWLOCK_T wq_lock); /* RW lock related to the wait queue */
void task_kill(struct task *t);
void __task_wakeup(struct task *t, struct eb_root *);
void __task_queue(struct task *task, struct eb_root *wq);
struct work_list *work_list_create(int nbthread,
struct task *(*fct)(struct task *, void *, unsigned short),
void *arg);
void work_list_destroy(struct work_list *work, int nbthread);
unsigned int run_tasks_from_lists(unsigned int budgets[]);
/*
* This does 3 things :
* - wake up all expired tasks
* - call all runnable tasks
* - return the date of next event in <next> or eternity.
*/
void process_runnable_tasks();
/*
* Extract all expired timers from the timer queue, and wakes up all
* associated tasks.
*/
void wake_expired_tasks();
/* Checks the next timer for the current thread by looking into its own timer
* list and the global one. It may return TICK_ETERNITY if no timer is present.
* Note that the next timer might very well be slightly in the past.
*/
int next_timer_expiry();
/*
* Delete every tasks before running the master polling loop
*/
void mworker_cleantasks();
/* return 0 if task is in run queue, otherwise non-zero */
static inline int task_in_rq(struct task *t)
{
/* Check if leaf_p is NULL, in case he's not in the runqueue, and if
* it's not 0x1, which would mean it's in the tasklet list.
*/
return t->rq.node.leaf_p != NULL;
}
/* return 0 if task is in wait queue, otherwise non-zero */
static inline int task_in_wq(struct task *t)
{
return t->wq.node.leaf_p != NULL;
}
/* returns true if the current thread has some work to do */
static inline int thread_has_tasks(void)
{
return (!!(global_tasks_mask & tid_bit) |
(sched->rqueue_size > 0) |
!!sched->tl_class_mask |
!MT_LIST_ISEMPTY(&sched->shared_tasklet_list));
}
/* puts the task <t> in run queue with reason flags <f>, and returns <t> */
/* This will put the task in the local runqueue if the task is only runnable
* by the current thread, in the global runqueue otherwies.
*/
static inline void task_wakeup(struct task *t, unsigned int f)
{
unsigned short state;
#ifdef USE_THREAD
struct eb_root *root;
if (t->thread_mask == tid_bit || global.nbthread == 1)
root = &sched->rqueue;
else
root = &rqueue;
#else
struct eb_root *root = &sched->rqueue;
#endif
state = _HA_ATOMIC_OR(&t->state, f);
while (!(state & (TASK_RUNNING | TASK_QUEUED))) {
if (_HA_ATOMIC_CAS(&t->state, &state, state | TASK_QUEUED)) {
__task_wakeup(t, root);
break;
}
}
}
/*
* Unlink the task from the wait queue, and possibly update the last_timer
* pointer. A pointer to the task itself is returned. The task *must* already
* be in the wait queue before calling this function. If unsure, use the safer
* task_unlink_wq() function.
*/
static inline struct task *__task_unlink_wq(struct task *t)
{
eb32_delete(&t->wq);
return t;
}
/* remove a task from its wait queue. It may either be the local wait queue if
BUG/MAJOR: task: add a new TASK_SHARED_WQ flag to fix foreing requeuing Since 1.9 with commit b20aa9eef3 ("MAJOR: tasks: create per-thread wait queues") a task bound to a single thread will not use locks when being queued or dequeued because the wait queue is assumed to be the owner thread's. But there exists a rare situation where this is not true: the health check tasks may be running on one thread waiting for a response, and may in parallel be requeued by another thread calling health_adjust() after a detecting a response error in traffic when "observe l7" is set, and "fastinter" is lower than "inter", requiring to shorten the running check's timeout. In this case, the task being requeued was present in another thread's wait queue, thus opening a race during task_unlink_wq(), and gets requeued into the calling thread's wait queue instead of the running one's, opening a second race here. This patch aims at protecting against the risk of calling task_unlink_wq() from one thread while the task is queued on another thread, hence unlocked, by introducing a new TASK_SHARED_WQ flag. This new flag indicates that a task's position in the wait queue may be adjusted by other threads than then one currently executing it. This means that such WQ manipulations must be performed under a lock. There are two types of such tasks: - the global ones, using the global wait queue (technically speaking, those whose thread_mask has at least 2 bits set). - some local ones, which for now will be placed into the global wait queue as well in order to benefit from its lock. The flag is automatically set on initialization if the task's thread mask indicates more than one thread. The caller must also set it if it intends to let other threads update the task's expiration delay (e.g. delegated I/Os), or if it intends to change the task's affinity over time as this could lead to the same situation. Right now only the situation described above seems to be affected by this issue, and it is very difficult to trigger, and even then, will often have no visible effect beyond stopping the checks for example once the race is met. On my laptop it is feasible with the following config, chained to httpterm: global maxconn 400 # provoke FD errors, calling health_adjust() defaults mode http timeout client 10s timeout server 10s timeout connect 10s listen px bind :8001 option httpchk /?t=50 server sback 127.0.0.1:8000 backup server-template s 0-999 127.0.0.1:8000 check port 8001 inter 100 fastinter 10 observe layer7 This patch will automatically address the case for the checks because check tasks are created with multiple threads bound and will get the TASK_SHARED_WQ flag set. If in the future more tasks need to rely on this (multi-threaded muxes for example) and the use of the global wait queue becomes a bottleneck again, then it should not be too difficult to place locks on the local wait queues and queue the task on its bound thread. This patch needs to be backported to 2.1, 2.0 and 1.9. It depends on previous patch "MINOR: task: only check TASK_WOKEN_ANY to decide to requeue a task". Many thanks to William Dauchy for providing detailed traces allowing to spot the problem.
2019-12-19 01:39:06 -05:00
* the task is bound to a single thread or the global queue. If the task uses a
* shared wait queue, the global wait queue lock is used.
*/
static inline struct task *task_unlink_wq(struct task *t)
{
unsigned long locked;
if (likely(task_in_wq(t))) {
BUG/MAJOR: task: add a new TASK_SHARED_WQ flag to fix foreing requeuing Since 1.9 with commit b20aa9eef3 ("MAJOR: tasks: create per-thread wait queues") a task bound to a single thread will not use locks when being queued or dequeued because the wait queue is assumed to be the owner thread's. But there exists a rare situation where this is not true: the health check tasks may be running on one thread waiting for a response, and may in parallel be requeued by another thread calling health_adjust() after a detecting a response error in traffic when "observe l7" is set, and "fastinter" is lower than "inter", requiring to shorten the running check's timeout. In this case, the task being requeued was present in another thread's wait queue, thus opening a race during task_unlink_wq(), and gets requeued into the calling thread's wait queue instead of the running one's, opening a second race here. This patch aims at protecting against the risk of calling task_unlink_wq() from one thread while the task is queued on another thread, hence unlocked, by introducing a new TASK_SHARED_WQ flag. This new flag indicates that a task's position in the wait queue may be adjusted by other threads than then one currently executing it. This means that such WQ manipulations must be performed under a lock. There are two types of such tasks: - the global ones, using the global wait queue (technically speaking, those whose thread_mask has at least 2 bits set). - some local ones, which for now will be placed into the global wait queue as well in order to benefit from its lock. The flag is automatically set on initialization if the task's thread mask indicates more than one thread. The caller must also set it if it intends to let other threads update the task's expiration delay (e.g. delegated I/Os), or if it intends to change the task's affinity over time as this could lead to the same situation. Right now only the situation described above seems to be affected by this issue, and it is very difficult to trigger, and even then, will often have no visible effect beyond stopping the checks for example once the race is met. On my laptop it is feasible with the following config, chained to httpterm: global maxconn 400 # provoke FD errors, calling health_adjust() defaults mode http timeout client 10s timeout server 10s timeout connect 10s listen px bind :8001 option httpchk /?t=50 server sback 127.0.0.1:8000 backup server-template s 0-999 127.0.0.1:8000 check port 8001 inter 100 fastinter 10 observe layer7 This patch will automatically address the case for the checks because check tasks are created with multiple threads bound and will get the TASK_SHARED_WQ flag set. If in the future more tasks need to rely on this (multi-threaded muxes for example) and the use of the global wait queue becomes a bottleneck again, then it should not be too difficult to place locks on the local wait queues and queue the task on its bound thread. This patch needs to be backported to 2.1, 2.0 and 1.9. It depends on previous patch "MINOR: task: only check TASK_WOKEN_ANY to decide to requeue a task". Many thanks to William Dauchy for providing detailed traces allowing to spot the problem.
2019-12-19 01:39:06 -05:00
locked = t->state & TASK_SHARED_WQ;
BUG_ON(!locked && t->thread_mask != tid_bit);
if (locked)
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
__task_unlink_wq(t);
if (locked)
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
}
return t;
}
/* Place <task> into the wait queue, where it may already be. If the expiration
* timer is infinite, do nothing and rely on wake_expired_task to clean up.
* If the task uses a shared wait queue, it's queued into the global wait queue,
* protected by the global wq_lock, otherwise by it necessarily belongs to the
* current thread'sand is queued without locking.
*/
static inline void task_queue(struct task *task)
{
/* If we already have a place in the wait queue no later than the
* timeout we're trying to set, we'll stay there, because it is very
* unlikely that we will reach the timeout anyway. If the timeout
* has been disabled, it's useless to leave the queue as well. We'll
* rely on wake_expired_tasks() to catch the node and move it to the
* proper place should it ever happen. Finally we only add the task
* to the queue if it was not there or if it was further than what
* we want.
*/
if (!tick_isset(task->expire))
return;
#ifdef USE_THREAD
if (task->state & TASK_SHARED_WQ) {
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key))
__task_queue(task, &timers);
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
} else
#endif
{
BUG_ON(task->thread_mask != tid_bit); // should have TASK_SHARED_WQ
if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key))
__task_queue(task, &sched->timers);
}
}
/* change the thread affinity of a task to <thread_mask>.
* This may only be done from within the running task itself or during its
* initialization. It will unqueue and requeue the task from the wait queue
* if it was in it. This is safe against a concurrent task_queue() call because
* task_queue() itself will unlink again if needed after taking into account
* the new thread_mask.
*/
static inline void task_set_affinity(struct task *t, unsigned long thread_mask)
{
if (unlikely(task_in_wq(t))) {
task_unlink_wq(t);
t->thread_mask = thread_mask;
task_queue(t);
}
else
t->thread_mask = thread_mask;
}
/*
* Unlink the task from the run queue. The tasks_run_queue size and number of
* niced tasks are updated too. A pointer to the task itself is returned. The
* task *must* already be in the run queue before calling this function. If
* unsure, use the safer task_unlink_rq() function. Note that the pointer to the
* next run queue entry is neither checked nor updated.
*/
static inline struct task *__task_unlink_rq(struct task *t)
{
_HA_ATOMIC_SUB(&tasks_run_queue, 1);
#ifdef USE_THREAD
if (t->state & TASK_GLOBAL)
_HA_ATOMIC_AND(&t->state, ~TASK_GLOBAL);
else
#endif
sched->rqueue_size--;
eb32sc_delete(&t->rq);
if (likely(t->nice))
_HA_ATOMIC_SUB(&niced_tasks, 1);
return t;
}
/* This function unlinks task <t> from the run queue if it is in it. It also
* takes care of updating the next run queue task if it was this task.
*/
static inline struct task *task_unlink_rq(struct task *t)
{
int is_global = t->state & TASK_GLOBAL;
if (is_global)
HA_SPIN_LOCK(TASK_RQ_LOCK, &rq_lock);
if (likely(task_in_rq(t)))
__task_unlink_rq(t);
if (is_global)
HA_SPIN_UNLOCK(TASK_RQ_LOCK, &rq_lock);
return t;
}
/* schedules tasklet <tl> to run onto thread <thr> or the current thread if
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM nodes which would not happen on x86 nodes. After investigation it turned out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding memory ordering. The issue that was triggered there is that if a tasklet_wakeup() call is made on a tasklet scheduled to run on a foreign thread and that tasklet is just being dequeued to be processed, there can be a race at two places: - if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list, because the emptiness tests matches ; - if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends up being implemented, it may even corrupt the adjacent nodes while they're being reused for the in-tree storage. This issue was introduced in 2.2 when support for waking up remote tasklets was added. Initially the attachment of a tasklet to a list was enough to know its status and this used to be stable information. Now it's not sufficient to rely on this anymore, thus we need to use a different information. This patch solves this by adding a new task flag, TASK_IN_LIST, which is atomically set before attaching a tasklet to a list, and is only removed after the tasklet is detached from a list. It is checked by tasklet_wakeup_on() so that it may only be done while the tasklet is out of any list, and is cleared during the state switch when calling the tasklet. Note that the flag is not set for pure tasks as it's not needed. However this introduces a new special case: the function tasklet_remove_from_tasklet_list() needs to keep both states in sync and cannot check both the state and the attachment to a list at the same time. This function is already limited to being used by the thread owning the tasklet, so in this case the test remains reliable. However, just like its predecessors, this function is wrong by design and it should probably be replaced with a stricter one, a lazy one, or be totally removed (it's only used in checks to avoid calling a possibly scheduled event, and when freeing a tasklet). Regardless, for now the function exists so the flag is removed only if the deletion could be done, which covers all cases we're interested in regarding the insertion. This removal is safe against a concurrent tasklet_wakeup_on() since MT_LIST_DEL() guarantees the atomic test, and will ultimately clear the flag only if the task could be deleted, so the flag will always reflect the last state. This should be carefully be backported as far as 2.2 after some observation period. This patch depends on previous patch "MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
* <thr> is negative. Note that it is illegal to wakeup a foreign tasklet if
* its tid is negative and it is illegal to self-assign a tasklet that was
* at least once scheduled on a specific thread.
*/
static inline void tasklet_wakeup_on(struct tasklet *tl, int thr)
{
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM nodes which would not happen on x86 nodes. After investigation it turned out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding memory ordering. The issue that was triggered there is that if a tasklet_wakeup() call is made on a tasklet scheduled to run on a foreign thread and that tasklet is just being dequeued to be processed, there can be a race at two places: - if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list, because the emptiness tests matches ; - if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends up being implemented, it may even corrupt the adjacent nodes while they're being reused for the in-tree storage. This issue was introduced in 2.2 when support for waking up remote tasklets was added. Initially the attachment of a tasklet to a list was enough to know its status and this used to be stable information. Now it's not sufficient to rely on this anymore, thus we need to use a different information. This patch solves this by adding a new task flag, TASK_IN_LIST, which is atomically set before attaching a tasklet to a list, and is only removed after the tasklet is detached from a list. It is checked by tasklet_wakeup_on() so that it may only be done while the tasklet is out of any list, and is cleared during the state switch when calling the tasklet. Note that the flag is not set for pure tasks as it's not needed. However this introduces a new special case: the function tasklet_remove_from_tasklet_list() needs to keep both states in sync and cannot check both the state and the attachment to a list at the same time. This function is already limited to being used by the thread owning the tasklet, so in this case the test remains reliable. However, just like its predecessors, this function is wrong by design and it should probably be replaced with a stricter one, a lazy one, or be totally removed (it's only used in checks to avoid calling a possibly scheduled event, and when freeing a tasklet). Regardless, for now the function exists so the flag is removed only if the deletion could be done, which covers all cases we're interested in regarding the insertion. This removal is safe against a concurrent tasklet_wakeup_on() since MT_LIST_DEL() guarantees the atomic test, and will ultimately clear the flag only if the task could be deleted, so the flag will always reflect the last state. This should be carefully be backported as far as 2.2 after some observation period. This patch depends on previous patch "MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
unsigned short state = tl->state;
do {
/* do nothing if someone else already added it */
if (state & TASK_IN_LIST)
return;
} while (!_HA_ATOMIC_CAS(&tl->state, &state, state | TASK_IN_LIST));
/* at this pint we're the first ones to add this task to the list */
if (likely(thr < 0)) {
/* this tasklet runs on the caller thread */
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM nodes which would not happen on x86 nodes. After investigation it turned out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding memory ordering. The issue that was triggered there is that if a tasklet_wakeup() call is made on a tasklet scheduled to run on a foreign thread and that tasklet is just being dequeued to be processed, there can be a race at two places: - if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list, because the emptiness tests matches ; - if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends up being implemented, it may even corrupt the adjacent nodes while they're being reused for the in-tree storage. This issue was introduced in 2.2 when support for waking up remote tasklets was added. Initially the attachment of a tasklet to a list was enough to know its status and this used to be stable information. Now it's not sufficient to rely on this anymore, thus we need to use a different information. This patch solves this by adding a new task flag, TASK_IN_LIST, which is atomically set before attaching a tasklet to a list, and is only removed after the tasklet is detached from a list. It is checked by tasklet_wakeup_on() so that it may only be done while the tasklet is out of any list, and is cleared during the state switch when calling the tasklet. Note that the flag is not set for pure tasks as it's not needed. However this introduces a new special case: the function tasklet_remove_from_tasklet_list() needs to keep both states in sync and cannot check both the state and the attachment to a list at the same time. This function is already limited to being used by the thread owning the tasklet, so in this case the test remains reliable. However, just like its predecessors, this function is wrong by design and it should probably be replaced with a stricter one, a lazy one, or be totally removed (it's only used in checks to avoid calling a possibly scheduled event, and when freeing a tasklet). Regardless, for now the function exists so the flag is removed only if the deletion could be done, which covers all cases we're interested in regarding the insertion. This removal is safe against a concurrent tasklet_wakeup_on() since MT_LIST_DEL() guarantees the atomic test, and will ultimately clear the flag only if the task could be deleted, so the flag will always reflect the last state. This should be carefully be backported as far as 2.2 after some observation period. This patch depends on previous patch "MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
if (tl->state & TASK_SELF_WAKING) {
LIST_ADDQ(&sched->tasklets[TL_BULK], &tl->list);
sched->tl_class_mask |= 1 << TL_BULK;
}
else if ((struct task *)tl == sched->current) {
_HA_ATOMIC_OR(&tl->state, TASK_SELF_WAKING);
LIST_ADDQ(&sched->tasklets[TL_BULK], &tl->list);
sched->tl_class_mask |= 1 << TL_BULK;
}
else if (sched->current_queue < 0) {
LIST_ADDQ(&sched->tasklets[TL_URGENT], &tl->list);
sched->tl_class_mask |= 1 << TL_URGENT;
}
else {
LIST_ADDQ(&sched->tasklets[sched->current_queue], &tl->list);
sched->tl_class_mask |= 1 << sched->current_queue;
}
} else {
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM nodes which would not happen on x86 nodes. After investigation it turned out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding memory ordering. The issue that was triggered there is that if a tasklet_wakeup() call is made on a tasklet scheduled to run on a foreign thread and that tasklet is just being dequeued to be processed, there can be a race at two places: - if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list, because the emptiness tests matches ; - if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends up being implemented, it may even corrupt the adjacent nodes while they're being reused for the in-tree storage. This issue was introduced in 2.2 when support for waking up remote tasklets was added. Initially the attachment of a tasklet to a list was enough to know its status and this used to be stable information. Now it's not sufficient to rely on this anymore, thus we need to use a different information. This patch solves this by adding a new task flag, TASK_IN_LIST, which is atomically set before attaching a tasklet to a list, and is only removed after the tasklet is detached from a list. It is checked by tasklet_wakeup_on() so that it may only be done while the tasklet is out of any list, and is cleared during the state switch when calling the tasklet. Note that the flag is not set for pure tasks as it's not needed. However this introduces a new special case: the function tasklet_remove_from_tasklet_list() needs to keep both states in sync and cannot check both the state and the attachment to a list at the same time. This function is already limited to being used by the thread owning the tasklet, so in this case the test remains reliable. However, just like its predecessors, this function is wrong by design and it should probably be replaced with a stricter one, a lazy one, or be totally removed (it's only used in checks to avoid calling a possibly scheduled event, and when freeing a tasklet). Regardless, for now the function exists so the flag is removed only if the deletion could be done, which covers all cases we're interested in regarding the insertion. This removal is safe against a concurrent tasklet_wakeup_on() since MT_LIST_DEL() guarantees the atomic test, and will ultimately clear the flag only if the task could be deleted, so the flag will always reflect the last state. This should be carefully be backported as far as 2.2 after some observation period. This patch depends on previous patch "MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
/* this tasklet runs on a specific thread. */
MT_LIST_ADDQ(&task_per_thread[thr].shared_tasklet_list, (struct mt_list *)&tl->list);
if (sleeping_thread_mask & (1UL << thr)) {
_HA_ATOMIC_AND(&sleeping_thread_mask, ~(1UL << thr));
wake_thread(thr);
}
}
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM nodes which would not happen on x86 nodes. After investigation it turned out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding memory ordering. The issue that was triggered there is that if a tasklet_wakeup() call is made on a tasklet scheduled to run on a foreign thread and that tasklet is just being dequeued to be processed, there can be a race at two places: - if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list, because the emptiness tests matches ; - if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends up being implemented, it may even corrupt the adjacent nodes while they're being reused for the in-tree storage. This issue was introduced in 2.2 when support for waking up remote tasklets was added. Initially the attachment of a tasklet to a list was enough to know its status and this used to be stable information. Now it's not sufficient to rely on this anymore, thus we need to use a different information. This patch solves this by adding a new task flag, TASK_IN_LIST, which is atomically set before attaching a tasklet to a list, and is only removed after the tasklet is detached from a list. It is checked by tasklet_wakeup_on() so that it may only be done while the tasklet is out of any list, and is cleared during the state switch when calling the tasklet. Note that the flag is not set for pure tasks as it's not needed. However this introduces a new special case: the function tasklet_remove_from_tasklet_list() needs to keep both states in sync and cannot check both the state and the attachment to a list at the same time. This function is already limited to being used by the thread owning the tasklet, so in this case the test remains reliable. However, just like its predecessors, this function is wrong by design and it should probably be replaced with a stricter one, a lazy one, or be totally removed (it's only used in checks to avoid calling a possibly scheduled event, and when freeing a tasklet). Regardless, for now the function exists so the flag is removed only if the deletion could be done, which covers all cases we're interested in regarding the insertion. This removal is safe against a concurrent tasklet_wakeup_on() since MT_LIST_DEL() guarantees the atomic test, and will ultimately clear the flag only if the task could be deleted, so the flag will always reflect the last state. This should be carefully be backported as far as 2.2 after some observation period. This patch depends on previous patch "MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
_HA_ATOMIC_ADD(&tasks_run_queue, 1);
}
/* schedules tasklet <tl> to run onto the thread designated by tl->tid, which
* is either its owner thread if >= 0 or the current thread if < 0.
*/
static inline void tasklet_wakeup(struct tasklet *tl)
{
tasklet_wakeup_on(tl, tl->tid);
}
/* Try to remove a tasklet from the list. This call is inherently racy and may
* only be performed on the thread that was supposed to dequeue this tasklet.
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM nodes which would not happen on x86 nodes. After investigation it turned out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding memory ordering. The issue that was triggered there is that if a tasklet_wakeup() call is made on a tasklet scheduled to run on a foreign thread and that tasklet is just being dequeued to be processed, there can be a race at two places: - if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list, because the emptiness tests matches ; - if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends up being implemented, it may even corrupt the adjacent nodes while they're being reused for the in-tree storage. This issue was introduced in 2.2 when support for waking up remote tasklets was added. Initially the attachment of a tasklet to a list was enough to know its status and this used to be stable information. Now it's not sufficient to rely on this anymore, thus we need to use a different information. This patch solves this by adding a new task flag, TASK_IN_LIST, which is atomically set before attaching a tasklet to a list, and is only removed after the tasklet is detached from a list. It is checked by tasklet_wakeup_on() so that it may only be done while the tasklet is out of any list, and is cleared during the state switch when calling the tasklet. Note that the flag is not set for pure tasks as it's not needed. However this introduces a new special case: the function tasklet_remove_from_tasklet_list() needs to keep both states in sync and cannot check both the state and the attachment to a list at the same time. This function is already limited to being used by the thread owning the tasklet, so in this case the test remains reliable. However, just like its predecessors, this function is wrong by design and it should probably be replaced with a stricter one, a lazy one, or be totally removed (it's only used in checks to avoid calling a possibly scheduled event, and when freeing a tasklet). Regardless, for now the function exists so the flag is removed only if the deletion could be done, which covers all cases we're interested in regarding the insertion. This removal is safe against a concurrent tasklet_wakeup_on() since MT_LIST_DEL() guarantees the atomic test, and will ultimately clear the flag only if the task could be deleted, so the flag will always reflect the last state. This should be carefully be backported as far as 2.2 after some observation period. This patch depends on previous patch "MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
* This way it is safe to call MT_LIST_DEL without first removing the
* TASK_IN_LIST bit, which must absolutely be removed afterwards in case
* another thread would want to wake this tasklet up in parallel.
*/
static inline void tasklet_remove_from_tasklet_list(struct tasklet *t)
{
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM nodes which would not happen on x86 nodes. After investigation it turned out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding memory ordering. The issue that was triggered there is that if a tasklet_wakeup() call is made on a tasklet scheduled to run on a foreign thread and that tasklet is just being dequeued to be processed, there can be a race at two places: - if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list, because the emptiness tests matches ; - if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends up being implemented, it may even corrupt the adjacent nodes while they're being reused for the in-tree storage. This issue was introduced in 2.2 when support for waking up remote tasklets was added. Initially the attachment of a tasklet to a list was enough to know its status and this used to be stable information. Now it's not sufficient to rely on this anymore, thus we need to use a different information. This patch solves this by adding a new task flag, TASK_IN_LIST, which is atomically set before attaching a tasklet to a list, and is only removed after the tasklet is detached from a list. It is checked by tasklet_wakeup_on() so that it may only be done while the tasklet is out of any list, and is cleared during the state switch when calling the tasklet. Note that the flag is not set for pure tasks as it's not needed. However this introduces a new special case: the function tasklet_remove_from_tasklet_list() needs to keep both states in sync and cannot check both the state and the attachment to a list at the same time. This function is already limited to being used by the thread owning the tasklet, so in this case the test remains reliable. However, just like its predecessors, this function is wrong by design and it should probably be replaced with a stricter one, a lazy one, or be totally removed (it's only used in checks to avoid calling a possibly scheduled event, and when freeing a tasklet). Regardless, for now the function exists so the flag is removed only if the deletion could be done, which covers all cases we're interested in regarding the insertion. This removal is safe against a concurrent tasklet_wakeup_on() since MT_LIST_DEL() guarantees the atomic test, and will ultimately clear the flag only if the task could be deleted, so the flag will always reflect the last state. This should be carefully be backported as far as 2.2 after some observation period. This patch depends on previous patch "MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
if (MT_LIST_DEL((struct mt_list *)&t->list)) {
_HA_ATOMIC_AND(&t->state, ~TASK_IN_LIST);
_HA_ATOMIC_SUB(&tasks_run_queue, 1);
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM nodes which would not happen on x86 nodes. After investigation it turned out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding memory ordering. The issue that was triggered there is that if a tasklet_wakeup() call is made on a tasklet scheduled to run on a foreign thread and that tasklet is just being dequeued to be processed, there can be a race at two places: - if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list, because the emptiness tests matches ; - if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends up being implemented, it may even corrupt the adjacent nodes while they're being reused for the in-tree storage. This issue was introduced in 2.2 when support for waking up remote tasklets was added. Initially the attachment of a tasklet to a list was enough to know its status and this used to be stable information. Now it's not sufficient to rely on this anymore, thus we need to use a different information. This patch solves this by adding a new task flag, TASK_IN_LIST, which is atomically set before attaching a tasklet to a list, and is only removed after the tasklet is detached from a list. It is checked by tasklet_wakeup_on() so that it may only be done while the tasklet is out of any list, and is cleared during the state switch when calling the tasklet. Note that the flag is not set for pure tasks as it's not needed. However this introduces a new special case: the function tasklet_remove_from_tasklet_list() needs to keep both states in sync and cannot check both the state and the attachment to a list at the same time. This function is already limited to being used by the thread owning the tasklet, so in this case the test remains reliable. However, just like its predecessors, this function is wrong by design and it should probably be replaced with a stricter one, a lazy one, or be totally removed (it's only used in checks to avoid calling a possibly scheduled event, and when freeing a tasklet). Regardless, for now the function exists so the flag is removed only if the deletion could be done, which covers all cases we're interested in regarding the insertion. This removal is safe against a concurrent tasklet_wakeup_on() since MT_LIST_DEL() guarantees the atomic test, and will ultimately clear the flag only if the task could be deleted, so the flag will always reflect the last state. This should be carefully be backported as far as 2.2 after some observation period. This patch depends on previous patch "MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
}
}
/*
* Initialize a new task. The bare minimum is performed (queue pointers and
* state). The task is returned. This function should not be used outside of
BUG/MAJOR: task: add a new TASK_SHARED_WQ flag to fix foreing requeuing Since 1.9 with commit b20aa9eef3 ("MAJOR: tasks: create per-thread wait queues") a task bound to a single thread will not use locks when being queued or dequeued because the wait queue is assumed to be the owner thread's. But there exists a rare situation where this is not true: the health check tasks may be running on one thread waiting for a response, and may in parallel be requeued by another thread calling health_adjust() after a detecting a response error in traffic when "observe l7" is set, and "fastinter" is lower than "inter", requiring to shorten the running check's timeout. In this case, the task being requeued was present in another thread's wait queue, thus opening a race during task_unlink_wq(), and gets requeued into the calling thread's wait queue instead of the running one's, opening a second race here. This patch aims at protecting against the risk of calling task_unlink_wq() from one thread while the task is queued on another thread, hence unlocked, by introducing a new TASK_SHARED_WQ flag. This new flag indicates that a task's position in the wait queue may be adjusted by other threads than then one currently executing it. This means that such WQ manipulations must be performed under a lock. There are two types of such tasks: - the global ones, using the global wait queue (technically speaking, those whose thread_mask has at least 2 bits set). - some local ones, which for now will be placed into the global wait queue as well in order to benefit from its lock. The flag is automatically set on initialization if the task's thread mask indicates more than one thread. The caller must also set it if it intends to let other threads update the task's expiration delay (e.g. delegated I/Os), or if it intends to change the task's affinity over time as this could lead to the same situation. Right now only the situation described above seems to be affected by this issue, and it is very difficult to trigger, and even then, will often have no visible effect beyond stopping the checks for example once the race is met. On my laptop it is feasible with the following config, chained to httpterm: global maxconn 400 # provoke FD errors, calling health_adjust() defaults mode http timeout client 10s timeout server 10s timeout connect 10s listen px bind :8001 option httpchk /?t=50 server sback 127.0.0.1:8000 backup server-template s 0-999 127.0.0.1:8000 check port 8001 inter 100 fastinter 10 observe layer7 This patch will automatically address the case for the checks because check tasks are created with multiple threads bound and will get the TASK_SHARED_WQ flag set. If in the future more tasks need to rely on this (multi-threaded muxes for example) and the use of the global wait queue becomes a bottleneck again, then it should not be too difficult to place locks on the local wait queues and queue the task on its bound thread. This patch needs to be backported to 2.1, 2.0 and 1.9. It depends on previous patch "MINOR: task: only check TASK_WOKEN_ANY to decide to requeue a task". Many thanks to William Dauchy for providing detailed traces allowing to spot the problem.
2019-12-19 01:39:06 -05:00
* task_new(). If the thread mask contains more than one thread, TASK_SHARED_WQ
* is set.
*/
static inline struct task *task_init(struct task *t, unsigned long thread_mask)
{
t->wq.node.leaf_p = NULL;
t->rq.node.leaf_p = NULL;
t->state = TASK_SLEEPING;
t->thread_mask = thread_mask;
BUG/MAJOR: task: add a new TASK_SHARED_WQ flag to fix foreing requeuing Since 1.9 with commit b20aa9eef3 ("MAJOR: tasks: create per-thread wait queues") a task bound to a single thread will not use locks when being queued or dequeued because the wait queue is assumed to be the owner thread's. But there exists a rare situation where this is not true: the health check tasks may be running on one thread waiting for a response, and may in parallel be requeued by another thread calling health_adjust() after a detecting a response error in traffic when "observe l7" is set, and "fastinter" is lower than "inter", requiring to shorten the running check's timeout. In this case, the task being requeued was present in another thread's wait queue, thus opening a race during task_unlink_wq(), and gets requeued into the calling thread's wait queue instead of the running one's, opening a second race here. This patch aims at protecting against the risk of calling task_unlink_wq() from one thread while the task is queued on another thread, hence unlocked, by introducing a new TASK_SHARED_WQ flag. This new flag indicates that a task's position in the wait queue may be adjusted by other threads than then one currently executing it. This means that such WQ manipulations must be performed under a lock. There are two types of such tasks: - the global ones, using the global wait queue (technically speaking, those whose thread_mask has at least 2 bits set). - some local ones, which for now will be placed into the global wait queue as well in order to benefit from its lock. The flag is automatically set on initialization if the task's thread mask indicates more than one thread. The caller must also set it if it intends to let other threads update the task's expiration delay (e.g. delegated I/Os), or if it intends to change the task's affinity over time as this could lead to the same situation. Right now only the situation described above seems to be affected by this issue, and it is very difficult to trigger, and even then, will often have no visible effect beyond stopping the checks for example once the race is met. On my laptop it is feasible with the following config, chained to httpterm: global maxconn 400 # provoke FD errors, calling health_adjust() defaults mode http timeout client 10s timeout server 10s timeout connect 10s listen px bind :8001 option httpchk /?t=50 server sback 127.0.0.1:8000 backup server-template s 0-999 127.0.0.1:8000 check port 8001 inter 100 fastinter 10 observe layer7 This patch will automatically address the case for the checks because check tasks are created with multiple threads bound and will get the TASK_SHARED_WQ flag set. If in the future more tasks need to rely on this (multi-threaded muxes for example) and the use of the global wait queue becomes a bottleneck again, then it should not be too difficult to place locks on the local wait queues and queue the task on its bound thread. This patch needs to be backported to 2.1, 2.0 and 1.9. It depends on previous patch "MINOR: task: only check TASK_WOKEN_ANY to decide to requeue a task". Many thanks to William Dauchy for providing detailed traces allowing to spot the problem.
2019-12-19 01:39:06 -05:00
if (atleast2(thread_mask))
t->state |= TASK_SHARED_WQ;
t->nice = 0;
t->calls = 0;
t->call_date = 0;
t->cpu_time = 0;
t->lat_time = 0;
t->expire = TICK_ETERNITY;
return t;
}
/* Initialize a new tasklet. It's identified as a tasklet by ->nice=-32768. It
* is expected to run on the calling thread by default, it's up to the caller
* to change ->tid if it wants to own it.
*/
static inline void tasklet_init(struct tasklet *t)
{
t->nice = -32768;
t->calls = 0;
t->state = 0;
t->process = NULL;
t->tid = -1;
LIST_INIT(&t->list);
}
/* Allocate and initialize a new tasklet, local to the thread by default. The
* caller may assign its tid if it wants to own the tasklet.
*/
static inline struct tasklet *tasklet_new(void)
{
struct tasklet *t = pool_alloc(pool_head_tasklet);
if (t) {
tasklet_init(t);
}
return t;
}
/*
* Allocate and initialise a new task. The new task is returned, or NULL in
* case of lack of memory. The task count is incremented. Tasks should only
* be allocated this way, and must be freed using task_free().
*/
static inline struct task *task_new(unsigned long thread_mask)
{
struct task *t = pool_alloc(pool_head_task);
if (t) {
_HA_ATOMIC_ADD(&nb_tasks, 1);
task_init(t, thread_mask);
}
return t;
}
/*
* Free a task. Its context must have been freed since it will be lost. The
* task count is decremented. It it is the current task, this one is reset.
*/
static inline void __task_free(struct task *t)
{
if (t == sched->current) {
sched->current = NULL;
__ha_barrier_store();
}
BUG_ON(task_in_wq(t) || task_in_rq(t));
pool_free(pool_head_task, t);
if (unlikely(stopping))
pool_flush(pool_head_task);
_HA_ATOMIC_SUB(&nb_tasks, 1);
}
/* Destroys a task : it's unlinked from the wait queues and is freed if it's
* the current task or not queued otherwise it's marked to be freed by the
* scheduler. It does nothing if <t> is NULL.
*/
static inline void task_destroy(struct task *t)
{
if (!t)
return;
task_unlink_wq(t);
/* We don't have to explicitly remove from the run queue.
* If we are in the runqueue, the test below will set t->process
* to NULL, and the task will be free'd when it'll be its turn
* to run.
*/
/* There's no need to protect t->state with a lock, as the task
* has to run on the current thread.
*/
if (t == sched->current || !(t->state & (TASK_QUEUED | TASK_RUNNING)))
__task_free(t);
else
t->process = NULL;
}
/* Should only be called by the thread responsible for the tasklet */
static inline void tasklet_free(struct tasklet *tl)
{
if (MT_LIST_DEL((struct mt_list *)&tl->list))
_HA_ATOMIC_SUB(&tasks_run_queue, 1);
pool_free(pool_head_tasklet, tl);
if (unlikely(stopping))
pool_flush(pool_head_tasklet);
}
static inline void tasklet_set_tid(struct tasklet *tl, int tid)
{
tl->tid = tid;
}
/* Ensure <task> will be woken up at most at <when>. If the task is already in
* the run queue (but not running), nothing is done. It may be used that way
* with a delay : task_schedule(task, tick_add(now_ms, delay));
*/
static inline void task_schedule(struct task *task, int when)
{
/* TODO: mthread, check if there is no tisk with this test */
if (task_in_rq(task))
return;
#ifdef USE_THREAD
BUG/MAJOR: task: add a new TASK_SHARED_WQ flag to fix foreing requeuing Since 1.9 with commit b20aa9eef3 ("MAJOR: tasks: create per-thread wait queues") a task bound to a single thread will not use locks when being queued or dequeued because the wait queue is assumed to be the owner thread's. But there exists a rare situation where this is not true: the health check tasks may be running on one thread waiting for a response, and may in parallel be requeued by another thread calling health_adjust() after a detecting a response error in traffic when "observe l7" is set, and "fastinter" is lower than "inter", requiring to shorten the running check's timeout. In this case, the task being requeued was present in another thread's wait queue, thus opening a race during task_unlink_wq(), and gets requeued into the calling thread's wait queue instead of the running one's, opening a second race here. This patch aims at protecting against the risk of calling task_unlink_wq() from one thread while the task is queued on another thread, hence unlocked, by introducing a new TASK_SHARED_WQ flag. This new flag indicates that a task's position in the wait queue may be adjusted by other threads than then one currently executing it. This means that such WQ manipulations must be performed under a lock. There are two types of such tasks: - the global ones, using the global wait queue (technically speaking, those whose thread_mask has at least 2 bits set). - some local ones, which for now will be placed into the global wait queue as well in order to benefit from its lock. The flag is automatically set on initialization if the task's thread mask indicates more than one thread. The caller must also set it if it intends to let other threads update the task's expiration delay (e.g. delegated I/Os), or if it intends to change the task's affinity over time as this could lead to the same situation. Right now only the situation described above seems to be affected by this issue, and it is very difficult to trigger, and even then, will often have no visible effect beyond stopping the checks for example once the race is met. On my laptop it is feasible with the following config, chained to httpterm: global maxconn 400 # provoke FD errors, calling health_adjust() defaults mode http timeout client 10s timeout server 10s timeout connect 10s listen px bind :8001 option httpchk /?t=50 server sback 127.0.0.1:8000 backup server-template s 0-999 127.0.0.1:8000 check port 8001 inter 100 fastinter 10 observe layer7 This patch will automatically address the case for the checks because check tasks are created with multiple threads bound and will get the TASK_SHARED_WQ flag set. If in the future more tasks need to rely on this (multi-threaded muxes for example) and the use of the global wait queue becomes a bottleneck again, then it should not be too difficult to place locks on the local wait queues and queue the task on its bound thread. This patch needs to be backported to 2.1, 2.0 and 1.9. It depends on previous patch "MINOR: task: only check TASK_WOKEN_ANY to decide to requeue a task". Many thanks to William Dauchy for providing detailed traces allowing to spot the problem.
2019-12-19 01:39:06 -05:00
if (task->state & TASK_SHARED_WQ) {
/* FIXME: is it really needed to lock the WQ during the check ? */
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
if (task_in_wq(task))
when = tick_first(when, task->expire);
task->expire = when;
if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key))
__task_queue(task, &timers);
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
} else
#endif
{
BUG/MAJOR: task: add a new TASK_SHARED_WQ flag to fix foreing requeuing Since 1.9 with commit b20aa9eef3 ("MAJOR: tasks: create per-thread wait queues") a task bound to a single thread will not use locks when being queued or dequeued because the wait queue is assumed to be the owner thread's. But there exists a rare situation where this is not true: the health check tasks may be running on one thread waiting for a response, and may in parallel be requeued by another thread calling health_adjust() after a detecting a response error in traffic when "observe l7" is set, and "fastinter" is lower than "inter", requiring to shorten the running check's timeout. In this case, the task being requeued was present in another thread's wait queue, thus opening a race during task_unlink_wq(), and gets requeued into the calling thread's wait queue instead of the running one's, opening a second race here. This patch aims at protecting against the risk of calling task_unlink_wq() from one thread while the task is queued on another thread, hence unlocked, by introducing a new TASK_SHARED_WQ flag. This new flag indicates that a task's position in the wait queue may be adjusted by other threads than then one currently executing it. This means that such WQ manipulations must be performed under a lock. There are two types of such tasks: - the global ones, using the global wait queue (technically speaking, those whose thread_mask has at least 2 bits set). - some local ones, which for now will be placed into the global wait queue as well in order to benefit from its lock. The flag is automatically set on initialization if the task's thread mask indicates more than one thread. The caller must also set it if it intends to let other threads update the task's expiration delay (e.g. delegated I/Os), or if it intends to change the task's affinity over time as this could lead to the same situation. Right now only the situation described above seems to be affected by this issue, and it is very difficult to trigger, and even then, will often have no visible effect beyond stopping the checks for example once the race is met. On my laptop it is feasible with the following config, chained to httpterm: global maxconn 400 # provoke FD errors, calling health_adjust() defaults mode http timeout client 10s timeout server 10s timeout connect 10s listen px bind :8001 option httpchk /?t=50 server sback 127.0.0.1:8000 backup server-template s 0-999 127.0.0.1:8000 check port 8001 inter 100 fastinter 10 observe layer7 This patch will automatically address the case for the checks because check tasks are created with multiple threads bound and will get the TASK_SHARED_WQ flag set. If in the future more tasks need to rely on this (multi-threaded muxes for example) and the use of the global wait queue becomes a bottleneck again, then it should not be too difficult to place locks on the local wait queues and queue the task on its bound thread. This patch needs to be backported to 2.1, 2.0 and 1.9. It depends on previous patch "MINOR: task: only check TASK_WOKEN_ANY to decide to requeue a task". Many thanks to William Dauchy for providing detailed traces allowing to spot the problem.
2019-12-19 01:39:06 -05:00
BUG_ON((task->thread_mask & tid_bit) == 0); // should have TASK_SHARED_WQ
if (task_in_wq(task))
when = tick_first(when, task->expire);
task->expire = when;
if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key))
__task_queue(task, &sched->timers);
}
}
/* This function register a new signal. "lua" is the current lua
* execution context. It contains a pointer to the associated task.
* "link" is a list head attached to an other task that must be wake
* the lua task if an event occurs. This is useful with external
* events like TCP I/O or sleep functions. This function allocate
* memory for the signal.
*/
static inline struct notification *notification_new(struct list *purge, struct list *event, struct task *wakeup)
{
struct notification *com = pool_alloc(pool_head_notification);
if (!com)
return NULL;
LIST_ADDQ(purge, &com->purge_me);
LIST_ADDQ(event, &com->wake_me);
HA_SPIN_INIT(&com->lock);
com->task = wakeup;
return com;
}
/* This function purge all the pending signals when the LUA execution
* is finished. This prevent than a coprocess try to wake a deleted
* task. This function remove the memory associated to the signal.
* The purge list is not locked because it is owned by only one
* process. before browsing this list, the caller must ensure to be
* the only one browser.
*/
static inline void notification_purge(struct list *purge)
{
struct notification *com, *back;
/* Delete all pending communication signals. */
list_for_each_entry_safe(com, back, purge, purge_me) {
HA_SPIN_LOCK(NOTIF_LOCK, &com->lock);
LIST_DEL(&com->purge_me);
if (!com->task) {
HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock);
pool_free(pool_head_notification, com);
continue;
}
com->task = NULL;
HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock);
}
}
/* In some cases, the disconnected notifications must be cleared.
* This function just release memory blocks. The purge list is not
* locked because it is owned by only one process. Before browsing
* this list, the caller must ensure to be the only one browser.
* The "com" is not locked because when com->task is NULL, the
* notification is no longer used.
*/
static inline void notification_gc(struct list *purge)
{
struct notification *com, *back;
/* Delete all pending communication signals. */
list_for_each_entry_safe (com, back, purge, purge_me) {
if (com->task)
continue;
LIST_DEL(&com->purge_me);
pool_free(pool_head_notification, com);
}
}
/* This function sends signals. It wakes all the tasks attached
* to a list head, and remove the signal, and free the used
* memory. The wake list is not locked because it is owned by
* only one process. before browsing this list, the caller must
* ensure to be the only one browser.
*/
static inline void notification_wake(struct list *wake)
{
struct notification *com, *back;
/* Wake task and delete all pending communication signals. */
list_for_each_entry_safe(com, back, wake, wake_me) {
HA_SPIN_LOCK(NOTIF_LOCK, &com->lock);
LIST_DEL(&com->wake_me);
if (!com->task) {
HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock);
pool_free(pool_head_notification, com);
continue;
}
task_wakeup(com->task, TASK_WOKEN_MSG);
com->task = NULL;
HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock);
}
}
/* This function returns true is some notification are pending
*/
static inline int notification_registered(struct list *wake)
{
return !LIST_ISEMPTY(wake);
}
/* adds list item <item> to work list <work> and wake up the associated task */
static inline void work_list_add(struct work_list *work, struct mt_list *item)
{
MINOR: lists: rename some MT_LIST operations to clarify them Initially when mt_lists were added, their purpose was to be used with the scheduler, where anyone may concurrently add the same tasklet, so it sounded natural to implement a check in MT_LIST_ADD{,Q}. Later their usage was extended and MT_LIST_ADD{,Q} started to be used on situations where the element to be added was exclusively owned by the one performing the operation so a conflict was impossible. This became more obvious with the idle connections and the new macro was called MT_LIST_ADDQ_NOCHECK. But this remains confusing and at many places it's not expected that an MT_LIST_ADD could possibly fail, and worse, at some places we start by initializing it before adding (and the test is superflous) so let's rename them to something more conventional to denote the presence of the check or not: MT_LIST_ADD{,Q} : inconditional operation, the caller owns the element, and doesn't care about the element's current state (exactly like LIST_ADD) MT_LIST_TRY_ADD{,Q}: only perform the operation if the element is not already added or in the process of being added. This means that the previously "safe" MT_LIST_ADD{,Q} are not "safe" anymore. This also means that in case of backport mistakes in the future causing this to be overlooked, the slower and safer functions will still be used by default. Note that the missing unchecked MT_LIST_ADD macro was added. The rest of the code will have to be reviewed so that a number of callers of MT_LIST_TRY_ADDQ are changed to MT_LIST_ADDQ to remove the unneeded test.
2020-07-10 02:10:29 -04:00
MT_LIST_TRY_ADDQ(&work->head, item);
task_wakeup(work->task, TASK_WOKEN_OTHER);
}
#endif /* _HAPROXY_TASK_H */
/*
* Local variables:
* c-indent-level: 8
* c-basic-offset: 8
* End:
*/