haproxy/include/types/task.h
Willy Tarreau dd0e89a084 BUG/MAJOR: task: add a new TASK_SHARED_WQ flag to fix foreing requeuing
Since 1.9 with commit b20aa9eef3 ("MAJOR: tasks: create per-thread wait
queues") a task bound to a single thread will not use locks when being
queued or dequeued because the wait queue is assumed to be the owner
thread's.

But there exists a rare situation where this is not true: the health
check tasks may be running on one thread waiting for a response, and
may in parallel be requeued by another thread calling health_adjust()
after a detecting a response error in traffic when "observe l7" is set,
and "fastinter" is lower than "inter", requiring to shorten the running
check's timeout. In this case, the task being requeued was present in
another thread's wait queue, thus opening a race during task_unlink_wq(),
and gets requeued into the calling thread's wait queue instead of the
running one's, opening a second race here.

This patch aims at protecting against the risk of calling task_unlink_wq()
from one thread while the task is queued on another thread, hence unlocked,
by introducing a new TASK_SHARED_WQ flag.

This new flag indicates that a task's position in the wait queue may be
adjusted by other threads than then one currently executing it. This means
that such WQ manipulations must be performed under a lock. There are two
types of such tasks:
  - the global ones, using the global wait queue (technically speaking,
    those whose thread_mask has at least 2 bits set).
  - some local ones, which for now will be placed into the global wait
    queue as well in order to benefit from its lock.

The flag is automatically set on initialization if the task's thread mask
indicates more than one thread. The caller must also set it if it intends
to let other threads update the task's expiration delay (e.g. delegated
I/Os), or if it intends to change the task's affinity over time as this
could lead to the same situation.

Right now only the situation described above seems to be affected by this
issue, and it is very difficult to trigger, and even then, will often have
no visible effect beyond stopping the checks for example once the race is
met. On my laptop it is feasible with the following config, chained to
httpterm:

    global
        maxconn 400 # provoke FD errors, calling health_adjust()

    defaults
        mode http
        timeout client 10s
        timeout server 10s
        timeout connect 10s

    listen px
        bind :8001
        option httpchk /?t=50
        server sback 127.0.0.1:8000 backup
        server-template s 0-999 127.0.0.1:8000 check port 8001 inter 100 fastinter 10 observe layer7

This patch will automatically address the case for the checks because
check tasks are created with multiple threads bound and will get the
TASK_SHARED_WQ flag set.

If in the future more tasks need to rely on this (multi-threaded muxes
for example) and the use of the global wait queue becomes a bottleneck
again, then it should not be too difficult to place locks on the local
wait queues and queue the task on its bound thread.

This patch needs to be backported to 2.1, 2.0 and 1.9. It depends on
previous patch "MINOR: task: only check TASK_WOKEN_ANY to decide to
requeue a task".

Many thanks to William Dauchy for providing detailed traces allowing to
spot the problem.
2019-12-19 14:42:22 +01:00

138 lines
5.8 KiB
C

/*
* include/types/task.h
* Macros, variables and structures for task management.
*
* Copyright (C) 2000-2010 Willy Tarreau - w@1wt.eu
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, version 2.1
* exclusively.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef _TYPES_TASK_H
#define _TYPES_TASK_H
#include <sys/time.h>
#include <common/config.h>
#include <common/mini-clist.h>
#include <eb32sctree.h>
#include <eb32tree.h>
/* values for task->state */
#define TASK_SLEEPING 0x0000 /* task sleeping */
#define TASK_RUNNING 0x0001 /* the task is currently running */
#define TASK_GLOBAL 0x0002 /* The task is currently in the global runqueue */
#define TASK_QUEUED 0x0004 /* The task has been (re-)added to the run queue */
#define TASK_SHARED_WQ 0x0008 /* The task's expiration may be updated by other
* threads, must be set before first queue/wakeup */
#define TASK_WOKEN_INIT 0x0100 /* woken up for initialisation purposes */
#define TASK_WOKEN_TIMER 0x0200 /* woken up because of expired timer */
#define TASK_WOKEN_IO 0x0400 /* woken up because of completed I/O */
#define TASK_WOKEN_SIGNAL 0x0800 /* woken up by a system signal */
#define TASK_WOKEN_MSG 0x1000 /* woken up by another task's message */
#define TASK_WOKEN_RES 0x2000 /* woken up because of available resource */
#define TASK_WOKEN_OTHER 0x4000 /* woken up for an unspecified reason */
/* use this to check a task state or to clean it up before queueing */
#define TASK_WOKEN_ANY (TASK_WOKEN_OTHER|TASK_WOKEN_INIT|TASK_WOKEN_TIMER| \
TASK_WOKEN_IO|TASK_WOKEN_SIGNAL|TASK_WOKEN_MSG| \
TASK_WOKEN_RES)
struct notification {
struct list purge_me; /* Part of the list of signals to be purged in the
case of the LUA execution stack crash. */
struct list wake_me; /* Part of list of signals to be targeted if an
event occurs. */
struct task *task; /* The task to be wake if an event occurs. */
__decl_hathreads(HA_SPINLOCK_T lock);
};
/* force to split per-thread stuff into separate cache lines */
struct task_per_thread {
struct eb_root timers; /* tree constituting the per-thread wait queue */
struct eb_root rqueue; /* tree constituting the per-thread run queue */
struct list task_list; /* List of tasks to be run, mixing tasks and tasklets */
struct mt_list shared_tasklet_list; /* Tasklet to be run, woken up by other threads */
int task_list_size; /* Number of tasks in the task_list */
int rqueue_size; /* Number of elements in the per-thread run queue */
struct task *current; /* current task (not tasklet) */
__attribute__((aligned(64))) char end[0];
};
/* This part is common between struct task and struct tasklet so that tasks
* can be used as-is as tasklets.
*/
#define TASK_COMMON \
struct { \
unsigned short state; /* task state : bitfield of TASK_ */ \
short nice; /* task prio from -1024 to +1024, or -32768 for tasklets */ \
unsigned int calls; /* number of times process was called */ \
struct task *(*process)(struct task *t, void *ctx, unsigned short state); /* the function which processes the task */ \
void *context; /* the task's context */ \
}
/* The base for all tasks */
struct task {
TASK_COMMON; /* must be at the beginning! */
struct eb32sc_node rq; /* ebtree node used to hold the task in the run queue */
struct eb32_node wq; /* ebtree node used to hold the task in the wait queue */
int expire; /* next expiration date for this task, in ticks */
unsigned long thread_mask; /* mask of thread IDs authorized to process the task */
uint64_t call_date; /* date of the last task wakeup or call */
uint64_t lat_time; /* total latency time experienced */
uint64_t cpu_time; /* total CPU time consumed */
};
/* lightweight tasks, without priority, mainly used for I/Os */
struct tasklet {
TASK_COMMON; /* must be at the beginning! */
struct list list;
int tid; /* TID of the tasklet owner, <0 if local */
};
#define TASK_IS_TASKLET(t) ((t)->nice == -32768)
/*
* The task callback (->process) is responsible for updating ->expire. It must
* return a pointer to the task itself, except if the task has been deleted, in
* which case it returns NULL so that the scheduler knows it must not check the
* expire timer. The scheduler will requeue the task at the proper location.
*/
/* A work_list is a thread-safe way to enqueue some work to be run on another
* thread. It consists of a list, a task and a general-purpose argument.
* A work is appended to the list by atomically adding a list element to the
* list and waking up the associated task, which is done using work_add(). The
* caller must be careful about how operations are run as it will definitely
* happen that the element being enqueued is processed by the other thread
* before the call returns. Some locking conventions between the caller and the
* callee might sometimes be necessary. The task is always woken up with reason
* TASK_WOKEN_OTHER and a context pointing to the work_list entry.
*/
struct work_list {
struct mt_list head;
struct task *task;
void *arg;
};
#endif /* _TYPES_TASK_H */
/*
* Local variables:
* c-indent-level: 8
* c-basic-offset: 8
* End:
*/