haproxy/src/task.c
Willy Tarreau 4726f53794 [OPTIM] task: don't unlink a task from a wait queue when waking it up
In many situations, we wake a task on an I/O event, then queue it
exactly where it was. This is a real waste because we delete/insert
tasks into the wait queue for nothing. The only reason for this is
that there was only one tree node in the task struct.

By adding another tree node, we can have one tree for the timers
(wait queue) and one tree for the priority (run queue). That way,
we can have a task both in the run queue and wait queue at the
same time. The wait queue now really holds timers, which is what
it was designed for.

The net gain is at least 1 delete/insert cycle per session, and up
to 2-3 depending on the workload, since we save one cycle each time
the expiration date is not changed during a wake up.
2009-03-08 07:59:18 +01:00

320 lines
11 KiB
C

/*
* Task management functions.
*
* Copyright 2000-2009 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <common/config.h>
#include <common/eb32tree.h>
#include <common/memory.h>
#include <common/mini-clist.h>
#include <common/standard.h>
#include <common/time.h>
#include <proto/proxy.h>
#include <proto/task.h>
struct pool_head *pool2_task;
unsigned int run_queue = 0;
unsigned int niced_tasks = 0; /* number of niced tasks in the run queue */
struct task *last_timer = NULL; /* optimization: last queued timer */
/* Principle of the wait queue.
*
* We want to be able to tell whether an expiration date is before of after the
* current time <now>. We KNOW that expiration dates are never too far apart,
* because they are already computed by adding integer numbers of milliseconds
* to the current date.
* We also know that almost all dates will be in the future, and that a very
* small part of them will be in the past, they are the ones which have expired
* since last time we checked them.
*
* The current implementation uses a wrapping time cut into 3 ranges :
* - previous : those ones are expired by definition
* - current : some are expired, some are not
* - next : none are expired
*
* We use the higher two bits of the timers expressed in ticks (milliseconds)
* to determine which range a timer is in, compared to <now> :
*
* now previous current next0 next1
* [31:30] [31:30] [31:30] [31:30] [31:30]
* 00 11 00 01 10
* 01 00 01 10 11
* 10 01 10 11 00
* 11 10 11 00 01
*
* By definition, <current> is the range containing <now> as well as all timers
* which have the same 2 high bits as <now>, <previous> is the range just
* before, which contains all timers whose high bits equal those of <now> minus
* 1. Last, <next> is composed of the two remaining ranges.
*
* For ease of implementation, the timers will then be stored into 4 queues 0-3
* determined by the 2 higher bits of the timer. The expiration algorithm is
* very simple :
* - expire everything in <previous>=queue[((now>>30)-1)&3]
* - expire from <current>=queue[(now>>30)&3] everything where timer >= now
*
* With this algorithm, it's possible to queue tasks meant to expire 24.8 days
* in the future, and still be able to detect events remaining unprocessed for
* the last 12.4 days! Note that the principle might be extended to any number
* of higher bits as long as there is only one range for expired tasks. For
* instance, using the 8 higher bits to index the range, we would have one past
* range of 4.6 hours (24 bits in ms), and 254 ranges in the future totalizing
* 49.3 days. This would eat more memory for a very little added benefit.
*
* Also, in order to maintain the ability to perform time comparisons, it is
* recommended to avoid using the <next1> range above, as values in this range
* may not easily be compared to <now> outside of these functions as it is the
* opposite of the <current> range, and <timer>-<now> may randomly be positive
* or negative. That means we're left with +/- 12 days timers.
*
* To keep timers ordered, we use 4 ebtrees [0..3]. To keep computation low, we
* may use (seconds*1024)+milliseconds, which preserves ordering eventhough we
* can't do real computations on it. Future evolutions could make use of 1024th
* of seconds instead of milliseconds, with the special value 0 avoided (and
* replaced with 1), so that zero indicates the timer is not set.
*/
#define TIMER_TICK_BITS 32
#define TIMER_TREE_BITS 2
#define TIMER_TREES (1 << TIMER_TREE_BITS)
#define TIMER_TREE_SHIFT (TIMER_TICK_BITS - TIMER_TREE_BITS)
#define TIMER_TREE_MASK (TIMER_TREES - 1)
#define TIMER_TICK_MASK ((1U << (TIMER_TICK_BITS-1)) * 2 - 1)
#define TIMER_SIGN_BIT (1 << (TIMER_TICK_BITS - 1))
static struct eb_root timers[TIMER_TREES]; /* trees with MSB 00, 01, 10 and 11 */
static struct eb_root rqueue[TIMER_TREES]; /* trees constituting the run queue */
static unsigned int rqueue_ticks; /* insertion count */
/* returns an ordered key based on an expiration date. */
static inline unsigned int timeval_to_ticks(const struct timeval *t)
{
unsigned int key;
key = ((unsigned int)t->tv_sec * 1000) + ((unsigned int)t->tv_usec / 1000);
key &= TIMER_TICK_MASK;
return key;
}
/* returns a tree number based on a ticks value */
static inline unsigned int ticks_to_tree(unsigned int ticks)
{
return (ticks >> TIMER_TREE_SHIFT) & TIMER_TREE_MASK;
}
/* returns a tree number based on an expiration date. */
static inline unsigned int timeval_to_tree(const struct timeval *t)
{
return ticks_to_tree(timeval_to_ticks(t));
}
/* Puts the task <t> in run queue at a position depending on t->nice. <t> is
* returned. The nice value assigns boosts in 32th of the run queue size. A
* nice value of -1024 sets the task to -run_queue*32, while a nice value of
* 1024 sets the task to run_queue*32. The state flags are cleared, so the
* caller will have to set its flags after this call.
* The task must not already be in the run queue. If unsure, use the safer
* task_wakeup() function.
*/
struct task *__task_wakeup(struct task *t)
{
run_queue++;
t->rq.key = ++rqueue_ticks;
if (likely(t->nice)) {
int offset;
niced_tasks++;
if (likely(t->nice > 0))
offset = (unsigned)((run_queue * (unsigned int)t->nice) / 32U);
else
offset = -(unsigned)((run_queue * (unsigned int)-t->nice) / 32U);
t->rq.key += offset;
}
/* clear state flags at the same time */
t->state &= ~TASK_WOKEN_ANY;
eb32_insert(&rqueue[ticks_to_tree(t->rq.key)], &t->rq);
return t;
}
/*
* task_queue()
*
* Inserts a task into the wait queue at the position given by its expiration
* date. It does not matter if the task was already in the wait queue or not,
* and it may even help if its position has not changed because we'll be able
* to return without doing anything. Tasks queued with an eternity expiration
* are just unlinked from the WQ. Last, tasks must not be queued further than
* the end of the next tree, which is between <now_ms> and <now_ms> +
* TIMER_SIGN_BIT ms (now+12days..24days in 32bit).
*/
void task_queue(struct task *task)
{
/* if the task is already in the wait queue, we may reuse its position
* or we will at least have to unlink it first.
*/
if (task_in_wq(task)) {
if (task->wq.key == task->expire)
return;
__task_unlink_wq(task);
}
/* the task is not in the queue now */
if (unlikely(!task->expire))
return;
task->wq.key = task->expire;
#ifdef DEBUG_CHECK_INVALID_EXPIRATION_DATES
if ((task->wq.key - now_ms) & TIMER_SIGN_BIT)
/* we're queuing too far away or in the past (most likely) */
return;
#endif
if (likely(last_timer &&
last_timer->wq.key == task->wq.key &&
last_timer->wq.node.node_p &&
last_timer->wq.node.bit == -1)) {
/* Most often, last queued timer has the same expiration date, so
* if it's not queued at the root, let's queue a dup directly there.
* Note that we can only use dups at the dup tree's root (bit==-1).
*/
eb_insert_dup(&last_timer->wq.node, &task->wq.node);
return;
}
eb32_insert(&timers[ticks_to_tree(task->wq.key)], &task->wq);
if (task->wq.node.bit == -1)
last_timer = task; /* we only want dup a tree's root */
return;
}
/*
* Extract all expired timers from the timer queue, and wakes up all
* associated tasks. Returns the date of next event (or eternity).
*/
void wake_expired_tasks(int *next)
{
struct task *task;
struct eb32_node *eb;
unsigned int now_tree;
unsigned int tree;
/* In theory, we should :
* - wake all tasks from the <previous> tree
* - wake all expired tasks from the <current> tree
* - scan <next> trees for next expiration date if not found earlier.
* But we can do all this more easily : we scan all 3 trees before we
* wrap, and wake everything expired from there, then stop on the first
* non-expired entry.
*/
now_tree = ticks_to_tree(now_ms);
tree = (now_tree - 1) & TIMER_TREE_MASK;
do {
eb = eb32_first(&timers[tree]);
while (eb) {
task = eb32_entry(eb, struct task, wq);
if ((now_ms - eb->key) & TIMER_SIGN_BIT) {
/* note that we don't need this check for the <previous>
* tree, but it's cheaper than duplicating the code.
*/
*next = task->expire;
return;
}
/* detach the task from the queue and add the task to the run queue */
eb = eb32_next(eb);
__task_unlink_wq(task);
task_wakeup(task, TASK_WOKEN_TIMER);
}
tree = (tree + 1) & TIMER_TREE_MASK;
} while (((tree - now_tree) & TIMER_TREE_MASK) < TIMER_TREES/2);
/* We have found no task to expire in any tree */
*next = TICK_ETERNITY;
return;
}
/* The run queue is chronologically sorted in a tree. An insertion counter is
* used to assign a position to each task. This counter may be combined with
* other variables (eg: nice value) to set the final position in the tree. The
* counter may wrap without a problem, of course. We then limit the number of
* tasks processed at once to 1/4 of the number of tasks in the queue, and to
* 200 max in any case, so that general latency remains low and so that task
* positions have a chance to be considered. It also reduces the number of
* trees to be evaluated when no task remains.
*
* Just like with timers, we start with tree[(current - 1)], which holds past
* values, and stop when we reach the middle of the list. In practise, we visit
* 3 out of 4 trees.
*
* The function adjusts <next> if a new event is closer.
*/
void process_runnable_tasks(int *next)
{
int temp;
struct task *t;
struct eb32_node *eb;
unsigned int tree, stop;
unsigned int max_processed;
if (!run_queue)
return;
max_processed = run_queue;
if (max_processed > 200)
max_processed = 200;
if (likely(niced_tasks))
max_processed /= 4;
tree = ticks_to_tree(rqueue_ticks);
stop = (tree + TIMER_TREES / 2) & TIMER_TREE_MASK;
tree = (tree - 1) & TIMER_TREE_MASK;
do {
eb = eb32_first(&rqueue[tree]);
while (eb) {
t = eb32_entry(eb, struct task, rq);
/* detach the task from the queue and add the task to the run queue */
eb = eb32_next(eb);
__task_unlink_rq(t);
t->state |= TASK_RUNNING;
t->process(t, &temp);
t->state &= ~TASK_RUNNING;
*next = tick_first(*next, temp);
if (!--max_processed)
return;
}
tree = (tree + 1) & TIMER_TREE_MASK;
} while (tree != stop);
}
/* perform minimal intializations, report 0 in case of error, 1 if OK. */
int init_task()
{
memset(&timers, 0, sizeof(timers));
memset(&rqueue, 0, sizeof(rqueue));
pool2_task = create_pool("task", sizeof(struct task), MEM_F_SHARED);
return pool2_task != NULL;
}
/*
* Local variables:
* c-indent-level: 8
* c-basic-offset: 8
* End:
*/