haproxy/include/haproxy/thread.h

566 lines
22 KiB
C
Raw Normal View History

/*
* include/haproxy/thread.h
* definitions, macros and inline functions used by threads.
*
* Copyright (C) 2017 Christopher Faulet - cfaulet@haproxy.com
* Copyright (C) 2020 Willy Tarreau - w@1wt.eu
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, version 2.1
* exclusively.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef _HAPROXY_THREAD_H
#define _HAPROXY_THREAD_H
#include <haproxy/api.h>
#include <haproxy/thread-t.h>
#include <haproxy/tinfo.h>
/* Note: this file mainly contains 5 sections:
* - a small common part, which also corresponds to the common API
* - one used solely when USE_THREAD is *not* set
* - one used solely when USE_THREAD is set
* - one used solely when USE_THREAD is set WITHOUT debugging
* - one used solely when USE_THREAD is set WITH debugging
*
*/
/* Generic exports */
int parse_nbthread(const char *arg, char **err);
void ha_tkill(unsigned int thr, int sig);
void ha_tkillall(int sig);
void ha_thread_relax(void);
int thread_detect_binding_discrepancies(void);
int thread_detect_more_than_cpus(void);
int thread_map_to_groups();
int thread_resolve_group_mask(struct thread_set *ts, int defgrp, char **err);
void thread_detect_count(void);
int parse_thread_set(const char *arg, struct thread_set *ts, char **err);
const char *lock_label(enum lock_label label);
extern int thread_cpus_enabled_at_boot;
#ifndef USE_THREAD
/********************** THREADS DISABLED ************************/
/* Only way found to replace variables with constants that are optimized away
* at build time.
*/
enum { tid_bit = 1UL };
enum { tid = 0 };
enum { tgid = 1 };
#define HA_SPIN_INIT(l) do { /* do nothing */ } while(0)
#define HA_SPIN_DESTROY(l) do { /* do nothing */ } while(0)
#define HA_SPIN_LOCK(lbl, l) do { /* do nothing */ } while(0)
#define HA_SPIN_TRYLOCK(lbl, l) ({ 0; })
#define HA_SPIN_UNLOCK(lbl, l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_INIT(l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_DESTROY(l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_WRLOCK(lbl, l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_TRYWRLOCK(lbl, l) ({ 0; })
#define HA_RWLOCK_WRUNLOCK(lbl, l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_RDLOCK(lbl, l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_TRYRDLOCK(lbl, l) ({ 0; })
#define HA_RWLOCK_RDUNLOCK(lbl, l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_SKLOCK(lbl,l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_SKTOWR(lbl,l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_WRTOSK(lbl,l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_SKTORD(lbl,l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_WRTORD(lbl,l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_SKUNLOCK(lbl,l) do { /* do nothing */ } while(0)
#define HA_RWLOCK_TRYSKLOCK(lbl,l) ({ 0; })
#define HA_RWLOCK_TRYRDTOSK(lbl,l) ({ 0; })
#define HA_RWLOCK_TRYRDTOWR(lbl,l) ({ 0; })
#define ha_sigmask(how, set, oldset) sigprocmask(how, set, oldset)
/* Sets the current thread to a valid one described by <thr>, or to any thread
* and any group if NULL (e.g. for use during boot where they're not totally
* initialized).
*/
static inline void ha_set_thread(const struct thread_info *thr)
{
if (thr) {
ti = thr;
tg = ti->tg;
th_ctx = &ha_thread_ctx[ti->tid];
} else {
ti = &ha_thread_info[0];
tg = &ha_tgroup_info[0];
th_ctx = &ha_thread_ctx[0];
}
}
MEDIUM: threads: add a stronger thread_isolate_full() call The current principle of running under isolation was made to access sensitive data while being certain that no other thread was using them in parallel, without necessarily having to place locks everywhere. The main use case are "show sess" and "show fd" which run over long chains of pointers. The thread_isolate() call relies on the "harmless" bit that indicates for a given thread that it's not currently doing such sensitive things, which is advertised using thread_harmless_now() and which ends usings thread_harmless_end(), which also waits for possibly concurrent threads to complete their work if they took this opportunity for starting something tricky. As some system calls were notoriously slow (e.g. mmap()), a bunch of thread_harmless_now() / thread_harmless_end() were placed around them to let waiting threads do their work while such other threads were not able to modify memory contents. But this is not sufficient for performing memory modifications. One such example is the server deletion code. By modifying memory, it not only requires that other threads are not playing with it, but are not either in the process of touching it. The fact that a pool_alloc() or pool_free() on some structure may call thread_harmless_now() and let another thread start to release the same object's memory is not acceptable. This patch introduces the concept of "idle threads". Threads entering the polling loop are idle, as well as those that are waiting for all others to become idle via the new function thread_isolate_full(). Once thread_isolate_full() is granted, the thread is not idle anymore, and it is released using thread_release() just like regular isolation. Its users have to keep in mind that across this call nothing is granted as another thread might have performed shared memory modifications. But such users are extremely rare and are actually expecting this from their peers as well. Note that that in case of backport, this patch depends on previous patch: MINOR: threads: make thread_release() not wait for other ones to complete
2021-08-04 05:44:17 -04:00
static inline void thread_idle_now()
{
tg_ctx->threads_idle |= ti->ltid_bit;
MEDIUM: threads: add a stronger thread_isolate_full() call The current principle of running under isolation was made to access sensitive data while being certain that no other thread was using them in parallel, without necessarily having to place locks everywhere. The main use case are "show sess" and "show fd" which run over long chains of pointers. The thread_isolate() call relies on the "harmless" bit that indicates for a given thread that it's not currently doing such sensitive things, which is advertised using thread_harmless_now() and which ends usings thread_harmless_end(), which also waits for possibly concurrent threads to complete their work if they took this opportunity for starting something tricky. As some system calls were notoriously slow (e.g. mmap()), a bunch of thread_harmless_now() / thread_harmless_end() were placed around them to let waiting threads do their work while such other threads were not able to modify memory contents. But this is not sufficient for performing memory modifications. One such example is the server deletion code. By modifying memory, it not only requires that other threads are not playing with it, but are not either in the process of touching it. The fact that a pool_alloc() or pool_free() on some structure may call thread_harmless_now() and let another thread start to release the same object's memory is not acceptable. This patch introduces the concept of "idle threads". Threads entering the polling loop are idle, as well as those that are waiting for all others to become idle via the new function thread_isolate_full(). Once thread_isolate_full() is granted, the thread is not idle anymore, and it is released using thread_release() just like regular isolation. Its users have to keep in mind that across this call nothing is granted as another thread might have performed shared memory modifications. But such users are extremely rare and are actually expecting this from their peers as well. Note that that in case of backport, this patch depends on previous patch: MINOR: threads: make thread_release() not wait for other ones to complete
2021-08-04 05:44:17 -04:00
}
static inline void thread_idle_end()
{
tg_ctx->threads_idle &= ~ti->ltid_bit;
MEDIUM: threads: add a stronger thread_isolate_full() call The current principle of running under isolation was made to access sensitive data while being certain that no other thread was using them in parallel, without necessarily having to place locks everywhere. The main use case are "show sess" and "show fd" which run over long chains of pointers. The thread_isolate() call relies on the "harmless" bit that indicates for a given thread that it's not currently doing such sensitive things, which is advertised using thread_harmless_now() and which ends usings thread_harmless_end(), which also waits for possibly concurrent threads to complete their work if they took this opportunity for starting something tricky. As some system calls were notoriously slow (e.g. mmap()), a bunch of thread_harmless_now() / thread_harmless_end() were placed around them to let waiting threads do their work while such other threads were not able to modify memory contents. But this is not sufficient for performing memory modifications. One such example is the server deletion code. By modifying memory, it not only requires that other threads are not playing with it, but are not either in the process of touching it. The fact that a pool_alloc() or pool_free() on some structure may call thread_harmless_now() and let another thread start to release the same object's memory is not acceptable. This patch introduces the concept of "idle threads". Threads entering the polling loop are idle, as well as those that are waiting for all others to become idle via the new function thread_isolate_full(). Once thread_isolate_full() is granted, the thread is not idle anymore, and it is released using thread_release() just like regular isolation. Its users have to keep in mind that across this call nothing is granted as another thread might have performed shared memory modifications. But such users are extremely rare and are actually expecting this from their peers as well. Note that that in case of backport, this patch depends on previous patch: MINOR: threads: make thread_release() not wait for other ones to complete
2021-08-04 05:44:17 -04:00
}
static inline void thread_harmless_now()
{
tg_ctx->threads_harmless |= ti->ltid_bit;
}
static inline int is_thread_harmless()
{
return !!(tg_ctx->threads_harmless & ti->ltid_bit);
}
static inline void thread_harmless_end()
{
tg_ctx->threads_harmless &= ~ti->ltid_bit;
}
static inline void thread_harmless_end_sig()
{
tg_ctx->threads_harmless &= ~ti->ltid_bit;
}
static inline void thread_isolate()
{
}
MEDIUM: threads: add a stronger thread_isolate_full() call The current principle of running under isolation was made to access sensitive data while being certain that no other thread was using them in parallel, without necessarily having to place locks everywhere. The main use case are "show sess" and "show fd" which run over long chains of pointers. The thread_isolate() call relies on the "harmless" bit that indicates for a given thread that it's not currently doing such sensitive things, which is advertised using thread_harmless_now() and which ends usings thread_harmless_end(), which also waits for possibly concurrent threads to complete their work if they took this opportunity for starting something tricky. As some system calls were notoriously slow (e.g. mmap()), a bunch of thread_harmless_now() / thread_harmless_end() were placed around them to let waiting threads do their work while such other threads were not able to modify memory contents. But this is not sufficient for performing memory modifications. One such example is the server deletion code. By modifying memory, it not only requires that other threads are not playing with it, but are not either in the process of touching it. The fact that a pool_alloc() or pool_free() on some structure may call thread_harmless_now() and let another thread start to release the same object's memory is not acceptable. This patch introduces the concept of "idle threads". Threads entering the polling loop are idle, as well as those that are waiting for all others to become idle via the new function thread_isolate_full(). Once thread_isolate_full() is granted, the thread is not idle anymore, and it is released using thread_release() just like regular isolation. Its users have to keep in mind that across this call nothing is granted as another thread might have performed shared memory modifications. But such users are extremely rare and are actually expecting this from their peers as well. Note that that in case of backport, this patch depends on previous patch: MINOR: threads: make thread_release() not wait for other ones to complete
2021-08-04 05:44:17 -04:00
static inline void thread_isolate_full()
{
}
static inline void thread_release()
{
}
static inline unsigned long thread_isolated()
{
return 1;
}
static inline void setup_extra_threads(void *(*handler)(void *))
{
}
static inline void wait_for_threads_completion()
{
}
static inline void set_thread_cpu_affinity()
{
}
static inline unsigned long long ha_get_pthread_id(unsigned int thr)
{
return 0;
}
static inline void cshared_init(struct cshared *ctr, uint64_t *var, int lim)
{
ctr->global = var;
ctr->diff = 0;
ctr->lim = 0;
}
static inline void cshared_add(struct cshared *ctr, int diff)
{
ctr->global += diff;
}
static inline uint64_t cshared_read(struct cshared *ctr)
{
return *ctr->global;
}
#else /* !USE_THREAD */
/********************** THREADS ENABLED ************************/
#define PLOCK_LORW_INLINE_WAIT
#include <import/plock.h>
CLEANUP: tree-wide: fix prototypes for functions taking no arguments. "f(void)" is the correct and preferred form for a function taking no argument, while some places use the older "f()". These were reported by clang's -Wmissing-prototypes, for example: src/cpuset.c:111:5: warning: no previous prototype for function 'ha_cpuset_size' [-Wmissing-prototypes] int ha_cpuset_size() include/haproxy/cpuset.h:42:5: note: this declaration is not a prototype; add 'void' to make it a prototype for a zero-parameter function int ha_cpuset_size(); ^ void This aggregate patch fixes this for the following functions: ha_backtrace_to_stderr(), ha_cpuset_size(), ha_panic(), ha_random64(), ha_thread_dump_all_to_trash(), get_exec_path(), check_config_validity(), mworker_child_nb(), mworker_cli_proxy_(create|stop)(), mworker_cleantasks(), mworker_cleanlisteners(), mworker_ext_launch_all(), mworker_reload(), mworker_(env|proc_list)_to_(proc_list|env)(), mworker_(un|)block_signals(), proxy_adjust_all_maxconn(), proxy_destroy_all_defaults(), get_tainted(), pool_total_(allocated|used)(), thread_isolate(_full|)(), thread(_sync|)_release(), thread_harmless_till_end(), thread_cpu_mask_forced(), dequeue_all_listeners(), next_timer_expiry(), wake_expired_tasks(), process_runnable_tasks(), init_acl(), init_buffer(), (de|)init_log_buffers(), (de|)init_pollers(), fork_poller(), pool_destroy_all(), pool_evict_from_local_caches(), pool_total_failures(), dump_pools_to_trash(), cfg_run_diagnostics(), tv_init_(process|thread)_date(), __signal_process_queue(), deinit_signals(), haproxy_unblock_signals()
2021-09-12 06:49:33 -04:00
void thread_harmless_till_end(void);
void thread_isolate(void);
void thread_isolate_full(void);
void thread_release(void);
void ha_spin_init(HA_SPINLOCK_T *l);
void ha_rwlock_init(HA_RWLOCK_T *l);
void setup_extra_threads(void *(*handler)(void *));
void wait_for_threads_completion();
void set_thread_cpu_affinity();
unsigned long long ha_get_pthread_id(unsigned int thr);
MAJOR: threads: change thread_isolate to support inter-group synchronization thread_isolate() and thread_isolate_full() were relying on a set of thread masks for all threads in different states (rdv, harmless, idle). This cannot work anymore when the number of threads increases beyond LONGBITS so we need to change the mechanism. What is done here is to have a counter of requesters and the number of the current isolated thread. Threads which want to isolate themselves increment the request counter and wait for all threads to be marked harmless (or idle) by scanning all groups and watching the respective masks. This is possible because threads cannot escape once they discover this counter, unless they also want to isolate and possibly pass first. Once all threads are harmless, the requesting thread tries to self-assign the isolated thread number, and if it fails it loops back to checking all threads. If it wins it's guaranted to be alone, and can drop its harmless bit, so that other competing threads go back to the loop waiting for all threads to be harmless. The benefit of proceeding this way is that there's very little write contention on the thread number (none during work), hence no cache line moves between caches, thus frozen threads do not slow down the isolated one. Once it's done, the isolated thread resets the thread number (hence lets another thread take the place) and decrements the requester count, thus possibly releasing all harmless threads. With this change there's no more need for any global mask to synchronize any thread, and we only need to loop over a number of groups to check 64 threads at a time per iteration. As such, tinfo's threads_want_rdv could be dropped. This was tested with 64 threads spread into 2 groups, running 64 tasks (from the debug dev command), 20 "show sess" (thread_isolate()), 20 "add server blah/blah" (thread_isolate()), and 20 "del server blah/blah" (thread_isolate_full()). The load remained very low (limited by external socat forks) and no stuck nor starved thread was found.
2022-07-01 09:08:37 -04:00
extern volatile unsigned int rdv_requests;
extern volatile unsigned int isolated_thread;
extern THREAD_LOCAL unsigned int tid; /* The thread id */
extern THREAD_LOCAL unsigned int tgid; /* The thread group id (starts at 1) */
#define ha_sigmask(how, set, oldset) pthread_sigmask(how, set, oldset)
/* Sets the current thread to a valid one described by <thr>, or to any thread
* and any group if NULL (e.g. for use during boot where they're not totally
* initialized).
*/
static inline void ha_set_thread(const struct thread_info *thr)
{
if (thr) {
BUG_ON(!thr->ltid_bit);
BUG_ON(!thr->tg);
BUG_ON(!thr->tgid);
ti = thr;
tg = thr->tg;
tid = thr->tid;
tgid = thr->tgid;
th_ctx = &ha_thread_ctx[tid];
tg_ctx = &ha_tgroup_ctx[tgid-1];
} else {
tgid = 1;
tid = 0;
ti = &ha_thread_info[0];
tg = &ha_tgroup_info[0];
th_ctx = &ha_thread_ctx[0];
tg_ctx = &ha_tgroup_ctx[0];
}
}
MEDIUM: threads: add a stronger thread_isolate_full() call The current principle of running under isolation was made to access sensitive data while being certain that no other thread was using them in parallel, without necessarily having to place locks everywhere. The main use case are "show sess" and "show fd" which run over long chains of pointers. The thread_isolate() call relies on the "harmless" bit that indicates for a given thread that it's not currently doing such sensitive things, which is advertised using thread_harmless_now() and which ends usings thread_harmless_end(), which also waits for possibly concurrent threads to complete their work if they took this opportunity for starting something tricky. As some system calls were notoriously slow (e.g. mmap()), a bunch of thread_harmless_now() / thread_harmless_end() were placed around them to let waiting threads do their work while such other threads were not able to modify memory contents. But this is not sufficient for performing memory modifications. One such example is the server deletion code. By modifying memory, it not only requires that other threads are not playing with it, but are not either in the process of touching it. The fact that a pool_alloc() or pool_free() on some structure may call thread_harmless_now() and let another thread start to release the same object's memory is not acceptable. This patch introduces the concept of "idle threads". Threads entering the polling loop are idle, as well as those that are waiting for all others to become idle via the new function thread_isolate_full(). Once thread_isolate_full() is granted, the thread is not idle anymore, and it is released using thread_release() just like regular isolation. Its users have to keep in mind that across this call nothing is granted as another thread might have performed shared memory modifications. But such users are extremely rare and are actually expecting this from their peers as well. Note that that in case of backport, this patch depends on previous patch: MINOR: threads: make thread_release() not wait for other ones to complete
2021-08-04 05:44:17 -04:00
/* Marks the thread as idle, which means that not only it's not doing anything
* dangerous, but in addition it has not started anything sensitive either.
* This essentially means that the thread currently is in the poller, thus
* outside of any execution block. Needs to be terminated using
* thread_idle_end(). This is needed to release a concurrent call to
* thread_isolate_full().
*/
static inline void thread_idle_now()
{
HA_ATOMIC_OR(&tg_ctx->threads_idle, ti->ltid_bit);
MEDIUM: threads: add a stronger thread_isolate_full() call The current principle of running under isolation was made to access sensitive data while being certain that no other thread was using them in parallel, without necessarily having to place locks everywhere. The main use case are "show sess" and "show fd" which run over long chains of pointers. The thread_isolate() call relies on the "harmless" bit that indicates for a given thread that it's not currently doing such sensitive things, which is advertised using thread_harmless_now() and which ends usings thread_harmless_end(), which also waits for possibly concurrent threads to complete their work if they took this opportunity for starting something tricky. As some system calls were notoriously slow (e.g. mmap()), a bunch of thread_harmless_now() / thread_harmless_end() were placed around them to let waiting threads do their work while such other threads were not able to modify memory contents. But this is not sufficient for performing memory modifications. One such example is the server deletion code. By modifying memory, it not only requires that other threads are not playing with it, but are not either in the process of touching it. The fact that a pool_alloc() or pool_free() on some structure may call thread_harmless_now() and let another thread start to release the same object's memory is not acceptable. This patch introduces the concept of "idle threads". Threads entering the polling loop are idle, as well as those that are waiting for all others to become idle via the new function thread_isolate_full(). Once thread_isolate_full() is granted, the thread is not idle anymore, and it is released using thread_release() just like regular isolation. Its users have to keep in mind that across this call nothing is granted as another thread might have performed shared memory modifications. But such users are extremely rare and are actually expecting this from their peers as well. Note that that in case of backport, this patch depends on previous patch: MINOR: threads: make thread_release() not wait for other ones to complete
2021-08-04 05:44:17 -04:00
}
/* Ends the harmless period started by thread_idle_now(), i.e. the thread is
* about to restart engaging in sensitive operations. This must not be done on
* a thread marked harmless, as it could cause a deadlock between another
* thread waiting for idle again and thread_harmless_end() in this thread.
*
* The right sequence is thus:
* thread_idle_now();
* thread_harmless_now();
* poll();
* thread_harmless_end();
* thread_idle_end();
*/
static inline void thread_idle_end()
{
HA_ATOMIC_AND(&tg_ctx->threads_idle, ~ti->ltid_bit);
MEDIUM: threads: add a stronger thread_isolate_full() call The current principle of running under isolation was made to access sensitive data while being certain that no other thread was using them in parallel, without necessarily having to place locks everywhere. The main use case are "show sess" and "show fd" which run over long chains of pointers. The thread_isolate() call relies on the "harmless" bit that indicates for a given thread that it's not currently doing such sensitive things, which is advertised using thread_harmless_now() and which ends usings thread_harmless_end(), which also waits for possibly concurrent threads to complete their work if they took this opportunity for starting something tricky. As some system calls were notoriously slow (e.g. mmap()), a bunch of thread_harmless_now() / thread_harmless_end() were placed around them to let waiting threads do their work while such other threads were not able to modify memory contents. But this is not sufficient for performing memory modifications. One such example is the server deletion code. By modifying memory, it not only requires that other threads are not playing with it, but are not either in the process of touching it. The fact that a pool_alloc() or pool_free() on some structure may call thread_harmless_now() and let another thread start to release the same object's memory is not acceptable. This patch introduces the concept of "idle threads". Threads entering the polling loop are idle, as well as those that are waiting for all others to become idle via the new function thread_isolate_full(). Once thread_isolate_full() is granted, the thread is not idle anymore, and it is released using thread_release() just like regular isolation. Its users have to keep in mind that across this call nothing is granted as another thread might have performed shared memory modifications. But such users are extremely rare and are actually expecting this from their peers as well. Note that that in case of backport, this patch depends on previous patch: MINOR: threads: make thread_release() not wait for other ones to complete
2021-08-04 05:44:17 -04:00
}
/* Marks the thread as harmless. Note: this must be true, i.e. the thread must
* not be touching any unprotected shared resource during this period. Usually
* this is called before poll(), but it may also be placed around very slow
* calls (eg: some crypto operations). Needs to be terminated using
* thread_harmless_end().
*/
static inline void thread_harmless_now()
{
HA_ATOMIC_OR(&tg_ctx->threads_harmless, ti->ltid_bit);
}
/* Returns non-zero if the current thread is already harmless */
static inline int is_thread_harmless()
{
return !!(HA_ATOMIC_LOAD(&tg_ctx->threads_harmless) & ti->ltid_bit);
}
/* Ends the harmless period started by thread_harmless_now(). Usually this is
* placed after the poll() call. If it is discovered that a job was running and
* is relying on the thread still being harmless, the thread waits for the
* other one to finish.
*/
static inline void thread_harmless_end()
{
while (1) {
MAJOR: threads: change thread_isolate to support inter-group synchronization thread_isolate() and thread_isolate_full() were relying on a set of thread masks for all threads in different states (rdv, harmless, idle). This cannot work anymore when the number of threads increases beyond LONGBITS so we need to change the mechanism. What is done here is to have a counter of requesters and the number of the current isolated thread. Threads which want to isolate themselves increment the request counter and wait for all threads to be marked harmless (or idle) by scanning all groups and watching the respective masks. This is possible because threads cannot escape once they discover this counter, unless they also want to isolate and possibly pass first. Once all threads are harmless, the requesting thread tries to self-assign the isolated thread number, and if it fails it loops back to checking all threads. If it wins it's guaranted to be alone, and can drop its harmless bit, so that other competing threads go back to the loop waiting for all threads to be harmless. The benefit of proceeding this way is that there's very little write contention on the thread number (none during work), hence no cache line moves between caches, thus frozen threads do not slow down the isolated one. Once it's done, the isolated thread resets the thread number (hence lets another thread take the place) and decrements the requester count, thus possibly releasing all harmless threads. With this change there's no more need for any global mask to synchronize any thread, and we only need to loop over a number of groups to check 64 threads at a time per iteration. As such, tinfo's threads_want_rdv could be dropped. This was tested with 64 threads spread into 2 groups, running 64 tasks (from the debug dev command), 20 "show sess" (thread_isolate()), 20 "add server blah/blah" (thread_isolate()), and 20 "del server blah/blah" (thread_isolate_full()). The load remained very low (limited by external socat forks) and no stuck nor starved thread was found.
2022-07-01 09:08:37 -04:00
HA_ATOMIC_AND(&tg_ctx->threads_harmless, ~ti->ltid_bit);
if (likely(_HA_ATOMIC_LOAD(&rdv_requests) == 0))
break;
thread_harmless_till_end();
}
}
/* Ends the harmless period started by thread_harmless_now(), but without
* waiting for isolated requests. This is meant to be used from signal handlers
* which might be called recursively while a thread already requested an
* isolation that must be ignored. It must not be used past a checkpoint where
* another thread could return and see the current thread as harmless before
* this call (or this could validate an isolation request by accident).
*/
static inline void thread_harmless_end_sig()
{
HA_ATOMIC_AND(&tg_ctx->threads_harmless, ~ti->ltid_bit);
}
MAJOR: threads: change thread_isolate to support inter-group synchronization thread_isolate() and thread_isolate_full() were relying on a set of thread masks for all threads in different states (rdv, harmless, idle). This cannot work anymore when the number of threads increases beyond LONGBITS so we need to change the mechanism. What is done here is to have a counter of requesters and the number of the current isolated thread. Threads which want to isolate themselves increment the request counter and wait for all threads to be marked harmless (or idle) by scanning all groups and watching the respective masks. This is possible because threads cannot escape once they discover this counter, unless they also want to isolate and possibly pass first. Once all threads are harmless, the requesting thread tries to self-assign the isolated thread number, and if it fails it loops back to checking all threads. If it wins it's guaranted to be alone, and can drop its harmless bit, so that other competing threads go back to the loop waiting for all threads to be harmless. The benefit of proceeding this way is that there's very little write contention on the thread number (none during work), hence no cache line moves between caches, thus frozen threads do not slow down the isolated one. Once it's done, the isolated thread resets the thread number (hence lets another thread take the place) and decrements the requester count, thus possibly releasing all harmless threads. With this change there's no more need for any global mask to synchronize any thread, and we only need to loop over a number of groups to check 64 threads at a time per iteration. As such, tinfo's threads_want_rdv could be dropped. This was tested with 64 threads spread into 2 groups, running 64 tasks (from the debug dev command), 20 "show sess" (thread_isolate()), 20 "add server blah/blah" (thread_isolate()), and 20 "del server blah/blah" (thread_isolate_full()). The load remained very low (limited by external socat forks) and no stuck nor starved thread was found.
2022-07-01 09:08:37 -04:00
/* an isolated thread has its ID in isolated_thread */
static inline unsigned long thread_isolated()
{
MAJOR: threads: change thread_isolate to support inter-group synchronization thread_isolate() and thread_isolate_full() were relying on a set of thread masks for all threads in different states (rdv, harmless, idle). This cannot work anymore when the number of threads increases beyond LONGBITS so we need to change the mechanism. What is done here is to have a counter of requesters and the number of the current isolated thread. Threads which want to isolate themselves increment the request counter and wait for all threads to be marked harmless (or idle) by scanning all groups and watching the respective masks. This is possible because threads cannot escape once they discover this counter, unless they also want to isolate and possibly pass first. Once all threads are harmless, the requesting thread tries to self-assign the isolated thread number, and if it fails it loops back to checking all threads. If it wins it's guaranted to be alone, and can drop its harmless bit, so that other competing threads go back to the loop waiting for all threads to be harmless. The benefit of proceeding this way is that there's very little write contention on the thread number (none during work), hence no cache line moves between caches, thus frozen threads do not slow down the isolated one. Once it's done, the isolated thread resets the thread number (hence lets another thread take the place) and decrements the requester count, thus possibly releasing all harmless threads. With this change there's no more need for any global mask to synchronize any thread, and we only need to loop over a number of groups to check 64 threads at a time per iteration. As such, tinfo's threads_want_rdv could be dropped. This was tested with 64 threads spread into 2 groups, running 64 tasks (from the debug dev command), 20 "show sess" (thread_isolate()), 20 "add server blah/blah" (thread_isolate()), and 20 "del server blah/blah" (thread_isolate_full()). The load remained very low (limited by external socat forks) and no stuck nor starved thread was found.
2022-07-01 09:08:37 -04:00
return _HA_ATOMIC_LOAD(&isolated_thread) == tid;
}
/* locking levels, for history and debugging */
#define _LK_UN 0
#define _LK_RD 1
#define _LK_SK 2
#define _LK_WR 3
#if (DEBUG_THREAD < 1) && !defined(DEBUG_FULL)
#define _lock_wait(_LK_, bal, lbl, expr) do { (void)(expr); } while (0)
#define _lock_cond(_LK_, bal, lbl, expr) ({ typeof(expr) _expr = (expr); _expr; })
#else
/* principle: each lock operation takes 8 bits, 6 of which (the highest) are
* the lock label, and two of which (the lowest) are the operation (_LK_*).
* In order to preserve as much usable history as possible, we try to merge
* repetitions:
* - if a lock is taken just after it was released, the release is erased
* from history and replace with the new operation ;
* - if, when replacing an unlock, the new operation is the same as the
* one before the unlock, then the new one is not added.
* This means that sequences like "R:foo U:foo R:foo" just become "R:foo",
* but that those like "R:foo U:foo W:foo U:foo" become "R:foo W:foo U:foo".
*/
#define _lock_wait_common(_LK_, lbl) do { \
ulong _lck = ((lbl + 1) << 2) + _LK_; \
if ((uint8_t)th_ctx->lock_history == (uint8_t)(((lbl + 1) << 2) + _LK_UN)) { \
/* re-lock of just unlocked, try to compact and possibly merge with n-2 */ \
th_ctx->lock_history >>= 8; \
if ((uint8_t)th_ctx->lock_history != (uint8_t)_lck) \
th_ctx->lock_history = (th_ctx->lock_history << 8) + _lck; \
} \
else \
th_ctx->lock_history = (th_ctx->lock_history << 8) + _lck; \
} while (0)
#define _lock_wait(_LK_, bal, lbl, expr) do { \
MINOR: activity: collect time spent waiting on a lock for each task When DEBUG_THREAD > 0, and if task profiling is enabled, then each locking attempt will measure the time it takes to obtain the lock, then add that time to a thread_ctx accumulator that the scheduler will then retrieve to update the current task's sched_activity entry. The value will then appear avearaged over the number of calls in the lkw_avg column of "show profiling tasks", such as below: Tasks activity over 48.298 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lat_avg h1_io_cb 3200170 26.81s 8.377us - 32.73us <- sock_conn_iocb@src/sock.c:1099 tasklet_wakeup sc_conn_io_cb 1657841 1.645s 992.0ns - 853.0us <- sc_app_chk_rcv_conn@src/stconn.c:844 tasklet_wakeup process_stream 1600450 49.16s 30.71us 1.936us 1.392ms <- sc_notify@src/stconn.c:1206 task_wakeup process_stream 1600321 7.770m 291.3us 209.1us 901.6us <- stream_new@src/stream.c:585 task_wakeup sc_conn_io_cb 1599928 7.975s 4.984us - 65.77us <- h1_wake_stream_for_recv@src/mux_h1.c:3633 tasklet_wakeup task_process_applet 997609 46.37s 46.48us 16.80us 113.0us <- sc_app_chk_snd_applet@src/stconn.c:1043 appctx_wakeup process_table_expire 922074 48.79s 52.92us 7.275us 181.1us <- run_tasks_from_lists@src/task.c:670 task_queue stktable_add_pend_updates 705423 1.511s 2.142us - 56.81us <- stktable_add_pend_updates@src/stick_table.c:869 tasklet_wakeup task_process_applet 683511 34.75s 50.84us 18.37us 153.3us <- __process_running_peer_sync@src/peers.c:3579 appctx_wakeup h1_io_cb 535395 198.1ms 370.0ns 72.00ns 930.4us <- h1_takeover@src/mux_h1.c:5659 tasklet_wakeup It now makes it pretty obvious which tasks (hence call chains) spend their time waiting on a lock and for what share of their execution time.
2025-09-11 04:47:35 -04:00
uint64_t lock_start = 0; \
extern uint64_t now_mono_time(void); \
if (_LK_ != _LK_UN) { \
th_ctx->lock_level += bal; \
if (unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L)) == \
(TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L))) \
MINOR: activity: collect time spent waiting on a lock for each task When DEBUG_THREAD > 0, and if task profiling is enabled, then each locking attempt will measure the time it takes to obtain the lock, then add that time to a thread_ctx accumulator that the scheduler will then retrieve to update the current task's sched_activity entry. The value will then appear avearaged over the number of calls in the lkw_avg column of "show profiling tasks", such as below: Tasks activity over 48.298 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lat_avg h1_io_cb 3200170 26.81s 8.377us - 32.73us <- sock_conn_iocb@src/sock.c:1099 tasklet_wakeup sc_conn_io_cb 1657841 1.645s 992.0ns - 853.0us <- sc_app_chk_rcv_conn@src/stconn.c:844 tasklet_wakeup process_stream 1600450 49.16s 30.71us 1.936us 1.392ms <- sc_notify@src/stconn.c:1206 task_wakeup process_stream 1600321 7.770m 291.3us 209.1us 901.6us <- stream_new@src/stream.c:585 task_wakeup sc_conn_io_cb 1599928 7.975s 4.984us - 65.77us <- h1_wake_stream_for_recv@src/mux_h1.c:3633 tasklet_wakeup task_process_applet 997609 46.37s 46.48us 16.80us 113.0us <- sc_app_chk_snd_applet@src/stconn.c:1043 appctx_wakeup process_table_expire 922074 48.79s 52.92us 7.275us 181.1us <- run_tasks_from_lists@src/task.c:670 task_queue stktable_add_pend_updates 705423 1.511s 2.142us - 56.81us <- stktable_add_pend_updates@src/stick_table.c:869 tasklet_wakeup task_process_applet 683511 34.75s 50.84us 18.37us 153.3us <- __process_running_peer_sync@src/peers.c:3579 appctx_wakeup h1_io_cb 535395 198.1ms 370.0ns 72.00ns 930.4us <- h1_takeover@src/mux_h1.c:5659 tasklet_wakeup It now makes it pretty obvious which tasks (hence call chains) spend their time waiting on a lock and for what share of their execution time.
2025-09-11 04:47:35 -04:00
lock_start = now_mono_time(); \
} \
(void)(expr); \
MINOR: activity: collect time spent with a lock held for each task When DEBUG_THREAD > 0 and task profiling enabled, we'll now measure the time spent with at least one lock held for each task. The time is collected by locking operations when locks are taken raising the level to one, or released resetting the level. An accumulator is updated in the thread_ctx struct that is collected by the scheduler when the task returns, and updated in the sched_activity entry of the related task. This allows to observe figures like this one: Tasks activity over 259.516 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lkd_avg lat_avg h1_io_cb 15466589 2.574m 9.984us - - 33.45us <- sock_conn_iocb@src/sock.c:1099 tasklet_wakeup sc_conn_io_cb 8047994 8.325s 1.034us - - 870.1us <- sc_app_chk_rcv_conn@src/stconn.c:844 tasklet_wakeup process_stream 7734689 4.356m 33.79us 1.990us 1.641us 1.554ms <- sc_notify@src/stconn.c:1206 task_wakeup process_stream 7734292 46.74m 362.6us 278.3us 132.2us 972.0us <- stream_new@src/stream.c:585 task_wakeup sc_conn_io_cb 7733158 46.88s 6.061us - - 68.78us <- h1_wake_stream_for_recv@src/mux_h1.c:3633 tasklet_wakeup task_process_applet 6603593 4.484m 40.74us 16.69us 34.00us 96.47us <- sc_app_chk_snd_applet@src/stconn.c:1043 appctx_wakeup task_process_applet 4761796 3.420m 43.09us 18.79us 39.28us 138.2us <- __process_running_peer_sync@src/peers.c:3579 appctx_wakeup process_table_expire 4710662 4.880m 62.16us 9.648us 53.95us 158.6us <- run_tasks_from_lists@src/task.c:671 task_queue stktable_add_pend_updates 4171868 6.786s 1.626us - 1.487us 47.94us <- stktable_add_pend_updates@src/stick_table.c:869 tasklet_wakeup h1_io_cb 2871683 1.198s 417.0ns 70.00ns 69.00ns 1.005ms <- h1_takeover@src/mux_h1.c:5659 tasklet_wakeup process_peer_sync 2304957 5.368s 2.328us - 1.156us 68.54us <- stktable_add_pend_updates@src/stick_table.c:873 task_wakeup process_peer_sync 1388141 3.174s 2.286us - 1.130us 52.31us <- run_tasks_from_lists@src/task.c:671 task_queue stktable_add_pend_updates 463488 3.530s 7.615us 2.000ns 7.134us 771.2us <- stktable_touch_with_exp@src/stick_table.c:654 tasklet_wakeup Here we see that almost the entirety of stktable_add_pend_updates() is spent under a lock, that 1/3 of the execution time of process_stream() was performed under a lock and that 2/3 of it was spent waiting for a lock (this is related to the 10 track-sc present in this config), and that the locking time in process_peer_sync() has now significantly reduced. This is more visible with "show profiling tasks aggr": Tasks activity over 475.354 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lkd_avg lat_avg h1_io_cb 25742539 3.699m 8.622us 11.00ns 10.00ns 188.0us sc_conn_io_cb 22565666 1.475m 3.920us - - 473.9us process_stream 21665212 1.195h 198.6us 140.6us 67.08us 1.266ms task_process_applet 16352495 11.31m 41.51us 17.98us 36.55us 112.3us process_peer_sync 7831923 17.15s 2.189us - 1.107us 41.27us process_table_expire 6878569 6.866m 59.89us 9.359us 51.91us 151.8us stktable_add_pend_updates 6602502 14.77s 2.236us - 2.060us 119.8us h1_timeout_task 801 703.4us 878.0ns - - 185.7us srv_cleanup_toremove_conns 347 12.43ms 35.82us 240.0ns 70.00ns 1.924ms accept_queue_process 142 1.384ms 9.743us - - 340.6us srv_cleanup_idle_conns 74 475.0us 6.418us 896.0ns 5.667us 114.6us
2025-09-11 05:26:40 -04:00
if (_LK_ == _LK_UN) { \
th_ctx->lock_level += bal; \
if (th_ctx->lock_level == 0 &&\
unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L)) == \
(TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L))) \
MINOR: activity: collect time spent with a lock held for each task When DEBUG_THREAD > 0 and task profiling enabled, we'll now measure the time spent with at least one lock held for each task. The time is collected by locking operations when locks are taken raising the level to one, or released resetting the level. An accumulator is updated in the thread_ctx struct that is collected by the scheduler when the task returns, and updated in the sched_activity entry of the related task. This allows to observe figures like this one: Tasks activity over 259.516 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lkd_avg lat_avg h1_io_cb 15466589 2.574m 9.984us - - 33.45us <- sock_conn_iocb@src/sock.c:1099 tasklet_wakeup sc_conn_io_cb 8047994 8.325s 1.034us - - 870.1us <- sc_app_chk_rcv_conn@src/stconn.c:844 tasklet_wakeup process_stream 7734689 4.356m 33.79us 1.990us 1.641us 1.554ms <- sc_notify@src/stconn.c:1206 task_wakeup process_stream 7734292 46.74m 362.6us 278.3us 132.2us 972.0us <- stream_new@src/stream.c:585 task_wakeup sc_conn_io_cb 7733158 46.88s 6.061us - - 68.78us <- h1_wake_stream_for_recv@src/mux_h1.c:3633 tasklet_wakeup task_process_applet 6603593 4.484m 40.74us 16.69us 34.00us 96.47us <- sc_app_chk_snd_applet@src/stconn.c:1043 appctx_wakeup task_process_applet 4761796 3.420m 43.09us 18.79us 39.28us 138.2us <- __process_running_peer_sync@src/peers.c:3579 appctx_wakeup process_table_expire 4710662 4.880m 62.16us 9.648us 53.95us 158.6us <- run_tasks_from_lists@src/task.c:671 task_queue stktable_add_pend_updates 4171868 6.786s 1.626us - 1.487us 47.94us <- stktable_add_pend_updates@src/stick_table.c:869 tasklet_wakeup h1_io_cb 2871683 1.198s 417.0ns 70.00ns 69.00ns 1.005ms <- h1_takeover@src/mux_h1.c:5659 tasklet_wakeup process_peer_sync 2304957 5.368s 2.328us - 1.156us 68.54us <- stktable_add_pend_updates@src/stick_table.c:873 task_wakeup process_peer_sync 1388141 3.174s 2.286us - 1.130us 52.31us <- run_tasks_from_lists@src/task.c:671 task_queue stktable_add_pend_updates 463488 3.530s 7.615us 2.000ns 7.134us 771.2us <- stktable_touch_with_exp@src/stick_table.c:654 tasklet_wakeup Here we see that almost the entirety of stktable_add_pend_updates() is spent under a lock, that 1/3 of the execution time of process_stream() was performed under a lock and that 2/3 of it was spent waiting for a lock (this is related to the 10 track-sc present in this config), and that the locking time in process_peer_sync() has now significantly reduced. This is more visible with "show profiling tasks aggr": Tasks activity over 475.354 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lkd_avg lat_avg h1_io_cb 25742539 3.699m 8.622us 11.00ns 10.00ns 188.0us sc_conn_io_cb 22565666 1.475m 3.920us - - 473.9us process_stream 21665212 1.195h 198.6us 140.6us 67.08us 1.266ms task_process_applet 16352495 11.31m 41.51us 17.98us 36.55us 112.3us process_peer_sync 7831923 17.15s 2.189us - 1.107us 41.27us process_table_expire 6878569 6.866m 59.89us 9.359us 51.91us 151.8us stktable_add_pend_updates 6602502 14.77s 2.236us - 2.060us 119.8us h1_timeout_task 801 703.4us 878.0ns - - 185.7us srv_cleanup_toremove_conns 347 12.43ms 35.82us 240.0ns 70.00ns 1.924ms accept_queue_process 142 1.384ms 9.743us - - 340.6us srv_cleanup_idle_conns 74 475.0us 6.418us 896.0ns 5.667us 114.6us
2025-09-11 05:26:40 -04:00
th_ctx->locked_total += now_mono_time() - th_ctx->lock_start_date; \
} else if (unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L)) == \
(TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L))) { \
MINOR: activity: collect time spent with a lock held for each task When DEBUG_THREAD > 0 and task profiling enabled, we'll now measure the time spent with at least one lock held for each task. The time is collected by locking operations when locks are taken raising the level to one, or released resetting the level. An accumulator is updated in the thread_ctx struct that is collected by the scheduler when the task returns, and updated in the sched_activity entry of the related task. This allows to observe figures like this one: Tasks activity over 259.516 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lkd_avg lat_avg h1_io_cb 15466589 2.574m 9.984us - - 33.45us <- sock_conn_iocb@src/sock.c:1099 tasklet_wakeup sc_conn_io_cb 8047994 8.325s 1.034us - - 870.1us <- sc_app_chk_rcv_conn@src/stconn.c:844 tasklet_wakeup process_stream 7734689 4.356m 33.79us 1.990us 1.641us 1.554ms <- sc_notify@src/stconn.c:1206 task_wakeup process_stream 7734292 46.74m 362.6us 278.3us 132.2us 972.0us <- stream_new@src/stream.c:585 task_wakeup sc_conn_io_cb 7733158 46.88s 6.061us - - 68.78us <- h1_wake_stream_for_recv@src/mux_h1.c:3633 tasklet_wakeup task_process_applet 6603593 4.484m 40.74us 16.69us 34.00us 96.47us <- sc_app_chk_snd_applet@src/stconn.c:1043 appctx_wakeup task_process_applet 4761796 3.420m 43.09us 18.79us 39.28us 138.2us <- __process_running_peer_sync@src/peers.c:3579 appctx_wakeup process_table_expire 4710662 4.880m 62.16us 9.648us 53.95us 158.6us <- run_tasks_from_lists@src/task.c:671 task_queue stktable_add_pend_updates 4171868 6.786s 1.626us - 1.487us 47.94us <- stktable_add_pend_updates@src/stick_table.c:869 tasklet_wakeup h1_io_cb 2871683 1.198s 417.0ns 70.00ns 69.00ns 1.005ms <- h1_takeover@src/mux_h1.c:5659 tasklet_wakeup process_peer_sync 2304957 5.368s 2.328us - 1.156us 68.54us <- stktable_add_pend_updates@src/stick_table.c:873 task_wakeup process_peer_sync 1388141 3.174s 2.286us - 1.130us 52.31us <- run_tasks_from_lists@src/task.c:671 task_queue stktable_add_pend_updates 463488 3.530s 7.615us 2.000ns 7.134us 771.2us <- stktable_touch_with_exp@src/stick_table.c:654 tasklet_wakeup Here we see that almost the entirety of stktable_add_pend_updates() is spent under a lock, that 1/3 of the execution time of process_stream() was performed under a lock and that 2/3 of it was spent waiting for a lock (this is related to the 10 track-sc present in this config), and that the locking time in process_peer_sync() has now significantly reduced. This is more visible with "show profiling tasks aggr": Tasks activity over 475.354 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lkd_avg lat_avg h1_io_cb 25742539 3.699m 8.622us 11.00ns 10.00ns 188.0us sc_conn_io_cb 22565666 1.475m 3.920us - - 473.9us process_stream 21665212 1.195h 198.6us 140.6us 67.08us 1.266ms task_process_applet 16352495 11.31m 41.51us 17.98us 36.55us 112.3us process_peer_sync 7831923 17.15s 2.189us - 1.107us 41.27us process_table_expire 6878569 6.866m 59.89us 9.359us 51.91us 151.8us stktable_add_pend_updates 6602502 14.77s 2.236us - 2.060us 119.8us h1_timeout_task 801 703.4us 878.0ns - - 185.7us srv_cleanup_toremove_conns 347 12.43ms 35.82us 240.0ns 70.00ns 1.924ms accept_queue_process 142 1.384ms 9.743us - - 340.6us srv_cleanup_idle_conns 74 475.0us 6.418us 896.0ns 5.667us 114.6us
2025-09-11 05:26:40 -04:00
uint64_t now = now_mono_time(); \
if (lock_start) \
th_ctx->lock_wait_total += now - lock_start; \
if (th_ctx->lock_level == 1) \
th_ctx->lock_start_date = now; \
} \
if (lbl != OTHER_LOCK) \
_lock_wait_common(_LK_, lbl); \
} while (0)
#define _lock_cond(_LK_, bal, lbl, expr) ({ \
typeof(expr) _expr = (expr); \
MINOR: activity: collect time spent with a lock held for each task When DEBUG_THREAD > 0 and task profiling enabled, we'll now measure the time spent with at least one lock held for each task. The time is collected by locking operations when locks are taken raising the level to one, or released resetting the level. An accumulator is updated in the thread_ctx struct that is collected by the scheduler when the task returns, and updated in the sched_activity entry of the related task. This allows to observe figures like this one: Tasks activity over 259.516 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lkd_avg lat_avg h1_io_cb 15466589 2.574m 9.984us - - 33.45us <- sock_conn_iocb@src/sock.c:1099 tasklet_wakeup sc_conn_io_cb 8047994 8.325s 1.034us - - 870.1us <- sc_app_chk_rcv_conn@src/stconn.c:844 tasklet_wakeup process_stream 7734689 4.356m 33.79us 1.990us 1.641us 1.554ms <- sc_notify@src/stconn.c:1206 task_wakeup process_stream 7734292 46.74m 362.6us 278.3us 132.2us 972.0us <- stream_new@src/stream.c:585 task_wakeup sc_conn_io_cb 7733158 46.88s 6.061us - - 68.78us <- h1_wake_stream_for_recv@src/mux_h1.c:3633 tasklet_wakeup task_process_applet 6603593 4.484m 40.74us 16.69us 34.00us 96.47us <- sc_app_chk_snd_applet@src/stconn.c:1043 appctx_wakeup task_process_applet 4761796 3.420m 43.09us 18.79us 39.28us 138.2us <- __process_running_peer_sync@src/peers.c:3579 appctx_wakeup process_table_expire 4710662 4.880m 62.16us 9.648us 53.95us 158.6us <- run_tasks_from_lists@src/task.c:671 task_queue stktable_add_pend_updates 4171868 6.786s 1.626us - 1.487us 47.94us <- stktable_add_pend_updates@src/stick_table.c:869 tasklet_wakeup h1_io_cb 2871683 1.198s 417.0ns 70.00ns 69.00ns 1.005ms <- h1_takeover@src/mux_h1.c:5659 tasklet_wakeup process_peer_sync 2304957 5.368s 2.328us - 1.156us 68.54us <- stktable_add_pend_updates@src/stick_table.c:873 task_wakeup process_peer_sync 1388141 3.174s 2.286us - 1.130us 52.31us <- run_tasks_from_lists@src/task.c:671 task_queue stktable_add_pend_updates 463488 3.530s 7.615us 2.000ns 7.134us 771.2us <- stktable_touch_with_exp@src/stick_table.c:654 tasklet_wakeup Here we see that almost the entirety of stktable_add_pend_updates() is spent under a lock, that 1/3 of the execution time of process_stream() was performed under a lock and that 2/3 of it was spent waiting for a lock (this is related to the 10 track-sc present in this config), and that the locking time in process_peer_sync() has now significantly reduced. This is more visible with "show profiling tasks aggr": Tasks activity over 475.354 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lkd_avg lat_avg h1_io_cb 25742539 3.699m 8.622us 11.00ns 10.00ns 188.0us sc_conn_io_cb 22565666 1.475m 3.920us - - 473.9us process_stream 21665212 1.195h 198.6us 140.6us 67.08us 1.266ms task_process_applet 16352495 11.31m 41.51us 17.98us 36.55us 112.3us process_peer_sync 7831923 17.15s 2.189us - 1.107us 41.27us process_table_expire 6878569 6.866m 59.89us 9.359us 51.91us 151.8us stktable_add_pend_updates 6602502 14.77s 2.236us - 2.060us 119.8us h1_timeout_task 801 703.4us 878.0ns - - 185.7us srv_cleanup_toremove_conns 347 12.43ms 35.82us 240.0ns 70.00ns 1.924ms accept_queue_process 142 1.384ms 9.743us - - 340.6us srv_cleanup_idle_conns 74 475.0us 6.418us 896.0ns 5.667us 114.6us
2025-09-11 05:26:40 -04:00
if (_expr == 0) { \
th_ctx->lock_level += bal; \
if (unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L)) == \
(TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L))) { \
MINOR: activity: collect time spent with a lock held for each task When DEBUG_THREAD > 0 and task profiling enabled, we'll now measure the time spent with at least one lock held for each task. The time is collected by locking operations when locks are taken raising the level to one, or released resetting the level. An accumulator is updated in the thread_ctx struct that is collected by the scheduler when the task returns, and updated in the sched_activity entry of the related task. This allows to observe figures like this one: Tasks activity over 259.516 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lkd_avg lat_avg h1_io_cb 15466589 2.574m 9.984us - - 33.45us <- sock_conn_iocb@src/sock.c:1099 tasklet_wakeup sc_conn_io_cb 8047994 8.325s 1.034us - - 870.1us <- sc_app_chk_rcv_conn@src/stconn.c:844 tasklet_wakeup process_stream 7734689 4.356m 33.79us 1.990us 1.641us 1.554ms <- sc_notify@src/stconn.c:1206 task_wakeup process_stream 7734292 46.74m 362.6us 278.3us 132.2us 972.0us <- stream_new@src/stream.c:585 task_wakeup sc_conn_io_cb 7733158 46.88s 6.061us - - 68.78us <- h1_wake_stream_for_recv@src/mux_h1.c:3633 tasklet_wakeup task_process_applet 6603593 4.484m 40.74us 16.69us 34.00us 96.47us <- sc_app_chk_snd_applet@src/stconn.c:1043 appctx_wakeup task_process_applet 4761796 3.420m 43.09us 18.79us 39.28us 138.2us <- __process_running_peer_sync@src/peers.c:3579 appctx_wakeup process_table_expire 4710662 4.880m 62.16us 9.648us 53.95us 158.6us <- run_tasks_from_lists@src/task.c:671 task_queue stktable_add_pend_updates 4171868 6.786s 1.626us - 1.487us 47.94us <- stktable_add_pend_updates@src/stick_table.c:869 tasklet_wakeup h1_io_cb 2871683 1.198s 417.0ns 70.00ns 69.00ns 1.005ms <- h1_takeover@src/mux_h1.c:5659 tasklet_wakeup process_peer_sync 2304957 5.368s 2.328us - 1.156us 68.54us <- stktable_add_pend_updates@src/stick_table.c:873 task_wakeup process_peer_sync 1388141 3.174s 2.286us - 1.130us 52.31us <- run_tasks_from_lists@src/task.c:671 task_queue stktable_add_pend_updates 463488 3.530s 7.615us 2.000ns 7.134us 771.2us <- stktable_touch_with_exp@src/stick_table.c:654 tasklet_wakeup Here we see that almost the entirety of stktable_add_pend_updates() is spent under a lock, that 1/3 of the execution time of process_stream() was performed under a lock and that 2/3 of it was spent waiting for a lock (this is related to the 10 track-sc present in this config), and that the locking time in process_peer_sync() has now significantly reduced. This is more visible with "show profiling tasks aggr": Tasks activity over 475.354 sec till 0.000 sec ago: function calls cpu_tot cpu_avg lkw_avg lkd_avg lat_avg h1_io_cb 25742539 3.699m 8.622us 11.00ns 10.00ns 188.0us sc_conn_io_cb 22565666 1.475m 3.920us - - 473.9us process_stream 21665212 1.195h 198.6us 140.6us 67.08us 1.266ms task_process_applet 16352495 11.31m 41.51us 17.98us 36.55us 112.3us process_peer_sync 7831923 17.15s 2.189us - 1.107us 41.27us process_table_expire 6878569 6.866m 59.89us 9.359us 51.91us 151.8us stktable_add_pend_updates 6602502 14.77s 2.236us - 2.060us 119.8us h1_timeout_task 801 703.4us 878.0ns - - 185.7us srv_cleanup_toremove_conns 347 12.43ms 35.82us 240.0ns 70.00ns 1.924ms accept_queue_process 142 1.384ms 9.743us - - 340.6us srv_cleanup_idle_conns 74 475.0us 6.418us 896.0ns 5.667us 114.6us
2025-09-11 05:26:40 -04:00
if (_LK_ == _LK_UN && th_ctx->lock_level == 0) \
th_ctx->locked_total += now_mono_time() - th_ctx->lock_start_date; \
else if (_LK_ != _LK_UN && th_ctx->lock_level == 1) \
th_ctx->lock_start_date = now_mono_time(); \
} \
} \
if (lbl != OTHER_LOCK && !_expr) \
_lock_wait_common(_LK_, lbl); \
_expr; \
})
#endif
/* Init a shared counter <ctr> which references global value <var>. Update are
* performed each time the shared counter exceed <lim>, either on the positive
* or negative value.
*/
static inline void cshared_init(struct cshared *ctr, uint64_t *var, int lim)
{
ctr->global = var;
ctr->diff = 0;
ctr->lim = lim;
}
/* Add <diff>, which may be positive or negative, to <ctr> shared counter. */
static inline void cshared_add(struct cshared *ctr, int diff)
{
ctr->diff += diff;
if (ctr->diff <= -(ctr->lim) || ctr->diff >= ctr->lim) {
HA_ATOMIC_ADD(ctr->global, ctr->diff);
ctr->diff = 0;
}
}
/* Atomically get current global value from <ctr> shared counter. */
static inline uint64_t cshared_read(struct cshared *ctr)
{
return HA_ATOMIC_LOAD(ctr->global);
}
#if (DEBUG_THREAD < 2) && !defined(DEBUG_FULL)
/* Thread debugging is DISABLED, these are the regular locking functions */
#define HA_SPIN_INIT(l) ({ (*l) = 0; })
#define HA_SPIN_DESTROY(l) ({ (*l) = 0; })
#define HA_SPIN_LOCK(lbl, l) _lock_wait(_LK_SK, 1, lbl, pl_take_s(l))
#define HA_SPIN_TRYLOCK(lbl, l) _lock_cond(_LK_SK, 1, lbl, !pl_try_s(l))
#define HA_SPIN_UNLOCK(lbl, l) _lock_wait(_LK_UN, -1, lbl, pl_drop_s(l))
#define HA_RWLOCK_INIT(l) ({ (*l) = 0; })
#define HA_RWLOCK_DESTROY(l) ({ (*l) = 0; })
#define HA_RWLOCK_WRLOCK(lbl,l) _lock_wait(_LK_WR, 1, lbl, pl_take_w(l))
#define HA_RWLOCK_TRYWRLOCK(lbl,l) _lock_cond(_LK_WR, 1, lbl, !pl_try_w(l))
#define HA_RWLOCK_WRUNLOCK(lbl,l) _lock_wait(_LK_UN, -1, lbl, pl_drop_w(l))
#define HA_RWLOCK_RDLOCK(lbl,l) _lock_wait(_LK_RD, 1, lbl, pl_take_r(l))
#define HA_RWLOCK_TRYRDLOCK(lbl,l) _lock_cond(_LK_RD, 1, lbl, (!pl_try_r(l)))
#define HA_RWLOCK_RDUNLOCK(lbl,l) _lock_wait(_LK_UN, -1, lbl, pl_drop_r(l))
/* rwlock upgrades via seek locks */
#define HA_RWLOCK_SKLOCK(lbl,l) _lock_wait(_LK_SK, 1, lbl, pl_take_s(l)) /* N --> S */
#define HA_RWLOCK_SKTOWR(lbl,l) _lock_wait(_LK_WR, 0, lbl, pl_stow(l)) /* S --> W */
#define HA_RWLOCK_WRTOSK(lbl,l) _lock_wait(_LK_SK, 0, lbl, pl_wtos(l)) /* W --> S */
#define HA_RWLOCK_SKTORD(lbl,l) _lock_wait(_LK_RD, 0, lbl, pl_stor(l)) /* S --> R */
#define HA_RWLOCK_WRTORD(lbl,l) _lock_wait(_LK_RD, 0, lbl, pl_wtor(l)) /* W --> R */
#define HA_RWLOCK_SKUNLOCK(lbl,l) _lock_wait(_LK_UN, -1, lbl, pl_drop_s(l)) /* S --> N */
#define HA_RWLOCK_TRYSKLOCK(lbl,l) _lock_cond(_LK_SK, 1, lbl, !pl_try_s(l)) /* N -?> S */
#define HA_RWLOCK_TRYRDTOSK(lbl,l) _lock_cond(_LK_SK, 0, lbl, !pl_try_rtos(l)) /* R -?> S */
#define HA_RWLOCK_TRYRDTOWR(lbl, l) _lock_cond(_LK_WR, 0, lbl, !pl_try_rtow(l)) /* R -?> W */
#else /* (DEBUG_THREAD < 2) && !defined(DEBUG_FULL) */
/* Thread debugging is ENABLED, these are the instrumented functions */
#define __SPIN_INIT(l) ({ (*l) = 0; })
#define __SPIN_DESTROY(l) ({ (*l) = 0; })
#define __SPIN_LOCK(l) pl_take_s(l)
#define __SPIN_TRYLOCK(l) (!pl_try_s(l))
#define __SPIN_UNLOCK(l) pl_drop_s(l)
#define __RWLOCK_INIT(l) ({ (*l) = 0; })
#define __RWLOCK_DESTROY(l) ({ (*l) = 0; })
#define __RWLOCK_WRLOCK(l) pl_take_w(l)
#define __RWLOCK_TRYWRLOCK(l) (!pl_try_w(l))
#define __RWLOCK_WRUNLOCK(l) pl_drop_w(l)
#define __RWLOCK_RDLOCK(l) pl_take_r(l)
#define __RWLOCK_TRYRDLOCK(l) (!pl_try_r(l))
#define __RWLOCK_RDUNLOCK(l) pl_drop_r(l)
/* rwlock upgrades via seek locks */
#define __RWLOCK_SKLOCK(l) pl_take_s(l) /* N --> S */
#define __RWLOCK_SKTOWR(l) pl_stow(l) /* S --> W */
#define __RWLOCK_WRTOSK(l) pl_wtos(l) /* W --> S */
#define __RWLOCK_SKTORD(l) pl_stor(l) /* S --> R */
#define __RWLOCK_WRTORD(l) pl_wtor(l) /* W --> R */
#define __RWLOCK_SKUNLOCK(l) pl_drop_s(l) /* S --> N */
#define __RWLOCK_TRYSKLOCK(l) (!pl_try_s(l)) /* N -?> S */
#define __RWLOCK_TRYRDTOSK(l) (!pl_try_rtos(l)) /* R -?> S */
#define __RWLOCK_TRYRDTOWR(l) (!pl_try_rtow(l)) /* R -?> W */
#define HA_SPIN_INIT(l) __spin_init(l)
#define HA_SPIN_DESTROY(l) __spin_destroy(l)
#define HA_SPIN_LOCK(lbl, l) _lock_wait(_LK_SK, 1, lbl, __spin_lock(lbl, l, __func__, __FILE__, __LINE__))
#define HA_SPIN_TRYLOCK(lbl, l) _lock_cond(_LK_SK, 1, lbl, __spin_trylock(lbl, l, __func__, __FILE__, __LINE__))
#define HA_SPIN_UNLOCK(lbl, l) _lock_wait(_LK_UN, -1, lbl, __spin_unlock(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_INIT(l) __ha_rwlock_init((l))
#define HA_RWLOCK_DESTROY(l) __ha_rwlock_destroy((l))
#define HA_RWLOCK_WRLOCK(lbl,l) _lock_wait(_LK_WR, 1, lbl, __ha_rwlock_wrlock(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_TRYWRLOCK(lbl,l) _lock_cond(_LK_WR, 1, lbl, __ha_rwlock_trywrlock(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_WRUNLOCK(lbl,l) _lock_wait(_LK_UN, -1, lbl, __ha_rwlock_wrunlock(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_RDLOCK(lbl,l) _lock_wait(_LK_RD, 1, lbl, __ha_rwlock_rdlock(lbl, l))
#define HA_RWLOCK_TRYRDLOCK(lbl,l) _lock_cond(_LK_RD, 1, lbl, __ha_rwlock_tryrdlock(lbl, l))
#define HA_RWLOCK_RDUNLOCK(lbl,l) _lock_wait(_LK_UN, -1, lbl, __ha_rwlock_rdunlock(lbl, l))
#define HA_RWLOCK_SKLOCK(lbl,l) _lock_wait(_LK_SK, 1, lbl, __ha_rwlock_sklock(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_SKTOWR(lbl,l) _lock_wait(_LK_WR, 0, lbl, __ha_rwlock_sktowr(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_WRTOSK(lbl,l) _lock_wait(_LK_SK, 0, lbl, __ha_rwlock_wrtosk(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_SKTORD(lbl,l) _lock_wait(_LK_RD, 0, lbl, __ha_rwlock_sktord(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_WRTORD(lbl,l) _lock_wait(_LK_RD, 0, lbl, __ha_rwlock_wrtord(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_SKUNLOCK(lbl,l) _lock_wait(_LK_UN, -1, lbl, __ha_rwlock_skunlock(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_TRYSKLOCK(lbl,l) _lock_cond(_LK_SK, 1, lbl, __ha_rwlock_trysklock(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_TRYRDTOSK(lbl,l) _lock_cond(_LK_RD, 0, lbl, __ha_rwlock_tryrdtosk(lbl, l, __func__, __FILE__, __LINE__))
#define HA_RWLOCK_TRYRDTOWR(lbl,l) _lock_cond(_LK_WR, 0, lbl, __ha_rwlock_tryrdtowr(lbl, l, __func__, __FILE__, __LINE__))
/* Following functions are used to collect some stats about locks. We wrap
* pthread functions to known how much time we wait in a lock. */
void show_lock_stats();
void __ha_rwlock_init(struct ha_rwlock *l);
void __ha_rwlock_destroy(struct ha_rwlock *l);
void __ha_rwlock_wrlock(enum lock_label lbl, struct ha_rwlock *l,
const char *func, const char *file, int line);
int __ha_rwlock_trywrlock(enum lock_label lbl, struct ha_rwlock *l,
const char *func, const char *file, int line);
void __ha_rwlock_wrunlock(enum lock_label lbl,struct ha_rwlock *l,
const char *func, const char *file, int line);
void __ha_rwlock_rdlock(enum lock_label lbl,struct ha_rwlock *l);
int __ha_rwlock_tryrdlock(enum lock_label lbl,struct ha_rwlock *l);
void __ha_rwlock_rdunlock(enum lock_label lbl,struct ha_rwlock *l);
void __ha_rwlock_wrtord(enum lock_label lbl, struct ha_rwlock *l,
const char *func, const char *file, int line);
void __ha_rwlock_wrtosk(enum lock_label lbl, struct ha_rwlock *l,
const char *func, const char *file, int line);
void __ha_rwlock_sklock(enum lock_label lbl, struct ha_rwlock *l,
const char *func, const char *file, int line);
void __ha_rwlock_sktowr(enum lock_label lbl, struct ha_rwlock *l,
const char *func, const char *file, int line);
void __ha_rwlock_sktord(enum lock_label lbl, struct ha_rwlock *l,
const char *func, const char *file, int line);
void __ha_rwlock_skunlock(enum lock_label lbl,struct ha_rwlock *l,
const char *func, const char *file, int line);
int __ha_rwlock_trysklock(enum lock_label lbl, struct ha_rwlock *l,
const char *func, const char *file, int line);
int __ha_rwlock_tryrdtosk(enum lock_label lbl, struct ha_rwlock *l,
const char *func, const char *file, int line);
int __ha_rwlock_tryrdtowr(enum lock_label lbl, struct ha_rwlock *l,
const char *func, const char *file, int line);
void __spin_init(struct ha_spinlock *l);
void __spin_destroy(struct ha_spinlock *l);
void __spin_lock(enum lock_label lbl, struct ha_spinlock *l,
const char *func, const char *file, int line);
int __spin_trylock(enum lock_label lbl, struct ha_spinlock *l,
const char *func, const char *file, int line);
void __spin_unlock(enum lock_label lbl, struct ha_spinlock *l,
const char *func, const char *file, int line);
#endif /* DEBUG_THREAD */
#endif /* USE_THREAD */
#endif /* _HAPROXY_THREAD_H */