From e2631ee5f7a1d10b403238ea45d2ff577794de79 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 10 Feb 2026 14:17:36 +0100 Subject: [PATCH] MEDIUM: activity: apply and use new finegrained task profiling settings In continuity of previous patch, this one makes use of the new profiling flags. For this, based on the global "profiling" setting, when switching profiling on, we set or clear two flags on the thread context, TH_FL_TASK_PROFILING_L and TH_FL_TASK_PROFILING_M to indicate whether lock profiling and/or malloc profiling are desired when profiling is enabled. These flags are checked along with TH_FL_TASK_PROFILING to decide when to collect time around a lock or a malloc. And by default we're back to the behavior of 3.2 in that neither lock nor malloc times are collected anymore. This is sufficient to see the CPU usage spent in the VDSO to significantly drop from 22% to 2.2% on a highly loaded system. This should be backported to 3.3 along with the previous patch. --- include/haproxy/thread.h | 13 +++++++++---- include/haproxy/tinfo-t.h | 2 ++ src/activity.c | 14 +++++++++++++- src/pool.c | 12 ++++++++---- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/include/haproxy/thread.h b/include/haproxy/thread.h index cde5c6aa5..c97566b92 100644 --- a/include/haproxy/thread.h +++ b/include/haproxy/thread.h @@ -362,15 +362,19 @@ static inline unsigned long thread_isolated() extern uint64_t now_mono_time(void); \ if (_LK_ != _LK_UN) { \ th_ctx->lock_level += bal; \ - if (unlikely(th_ctx->flags & TH_FL_TASK_PROFILING)) \ + if (unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L)) == \ + (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L))) \ lock_start = now_mono_time(); \ } \ (void)(expr); \ if (_LK_ == _LK_UN) { \ th_ctx->lock_level += bal; \ - if (th_ctx->lock_level == 0 && unlikely(th_ctx->flags & TH_FL_TASK_PROFILING)) \ + if (th_ctx->lock_level == 0 &&\ + unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L)) == \ + (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L))) \ th_ctx->locked_total += now_mono_time() - th_ctx->lock_start_date; \ - } else if (unlikely(th_ctx->flags & TH_FL_TASK_PROFILING)) { \ + } else if (unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L)) == \ + (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L))) { \ uint64_t now = now_mono_time(); \ if (lock_start) \ th_ctx->lock_wait_total += now - lock_start; \ @@ -384,7 +388,8 @@ static inline unsigned long thread_isolated() typeof(expr) _expr = (expr); \ if (_expr == 0) { \ th_ctx->lock_level += bal; \ - if (unlikely(th_ctx->flags & TH_FL_TASK_PROFILING)) { \ + if (unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L)) == \ + (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_L))) { \ if (_LK_ == _LK_UN && th_ctx->lock_level == 0) \ th_ctx->locked_total += now_mono_time() - th_ctx->lock_start_date; \ else if (_LK_ != _LK_UN && th_ctx->lock_level == 1) \ diff --git a/include/haproxy/tinfo-t.h b/include/haproxy/tinfo-t.h index 62e87f87e..dc713cc1f 100644 --- a/include/haproxy/tinfo-t.h +++ b/include/haproxy/tinfo-t.h @@ -69,6 +69,8 @@ enum { #define TH_FL_IN_DBG_HANDLER 0x00000100 /* thread currently in the debug signal handler */ #define TH_FL_IN_WDT_HANDLER 0x00000200 /* thread currently in the wdt signal handler */ #define TH_FL_IN_ANY_HANDLER 0x00000380 /* mask to test if the thread is in any signal handler */ +#define TH_FL_TASK_PROFILING_L 0x00000400 /* task profiling in locks (also requires TASK_PROFILING) */ +#define TH_FL_TASK_PROFILING_M 0x00000800 /* task profiling in mem alloc (also requires TASK_PROFILING) */ /* we have 4 buffer-wait queues, in highest to lowest emergency order */ #define DYNBUF_NBQ 4 diff --git a/src/activity.c b/src/activity.c index 4cc2386de..8f794abb1 100644 --- a/src/activity.c +++ b/src/activity.c @@ -659,8 +659,20 @@ void activity_count_runtime(uint32_t run_time) if (!(_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_TASK_PROFILING)) { if (unlikely((profiling & HA_PROF_TASKS_MASK) == HA_PROF_TASKS_ON || ((profiling & HA_PROF_TASKS_MASK) == HA_PROF_TASKS_AON && - swrate_avg(run_time, TIME_STATS_SAMPLES) >= up))) + swrate_avg(run_time, TIME_STATS_SAMPLES) >= up))) { + + if (profiling & HA_PROF_TASKS_LOCK) + _HA_ATOMIC_OR(&th_ctx->flags, TH_FL_TASK_PROFILING_L); + else + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_TASK_PROFILING_L); + + if (profiling & HA_PROF_TASKS_MEM) + _HA_ATOMIC_OR(&th_ctx->flags, TH_FL_TASK_PROFILING_M); + else + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_TASK_PROFILING_M); + _HA_ATOMIC_OR(&th_ctx->flags, TH_FL_TASK_PROFILING); + } } else { if (unlikely((profiling & HA_PROF_TASKS_MASK) == HA_PROF_TASKS_OFF || ((profiling & HA_PROF_TASKS_MASK) == HA_PROF_TASKS_AOFF && diff --git a/src/pool.c b/src/pool.c index b76bd8368..c52da365e 100644 --- a/src/pool.c +++ b/src/pool.c @@ -806,7 +806,8 @@ void pool_put_to_cache(struct pool_head *pool, void *ptr, const void *caller) if (unlikely(pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) { uint64_t mem_wait_start = 0; - if (unlikely(th_ctx->flags & TH_FL_TASK_PROFILING)) + if (unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_M)) == + (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_M))) mem_wait_start = now_mono_time(); if (ph->count >= 16 + pool_cache_count / 8 + CONFIG_HAP_POOL_CLUSTER_SIZE) @@ -969,7 +970,8 @@ void pool_gc(struct pool_head *pool_ctx) uint64_t mem_wait_start = 0; int isolated = thread_isolated(); - if (unlikely(th_ctx->flags & TH_FL_TASK_PROFILING)) + if (unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_M)) == + (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_M))) mem_wait_start = now_mono_time(); if (!isolated) @@ -1031,7 +1033,8 @@ void *__pool_alloc(struct pool_head *pool, unsigned int flags) /* count allocation time only for cache misses */ uint64_t mem_wait_start = 0; - if (unlikely(th_ctx->flags & TH_FL_TASK_PROFILING)) + if (unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_M)) == + (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_M))) mem_wait_start = now_mono_time(); p = pool_alloc_nocache(pool, caller); @@ -1109,7 +1112,8 @@ void __pool_free(struct pool_head *pool, void *ptr) global.tune.pool_cache_size < pool->size)) { uint64_t mem_wait_start = 0; - if (unlikely(th_ctx->flags & TH_FL_TASK_PROFILING)) + if (unlikely((th_ctx->flags & (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_M)) == + (TH_FL_TASK_PROFILING|TH_FL_TASK_PROFILING_M))) mem_wait_start = now_mono_time(); pool_free_nocache(pool, ptr);