From c4baa64ea844f96a98902b8b255dd33d55d89d84 Mon Sep 17 00:00:00 2001 From: Paulo Sousa Date: Wed, 21 Jan 2026 14:52:31 +0000 Subject: [PATCH] Optimize peak memory stats by switching from per-command checks to threshold-based (#14692) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR optimizes peak memory tracking by moving from **per-command checks** to a **threshold-based mechanism** in `zmalloc`. Instead of updating peak memory on every command, peak tracking is now triggered only when a thread's memory delta exceeds **100KB**. This reduces runtime overhead while keeping peak memory accuracy acceptable. ## Implementation Details - Peak memory is tracked atomically in `zmalloc` when a thread's memory delta exceeds 100KB - Thread-safe peak updates using CAS - Peak tracking considers both: - current used memory - zmalloc-reported peak memory ## Performance Results (ARM AArch64) All performance numbers were obtained on an **AWS m8g.metal (ARM AArch64)** instance. The database was pre-populated with **1M keys**, each holding a **1KB value**. Benchmarks were executed using memtier with a **10 SET : 90 GET ratio** and **pipeline = 10** ([full benchmark spec. here](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-string-setget200c-1KiB-pipeline-10.yml)). | Environment | Baseline `redis/redis` unstable (median ± std.dev) | Comparison `paulorsousa/redis` `f05a4bd273cb4d63ff03d33e6207837b6e51de86` (median) | % change (higher better) | Note | |------------------------------|----------------------------------------------------|----------------------------------------------------------------------------------:|--------------------------|-----------------------| | oss-standalone | 802,830 ± 0.2% (7 datapoints) | 796,660 | -0.8% | No change | | oss-standalone-02-io-threads | 982,698 ± 0.6% (7 datapoints) | 980,520 | -0.2% | No change | | oss-standalone-04-io-threads | 2,573,244 ± 1.9% (7 datapoints) | 2,630,931 | +2.2% | Potential improvement | | oss-standalone-08-io-threads | 2,343,609 ± 1.6% (7 datapoints) | 2,455,630 | +4.8% | Improvement | --- src/atomicvar.h | 11 ++++++ src/rdb.c | 4 +- src/server.c | 23 ++++++----- src/server.h | 2 +- src/zmalloc.c | 61 ++++++++++++++++++++++++++++-- src/zmalloc.h | 4 ++ tests/unit/moduleapi/propagate.tcl | 2 +- 7 files changed, 89 insertions(+), 18 deletions(-) diff --git a/src/atomicvar.h b/src/atomicvar.h index c9093470a..3c332ee69 100644 --- a/src/atomicvar.h +++ b/src/atomicvar.h @@ -11,6 +11,7 @@ * atomicSet(var,value) -- Set the atomic counter value * atomicGetWithSync(var,value) -- 'atomicGet' with inter-thread synchronization * atomicSetWithSync(var,value) -- 'atomicSet' with inter-thread synchronization + * atomicCompareExchange(type,var,expected_var,desired) -- Compare and exchange (CAS) operation * * Atomic operations on flags. * Flag type can be int, long, long long or their unsigned counterparts. @@ -110,6 +111,8 @@ } while(0) #define atomicSetWithSync(var,value) \ atomic_store_explicit(&var,value,memory_order_seq_cst) +#define atomicCompareExchange(type,var,expected_var,desired) \ + atomic_compare_exchange_weak_explicit(&var,&expected_var,desired,memory_order_relaxed,memory_order_relaxed) #define atomicFlagGetSet(var,oldvalue_var) \ oldvalue_var = atomic_exchange_explicit(&var,1,memory_order_relaxed) #define REDIS_ATOMIC_API "c11-builtin" @@ -135,6 +138,8 @@ } while(0) #define atomicSetWithSync(var,value) \ __atomic_store_n(&var,value,__ATOMIC_SEQ_CST) +#define atomicCompareExchange(type,var,expected_var,desired) \ + __atomic_compare_exchange_n(&var,&expected_var,desired,1,__ATOMIC_RELAXED,__ATOMIC_RELAXED) #define atomicFlagGetSet(var,oldvalue_var) \ oldvalue_var = __atomic_exchange_n(&var,1,__ATOMIC_RELAXED) #define REDIS_ATOMIC_API "atomic-builtin" @@ -164,6 +169,12 @@ ANNOTATE_HAPPENS_BEFORE(&var); \ while(!__sync_bool_compare_and_swap(&var,var,value,__sync_synchronize)); \ } while(0) +#define atomicCompareExchange(type,var,expected_var,desired) ({ \ + type _old = __sync_val_compare_and_swap(&var,expected_var,desired); \ + int _success = (_old == expected_var); \ + if (!_success) expected_var = _old; \ + _success; \ +}) #define atomicFlagGetSet(var,oldvalue_var) \ oldvalue_var = __sync_val_compare_and_swap(&var,0,1) #define REDIS_ATOMIC_API "sync-builtin" diff --git a/src/rdb.c b/src/rdb.c index 8f9da2e8c..7dc0e8ef1 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3531,13 +3531,13 @@ void startLoadingFile(size_t size, char* filename, int rdbflags) { /* Refresh the absolute loading progress info */ void loadingAbsProgress(off_t pos) { server.loading_loaded_bytes = pos; - updatePeakMemory(zmalloc_used_memory()); + updatePeakMemory(); } /* Refresh the incremental loading progress info */ void loadingIncrProgress(off_t size) { server.loading_loaded_bytes += size; - updatePeakMemory(zmalloc_used_memory()); + updatePeakMemory(); } /* Update the file name currently being loaded */ diff --git a/src/server.c b/src/server.c index a0d0af867..8edfb87f7 100644 --- a/src/server.c +++ b/src/server.c @@ -1400,16 +1400,23 @@ void checkChildrenDone(void) { } /* Record the max memory used since the server was started. */ -void updatePeakMemory(size_t used_memory) { - if (unlikely(used_memory > server.stat_peak_memory)) { - server.stat_peak_memory = used_memory; +void updatePeakMemory(void) { + size_t zmalloc_used = zmalloc_used_memory(); + if (zmalloc_used > server.stat_peak_memory) { + server.stat_peak_memory = zmalloc_used; server.stat_peak_memory_time = server.unixtime; } + + size_t zmalloc_peak = zmalloc_get_peak_memory(); + if (zmalloc_peak > server.stat_peak_memory) { + server.stat_peak_memory = zmalloc_peak; + server.stat_peak_memory_time = zmalloc_get_peak_memory_time(); + } } /* Called from serverCron and cronUpdateMemoryStats to update cached memory metrics. */ void cronUpdateMemoryStats(void) { - updatePeakMemory(zmalloc_used_memory()); + updatePeakMemory(); run_with_period(100) { /* Sample the RSS and other metrics here since this is a relatively slow call. @@ -1843,7 +1850,7 @@ extern int ProcessingEventsWhileBlocked; void beforeSleep(struct aeEventLoop *eventLoop) { UNUSED(eventLoop); - updatePeakMemory(zmalloc_used_memory()); + updatePeakMemory(); /* Just call a subset of vital functions in case we are re-entering * the event loop from processEventsWhileBlocked(). Note that in this @@ -4027,10 +4034,6 @@ void call(client *c, int flags) { server.stat_numcommands++; } - /* Record peak memory after each command and before the eviction that runs - * before the next command. */ - updatePeakMemory(zmalloc_used_memory()); - /* Do some maintenance job and cleanup */ afterCommand(c); @@ -6192,7 +6195,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { * may happen that the instantaneous value is slightly bigger than * the peak value. This may confuse users, so we update the peak * if found smaller than the current memory usage. */ - updatePeakMemory(zmalloc_used); + updatePeakMemory(); bytesToHuman(hmem,sizeof(hmem),zmalloc_used); bytesToHuman(peak_hmem,sizeof(peak_hmem),server.stat_peak_memory); diff --git a/src/server.h b/src/server.h index 1e0173dfa..96eb286a6 100644 --- a/src/server.h +++ b/src/server.h @@ -3560,7 +3560,7 @@ int zslLexValueLteMax(sds value, zlexrangespec *spec); /* Core functions */ int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level); -void updatePeakMemory(size_t used_memory); +void updatePeakMemory(void); size_t freeMemoryGetNotCountedMemory(void); int overMaxmemoryAfterAlloc(size_t moremem); uint64_t getCommandFlags(client *c); diff --git a/src/zmalloc.c b/src/zmalloc.c index b1d62d157..21d5749e4 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -81,14 +81,18 @@ void je_free_with_usize(void *ptr, size_t *usize); #define MAX_THREADS 16 /* Keep it a power of 2 so we can use '&' instead of '%'. */ #define THREAD_MASK (MAX_THREADS - 1) +#define PEAK_CHECK_THRESHOLD (1024 * 100) /* 100KB */ typedef struct used_memory_entry { redisAtomic long long used_memory; - char padding[CACHE_LINE_SIZE - sizeof(long long)]; + redisAtomic long long last_peak_check; + char padding[CACHE_LINE_SIZE - sizeof(long long) - sizeof(long long)]; } used_memory_entry; static __attribute__((aligned(CACHE_LINE_SIZE))) used_memory_entry used_memory[MAX_THREADS]; static redisAtomic size_t num_active_threads = 0; +static redisAtomic size_t zmalloc_peak = 0; +static redisAtomic time_t zmalloc_peak_time = 0; static __thread long my_thread_index = -1; static inline void init_my_thread_index(void) { @@ -98,9 +102,46 @@ static inline void init_my_thread_index(void) { } } -static void update_zmalloc_stat_alloc(long long num) { +static void update_zmalloc_stat_alloc(long long bytes_delta) { init_my_thread_index(); - atomicIncr(used_memory[my_thread_index].used_memory, num); + + /* Per-thread allocation counter and the last counter value at which we ran a + * global peak check (throttles how often we call zmalloc_used_memory()). */ + long long thread_used, thread_last_peak_check_used; + atomicIncrGet(used_memory[my_thread_index].used_memory, thread_used, bytes_delta); + atomicGet(used_memory[my_thread_index].last_peak_check, thread_last_peak_check_used); + + /* Only run the (expensive) global used/peak check after this thread's + * allocation counter has advanced enough since the last check. */ + if (unlikely(thread_used - thread_last_peak_check_used > PEAK_CHECK_THRESHOLD)) { + /* Snapshot of global used memory across all threads. */ + size_t used_mem = zmalloc_used_memory(); + + /* Current published global peak. */ + size_t published_peak; + atomicGet(zmalloc_peak, published_peak); + + if (used_mem > published_peak) { + /* Try to publish `used_mem` as the new global peak. + * + * Another thread may update `zmalloc_peak` concurrently. Use a CAS loop: + * on failure, `old_peak` is refreshed with the latest peak value, and we + * retry only while our snapshot still exceeds it. */ + size_t old_peak = published_peak; + while (used_mem > old_peak && !atomicCompareExchange(size_t, zmalloc_peak, old_peak, used_mem)) { + /* CAS failed: `old_peak` now holds the current `zmalloc_peak`. */ + } + + /* If we raised the peak, record when it was reached. */ + if (used_mem > old_peak) { + atomicSet(zmalloc_peak_time, time(NULL)); + } + } + + /* Record the thread counter value at which we last ran a global peak check, + * to throttle future checks for this thread. */ + atomicSet(used_memory[my_thread_index].last_peak_check, thread_used); + } } static void update_zmalloc_stat_free(long long num) { @@ -183,7 +224,7 @@ void *zmalloc_usable(size_t size, size_t *usable) { void *ptr = ztrymalloc_usable_internal(size, &usable_size); if (!ptr) zmalloc_oom_handler(size); #ifdef HAVE_MALLOC_SIZE - ptr = extend_to_usable(ptr, usable_size); + if (ptr) ptr = extend_to_usable(ptr, usable_size); #endif if (usable) *usable = usable_size; return ptr; @@ -538,6 +579,18 @@ size_t zmalloc_used_memory(void) { return total_mem; } +size_t zmalloc_get_peak_memory(void) { + size_t peak; + atomicGet(zmalloc_peak, peak); + return peak; +} + +time_t zmalloc_get_peak_memory_time(void) { + time_t t; + atomicGet(zmalloc_peak_time, t); + return t; +} + void zmalloc_set_oom_handler(void (*oom_handler)(size_t)) { zmalloc_oom_handler = oom_handler; } diff --git a/src/zmalloc.h b/src/zmalloc.h index 9cc535d7d..3dda50327 100644 --- a/src/zmalloc.h +++ b/src/zmalloc.h @@ -87,6 +87,8 @@ #define HAVE_ALLOC_WITH_USIZE #endif +#include + /* 'noinline' attribute is intended to prevent the `-Wstringop-overread` warning * when using gcc-12 later with LTO enabled. It may be removed once the * bug[https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96503] is fixed. */ @@ -108,6 +110,8 @@ void zfree_usable(void *ptr, size_t *usable); __attribute__((malloc)) char *zstrdup(const char *s); __attribute__((malloc)) char *zstrdup_usable(const char *s, size_t *usable); size_t zmalloc_used_memory(void); +size_t zmalloc_get_peak_memory(void); +time_t zmalloc_get_peak_memory_time(void); void zmalloc_set_oom_handler(void (*oom_handler)(size_t)); size_t zmalloc_get_rss(void); int zmalloc_get_allocator_info(int refresh_stats, size_t *allocated, size_t *active, size_t *resident, diff --git a/tests/unit/moduleapi/propagate.tcl b/tests/unit/moduleapi/propagate.tcl index 6fd315579..d3d08d954 100644 --- a/tests/unit/moduleapi/propagate.tcl +++ b/tests/unit/moduleapi/propagate.tcl @@ -798,7 +798,7 @@ test {Replicas that was marked as CLIENT_CLOSE_ASAP should not keep the replicat # exceed the replica soft limit. Furthermore, as the replica release its reference to # replication backlog, it should be properly trimmed, the memory usage of replication # backlog should not significantly exceed repl-backlog-size (default 1MB). */ - assert_lessthan [getInfoProperty $res used_memory_peak] 10000000;# less than 10mb + assert_lessthan [getInfoProperty $res used_memory_peak] 20000000;# less than 20mb assert_lessthan [getInfoProperty $res mem_replication_backlog] 2000000;# less than 2mb } }