Optimize peak memory stats by switching from per-command checks to threshold-based (#14692)

This PR optimizes peak memory tracking by moving from **per-command checks** to a **threshold-based mechanism** in `zmalloc`. Instead of updating peak memory on every command, peak tracking is now triggered only when a thread's memory delta exceeds **100KB**. This reduces runtime overhead while keeping peak memory accuracy acceptable. ## Implementation Details - Peak memory is tracked atomically in `zmalloc` when a thread's memory delta exceeds 100KB - Thread-safe peak updates using CAS - Peak tracking considers both: - current used memory - zmalloc-reported peak memory ## Performance Results (ARM AArch64) All performance numbers were obtained on an **AWS m8g.metal (ARM AArch64)** instance. The database was pre-populated with **1M keys**, each holding a **1KB value**. Benchmarks were executed using memtier with a **10 SET : 90 GET ratio** and **pipeline = 10** ([full benchmark spec. here](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-string-setget200c-1KiB-pipeline-10.yml)). | Environment | Baseline `redis/redis` unstable (median ± std.dev) | Comparison `paulorsousa/redis` `f05a4bd273cb4d63ff03d33e6207837b6e51de86` (median) | % change (higher better) | Note | |------------------------------|----------------------------------------------------|----------------------------------------------------------------------------------:|--------------------------|-----------------------| | oss-standalone | 802,830 ± 0.2% (7 datapoints) | 796,660 | -0.8% | No change | | oss-standalone-02-io-threads | 982,698 ± 0.6% (7 datapoints) | 980,520 | -0.2% | No change | | oss-standalone-04-io-threads | 2,573,244 ± 1.9% (7 datapoints) | 2,630,931 | +2.2% | Potential improvement | | oss-standalone-08-io-threads | 2,343,609 ± 1.6% (7 datapoints) | 2,455,630 | +4.8% | Improvement |
2026-02-03 20:39:54 -05:00 · 2026-01-21 14:52:31 +00:00 · 2026-01-21 14:52:31 +00:00 · c4baa64ea8
commit c4baa64ea8
parent e3c38aab66
7 changed files with 89 additions and 18 deletions
--- a/src/atomicvar.h
+++ b/src/atomicvar.h
@ -11,6 +11,7 @@
 * atomicSet(var,value)  -- Set the atomic counter value
 * atomicGetWithSync(var,value)  -- 'atomicGet' with inter-thread synchronization
 * atomicSetWithSync(var,value)  -- 'atomicSet' with inter-thread synchronization
+ * atomicCompareExchange(type,var,expected_var,desired)  --  Compare and exchange (CAS) operation
 * 
 * Atomic operations on flags. 
 * Flag type can be int, long, long long or their unsigned counterparts.
@ -110,6 +111,8 @@
 } while(0)
 #define atomicSetWithSync(var,value) \
    atomic_store_explicit(&var,value,memory_order_seq_cst)
+#define atomicCompareExchange(type,var,expected_var,desired) \
+    atomic_compare_exchange_weak_explicit(&var,&expected_var,desired,memory_order_relaxed,memory_order_relaxed)
 #define atomicFlagGetSet(var,oldvalue_var) \
    oldvalue_var = atomic_exchange_explicit(&var,1,memory_order_relaxed)
 #define REDIS_ATOMIC_API "c11-builtin"
@ -135,6 +138,8 @@
 } while(0)
 #define atomicSetWithSync(var,value) \
    __atomic_store_n(&var,value,__ATOMIC_SEQ_CST)
+#define atomicCompareExchange(type,var,expected_var,desired) \
+    __atomic_compare_exchange_n(&var,&expected_var,desired,1,__ATOMIC_RELAXED,__ATOMIC_RELAXED)
 #define atomicFlagGetSet(var,oldvalue_var) \
    oldvalue_var = __atomic_exchange_n(&var,1,__ATOMIC_RELAXED)
 #define REDIS_ATOMIC_API "atomic-builtin"
@ -164,6 +169,12 @@
    ANNOTATE_HAPPENS_BEFORE(&var);  \
    while(!__sync_bool_compare_and_swap(&var,var,value,__sync_synchronize)); \
 } while(0)
+#define atomicCompareExchange(type,var,expected_var,desired) ({ \
+    type _old = __sync_val_compare_and_swap(&var,expected_var,desired); \
+    int _success = (_old == expected_var); \
+    if (!_success) expected_var = _old; \
+    _success; \
+})
 #define atomicFlagGetSet(var,oldvalue_var) \
    oldvalue_var = __sync_val_compare_and_swap(&var,0,1)
 #define REDIS_ATOMIC_API "sync-builtin"
--- a/src/rdb.c
+++ b/src/rdb.c
@ -3531,13 +3531,13 @@ void startLoadingFile(size_t size, char* filename, int rdbflags) {
 /* Refresh the absolute loading progress info */
 void loadingAbsProgress(off_t pos) {
    server.loading_loaded_bytes = pos;
-    updatePeakMemory(zmalloc_used_memory());
+    updatePeakMemory();
 }

 /* Refresh the incremental loading progress info */
 void loadingIncrProgress(off_t size) {
    server.loading_loaded_bytes += size;
-    updatePeakMemory(zmalloc_used_memory());
+    updatePeakMemory();
 }

 /* Update the file name currently being loaded */
--- a/src/server.c
+++ b/src/server.c
@ -1400,16 +1400,23 @@ void checkChildrenDone(void) {
 }

 /* Record the max memory used since the server was started. */
-void updatePeakMemory(size_t used_memory) {
-    if (unlikely(used_memory > server.stat_peak_memory)) {
-        server.stat_peak_memory = used_memory;
+void updatePeakMemory(void) {
+    size_t zmalloc_used = zmalloc_used_memory();
+    if (zmalloc_used > server.stat_peak_memory) {
+        server.stat_peak_memory = zmalloc_used;
        server.stat_peak_memory_time = server.unixtime;
    }
+
+    size_t zmalloc_peak = zmalloc_get_peak_memory();
+    if (zmalloc_peak > server.stat_peak_memory) {
+        server.stat_peak_memory = zmalloc_peak;
+        server.stat_peak_memory_time = zmalloc_get_peak_memory_time();
+    }
 }

 /* Called from serverCron and cronUpdateMemoryStats to update cached memory metrics. */
 void cronUpdateMemoryStats(void) {
-    updatePeakMemory(zmalloc_used_memory());
+    updatePeakMemory();

    run_with_period(100) {
        /* Sample the RSS and other metrics here since this is a relatively slow call.
@ -1843,7 +1850,7 @@ extern int ProcessingEventsWhileBlocked;
 void beforeSleep(struct aeEventLoop *eventLoop) {
    UNUSED(eventLoop);

-    updatePeakMemory(zmalloc_used_memory());
+    updatePeakMemory();

    /* Just call a subset of vital functions in case we are re-entering
     * the event loop from processEventsWhileBlocked(). Note that in this
@ -4027,10 +4034,6 @@ void call(client *c, int flags) {
        server.stat_numcommands++;
    }

-    /* Record peak memory after each command and before the eviction that runs
-     * before the next command. */
-    updatePeakMemory(zmalloc_used_memory());
-
    /* Do some maintenance job and cleanup */
    afterCommand(c);

@ -6192,7 +6195,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) {
         * may happen that the instantaneous value is slightly bigger than
         * the peak value. This may confuse users, so we update the peak
         * if found smaller than the current memory usage. */
-        updatePeakMemory(zmalloc_used);
+        updatePeakMemory();

        bytesToHuman(hmem,sizeof(hmem),zmalloc_used);
        bytesToHuman(peak_hmem,sizeof(peak_hmem),server.stat_peak_memory);
--- a/src/server.h
+++ b/src/server.h
@ -3560,7 +3560,7 @@ int zslLexValueLteMax(sds value, zlexrangespec *spec);

 /* Core functions */
 int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level);
-void updatePeakMemory(size_t used_memory);
+void updatePeakMemory(void);
 size_t freeMemoryGetNotCountedMemory(void);
 int overMaxmemoryAfterAlloc(size_t moremem);
 uint64_t getCommandFlags(client *c);
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@ -81,14 +81,18 @@ void je_free_with_usize(void *ptr, size_t *usize);

 #define MAX_THREADS 16 /* Keep it a power of 2 so we can use '&' instead of '%'. */
 #define THREAD_MASK (MAX_THREADS - 1)
+#define PEAK_CHECK_THRESHOLD (1024 * 100) /* 100KB */

 typedef struct used_memory_entry {
    redisAtomic long long used_memory;
-    char padding[CACHE_LINE_SIZE - sizeof(long long)];
+    redisAtomic long long last_peak_check;
+    char padding[CACHE_LINE_SIZE - sizeof(long long) - sizeof(long long)];
 } used_memory_entry;

 static __attribute__((aligned(CACHE_LINE_SIZE))) used_memory_entry used_memory[MAX_THREADS];
 static redisAtomic size_t num_active_threads = 0;
+static redisAtomic size_t zmalloc_peak = 0;
+static redisAtomic time_t zmalloc_peak_time = 0;
 static __thread long my_thread_index = -1;

 static inline void init_my_thread_index(void) {
@ -98,9 +102,46 @@ static inline void init_my_thread_index(void) {
    }
 }

-static void update_zmalloc_stat_alloc(long long num) {
+static void update_zmalloc_stat_alloc(long long bytes_delta) {
    init_my_thread_index();
-    atomicIncr(used_memory[my_thread_index].used_memory, num);
+
+    /* Per-thread allocation counter and the last counter value at which we ran a
+     * global peak check (throttles how often we call zmalloc_used_memory()). */
+    long long thread_used, thread_last_peak_check_used;
+    atomicIncrGet(used_memory[my_thread_index].used_memory, thread_used, bytes_delta);
+    atomicGet(used_memory[my_thread_index].last_peak_check, thread_last_peak_check_used);
+
+    /* Only run the (expensive) global used/peak check after this thread's
+     * allocation counter has advanced enough since the last check. */
+    if (unlikely(thread_used - thread_last_peak_check_used > PEAK_CHECK_THRESHOLD)) {
+        /* Snapshot of global used memory across all threads. */
+        size_t used_mem = zmalloc_used_memory();
+
+        /* Current published global peak. */
+        size_t published_peak;
+        atomicGet(zmalloc_peak, published_peak);
+
+        if (used_mem > published_peak) {
+            /* Try to publish `used_mem` as the new global peak.
+             *
+             * Another thread may update `zmalloc_peak` concurrently. Use a CAS loop:
+             * on failure, `old_peak` is refreshed with the latest peak value, and we
+             * retry only while our snapshot still exceeds it. */
+            size_t old_peak = published_peak;
+            while (used_mem > old_peak && !atomicCompareExchange(size_t, zmalloc_peak, old_peak, used_mem)) {
+                /* CAS failed: `old_peak` now holds the current `zmalloc_peak`. */
+            }
+
+            /* If we raised the peak, record when it was reached. */
+            if (used_mem > old_peak) {
+                atomicSet(zmalloc_peak_time, time(NULL));
+            }
+        }
+
+        /* Record the thread counter value at which we last ran a global peak check,
+         * to throttle future checks for this thread. */
+        atomicSet(used_memory[my_thread_index].last_peak_check, thread_used);
+    }
 }

 static void update_zmalloc_stat_free(long long num) {
@ -183,7 +224,7 @@ void *zmalloc_usable(size_t size, size_t *usable) {
    void *ptr = ztrymalloc_usable_internal(size, &usable_size);
    if (!ptr) zmalloc_oom_handler(size);
 #ifdef HAVE_MALLOC_SIZE
-    ptr = extend_to_usable(ptr, usable_size);
+    if (ptr) ptr = extend_to_usable(ptr, usable_size);
 #endif
    if (usable) *usable = usable_size;
    return ptr;
@ -538,6 +579,18 @@ size_t zmalloc_used_memory(void) {
    return total_mem;
 }

+size_t zmalloc_get_peak_memory(void) {
+    size_t peak;
+    atomicGet(zmalloc_peak, peak);
+    return peak;
+}
+
+time_t zmalloc_get_peak_memory_time(void) {
+    time_t t;
+    atomicGet(zmalloc_peak_time, t);
+    return t;
+}
+
 void zmalloc_set_oom_handler(void (*oom_handler)(size_t)) {
    zmalloc_oom_handler = oom_handler;
 }
--- a/src/zmalloc.h
+++ b/src/zmalloc.h
@ -87,6 +87,8 @@
 #define HAVE_ALLOC_WITH_USIZE
 #endif

+#include <time.h>
+
 /* 'noinline' attribute is intended to prevent the `-Wstringop-overread` warning
 * when using gcc-12 later with LTO enabled. It may be removed once the
 * bug[https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96503] is fixed. */
@ -108,6 +110,8 @@ void zfree_usable(void *ptr, size_t *usable);
 __attribute__((malloc)) char *zstrdup(const char *s);
 __attribute__((malloc)) char *zstrdup_usable(const char *s, size_t *usable);
 size_t zmalloc_used_memory(void);
+size_t zmalloc_get_peak_memory(void);
+time_t zmalloc_get_peak_memory_time(void);
 void zmalloc_set_oom_handler(void (*oom_handler)(size_t));
 size_t zmalloc_get_rss(void);
 int zmalloc_get_allocator_info(int refresh_stats, size_t *allocated, size_t *active, size_t *resident,
--- a/tests/unit/moduleapi/propagate.tcl
+++ b/tests/unit/moduleapi/propagate.tcl
@ -798,7 +798,7 @@ test {Replicas that was marked as CLIENT_CLOSE_ASAP should not keep the replicat
                # exceed the replica soft limit. Furthermore, as the replica release its reference to
                # replication backlog, it should be properly trimmed, the memory usage of replication
                # backlog should not significantly exceed repl-backlog-size (default 1MB). */
-                assert_lessthan [getInfoProperty $res used_memory_peak] 10000000;# less than 10mb
+                assert_lessthan [getInfoProperty $res used_memory_peak] 20000000;# less than 20mb
                assert_lessthan [getInfoProperty $res mem_replication_backlog] 2000000;# less than 2mb
            }
        }