From c4baa64ea844f96a98902b8b255dd33d55d89d84 Mon Sep 17 00:00:00 2001
From: Paulo Sousa <paulo.sousa@redis.com>
Date: Wed, 21 Jan 2026 14:52:31 +0000
Subject: [PATCH] Optimize peak memory stats by switching from per-command
 checks to threshold-based (#14692)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR optimizes peak memory tracking by moving from **per-command
checks** to a **threshold-based mechanism** in `zmalloc`.

Instead of updating peak memory on every command, peak tracking is now
triggered only when a thread's memory delta exceeds **100KB**. This
reduces runtime overhead while keeping peak memory accuracy acceptable.

## Implementation Details

- Peak memory is tracked atomically in `zmalloc` when a thread's memory
delta exceeds 100KB
- Thread-safe peak updates using CAS
- Peak tracking considers both:
  - current used memory
  - zmalloc-reported peak memory

## Performance Results (ARM AArch64)

All performance numbers were obtained on an **AWS m8g.metal (ARM
AArch64)** instance.

The database was pre-populated with **1M keys**, each holding a **1KB
value**.
Benchmarks were executed using memtier with a **10 SET : 90 GET ratio**
and **pipeline = 10** ([full benchmark spec.
here](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-string-setget200c-1KiB-pipeline-10.yml)).

| Environment | Baseline `redis/redis` unstable (median ± std.dev) |
Comparison `paulorsousa/redis`
`f05a4bd273cb4d63ff03d33e6207837b6e51de86` (median) | % change (higher
better) | Note |

|------------------------------|----------------------------------------------------|----------------------------------------------------------------------------------:|--------------------------|-----------------------|
| oss-standalone | 802,830 ± 0.2% (7 datapoints) | 796,660 | -0.8% | No
change |
| oss-standalone-02-io-threads | 982,698 ± 0.6% (7 datapoints) | 980,520
| -0.2% | No change |
| oss-standalone-04-io-threads | 2,573,244 ± 1.9% (7 datapoints) |
2,630,931 | +2.2% | Potential improvement |
| oss-standalone-08-io-threads | 2,343,609 ± 1.6% (7 datapoints) |
2,455,630 | +4.8% | Improvement |
---
 src/atomicvar.h                    | 11 ++++++
 src/rdb.c                          |  4 +-
 src/server.c                       | 23 ++++++-----
 src/server.h                       |  2 +-
 src/zmalloc.c                      | 61 ++++++++++++++++++++++++++++--
 src/zmalloc.h                      |  4 ++
 tests/unit/moduleapi/propagate.tcl |  2 +-
 7 files changed, 89 insertions(+), 18 deletions(-)

diff --git a/src/atomicvar.h b/src/atomicvar.h
index c9093470a..3c332ee69 100644
--- a/src/atomicvar.h
+++ b/src/atomicvar.h
@@ -11,6 +11,7 @@
  * atomicSet(var,value)  -- Set the atomic counter value
  * atomicGetWithSync(var,value)  -- 'atomicGet' with inter-thread synchronization
  * atomicSetWithSync(var,value)  -- 'atomicSet' with inter-thread synchronization
+ * atomicCompareExchange(type,var,expected_var,desired)  --  Compare and exchange (CAS) operation
  * 
  * Atomic operations on flags. 
  * Flag type can be int, long, long long or their unsigned counterparts.
@@ -110,6 +111,8 @@
 } while(0)
 #define atomicSetWithSync(var,value) \
     atomic_store_explicit(&var,value,memory_order_seq_cst)
+#define atomicCompareExchange(type,var,expected_var,desired) \
+    atomic_compare_exchange_weak_explicit(&var,&expected_var,desired,memory_order_relaxed,memory_order_relaxed)
 #define atomicFlagGetSet(var,oldvalue_var) \
     oldvalue_var = atomic_exchange_explicit(&var,1,memory_order_relaxed)
 #define REDIS_ATOMIC_API "c11-builtin"
@@ -135,6 +138,8 @@
 } while(0)
 #define atomicSetWithSync(var,value) \
     __atomic_store_n(&var,value,__ATOMIC_SEQ_CST)
+#define atomicCompareExchange(type,var,expected_var,desired) \
+    __atomic_compare_exchange_n(&var,&expected_var,desired,1,__ATOMIC_RELAXED,__ATOMIC_RELAXED)
 #define atomicFlagGetSet(var,oldvalue_var) \
     oldvalue_var = __atomic_exchange_n(&var,1,__ATOMIC_RELAXED)
 #define REDIS_ATOMIC_API "atomic-builtin"
@@ -164,6 +169,12 @@
     ANNOTATE_HAPPENS_BEFORE(&var);  \
     while(!__sync_bool_compare_and_swap(&var,var,value,__sync_synchronize)); \
 } while(0)
+#define atomicCompareExchange(type,var,expected_var,desired) ({ \
+    type _old = __sync_val_compare_and_swap(&var,expected_var,desired); \
+    int _success = (_old == expected_var); \
+    if (!_success) expected_var = _old; \
+    _success; \
+})
 #define atomicFlagGetSet(var,oldvalue_var) \
     oldvalue_var = __sync_val_compare_and_swap(&var,0,1)
 #define REDIS_ATOMIC_API "sync-builtin"
diff --git a/src/rdb.c b/src/rdb.c
index 8f9da2e8c..7dc0e8ef1 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -3531,13 +3531,13 @@ void startLoadingFile(size_t size, char* filename, int rdbflags) {
 /* Refresh the absolute loading progress info */
 void loadingAbsProgress(off_t pos) {
     server.loading_loaded_bytes = pos;
-    updatePeakMemory(zmalloc_used_memory());
+    updatePeakMemory();
 }
 
 /* Refresh the incremental loading progress info */
 void loadingIncrProgress(off_t size) {
     server.loading_loaded_bytes += size;
-    updatePeakMemory(zmalloc_used_memory());
+    updatePeakMemory();
 }
 
 /* Update the file name currently being loaded */
diff --git a/src/server.c b/src/server.c
index a0d0af867..8edfb87f7 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1400,16 +1400,23 @@ void checkChildrenDone(void) {
 }
 
 /* Record the max memory used since the server was started. */
-void updatePeakMemory(size_t used_memory) {
-    if (unlikely(used_memory > server.stat_peak_memory)) {
-        server.stat_peak_memory = used_memory;
+void updatePeakMemory(void) {
+    size_t zmalloc_used = zmalloc_used_memory();
+    if (zmalloc_used > server.stat_peak_memory) {
+        server.stat_peak_memory = zmalloc_used;
         server.stat_peak_memory_time = server.unixtime;
     }
+
+    size_t zmalloc_peak = zmalloc_get_peak_memory();
+    if (zmalloc_peak > server.stat_peak_memory) {
+        server.stat_peak_memory = zmalloc_peak;
+        server.stat_peak_memory_time = zmalloc_get_peak_memory_time();
+    }
 }
 
 /* Called from serverCron and cronUpdateMemoryStats to update cached memory metrics. */
 void cronUpdateMemoryStats(void) {
-    updatePeakMemory(zmalloc_used_memory());
+    updatePeakMemory();
 
     run_with_period(100) {
         /* Sample the RSS and other metrics here since this is a relatively slow call.
@@ -1843,7 +1850,7 @@ extern int ProcessingEventsWhileBlocked;
 void beforeSleep(struct aeEventLoop *eventLoop) {
     UNUSED(eventLoop);
 
-    updatePeakMemory(zmalloc_used_memory());
+    updatePeakMemory();
 
     /* Just call a subset of vital functions in case we are re-entering
      * the event loop from processEventsWhileBlocked(). Note that in this
@@ -4027,10 +4034,6 @@ void call(client *c, int flags) {
         server.stat_numcommands++;
     }
 
-    /* Record peak memory after each command and before the eviction that runs
-     * before the next command. */
-    updatePeakMemory(zmalloc_used_memory());
-
     /* Do some maintenance job and cleanup */
     afterCommand(c);
 
@@ -6192,7 +6195,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) {
          * may happen that the instantaneous value is slightly bigger than
          * the peak value. This may confuse users, so we update the peak
          * if found smaller than the current memory usage. */
-        updatePeakMemory(zmalloc_used);
+        updatePeakMemory();
 
         bytesToHuman(hmem,sizeof(hmem),zmalloc_used);
         bytesToHuman(peak_hmem,sizeof(peak_hmem),server.stat_peak_memory);
diff --git a/src/server.h b/src/server.h
index 1e0173dfa..96eb286a6 100644
--- a/src/server.h
+++ b/src/server.h
@@ -3560,7 +3560,7 @@ int zslLexValueLteMax(sds value, zlexrangespec *spec);
 
 /* Core functions */
 int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level);
-void updatePeakMemory(size_t used_memory);
+void updatePeakMemory(void);
 size_t freeMemoryGetNotCountedMemory(void);
 int overMaxmemoryAfterAlloc(size_t moremem);
 uint64_t getCommandFlags(client *c);
diff --git a/src/zmalloc.c b/src/zmalloc.c
index b1d62d157..21d5749e4 100644
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@@ -81,14 +81,18 @@ void je_free_with_usize(void *ptr, size_t *usize);
 
 #define MAX_THREADS 16 /* Keep it a power of 2 so we can use '&' instead of '%'. */
 #define THREAD_MASK (MAX_THREADS - 1)
+#define PEAK_CHECK_THRESHOLD (1024 * 100) /* 100KB */
 
 typedef struct used_memory_entry {
     redisAtomic long long used_memory;
-    char padding[CACHE_LINE_SIZE - sizeof(long long)];
+    redisAtomic long long last_peak_check;
+    char padding[CACHE_LINE_SIZE - sizeof(long long) - sizeof(long long)];
 } used_memory_entry;
 
 static __attribute__((aligned(CACHE_LINE_SIZE))) used_memory_entry used_memory[MAX_THREADS];
 static redisAtomic size_t num_active_threads = 0;
+static redisAtomic size_t zmalloc_peak = 0;
+static redisAtomic time_t zmalloc_peak_time = 0;
 static __thread long my_thread_index = -1;
 
 static inline void init_my_thread_index(void) {
@@ -98,9 +102,46 @@ static inline void init_my_thread_index(void) {
     }
 }
 
-static void update_zmalloc_stat_alloc(long long num) {
+static void update_zmalloc_stat_alloc(long long bytes_delta) {
     init_my_thread_index();
-    atomicIncr(used_memory[my_thread_index].used_memory, num);
+
+    /* Per-thread allocation counter and the last counter value at which we ran a
+     * global peak check (throttles how often we call zmalloc_used_memory()). */
+    long long thread_used, thread_last_peak_check_used;
+    atomicIncrGet(used_memory[my_thread_index].used_memory, thread_used, bytes_delta);
+    atomicGet(used_memory[my_thread_index].last_peak_check, thread_last_peak_check_used);
+
+    /* Only run the (expensive) global used/peak check after this thread's
+     * allocation counter has advanced enough since the last check. */
+    if (unlikely(thread_used - thread_last_peak_check_used > PEAK_CHECK_THRESHOLD)) {
+        /* Snapshot of global used memory across all threads. */
+        size_t used_mem = zmalloc_used_memory();
+
+        /* Current published global peak. */
+        size_t published_peak;
+        atomicGet(zmalloc_peak, published_peak);
+
+        if (used_mem > published_peak) {
+            /* Try to publish `used_mem` as the new global peak.
+             *
+             * Another thread may update `zmalloc_peak` concurrently. Use a CAS loop:
+             * on failure, `old_peak` is refreshed with the latest peak value, and we
+             * retry only while our snapshot still exceeds it. */
+            size_t old_peak = published_peak;
+            while (used_mem > old_peak && !atomicCompareExchange(size_t, zmalloc_peak, old_peak, used_mem)) {
+                /* CAS failed: `old_peak` now holds the current `zmalloc_peak`. */
+            }
+
+            /* If we raised the peak, record when it was reached. */
+            if (used_mem > old_peak) {
+                atomicSet(zmalloc_peak_time, time(NULL));
+            }
+        }
+
+        /* Record the thread counter value at which we last ran a global peak check,
+         * to throttle future checks for this thread. */
+        atomicSet(used_memory[my_thread_index].last_peak_check, thread_used);
+    }
 }
 
 static void update_zmalloc_stat_free(long long num) {
@@ -183,7 +224,7 @@ void *zmalloc_usable(size_t size, size_t *usable) {
     void *ptr = ztrymalloc_usable_internal(size, &usable_size);
     if (!ptr) zmalloc_oom_handler(size);
 #ifdef HAVE_MALLOC_SIZE
-    ptr = extend_to_usable(ptr, usable_size);
+    if (ptr) ptr = extend_to_usable(ptr, usable_size);
 #endif
     if (usable) *usable = usable_size;
     return ptr;
@@ -538,6 +579,18 @@ size_t zmalloc_used_memory(void) {
     return total_mem;
 }
 
+size_t zmalloc_get_peak_memory(void) {
+    size_t peak;
+    atomicGet(zmalloc_peak, peak);
+    return peak;
+}
+
+time_t zmalloc_get_peak_memory_time(void) {
+    time_t t;
+    atomicGet(zmalloc_peak_time, t);
+    return t;
+}
+
 void zmalloc_set_oom_handler(void (*oom_handler)(size_t)) {
     zmalloc_oom_handler = oom_handler;
 }
diff --git a/src/zmalloc.h b/src/zmalloc.h
index 9cc535d7d..3dda50327 100644
--- a/src/zmalloc.h
+++ b/src/zmalloc.h
@@ -87,6 +87,8 @@
 #define HAVE_ALLOC_WITH_USIZE
 #endif
 
+#include <time.h>
+
 /* 'noinline' attribute is intended to prevent the `-Wstringop-overread` warning
  * when using gcc-12 later with LTO enabled. It may be removed once the
  * bug[https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96503] is fixed. */
@@ -108,6 +110,8 @@ void zfree_usable(void *ptr, size_t *usable);
 __attribute__((malloc)) char *zstrdup(const char *s);
 __attribute__((malloc)) char *zstrdup_usable(const char *s, size_t *usable);
 size_t zmalloc_used_memory(void);
+size_t zmalloc_get_peak_memory(void);
+time_t zmalloc_get_peak_memory_time(void);
 void zmalloc_set_oom_handler(void (*oom_handler)(size_t));
 size_t zmalloc_get_rss(void);
 int zmalloc_get_allocator_info(int refresh_stats, size_t *allocated, size_t *active, size_t *resident,
diff --git a/tests/unit/moduleapi/propagate.tcl b/tests/unit/moduleapi/propagate.tcl
index 6fd315579..d3d08d954 100644
--- a/tests/unit/moduleapi/propagate.tcl
+++ b/tests/unit/moduleapi/propagate.tcl
@@ -798,7 +798,7 @@ test {Replicas that was marked as CLIENT_CLOSE_ASAP should not keep the replicat
                 # exceed the replica soft limit. Furthermore, as the replica release its reference to
                 # replication backlog, it should be properly trimmed, the memory usage of replication
                 # backlog should not significantly exceed repl-backlog-size (default 1MB). */
-                assert_lessthan [getInfoProperty $res used_memory_peak] 10000000;# less than 10mb
+                assert_lessthan [getInfoProperty $res used_memory_peak] 20000000;# less than 20mb
                 assert_lessthan [getInfoProperty $res mem_replication_backlog] 2000000;# less than 2mb
             }
         }