Optimize peak memory stats by switching from per-command checks to threshold-based (#14692)
Some checks are pending
CI / test-ubuntu-latest (push) Waiting to run
CI / test-sanitizer-address (push) Waiting to run
CI / build-debian-old (push) Waiting to run
CI / build-macos-latest (push) Waiting to run
CI / build-32bit (push) Waiting to run
CI / build-libc-malloc (push) Waiting to run
CI / build-centos-jemalloc (push) Waiting to run
CI / build-old-chain-jemalloc (push) Waiting to run
Codecov / code-coverage (push) Waiting to run
External Server Tests / test-external-standalone (push) Waiting to run
External Server Tests / test-external-cluster (push) Waiting to run
External Server Tests / test-external-nodebug (push) Waiting to run
Spellcheck / Spellcheck (push) Waiting to run

This PR optimizes peak memory tracking by moving from **per-command
checks** to a **threshold-based mechanism** in `zmalloc`.

Instead of updating peak memory on every command, peak tracking is now
triggered only when a thread's memory delta exceeds **100KB**. This
reduces runtime overhead while keeping peak memory accuracy acceptable.

## Implementation Details

- Peak memory is tracked atomically in `zmalloc` when a thread's memory
delta exceeds 100KB
- Thread-safe peak updates using CAS
- Peak tracking considers both:
  - current used memory
  - zmalloc-reported peak memory

## Performance Results (ARM AArch64)

All performance numbers were obtained on an **AWS m8g.metal (ARM
AArch64)** instance.

The database was pre-populated with **1M keys**, each holding a **1KB
value**.
Benchmarks were executed using memtier with a **10 SET : 90 GET ratio**
and **pipeline = 10** ([full benchmark spec.
here](https://github.com/redis/redis-benchmarks-specification/blob/main/redis_benchmarks_specification/test-suites/memtier_benchmark-1Mkeys-string-setget200c-1KiB-pipeline-10.yml)).

| Environment | Baseline `redis/redis` unstable (median ± std.dev) |
Comparison `paulorsousa/redis`
`f05a4bd273cb4d63ff03d33e6207837b6e51de86` (median) | % change (higher
better) | Note |

|------------------------------|----------------------------------------------------|----------------------------------------------------------------------------------:|--------------------------|-----------------------|
| oss-standalone | 802,830 ± 0.2% (7 datapoints) | 796,660 | -0.8% | No
change |
| oss-standalone-02-io-threads | 982,698 ± 0.6% (7 datapoints) | 980,520
| -0.2% | No change |
| oss-standalone-04-io-threads | 2,573,244 ± 1.9% (7 datapoints) |
2,630,931 | +2.2% | Potential improvement |
| oss-standalone-08-io-threads | 2,343,609 ± 1.6% (7 datapoints) |
2,455,630 | +4.8% | Improvement |
This commit is contained in:
Paulo Sousa 2026-01-21 14:52:31 +00:00 committed by GitHub
parent e3c38aab66
commit c4baa64ea8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 89 additions and 18 deletions

View file

@ -11,6 +11,7 @@
* atomicSet(var,value) -- Set the atomic counter value
* atomicGetWithSync(var,value) -- 'atomicGet' with inter-thread synchronization
* atomicSetWithSync(var,value) -- 'atomicSet' with inter-thread synchronization
* atomicCompareExchange(type,var,expected_var,desired) -- Compare and exchange (CAS) operation
*
* Atomic operations on flags.
* Flag type can be int, long, long long or their unsigned counterparts.
@ -110,6 +111,8 @@
} while(0)
#define atomicSetWithSync(var,value) \
atomic_store_explicit(&var,value,memory_order_seq_cst)
#define atomicCompareExchange(type,var,expected_var,desired) \
atomic_compare_exchange_weak_explicit(&var,&expected_var,desired,memory_order_relaxed,memory_order_relaxed)
#define atomicFlagGetSet(var,oldvalue_var) \
oldvalue_var = atomic_exchange_explicit(&var,1,memory_order_relaxed)
#define REDIS_ATOMIC_API "c11-builtin"
@ -135,6 +138,8 @@
} while(0)
#define atomicSetWithSync(var,value) \
__atomic_store_n(&var,value,__ATOMIC_SEQ_CST)
#define atomicCompareExchange(type,var,expected_var,desired) \
__atomic_compare_exchange_n(&var,&expected_var,desired,1,__ATOMIC_RELAXED,__ATOMIC_RELAXED)
#define atomicFlagGetSet(var,oldvalue_var) \
oldvalue_var = __atomic_exchange_n(&var,1,__ATOMIC_RELAXED)
#define REDIS_ATOMIC_API "atomic-builtin"
@ -164,6 +169,12 @@
ANNOTATE_HAPPENS_BEFORE(&var); \
while(!__sync_bool_compare_and_swap(&var,var,value,__sync_synchronize)); \
} while(0)
#define atomicCompareExchange(type,var,expected_var,desired) ({ \
type _old = __sync_val_compare_and_swap(&var,expected_var,desired); \
int _success = (_old == expected_var); \
if (!_success) expected_var = _old; \
_success; \
})
#define atomicFlagGetSet(var,oldvalue_var) \
oldvalue_var = __sync_val_compare_and_swap(&var,0,1)
#define REDIS_ATOMIC_API "sync-builtin"

View file

@ -3531,13 +3531,13 @@ void startLoadingFile(size_t size, char* filename, int rdbflags) {
/* Refresh the absolute loading progress info */
void loadingAbsProgress(off_t pos) {
server.loading_loaded_bytes = pos;
updatePeakMemory(zmalloc_used_memory());
updatePeakMemory();
}
/* Refresh the incremental loading progress info */
void loadingIncrProgress(off_t size) {
server.loading_loaded_bytes += size;
updatePeakMemory(zmalloc_used_memory());
updatePeakMemory();
}
/* Update the file name currently being loaded */

View file

@ -1400,16 +1400,23 @@ void checkChildrenDone(void) {
}
/* Record the max memory used since the server was started. */
void updatePeakMemory(size_t used_memory) {
if (unlikely(used_memory > server.stat_peak_memory)) {
server.stat_peak_memory = used_memory;
void updatePeakMemory(void) {
size_t zmalloc_used = zmalloc_used_memory();
if (zmalloc_used > server.stat_peak_memory) {
server.stat_peak_memory = zmalloc_used;
server.stat_peak_memory_time = server.unixtime;
}
size_t zmalloc_peak = zmalloc_get_peak_memory();
if (zmalloc_peak > server.stat_peak_memory) {
server.stat_peak_memory = zmalloc_peak;
server.stat_peak_memory_time = zmalloc_get_peak_memory_time();
}
}
/* Called from serverCron and cronUpdateMemoryStats to update cached memory metrics. */
void cronUpdateMemoryStats(void) {
updatePeakMemory(zmalloc_used_memory());
updatePeakMemory();
run_with_period(100) {
/* Sample the RSS and other metrics here since this is a relatively slow call.
@ -1843,7 +1850,7 @@ extern int ProcessingEventsWhileBlocked;
void beforeSleep(struct aeEventLoop *eventLoop) {
UNUSED(eventLoop);
updatePeakMemory(zmalloc_used_memory());
updatePeakMemory();
/* Just call a subset of vital functions in case we are re-entering
* the event loop from processEventsWhileBlocked(). Note that in this
@ -4027,10 +4034,6 @@ void call(client *c, int flags) {
server.stat_numcommands++;
}
/* Record peak memory after each command and before the eviction that runs
* before the next command. */
updatePeakMemory(zmalloc_used_memory());
/* Do some maintenance job and cleanup */
afterCommand(c);
@ -6192,7 +6195,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) {
* may happen that the instantaneous value is slightly bigger than
* the peak value. This may confuse users, so we update the peak
* if found smaller than the current memory usage. */
updatePeakMemory(zmalloc_used);
updatePeakMemory();
bytesToHuman(hmem,sizeof(hmem),zmalloc_used);
bytesToHuman(peak_hmem,sizeof(peak_hmem),server.stat_peak_memory);

View file

@ -3560,7 +3560,7 @@ int zslLexValueLteMax(sds value, zlexrangespec *spec);
/* Core functions */
int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level);
void updatePeakMemory(size_t used_memory);
void updatePeakMemory(void);
size_t freeMemoryGetNotCountedMemory(void);
int overMaxmemoryAfterAlloc(size_t moremem);
uint64_t getCommandFlags(client *c);

View file

@ -81,14 +81,18 @@ void je_free_with_usize(void *ptr, size_t *usize);
#define MAX_THREADS 16 /* Keep it a power of 2 so we can use '&' instead of '%'. */
#define THREAD_MASK (MAX_THREADS - 1)
#define PEAK_CHECK_THRESHOLD (1024 * 100) /* 100KB */
typedef struct used_memory_entry {
redisAtomic long long used_memory;
char padding[CACHE_LINE_SIZE - sizeof(long long)];
redisAtomic long long last_peak_check;
char padding[CACHE_LINE_SIZE - sizeof(long long) - sizeof(long long)];
} used_memory_entry;
static __attribute__((aligned(CACHE_LINE_SIZE))) used_memory_entry used_memory[MAX_THREADS];
static redisAtomic size_t num_active_threads = 0;
static redisAtomic size_t zmalloc_peak = 0;
static redisAtomic time_t zmalloc_peak_time = 0;
static __thread long my_thread_index = -1;
static inline void init_my_thread_index(void) {
@ -98,9 +102,46 @@ static inline void init_my_thread_index(void) {
}
}
static void update_zmalloc_stat_alloc(long long num) {
static void update_zmalloc_stat_alloc(long long bytes_delta) {
init_my_thread_index();
atomicIncr(used_memory[my_thread_index].used_memory, num);
/* Per-thread allocation counter and the last counter value at which we ran a
* global peak check (throttles how often we call zmalloc_used_memory()). */
long long thread_used, thread_last_peak_check_used;
atomicIncrGet(used_memory[my_thread_index].used_memory, thread_used, bytes_delta);
atomicGet(used_memory[my_thread_index].last_peak_check, thread_last_peak_check_used);
/* Only run the (expensive) global used/peak check after this thread's
* allocation counter has advanced enough since the last check. */
if (unlikely(thread_used - thread_last_peak_check_used > PEAK_CHECK_THRESHOLD)) {
/* Snapshot of global used memory across all threads. */
size_t used_mem = zmalloc_used_memory();
/* Current published global peak. */
size_t published_peak;
atomicGet(zmalloc_peak, published_peak);
if (used_mem > published_peak) {
/* Try to publish `used_mem` as the new global peak.
*
* Another thread may update `zmalloc_peak` concurrently. Use a CAS loop:
* on failure, `old_peak` is refreshed with the latest peak value, and we
* retry only while our snapshot still exceeds it. */
size_t old_peak = published_peak;
while (used_mem > old_peak && !atomicCompareExchange(size_t, zmalloc_peak, old_peak, used_mem)) {
/* CAS failed: `old_peak` now holds the current `zmalloc_peak`. */
}
/* If we raised the peak, record when it was reached. */
if (used_mem > old_peak) {
atomicSet(zmalloc_peak_time, time(NULL));
}
}
/* Record the thread counter value at which we last ran a global peak check,
* to throttle future checks for this thread. */
atomicSet(used_memory[my_thread_index].last_peak_check, thread_used);
}
}
static void update_zmalloc_stat_free(long long num) {
@ -183,7 +224,7 @@ void *zmalloc_usable(size_t size, size_t *usable) {
void *ptr = ztrymalloc_usable_internal(size, &usable_size);
if (!ptr) zmalloc_oom_handler(size);
#ifdef HAVE_MALLOC_SIZE
ptr = extend_to_usable(ptr, usable_size);
if (ptr) ptr = extend_to_usable(ptr, usable_size);
#endif
if (usable) *usable = usable_size;
return ptr;
@ -538,6 +579,18 @@ size_t zmalloc_used_memory(void) {
return total_mem;
}
size_t zmalloc_get_peak_memory(void) {
size_t peak;
atomicGet(zmalloc_peak, peak);
return peak;
}
time_t zmalloc_get_peak_memory_time(void) {
time_t t;
atomicGet(zmalloc_peak_time, t);
return t;
}
void zmalloc_set_oom_handler(void (*oom_handler)(size_t)) {
zmalloc_oom_handler = oom_handler;
}

View file

@ -87,6 +87,8 @@
#define HAVE_ALLOC_WITH_USIZE
#endif
#include <time.h>
/* 'noinline' attribute is intended to prevent the `-Wstringop-overread` warning
* when using gcc-12 later with LTO enabled. It may be removed once the
* bug[https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96503] is fixed. */
@ -108,6 +110,8 @@ void zfree_usable(void *ptr, size_t *usable);
__attribute__((malloc)) char *zstrdup(const char *s);
__attribute__((malloc)) char *zstrdup_usable(const char *s, size_t *usable);
size_t zmalloc_used_memory(void);
size_t zmalloc_get_peak_memory(void);
time_t zmalloc_get_peak_memory_time(void);
void zmalloc_set_oom_handler(void (*oom_handler)(size_t));
size_t zmalloc_get_rss(void);
int zmalloc_get_allocator_info(int refresh_stats, size_t *allocated, size_t *active, size_t *resident,

View file

@ -798,7 +798,7 @@ test {Replicas that was marked as CLIENT_CLOSE_ASAP should not keep the replicat
# exceed the replica soft limit. Furthermore, as the replica release its reference to
# replication backlog, it should be properly trimmed, the memory usage of replication
# backlog should not significantly exceed repl-backlog-size (default 1MB). */
assert_lessthan [getInfoProperty $res used_memory_peak] 10000000;# less than 10mb
assert_lessthan [getInfoProperty $res used_memory_peak] 20000000;# less than 20mb
assert_lessthan [getInfoProperty $res mem_replication_backlog] 2000000;# less than 2mb
}
}