bind9/lib/isc/qsbr.c
Tony Finch 9b7aa536ba QSBR: safe memory reclamation for lock-free data structures
This "quiescent state based reclamation" module provides support for
the qp-trie module in dns/qp. It is a replacement for liburcu, written
without reference to the urcu source code, and in fact it works in a
significantly different way.

A few specifics of BIND make this variant of QSBR somewhat simpler:

  * We can require that wait-free access to a qp-trie only happens in
    an isc_loop callback. The loop provides a natural quiescent state,
    after the callbacks are done, when no qp-trie access occurs.

  * We can dispense with any API like rcu_synchronize(). In practice,
    it takes far too long to wait for a grace period to elapse for each
    write to a data structure.

  * We use the idea of "phases" (aka epochs or eras) from EBR to
    reduce the amount of bookkeeping needed to track memory that is no
    longer needed, knowing that the qp-trie does most of that work
    already.

I considered hazard pointers for safe memory reclamation. They have
more read-side overhead (updating the hazard pointers) and it wasn't
clear to me how to nicely schedule the cleanup work. Another
alternative, epoch-based reclamation, is designed for fine-grained
lock-free updates, so it needs some rethinking to work well with the
heavily read-biased design of the qp-trie. QSBR has the fastest read
side of the basic SMR algorithms (with no barriers), and fits well
into a libuv loop. More recent hybrid SMR algorithms do not appear to
have enough benefits to justify the extra complexity.
2023-02-23 15:57:53 +00:00

393 lines
12 KiB
C

/*
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
*
* SPDX-License-Identifier: MPL-2.0
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
*
* See the COPYRIGHT file distributed with this work for additional
* information regarding copyright ownership.
*/
#include <isc/atomic.h>
#include <isc/log.h>
#include <isc/loop.h>
#include <isc/qsbr.h>
#include <isc/stack.h>
#include <isc/tid.h>
#include <isc/time.h>
#include <isc/types.h>
#include <isc/uv.h>
#include "loop_p.h"
#define MAX_GRACE_PERIOD_NS 53 * NS_PER_MS
#if 0
#define TRACE(fmt, ...) \
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_OTHER, \
ISC_LOG_DEBUG(7), "%s:%u:%s():t%u: " fmt, __FILE__, \
__LINE__, __func__, isc_tid(), ##__VA_ARGS__)
#else
#define TRACE(...)
#endif
static ISC_STACK(isc_qsbr_registered_t) qsbreclaimers = ISC_STACK_INITIALIZER;
static void
reclaim_cb(void *arg);
static void
reclaimed_cb(void *arg);
/**********************************************************************/
/*
* 3,2,1,3,2,1,...
*/
static isc_qsbr_phase_t
change_phase(isc_qsbr_phase_t phase) {
return (--phase > 0 ? phase : ISC_QSBR_PHASE_MAX);
}
/*
* For marking or checking that a phase has cleanup work to do.
*/
static unsigned int
active_bit(isc_qsbr_phase_t phase) {
return (1 << phase);
}
/*
* Extract the global phase from the grace period state.
*/
static isc_qsbr_phase_t
global_phase(isc_qsbr_t *qsbr, memory_order m_o) {
uint32_t grace = atomic_load_explicit(&qsbr->grace, m_o);
return (ISC_QSBR_GRACE_PHASE(grace));
}
/*
* Record that the current thread has passed the barrier.
* Returns true if more threads still need to pass.
*
* ATOMIC: acquire-release, to ensure that this is not reordered wrt
* read-only accesses to lock-free data structures. This implements the
* ordering requirements of a quiescent state.
*/
static bool
fuzzy_barrier_not_yet(isc_qsbr_t *qsbr) {
uint32_t grace = atomic_fetch_sub_acq_rel(&qsbr->grace,
ISC_QSBR_ONE_THREAD);
uint32_t threads = ISC_QSBR_GRACE_THREADS(grace);
return (threads > 1);
}
/*
* Ungracefully drive all cleanup work to completion.
*
* ATOMIC: everything is relaxed, because we assume that concurrent
* readers have already finished. `reclaim_cb()` uses the `activated`
* flags to ensure it is OK that threads will race to complete the
* cleanup.
*/
static void
qsbr_shutdown(isc_loopmgr_t *loopmgr) {
isc_qsbr_t *qsbr = &loopmgr->qsbr;
isc_qsbr_phase_t phase = global_phase(qsbr, memory_order_relaxed);
uint32_t threads = isc_loopmgr_nloops(loopmgr);
uint32_t grace;
while (atomic_load_relaxed(&qsbr->activated) != 0) {
reclaim_cb(loopmgr);
phase = change_phase(phase);
grace = ISC_QSBR_GRACE(threads, phase);
atomic_store_relaxed(&qsbr->grace, grace);
}
}
/*
* On a quiet server that does not have enough network traffic to keep
* all its threads spinning, grace periods might extend indefinitely.
* So check if we have been waiting an unreasonably long time since
* the last phase change. If so, send a no-op async request to every
* thread to make them all cycle through a quiescent state.
*/
static void
maybe_wakeup(isc_loop_t *loop) {
isc_loopmgr_t *loopmgr = loop->loopmgr;
isc_qsbr_t *qsbr = &loopmgr->qsbr;
/*
* ATOMIC: relaxed is OK here because we don't use any values guarded
* by the `activated` flags.
*/
if (atomic_load_relaxed(&qsbr->activated) == 0) {
return;
}
if (loop->shuttingdown) {
qsbr_shutdown(loopmgr);
return;
}
/*
* ATOMIC: relaxed, because the `transition_time` doesn't guard any
* other values, just the isc_loopmgr_wakeup() call below.
*/
atomic_uint_fast64_t *qsbr_ttp = &qsbr->transition_time;
isc_nanosecs_t now = isc_time_monotonic();
isc_nanosecs_t start = atomic_load_relaxed(qsbr_ttp);
if (now < start + MAX_GRACE_PERIOD_NS) {
return;
}
/*
* To stop other threads from also invoking `isc_loopmgr_wakeup()`,
* we try to push the timer into the future (expecting that it will
* not trigger again), and quit if someone else got there first.
* ATOMIC: relaxed, as before; strong, because there is no retry loop.
*/
if (!atomic_compare_exchange_strong_relaxed(qsbr_ttp, &start, now)) {
return;
}
TRACE("long grace period of %llu ns, waking up other threads",
(unsigned long long)(now - start));
isc_loopmgr_wakeup(loopmgr);
}
/*
* Callers use the fuzzy barrier to ensure only one thread can enter
* this function at a time.
*
* Phase transitions happen at roughly the same frequency that IO
* event loops cycle, limited by the slowest loop in each cycle.
*/
static void
phase_transition(isc_loop_t *loop, isc_qsbr_phase_t current_phase) {
isc_loopmgr_t *loopmgr = loop->loopmgr;
isc_qsbr_t *qsbr = &loopmgr->qsbr;
if (loop->shuttingdown) {
qsbr_shutdown(loopmgr);
return;
}
/*
* After we change phase, threads will be in either the `current_phase`
* or the `next_phase`. We will reclaim memory from the `third_phase`.
*
* ATOMIC: relaxed is OK here because the necessary synchronization
* happens in `reclaim_cb()`.
*/
isc_qsbr_phase_t next_phase = change_phase(current_phase);
isc_qsbr_phase_t third_phase = change_phase(next_phase);
bool activated = atomic_load_relaxed(&qsbr->activated) &
active_bit(third_phase);
/*
* Reset the wakeup timer, and log the length of the grace period.
* ATOMIC: relaxed, per the commentary in `maybe_wakeup()`.
*/
atomic_uint_fast64_t *qsbr_tt = &qsbr->transition_time;
isc_nanosecs_t now = isc_time_monotonic();
isc_nanosecs_t start = atomic_exchange_relaxed(qsbr_tt, now);
TRACE("phase %u -> %u after grace period of %f ms", current_phase,
next_phase, (double)(now - start) / NS_PER_MS);
UNUSED(start); /* ifndef TRACE() */
/*
* Work out the threads counter for this grace period.
*
* We need to add one for any reclamation worker thread, to
* prevent us from changing phase before the work is done. If
* we change too early, any newly detached objects will be
* marked with the same phase as the running reclaimer, which
* might lead to them being free()d too soon.
*/
uint32_t threads = isc_loopmgr_nloops(loopmgr) + (activated ? 1 : 0);
/*
* Start the new grace period.
*
* ATOMIC: release, to pair with the load-acquire in `reclaim_cb()`
* which is spawned in a separate worker thread.
*/
uint32_t grace = ISC_QSBR_GRACE(threads, next_phase);
atomic_store_release(&qsbr->grace, grace);
if (activated) {
isc_work_enqueue(loop, reclaim_cb, reclaimed_cb, loopmgr);
}
}
/*
* This function is called once per cycle of each IO event loop by the
* `uv_prepare` callback below.
*/
void
isc__qsbr_quiescent_state(isc_loop_t *loop) {
isc_loopmgr_t *loopmgr = loop->loopmgr;
isc_qsbr_t *qsbr = &loopmgr->qsbr;
/*
* ATOMIC: relaxed. If we are in phase then we don't need to
* synchronize; if we are not then this thread's presence in
* the thread counter will prevent the phase from changing
* before we get to the fuzzy barrier.
*/
isc_qsbr_phase_t phase = global_phase(qsbr, memory_order_relaxed);
if (loop->qsbr_phase == phase) {
maybe_wakeup(loop);
return;
}
/*
* Enter the current phase and count us out of the previous phase.
*/
loop->qsbr_phase = phase;
if (fuzzy_barrier_not_yet(qsbr)) {
maybe_wakeup(loop);
return;
}
/*
* We were the last thread to enter the current phase so the
* grace period is up. No other thread can reach this point.
*/
phase_transition(loop, phase);
}
void
isc__qsbr_quiescent_cb(uv_prepare_t *handle) {
isc_loop_t *loop = uv_handle_get_data((uv_handle_t *)handle);
isc__qsbr_quiescent_state(loop);
}
static void
reclaimed_cb(void *arg) {
/* we are back on a loop thread */
isc_loopmgr_t *loopmgr = arg;
isc_qsbr_t *qsbr = &loopmgr->qsbr;
isc_loop_t *loop = CURRENT_LOOP(loopmgr);
/*
* Remove the reclaimers from the thread count, so that the
* next grace period can start.
*/
if (fuzzy_barrier_not_yet(qsbr)) {
return;
}
/*
* The reclaimers were the last thread to be counted out: every
* other thread already passed through a quiescent state.
*
* We expect loop->qsbr_phase == global_phase() at this point,
* except during shutdown when the phase shifts rapidly. Also,
* the current loop might not have received the shutdown
* message yet, so it seems easiest to omit the assertion.
*
* ATOMIC: relaxed, the fuzzy barrier already synchronized.
*/
TRACE("reclaimers overran");
phase_transition(loop, global_phase(qsbr, memory_order_relaxed));
}
static void
reclaim_cb(void *arg) {
/* we are on a work thread not a loop thread */
isc_loopmgr_t *loopmgr = arg;
isc_qsbr_t *qsbr = &loopmgr->qsbr;
/*
* The global phase has just been bumped by a `phase_transition()`
* and it cannot change again until the grace period is up, which
* cannot happen until we have finished working.
*
* ATOMIC: acquire, to pair with the release in `phase_transition()`.
*
* The phase we are to clean up is 2 before the current phase,
* which is the same as the one after the current phase (mod 3).
*/
isc_qsbr_phase_t cur_phase = global_phase(qsbr, memory_order_acquire);
isc_qsbr_phase_t third_phase = change_phase(cur_phase);
unsigned int third_bit = active_bit(third_phase);
/*
* If any reclaimers need to be called again later, they can use
* `isc_qsbr_activate()`, so we need to clear the bit first.
*
* ATOMIC: acquire, so that `isc_qsbr_activate()` happens before
* the callbacks are invoked.
*/
uint32_t activated = atomic_fetch_and_explicit(
&qsbr->activated, ~third_bit, memory_order_acquire);
/* this can happen when we are racing to clean up on shutdown */
if ((activated & third_bit) == 0) {
return;
}
isc_qsbr_registered_t *reclaimer = ISC_STACK_TOP(qsbreclaimers);
while (reclaimer != NULL) {
reclaimer->func(third_phase);
reclaimer = ISC_SLINK_NEXT(reclaimer, link);
}
}
void
isc__qsbr_register(isc_qsbr_registered_t *reclaimer) {
REQUIRE(reclaimer->func != NULL);
ISC_STACK_PUSH(qsbreclaimers, reclaimer, link);
}
/*
* ATOMIC: This function needs to ensure that the global phase is read
* after a write has committed. Acquire/release ordering is not sufficient
* for ordering between separate atomics (the data structure's root pointer
* and the global phase), so it must be sequentially consistent.
*
* In general, the phases up to and including the next phase transition
* look like:
*
* 1. local phase
* 2. global phase
* 3. next phase
* 1. third phase
*
* i.e. some threads are still one behind the global phase, on the same
* phase that will be cleaned up immediately after the phase transition.
*
* This function is called just after a write commits. It's likely that
* some threads on the global phase (2) are using a version of the data
* structure from before the write, and they can continue using it while
* the straggler threads (1) catch up and cause a phase transition.
*
* The writer can be one of the straggler threads. If it incorrectly marks
* cleanup work with its local phase (1), memory will be reclaimed
* immediately after the next phase transition (when the third phase is
* also 1), which could be almost immediately when the writer returns to
* the event loop. This will cause a use-after-free for existing readers
* (in phase 2).
*
* More straightforwardly, we need to be able to queue up reclaim work from
* a thread that isn't running a loop, which also means this function has
* to return the global phase.
*/
isc_qsbr_phase_t
isc_qsbr_phase(isc_loopmgr_t *loopmgr) {
isc_qsbr_t *qsbr = &loopmgr->qsbr;
return (global_phase(qsbr, memory_order_seq_cst));
}
void
isc_qsbr_activate(isc_loopmgr_t *loopmgr, isc_qsbr_phase_t phase) {
/*
* ATOMIC: release ordering ensures that writing the cleanup lists
* happens before the callback is invoked from a worker thread.
*/
atomic_fetch_or_release(&loopmgr->qsbr.activated, active_bit(phase));
}