mirror of
https://github.com/isc-projects/bind9.git
synced 2026-02-24 02:10:30 -05:00
This "quiescent state based reclamation" module provides support for
the qp-trie module in dns/qp. It is a replacement for liburcu, written
without reference to the urcu source code, and in fact it works in a
significantly different way.
A few specifics of BIND make this variant of QSBR somewhat simpler:
* We can require that wait-free access to a qp-trie only happens in
an isc_loop callback. The loop provides a natural quiescent state,
after the callbacks are done, when no qp-trie access occurs.
* We can dispense with any API like rcu_synchronize(). In practice,
it takes far too long to wait for a grace period to elapse for each
write to a data structure.
* We use the idea of "phases" (aka epochs or eras) from EBR to
reduce the amount of bookkeeping needed to track memory that is no
longer needed, knowing that the qp-trie does most of that work
already.
I considered hazard pointers for safe memory reclamation. They have
more read-side overhead (updating the hazard pointers) and it wasn't
clear to me how to nicely schedule the cleanup work. Another
alternative, epoch-based reclamation, is designed for fine-grained
lock-free updates, so it needs some rethinking to work well with the
heavily read-biased design of the qp-trie. QSBR has the fastest read
side of the basic SMR algorithms (with no barriers), and fits well
into a libuv loop. More recent hybrid SMR algorithms do not appear to
have enough benefits to justify the extra complexity.
393 lines
12 KiB
C
393 lines
12 KiB
C
/*
|
|
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
|
|
*
|
|
* SPDX-License-Identifier: MPL-2.0
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
|
|
*
|
|
* See the COPYRIGHT file distributed with this work for additional
|
|
* information regarding copyright ownership.
|
|
*/
|
|
|
|
#include <isc/atomic.h>
|
|
#include <isc/log.h>
|
|
#include <isc/loop.h>
|
|
#include <isc/qsbr.h>
|
|
#include <isc/stack.h>
|
|
#include <isc/tid.h>
|
|
#include <isc/time.h>
|
|
#include <isc/types.h>
|
|
#include <isc/uv.h>
|
|
|
|
#include "loop_p.h"
|
|
|
|
#define MAX_GRACE_PERIOD_NS 53 * NS_PER_MS
|
|
|
|
#if 0
|
|
#define TRACE(fmt, ...) \
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_OTHER, \
|
|
ISC_LOG_DEBUG(7), "%s:%u:%s():t%u: " fmt, __FILE__, \
|
|
__LINE__, __func__, isc_tid(), ##__VA_ARGS__)
|
|
#else
|
|
#define TRACE(...)
|
|
#endif
|
|
|
|
static ISC_STACK(isc_qsbr_registered_t) qsbreclaimers = ISC_STACK_INITIALIZER;
|
|
|
|
static void
|
|
reclaim_cb(void *arg);
|
|
static void
|
|
reclaimed_cb(void *arg);
|
|
|
|
/**********************************************************************/
|
|
|
|
/*
|
|
* 3,2,1,3,2,1,...
|
|
*/
|
|
static isc_qsbr_phase_t
|
|
change_phase(isc_qsbr_phase_t phase) {
|
|
return (--phase > 0 ? phase : ISC_QSBR_PHASE_MAX);
|
|
}
|
|
|
|
/*
|
|
* For marking or checking that a phase has cleanup work to do.
|
|
*/
|
|
static unsigned int
|
|
active_bit(isc_qsbr_phase_t phase) {
|
|
return (1 << phase);
|
|
}
|
|
|
|
/*
|
|
* Extract the global phase from the grace period state.
|
|
*/
|
|
static isc_qsbr_phase_t
|
|
global_phase(isc_qsbr_t *qsbr, memory_order m_o) {
|
|
uint32_t grace = atomic_load_explicit(&qsbr->grace, m_o);
|
|
return (ISC_QSBR_GRACE_PHASE(grace));
|
|
}
|
|
|
|
/*
|
|
* Record that the current thread has passed the barrier.
|
|
* Returns true if more threads still need to pass.
|
|
*
|
|
* ATOMIC: acquire-release, to ensure that this is not reordered wrt
|
|
* read-only accesses to lock-free data structures. This implements the
|
|
* ordering requirements of a quiescent state.
|
|
*/
|
|
static bool
|
|
fuzzy_barrier_not_yet(isc_qsbr_t *qsbr) {
|
|
uint32_t grace = atomic_fetch_sub_acq_rel(&qsbr->grace,
|
|
ISC_QSBR_ONE_THREAD);
|
|
uint32_t threads = ISC_QSBR_GRACE_THREADS(grace);
|
|
return (threads > 1);
|
|
}
|
|
|
|
/*
|
|
* Ungracefully drive all cleanup work to completion.
|
|
*
|
|
* ATOMIC: everything is relaxed, because we assume that concurrent
|
|
* readers have already finished. `reclaim_cb()` uses the `activated`
|
|
* flags to ensure it is OK that threads will race to complete the
|
|
* cleanup.
|
|
*/
|
|
static void
|
|
qsbr_shutdown(isc_loopmgr_t *loopmgr) {
|
|
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
|
isc_qsbr_phase_t phase = global_phase(qsbr, memory_order_relaxed);
|
|
uint32_t threads = isc_loopmgr_nloops(loopmgr);
|
|
uint32_t grace;
|
|
|
|
while (atomic_load_relaxed(&qsbr->activated) != 0) {
|
|
reclaim_cb(loopmgr);
|
|
phase = change_phase(phase);
|
|
grace = ISC_QSBR_GRACE(threads, phase);
|
|
atomic_store_relaxed(&qsbr->grace, grace);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* On a quiet server that does not have enough network traffic to keep
|
|
* all its threads spinning, grace periods might extend indefinitely.
|
|
* So check if we have been waiting an unreasonably long time since
|
|
* the last phase change. If so, send a no-op async request to every
|
|
* thread to make them all cycle through a quiescent state.
|
|
*/
|
|
static void
|
|
maybe_wakeup(isc_loop_t *loop) {
|
|
isc_loopmgr_t *loopmgr = loop->loopmgr;
|
|
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
|
|
|
/*
|
|
* ATOMIC: relaxed is OK here because we don't use any values guarded
|
|
* by the `activated` flags.
|
|
*/
|
|
if (atomic_load_relaxed(&qsbr->activated) == 0) {
|
|
return;
|
|
}
|
|
if (loop->shuttingdown) {
|
|
qsbr_shutdown(loopmgr);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* ATOMIC: relaxed, because the `transition_time` doesn't guard any
|
|
* other values, just the isc_loopmgr_wakeup() call below.
|
|
*/
|
|
atomic_uint_fast64_t *qsbr_ttp = &qsbr->transition_time;
|
|
isc_nanosecs_t now = isc_time_monotonic();
|
|
isc_nanosecs_t start = atomic_load_relaxed(qsbr_ttp);
|
|
if (now < start + MAX_GRACE_PERIOD_NS) {
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* To stop other threads from also invoking `isc_loopmgr_wakeup()`,
|
|
* we try to push the timer into the future (expecting that it will
|
|
* not trigger again), and quit if someone else got there first.
|
|
* ATOMIC: relaxed, as before; strong, because there is no retry loop.
|
|
*/
|
|
if (!atomic_compare_exchange_strong_relaxed(qsbr_ttp, &start, now)) {
|
|
return;
|
|
}
|
|
|
|
TRACE("long grace period of %llu ns, waking up other threads",
|
|
(unsigned long long)(now - start));
|
|
|
|
isc_loopmgr_wakeup(loopmgr);
|
|
}
|
|
|
|
/*
|
|
* Callers use the fuzzy barrier to ensure only one thread can enter
|
|
* this function at a time.
|
|
*
|
|
* Phase transitions happen at roughly the same frequency that IO
|
|
* event loops cycle, limited by the slowest loop in each cycle.
|
|
*/
|
|
static void
|
|
phase_transition(isc_loop_t *loop, isc_qsbr_phase_t current_phase) {
|
|
isc_loopmgr_t *loopmgr = loop->loopmgr;
|
|
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
|
|
|
if (loop->shuttingdown) {
|
|
qsbr_shutdown(loopmgr);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* After we change phase, threads will be in either the `current_phase`
|
|
* or the `next_phase`. We will reclaim memory from the `third_phase`.
|
|
*
|
|
* ATOMIC: relaxed is OK here because the necessary synchronization
|
|
* happens in `reclaim_cb()`.
|
|
*/
|
|
isc_qsbr_phase_t next_phase = change_phase(current_phase);
|
|
isc_qsbr_phase_t third_phase = change_phase(next_phase);
|
|
bool activated = atomic_load_relaxed(&qsbr->activated) &
|
|
active_bit(third_phase);
|
|
|
|
/*
|
|
* Reset the wakeup timer, and log the length of the grace period.
|
|
* ATOMIC: relaxed, per the commentary in `maybe_wakeup()`.
|
|
*/
|
|
atomic_uint_fast64_t *qsbr_tt = &qsbr->transition_time;
|
|
isc_nanosecs_t now = isc_time_monotonic();
|
|
isc_nanosecs_t start = atomic_exchange_relaxed(qsbr_tt, now);
|
|
TRACE("phase %u -> %u after grace period of %f ms", current_phase,
|
|
next_phase, (double)(now - start) / NS_PER_MS);
|
|
UNUSED(start); /* ifndef TRACE() */
|
|
|
|
/*
|
|
* Work out the threads counter for this grace period.
|
|
*
|
|
* We need to add one for any reclamation worker thread, to
|
|
* prevent us from changing phase before the work is done. If
|
|
* we change too early, any newly detached objects will be
|
|
* marked with the same phase as the running reclaimer, which
|
|
* might lead to them being free()d too soon.
|
|
*/
|
|
uint32_t threads = isc_loopmgr_nloops(loopmgr) + (activated ? 1 : 0);
|
|
|
|
/*
|
|
* Start the new grace period.
|
|
*
|
|
* ATOMIC: release, to pair with the load-acquire in `reclaim_cb()`
|
|
* which is spawned in a separate worker thread.
|
|
*/
|
|
uint32_t grace = ISC_QSBR_GRACE(threads, next_phase);
|
|
atomic_store_release(&qsbr->grace, grace);
|
|
|
|
if (activated) {
|
|
isc_work_enqueue(loop, reclaim_cb, reclaimed_cb, loopmgr);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This function is called once per cycle of each IO event loop by the
|
|
* `uv_prepare` callback below.
|
|
*/
|
|
void
|
|
isc__qsbr_quiescent_state(isc_loop_t *loop) {
|
|
isc_loopmgr_t *loopmgr = loop->loopmgr;
|
|
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
|
|
|
/*
|
|
* ATOMIC: relaxed. If we are in phase then we don't need to
|
|
* synchronize; if we are not then this thread's presence in
|
|
* the thread counter will prevent the phase from changing
|
|
* before we get to the fuzzy barrier.
|
|
*/
|
|
isc_qsbr_phase_t phase = global_phase(qsbr, memory_order_relaxed);
|
|
if (loop->qsbr_phase == phase) {
|
|
maybe_wakeup(loop);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Enter the current phase and count us out of the previous phase.
|
|
*/
|
|
loop->qsbr_phase = phase;
|
|
if (fuzzy_barrier_not_yet(qsbr)) {
|
|
maybe_wakeup(loop);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* We were the last thread to enter the current phase so the
|
|
* grace period is up. No other thread can reach this point.
|
|
*/
|
|
phase_transition(loop, phase);
|
|
}
|
|
|
|
void
|
|
isc__qsbr_quiescent_cb(uv_prepare_t *handle) {
|
|
isc_loop_t *loop = uv_handle_get_data((uv_handle_t *)handle);
|
|
isc__qsbr_quiescent_state(loop);
|
|
}
|
|
|
|
static void
|
|
reclaimed_cb(void *arg) {
|
|
/* we are back on a loop thread */
|
|
isc_loopmgr_t *loopmgr = arg;
|
|
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
|
isc_loop_t *loop = CURRENT_LOOP(loopmgr);
|
|
|
|
/*
|
|
* Remove the reclaimers from the thread count, so that the
|
|
* next grace period can start.
|
|
*/
|
|
if (fuzzy_barrier_not_yet(qsbr)) {
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* The reclaimers were the last thread to be counted out: every
|
|
* other thread already passed through a quiescent state.
|
|
*
|
|
* We expect loop->qsbr_phase == global_phase() at this point,
|
|
* except during shutdown when the phase shifts rapidly. Also,
|
|
* the current loop might not have received the shutdown
|
|
* message yet, so it seems easiest to omit the assertion.
|
|
*
|
|
* ATOMIC: relaxed, the fuzzy barrier already synchronized.
|
|
*/
|
|
TRACE("reclaimers overran");
|
|
phase_transition(loop, global_phase(qsbr, memory_order_relaxed));
|
|
}
|
|
|
|
static void
|
|
reclaim_cb(void *arg) {
|
|
/* we are on a work thread not a loop thread */
|
|
isc_loopmgr_t *loopmgr = arg;
|
|
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
|
|
|
/*
|
|
* The global phase has just been bumped by a `phase_transition()`
|
|
* and it cannot change again until the grace period is up, which
|
|
* cannot happen until we have finished working.
|
|
*
|
|
* ATOMIC: acquire, to pair with the release in `phase_transition()`.
|
|
*
|
|
* The phase we are to clean up is 2 before the current phase,
|
|
* which is the same as the one after the current phase (mod 3).
|
|
*/
|
|
isc_qsbr_phase_t cur_phase = global_phase(qsbr, memory_order_acquire);
|
|
isc_qsbr_phase_t third_phase = change_phase(cur_phase);
|
|
unsigned int third_bit = active_bit(third_phase);
|
|
|
|
/*
|
|
* If any reclaimers need to be called again later, they can use
|
|
* `isc_qsbr_activate()`, so we need to clear the bit first.
|
|
*
|
|
* ATOMIC: acquire, so that `isc_qsbr_activate()` happens before
|
|
* the callbacks are invoked.
|
|
*/
|
|
uint32_t activated = atomic_fetch_and_explicit(
|
|
&qsbr->activated, ~third_bit, memory_order_acquire);
|
|
|
|
/* this can happen when we are racing to clean up on shutdown */
|
|
if ((activated & third_bit) == 0) {
|
|
return;
|
|
}
|
|
|
|
isc_qsbr_registered_t *reclaimer = ISC_STACK_TOP(qsbreclaimers);
|
|
while (reclaimer != NULL) {
|
|
reclaimer->func(third_phase);
|
|
reclaimer = ISC_SLINK_NEXT(reclaimer, link);
|
|
}
|
|
}
|
|
|
|
void
|
|
isc__qsbr_register(isc_qsbr_registered_t *reclaimer) {
|
|
REQUIRE(reclaimer->func != NULL);
|
|
ISC_STACK_PUSH(qsbreclaimers, reclaimer, link);
|
|
}
|
|
|
|
/*
|
|
* ATOMIC: This function needs to ensure that the global phase is read
|
|
* after a write has committed. Acquire/release ordering is not sufficient
|
|
* for ordering between separate atomics (the data structure's root pointer
|
|
* and the global phase), so it must be sequentially consistent.
|
|
*
|
|
* In general, the phases up to and including the next phase transition
|
|
* look like:
|
|
*
|
|
* 1. local phase
|
|
* 2. global phase
|
|
* 3. next phase
|
|
* 1. third phase
|
|
*
|
|
* i.e. some threads are still one behind the global phase, on the same
|
|
* phase that will be cleaned up immediately after the phase transition.
|
|
*
|
|
* This function is called just after a write commits. It's likely that
|
|
* some threads on the global phase (2) are using a version of the data
|
|
* structure from before the write, and they can continue using it while
|
|
* the straggler threads (1) catch up and cause a phase transition.
|
|
*
|
|
* The writer can be one of the straggler threads. If it incorrectly marks
|
|
* cleanup work with its local phase (1), memory will be reclaimed
|
|
* immediately after the next phase transition (when the third phase is
|
|
* also 1), which could be almost immediately when the writer returns to
|
|
* the event loop. This will cause a use-after-free for existing readers
|
|
* (in phase 2).
|
|
*
|
|
* More straightforwardly, we need to be able to queue up reclaim work from
|
|
* a thread that isn't running a loop, which also means this function has
|
|
* to return the global phase.
|
|
*/
|
|
isc_qsbr_phase_t
|
|
isc_qsbr_phase(isc_loopmgr_t *loopmgr) {
|
|
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
|
return (global_phase(qsbr, memory_order_seq_cst));
|
|
}
|
|
|
|
void
|
|
isc_qsbr_activate(isc_loopmgr_t *loopmgr, isc_qsbr_phase_t phase) {
|
|
/*
|
|
* ATOMIC: release ordering ensures that writing the cleanup lists
|
|
* happens before the callback is invoked from a worker thread.
|
|
*/
|
|
atomic_fetch_or_release(&loopmgr->qsbr.activated, active_bit(phase));
|
|
}
|