mirror of
https://github.com/postgres/postgres.git
synced 2026-04-11 12:06:55 -04:00
This allows data checksums to be enabled, or disabled, in a running cluster without restricting access to the cluster during processing. Data checksums could prior to this only be enabled during initdb or when the cluster is offline using the pg_checksums app. This commit introduce functionality to enable, or disable, data checksums while the cluster is running regardless of how it was initialized. A background worker launcher process is responsible for launching a dynamic per-database background worker which will mark all buffers dirty for all relation with storage in order for them to have data checksums calculated on write. Once all relations in all databases have been processed, the data_checksums state will be set to on and the cluster will at that point be identical to one which had data checksums enabled during initialization or via offline processing. When data checksums are being enabled, concurrent I/O operations from backends other than the data checksums worker will write the checksums but not verify them on reading. Only when all backends have absorbed the procsignalbarrier for setting data_checksums to on will they also start verifying checksums on reading. The same process is repeated during disabling; all backends write checksums but do not verify them until the barrier for setting the state to off has been absorbed by all. This in-progress state is used to ensure there are no false negatives (or positives) due to reading a checksum which is not in sync with the page. A new testmodule, test_checksums, is introduced with an extensive set of tests covering both online and offline data checksum mode changes. The tests which run concurrent pgbdench during online processing are gated behind the PG_TEST_EXTRA flag due to being very expensive to run. Two levels of PG_TEST_EXTRA flags exist to turn on a subset of the expensive tests, or the full suite of multiple runs. This work is based on an earlier version of this patch which was reviewed by among others Heikki Linnakangas, Robert Haas, Andres Freund, Tomas Vondra, Michael Banck and Andrey Borodin. During the work on this new version, Tomas Vondra has given invaluable assistance with not only coding and reviewing but very in-depth testing. Author: Daniel Gustafsson <daniel@yesql.se> Author: Magnus Hagander <magnus@hagander.net> Co-authored-by: Tomas Vondra <tomas@vondra.me> Reviewed-by: Tomas Vondra <tomas@vondra.me> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi> Discussion: https://postgr.es/m/CABUevExz9hUUOLnJVr2kpw9Cx=o4MCr1SVKwbupzuxP7ckNutA@mail.gmail.com Discussion: https://postgr.es/m/20181030051643.elbxjww5jjgnjaxg@alap3.anarazel.de Discussion: https://postgr.es/m/CABUevEwE3urLtwxxqdgd5O2oQz9J717ZzMbh+ziCSa5YLLU_BA@mail.gmail.com
406 lines
11 KiB
C
406 lines
11 KiB
C
/* -------------------------------------------------------------------------
|
|
*
|
|
* pgstat_backend.c
|
|
* Implementation of backend statistics.
|
|
*
|
|
* This file contains the implementation of backend statistics. It is kept
|
|
* separate from pgstat.c to enforce the line between the statistics access /
|
|
* storage implementation and the details about individual types of
|
|
* statistics.
|
|
*
|
|
* This statistics kind uses a proc number as object ID for the hash table
|
|
* of pgstats. Entries are created each time a process is spawned, and are
|
|
* dropped when the process exits. These are not written to the pgstats file
|
|
* on disk. Pending statistics are managed without direct interactions with
|
|
* PgStat_EntryRef->pending, relying on PendingBackendStats instead so as it
|
|
* is possible to report data within critical sections.
|
|
*
|
|
* Copyright (c) 2001-2026, PostgreSQL Global Development Group
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/utils/activity/pgstat_backend.c
|
|
* -------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "access/xlog.h"
|
|
#include "executor/instrument.h"
|
|
#include "storage/bufmgr.h"
|
|
#include "storage/proc.h"
|
|
#include "storage/procarray.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/pgstat_internal.h"
|
|
|
|
/*
|
|
* Backend statistics counts waiting to be flushed out. These counters may be
|
|
* reported within critical sections so we use static memory in order to avoid
|
|
* memory allocation.
|
|
*/
|
|
static PgStat_BackendPending PendingBackendStats;
|
|
static bool backend_has_iostats = false;
|
|
|
|
/*
|
|
* WAL usage counters saved from pgWalUsage at the previous call to
|
|
* pgstat_flush_backend(). This is used to calculate how much WAL usage
|
|
* happens between pgstat_flush_backend() calls, by subtracting the
|
|
* previous counters from the current ones.
|
|
*/
|
|
static WalUsage prevBackendWalUsage;
|
|
|
|
/*
|
|
* Utility routines to report I/O stats for backends, kept here to avoid
|
|
* exposing PendingBackendStats to the outside world.
|
|
*/
|
|
void
|
|
pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context,
|
|
IOOp io_op, instr_time io_time)
|
|
{
|
|
Assert(track_io_timing || track_wal_io_timing);
|
|
|
|
if (!pgstat_tracks_backend_bktype(MyBackendType))
|
|
return;
|
|
|
|
Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
|
|
|
|
INSTR_TIME_ADD(PendingBackendStats.pending_io.pending_times[io_object][io_context][io_op],
|
|
io_time);
|
|
|
|
backend_has_iostats = true;
|
|
pgstat_report_fixed = true;
|
|
}
|
|
|
|
void
|
|
pgstat_count_backend_io_op(IOObject io_object, IOContext io_context,
|
|
IOOp io_op, uint32 cnt, uint64 bytes)
|
|
{
|
|
if (!pgstat_tracks_backend_bktype(MyBackendType))
|
|
return;
|
|
|
|
Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
|
|
|
|
PendingBackendStats.pending_io.counts[io_object][io_context][io_op] += cnt;
|
|
PendingBackendStats.pending_io.bytes[io_object][io_context][io_op] += bytes;
|
|
|
|
backend_has_iostats = true;
|
|
pgstat_report_fixed = true;
|
|
}
|
|
|
|
/*
|
|
* Returns statistics of a backend by proc number.
|
|
*/
|
|
PgStat_Backend *
|
|
pgstat_fetch_stat_backend(ProcNumber procNumber)
|
|
{
|
|
PgStat_Backend *backend_entry;
|
|
|
|
backend_entry = (PgStat_Backend *) pgstat_fetch_entry(PGSTAT_KIND_BACKEND,
|
|
InvalidOid, procNumber);
|
|
|
|
return backend_entry;
|
|
}
|
|
|
|
/*
|
|
* Returns statistics of a backend by pid.
|
|
*
|
|
* This routine includes sanity checks to ensure that the backend exists and
|
|
* is running. "bktype" can be optionally defined to return the BackendType
|
|
* of the backend whose statistics are returned.
|
|
*/
|
|
PgStat_Backend *
|
|
pgstat_fetch_stat_backend_by_pid(int pid, BackendType *bktype)
|
|
{
|
|
PGPROC *proc;
|
|
PgBackendStatus *beentry;
|
|
ProcNumber procNumber;
|
|
PgStat_Backend *backend_stats;
|
|
|
|
proc = BackendPidGetProc(pid);
|
|
if (bktype)
|
|
*bktype = B_INVALID;
|
|
|
|
/* this could be an auxiliary process */
|
|
if (!proc)
|
|
proc = AuxiliaryPidGetProc(pid);
|
|
|
|
if (!proc)
|
|
return NULL;
|
|
|
|
procNumber = GetNumberFromPGProc(proc);
|
|
|
|
beentry = pgstat_get_beentry_by_proc_number(procNumber);
|
|
if (!beentry)
|
|
return NULL;
|
|
|
|
/* check if the backend type tracks statistics */
|
|
if (!pgstat_tracks_backend_bktype(beentry->st_backendType))
|
|
return NULL;
|
|
|
|
/* if PID does not match, leave */
|
|
if (beentry->st_procpid != pid)
|
|
return NULL;
|
|
|
|
if (bktype)
|
|
*bktype = beentry->st_backendType;
|
|
|
|
/*
|
|
* Retrieve the entry. Note that "beentry" may be freed depending on the
|
|
* value of stats_fetch_consistency, so do not access it from this point.
|
|
*/
|
|
backend_stats = pgstat_fetch_stat_backend(procNumber);
|
|
if (!backend_stats)
|
|
{
|
|
if (bktype)
|
|
*bktype = B_INVALID;
|
|
return NULL;
|
|
}
|
|
|
|
return backend_stats;
|
|
}
|
|
|
|
/*
|
|
* Flush out locally pending backend IO statistics. Locking is managed
|
|
* by the caller.
|
|
*/
|
|
static void
|
|
pgstat_flush_backend_entry_io(PgStat_EntryRef *entry_ref)
|
|
{
|
|
PgStatShared_Backend *shbackendent;
|
|
PgStat_BktypeIO *bktype_shstats;
|
|
PgStat_PendingIO pending_io;
|
|
|
|
/*
|
|
* This function can be called even if nothing at all has happened for IO
|
|
* statistics. In this case, avoid unnecessarily modifying the stats
|
|
* entry.
|
|
*/
|
|
if (!backend_has_iostats)
|
|
return;
|
|
|
|
shbackendent = (PgStatShared_Backend *) entry_ref->shared_stats;
|
|
bktype_shstats = &shbackendent->stats.io_stats;
|
|
pending_io = PendingBackendStats.pending_io;
|
|
|
|
for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
|
|
{
|
|
for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
|
|
{
|
|
for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
|
|
{
|
|
instr_time time;
|
|
|
|
bktype_shstats->counts[io_object][io_context][io_op] +=
|
|
pending_io.counts[io_object][io_context][io_op];
|
|
bktype_shstats->bytes[io_object][io_context][io_op] +=
|
|
pending_io.bytes[io_object][io_context][io_op];
|
|
time = pending_io.pending_times[io_object][io_context][io_op];
|
|
|
|
bktype_shstats->times[io_object][io_context][io_op] +=
|
|
INSTR_TIME_GET_MICROSEC(time);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Clear out the statistics buffer, so it can be re-used.
|
|
*/
|
|
MemSet(&PendingBackendStats.pending_io, 0, sizeof(PgStat_PendingIO));
|
|
|
|
backend_has_iostats = false;
|
|
}
|
|
|
|
/*
|
|
* To determine whether WAL usage happened.
|
|
*/
|
|
static inline bool
|
|
pgstat_backend_wal_have_pending(void)
|
|
{
|
|
return (pgWalUsage.wal_records != prevBackendWalUsage.wal_records);
|
|
}
|
|
|
|
/*
|
|
* Flush out locally pending backend WAL statistics. Locking is managed
|
|
* by the caller.
|
|
*/
|
|
static void
|
|
pgstat_flush_backend_entry_wal(PgStat_EntryRef *entry_ref)
|
|
{
|
|
PgStatShared_Backend *shbackendent;
|
|
PgStat_WalCounters *bktype_shstats;
|
|
WalUsage wal_usage_diff = {0};
|
|
|
|
/*
|
|
* This function can be called even if nothing at all has happened for WAL
|
|
* statistics. In this case, avoid unnecessarily modifying the stats
|
|
* entry.
|
|
*/
|
|
if (!pgstat_backend_wal_have_pending())
|
|
return;
|
|
|
|
shbackendent = (PgStatShared_Backend *) entry_ref->shared_stats;
|
|
bktype_shstats = &shbackendent->stats.wal_counters;
|
|
|
|
/*
|
|
* Calculate how much WAL usage counters were increased by subtracting the
|
|
* previous counters from the current ones.
|
|
*/
|
|
WalUsageAccumDiff(&wal_usage_diff, &pgWalUsage, &prevBackendWalUsage);
|
|
|
|
#define WALSTAT_ACC(fld, var_to_add) \
|
|
(bktype_shstats->fld += var_to_add.fld)
|
|
WALSTAT_ACC(wal_buffers_full, wal_usage_diff);
|
|
WALSTAT_ACC(wal_records, wal_usage_diff);
|
|
WALSTAT_ACC(wal_fpi, wal_usage_diff);
|
|
WALSTAT_ACC(wal_bytes, wal_usage_diff);
|
|
WALSTAT_ACC(wal_fpi_bytes, wal_usage_diff);
|
|
#undef WALSTAT_ACC
|
|
|
|
/*
|
|
* Save the current counters for the subsequent calculation of WAL usage.
|
|
*/
|
|
prevBackendWalUsage = pgWalUsage;
|
|
}
|
|
|
|
/*
|
|
* Flush out locally pending backend statistics
|
|
*
|
|
* "flags" parameter controls which statistics to flush. Returns true
|
|
* if some statistics could not be flushed due to lock contention.
|
|
*/
|
|
bool
|
|
pgstat_flush_backend(bool nowait, uint32 flags)
|
|
{
|
|
PgStat_EntryRef *entry_ref;
|
|
bool has_pending_data = false;
|
|
|
|
if (!pgstat_tracks_backend_bktype(MyBackendType))
|
|
return false;
|
|
|
|
/* Some IO data pending? */
|
|
if ((flags & PGSTAT_BACKEND_FLUSH_IO) && backend_has_iostats)
|
|
has_pending_data = true;
|
|
|
|
/* Some WAL data pending? */
|
|
if ((flags & PGSTAT_BACKEND_FLUSH_WAL) &&
|
|
pgstat_backend_wal_have_pending())
|
|
has_pending_data = true;
|
|
|
|
if (!has_pending_data)
|
|
return false;
|
|
|
|
entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_BACKEND, InvalidOid,
|
|
MyProcNumber, nowait);
|
|
if (!entry_ref)
|
|
return true;
|
|
|
|
/* Flush requested statistics */
|
|
if (flags & PGSTAT_BACKEND_FLUSH_IO)
|
|
pgstat_flush_backend_entry_io(entry_ref);
|
|
|
|
if (flags & PGSTAT_BACKEND_FLUSH_WAL)
|
|
pgstat_flush_backend_entry_wal(entry_ref);
|
|
|
|
pgstat_unlock_entry(entry_ref);
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Callback to flush out locally pending backend statistics.
|
|
*
|
|
* If some stats could not be flushed due to lock contention, return true.
|
|
*/
|
|
bool
|
|
pgstat_backend_flush_cb(bool nowait)
|
|
{
|
|
return pgstat_flush_backend(nowait, PGSTAT_BACKEND_FLUSH_ALL);
|
|
}
|
|
|
|
/*
|
|
* Create backend statistics entry for proc number.
|
|
*/
|
|
void
|
|
pgstat_create_backend(ProcNumber procnum)
|
|
{
|
|
PgStat_EntryRef *entry_ref;
|
|
PgStatShared_Backend *shstatent;
|
|
|
|
entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_BACKEND, InvalidOid,
|
|
procnum, false);
|
|
shstatent = (PgStatShared_Backend *) entry_ref->shared_stats;
|
|
|
|
/*
|
|
* NB: need to accept that there might be stats from an older backend,
|
|
* e.g. if we previously used this proc number.
|
|
*/
|
|
memset(&shstatent->stats, 0, sizeof(shstatent->stats));
|
|
pgstat_unlock_entry(entry_ref);
|
|
|
|
MemSet(&PendingBackendStats, 0, sizeof(PgStat_BackendPending));
|
|
backend_has_iostats = false;
|
|
|
|
/*
|
|
* Initialize prevBackendWalUsage with pgWalUsage so that
|
|
* pgstat_backend_flush_cb() can calculate how much pgWalUsage counters
|
|
* are increased by subtracting prevBackendWalUsage from pgWalUsage.
|
|
*/
|
|
prevBackendWalUsage = pgWalUsage;
|
|
}
|
|
|
|
/*
|
|
* Backend statistics are not collected for all BackendTypes.
|
|
*
|
|
* The following BackendTypes do not participate in the backend stats
|
|
* subsystem:
|
|
* - The same and for the same reasons as in pgstat_tracks_io_bktype().
|
|
* - B_BG_WRITER, B_CHECKPOINTER, B_STARTUP and B_AUTOVAC_LAUNCHER because their
|
|
* I/O stats are already visible in pg_stat_io and there is only one of those.
|
|
*
|
|
* Function returns true if BackendType participates in the backend stats
|
|
* subsystem and false if it does not.
|
|
*
|
|
* When adding a new BackendType, also consider adding relevant restrictions to
|
|
* pgstat_tracks_io_object() and pgstat_tracks_io_op().
|
|
*/
|
|
bool
|
|
pgstat_tracks_backend_bktype(BackendType bktype)
|
|
{
|
|
/*
|
|
* List every type so that new backend types trigger a warning about
|
|
* needing to adjust this switch.
|
|
*/
|
|
switch (bktype)
|
|
{
|
|
case B_INVALID:
|
|
case B_AUTOVAC_LAUNCHER:
|
|
case B_DEAD_END_BACKEND:
|
|
case B_ARCHIVER:
|
|
case B_LOGGER:
|
|
case B_BG_WRITER:
|
|
case B_CHECKPOINTER:
|
|
case B_IO_WORKER:
|
|
case B_STARTUP:
|
|
case B_DATACHECKSUMSWORKER_LAUNCHER:
|
|
case B_DATACHECKSUMSWORKER_WORKER:
|
|
return false;
|
|
|
|
case B_AUTOVAC_WORKER:
|
|
case B_BACKEND:
|
|
case B_BG_WORKER:
|
|
case B_STANDALONE_BACKEND:
|
|
case B_SLOTSYNC_WORKER:
|
|
case B_WAL_RECEIVER:
|
|
case B_WAL_SENDER:
|
|
case B_WAL_SUMMARIZER:
|
|
case B_WAL_WRITER:
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void
|
|
pgstat_backend_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts)
|
|
{
|
|
((PgStatShared_Backend *) header)->stats.stat_reset_timestamp = ts;
|
|
}
|