diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 422ba304982..d3fea738ca3 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -7187,6 +7187,7 @@ local0.* /var/log/postgresql bgworker bgwriter checkpointer + checksums ioworker postmaster slotsyncworker diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml index 210b1118bdf..24ecb46542e 100644 --- a/doc/src/sgml/func/func-admin.sgml +++ b/doc/src/sgml/func/func-admin.sgml @@ -3123,4 +3123,82 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8'); + + Data Checksum Functions + + + The functions shown in can + be used to enable or disable data checksums in a running cluster. + + + Changing data checksums can be done in a cluster with concurrent activity + without blocking queries, but overall system performance will be affected. + See for further details on how changing the + data checksums state can affect a system and possible mitigations for how + to reduce the impact. + + + + Data Checksum Functions + + + + + Function + + + Description + + + + + + + + + pg_enable_data_checksums + + pg_enable_data_checksums ( cost_delay int, cost_limit int ) + void + + + Initiates the process of enabling data checksums for the cluster. This + will set the data checksums state to inprogress-on + as well as start a background worker that will process all pages in all + databases and enable data checksums on them. When all pages have + been processed, the cluster will automatically set data checksums state + to on. This operation is WAL logged and replicated + to all standby nodes. + + + If cost_delay and cost_limit are + specified, the process is throttled using the same principles as + Cost-based Vacuum Delay. + + + + + + + + pg_disable_data_checksums + + pg_disable_data_checksums () + void + + + Disables data checksum calculation and validation for the cluster. This + will set the data checksum state to inprogress-off + while data checksums are being disabled. When all active backends have + stopped validating data checksums, the data checksum state will be + set to off. + + + + + +
+ +
+ diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml index 113d7640626..b881ae71198 100644 --- a/doc/src/sgml/glossary.sgml +++ b/doc/src/sgml/glossary.sgml @@ -199,6 +199,8 @@ (but not the autovacuum workers), the background writer, the checkpointer, + the data checksums worker, + the data checksums worker launcher, the logger, the startup process, the WAL archiver, @@ -574,6 +576,28 @@ + + Data Checksums Worker + + + A background worker + which enables data checksums in a specific database. + + + + + + Data Checksums Worker Launcher + + + A background worker + which starts data + checksum worker processes for enabling data checksums in each + database, or disables data checksums cluster-wide. + + + + Database diff --git a/doc/src/sgml/images/Makefile b/doc/src/sgml/images/Makefile index 38f8869d78d..7b8ac0fbb32 100644 --- a/doc/src/sgml/images/Makefile +++ b/doc/src/sgml/images/Makefile @@ -3,6 +3,7 @@ # see README in this directory about image handling ALL_IMAGES = \ + datachecksums.svg \ genetic-algorithm.svg \ gin.svg \ pagelayout.svg \ diff --git a/doc/src/sgml/images/datachecksums.gv b/doc/src/sgml/images/datachecksums.gv new file mode 100644 index 00000000000..dff3ff7340a --- /dev/null +++ b/doc/src/sgml/images/datachecksums.gv @@ -0,0 +1,14 @@ +digraph G { + A -> B [label="SELECT pg_enable_data_checksums()"]; + B -> C; + D -> A; + C -> D [label="SELECT pg_disable_data_checksums()"]; + E -> A [label=" --no-data-checksums"]; + E -> C [label=" --data-checksums"]; + + A [label="off"]; + B [label="inprogress-on"]; + C [label="on"]; + D [label="inprogress-off"]; + E [label="initdb"]; +} diff --git a/doc/src/sgml/images/datachecksums.svg b/doc/src/sgml/images/datachecksums.svg new file mode 100644 index 00000000000..8c58f42922e --- /dev/null +++ b/doc/src/sgml/images/datachecksums.svg @@ -0,0 +1,81 @@ + + + + + +G + + + +A + +off + + + +B + +inprogress-on + + + +A->B + + +SELECT pg_enable_data_checksums() + + + +C + +on + + + +B->C + + + + + +D + +inprogress-off + + + +C->D + + +SELECT pg_disable_data_checksums() + + + +D->A + + + + + +E + +initdb + + + +E->A + + + --no-data-checksums + + + +E->C + + + --data-checksums + + + diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index bb75ed1069b..312374da5e0 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -3885,9 +3885,14 @@ description | Waiting for a newly initialized WAL file to reach durable storage Number of data page checksum failures detected in this - database (or on a shared object), or NULL if data checksums are - disabled. - + database (or on a shared object). Detected failures are not reset if + the setting changes. Clusters + which are initialized without data checksums will show this as + 0. In PostgreSQL version + 18 and earlier, this was set to NULL for clusters + with data checksums disabled. + + @@ -3896,8 +3901,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage Time at which the last data page checksum failure was detected in - this database (or on a shared object), or NULL if data checksums are - disabled. + this database (or on a shared object). Last failure is reported + regardless of the setting. @@ -7634,6 +7639,219 @@ FROM pg_stat_get_backend_idset() AS backendid; + + Data Checksum Progress Reporting + + + pg_stat_progress_data_checksums + + + + When data checksums are being enabled on a running cluster, the + pg_stat_progress_data_checksums view will contain + a row for the launcher process, and one row for each worker process which + is currently calculating and writing checksums for the data pages in a database. + The launcher provides overview of the overall progress (how many databases + have been processed, how many remain), while the workers track progress for + currently processed databases. + + + + <structname>pg_stat_progress_data_checksums</structname> View + + + + + + Column Type + + + Description + + + + + + + + + + pid integer + + + Process ID of the data checksum process, launcher or worker. + + + + + + + + datid oid + + + OID of this database, or 0 for the launcher process. + + + + + + + + datname name + + + Name of this database, or NULL for the + launcher process. + + + + + + + + phase text + + + Current processing phase, see + for description of the phases. + + + + + + + + databases_total integer + + + The total number of databases which will be processed. Only the + launcher process has this value set, the worker processes have this + set to NULL. + + + + + + + + databases_done integer + + + The number of databases which have been processed. Only the launcher + process has this value set, the worker processes have this set to + NULL. + + + + + + + + relations_total integer + + + The total number of relations which will be processed, or + NULL if the worker process hasn't + calculated the number of relations yet. The launcher process has + this set to NULL since it isn't responsible for + processing relations, only launching worker processes. + + + + + + + + relations_done integer + + + The number of relations which have been processed. The launcher + process has this set to NULL. + + + + + + + + blocks_total integer + + + The number of blocks in the current relation which will be processed, + or NULL if the worker process hasn't + calculated the number of blocks yet. The launcher process has + this set to NULL. + + + + + + + + blocks_done integer + + + The number of blocks in the current relation which have been processed. + The launcher process has this set to NULL. + + + + + + +
+ + + Data Checksum Phases + + + + + + Phase + Description + + + + + enabling + + The command is currently enabling data checksums on the cluster. + + + + disabling + + The command is currently disabling data checksums on the cluster. + + + + done + + The command is done and the data checksum state in the cluster has + changed. + + + + waiting on barrier + + The command is currently waiting for the current active backends to + acknowledge the change in data checksum state. + + + + waiting on temporary tables + + The command is currently waiting for all temporary tables which existed + at the time the command was started to be removed. + + + + +
+
+ diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml index b64393c813f..45890324075 100644 --- a/doc/src/sgml/ref/pg_checksums.sgml +++ b/doc/src/sgml/ref/pg_checksums.sgml @@ -45,6 +45,12 @@ PostgreSQL documentation exit status is nonzero if the operation failed. + + When enabling checksums, if checksums were in the process of being enabled + when the cluster was shut down, pg_checksums + will still process all relations regardless of the online processing. + + When verifying checksums, every file in the cluster is scanned. When enabling checksums, each relation file block with a changed checksum is diff --git a/doc/src/sgml/regress.sgml b/doc/src/sgml/regress.sgml index 873387ec168..c74941bfbf2 100644 --- a/doc/src/sgml/regress.sgml +++ b/doc/src/sgml/regress.sgml @@ -275,6 +275,20 @@ make check-world PG_TEST_EXTRA='kerberos ldap ssl load_balance libpq_encryption' The following values are currently supported: + + checksum, checksum_extended + + + Runs additional tests for enabling data checksums which inject faults + to cause re-tries in the processing, as well as tests that run pgbench + concurrently and randomly restarts the cluster. Some of these test + suites require injection points enabled in the installation. + checksum_extended is an extended version with + longer runtime, injected random delays and larger datasets. + + + + kerberos diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index f3b86b26be9..165af8a0cf2 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -246,9 +246,10 @@ Checksums can be disabled when the cluster is initialized using initdb. - They can also be enabled or disabled at a later time as an offline - operation. Data checksums are enabled or disabled at the full cluster - level, and cannot be specified individually for databases or tables. + They can also be enabled or disabled at a later time either as an offline + operation or online in a running cluster allowing concurrent access. Data + checksums are enabled or disabled at the full cluster level, and cannot be + specified individually for databases, tables or replicated cluster members. @@ -265,7 +266,7 @@ - Off-line Enabling of Checksums + Offline Enabling of Checksums The pg_checksums @@ -274,6 +275,123 @@ + + + Online Enabling of Checksums + + + Checksums can be enabled or disabled online, by calling the appropriate + functions. + + + + Both enabling and disabling data checksums happens in two phases, separated + by a checkpoint to ensure durability. The different states, and their + transitions, are illustrated in + and discussed in further detail in this section. + + + +
+ data checksums states + + + + + +
+
+ + + Enabling checksums will set the cluster checksum state to + inprogress-on. During this time, checksums will be + written but not verified. In addition to this, a background worker process + is started that enables checksums on all existing data in the cluster. Once + this worker has completed processing all databases in the cluster, the + checksum state will automatically switch to on. The + processing will consume two background worker processes, make sure that + max_worker_processes allows for at least two more + additional processes. + + + + The process will initially wait for all open transactions to finish before + it starts, so that it can be certain that there are no tables that have been + created inside a transaction that has not committed yet and thus would not + be visible to the process enabling checksums. It will also, for each database, + wait for all pre-existing temporary tables to get removed before it finishes. + If long-lived temporary tables are used in an application it may be necessary + to terminate these application connections to allow the process to complete. + + + + If the cluster is stopped while in inprogress-on state, + for any reason, or processing was interrupted, then the checksum enable + process must be restarted manually. To do this, re-execute the function + pg_enable_data_checksums() once the cluster has been + restarted. The process will start over, there is no support for resuming + work from where it was interrupted. If the cluster is stopped while in + inprogress-off, then the checksum state will be set to + off when the cluster is restarted. + + + + Disabling data checksums will set the data checksum state to + inprogress-off. During this time, checksums will be + written but not verified. After all processes acknowledge the change, + the state will automatically be set to off. + + + + Disabling data checksums while data checksums are actively being enabled + will abort the current processing. + + + + Impact on system of online operations + + Enabling data checksums can cause significant I/O to the system, as all of the + database pages will need to be rewritten, and will be written both to the + data files and the WAL. The impact may be limited by throttling using the + cost_delay and cost_limit + parameters of the pg_enable_data_checksums() function. + + + + + + I/O: all pages need to have data checksums calculated and written which + will generate a lot of dirty pages that will need to be flushed to disk, + as well as WAL logged. + + + Replication: When the standby receives the data checksum state change + in the WAL stream it will issue a + restartpoint in order to flush the current state into the + pg_control file. The restartpoint will flush the + current state to disk and will block redo until finished. This in turn + will induce replication lag, which on synchronous standbys also blocks + the primary. Reducing before the + process is started can help with reducing the time it takes for the + restartpoint to finish. + + + Shutdown/Restart: If the server is shut down or restarted when data + checksums are being enabled, the process will not resume and all pages + need to be recalculated and rewritten. Enabling data checksums should + be done when there is no need for regular maintenance or during a + service window. + + + + + + No I/O is incurred when disabling data checksums, but checkpoints are + still required. + + + +
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 44194d3ea17..2468a7d2578 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -18,6 +18,7 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/pg_control.h" +#include "storage/checksum.h" #include "utils/guc.h" #include "utils/timestamp.h" @@ -54,6 +55,40 @@ get_wal_level_string(int wal_level) return wal_level_str; } +const char * +get_checksum_state_string(uint32 state) +{ + switch (state) + { + case PG_DATA_CHECKSUM_VERSION: + return "on"; + case PG_DATA_CHECKSUM_INPROGRESS_OFF: + return "inprogress-off"; + case PG_DATA_CHECKSUM_INPROGRESS_ON: + return "inprogress-on"; + case PG_DATA_CHECKSUM_OFF: + return "off"; + } + + Assert(false); + return "?"; +} + +void +xlog2_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG2_CHECKSUMS) + { + xl_checksum_state xlrec; + + memcpy(&xlrec, rec, sizeof(xl_checksum_state)); + appendStringInfoString(buf, get_checksum_state_string(xlrec.new_checksum_state)); + } +} + void xlog_desc(StringInfo buf, XLogReaderState *record) { @@ -69,7 +104,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) "tli %u; prev tli %u; fpw %s; wal_level %s; logical decoding %s; xid %u:%u; oid %u; multi %u; offset %" PRIu64 "; " "oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " - "oldest running xid %u; %s", + "oldest running xid %u; " + "checksums %s; %s", LSN_FORMAT_ARGS(checkpoint->redo), checkpoint->ThisTimeLineID, checkpoint->PrevTimeLineID, @@ -88,6 +124,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) checkpoint->oldestCommitTsXid, checkpoint->newestCommitTsXid, checkpoint->oldestActiveXid, + get_checksum_state_string(checkpoint->dataChecksumState), (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online"); } else if (info == XLOG_NEXTOID) @@ -166,7 +203,9 @@ xlog_desc(StringInfo buf, XLogReaderState *record) xl_checkpoint_redo xlrec; memcpy(&xlrec, rec, sizeof(xl_checkpoint_redo)); - appendStringInfo(buf, "wal_level %s", get_wal_level_string(xlrec.wal_level)); + appendStringInfo(buf, "wal_level %s; checksums %s", + get_wal_level_string(xlrec.wal_level), + get_checksum_state_string(xlrec.data_checksum_version)); } else if (info == XLOG_LOGICAL_DECODING_STATUS_CHANGE) { @@ -241,6 +280,21 @@ xlog_identify(uint8 info) return id; } +const char * +xlog2_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG2_CHECKSUMS: + id = "CHECKSUMS"; + break; + } + + return id; +} + /* * Returns a string giving information about all the blocks in an * XLogRecord. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 2c1c6f88b74..9e8999bbb61 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -75,6 +75,7 @@ #include "pgstat.h" #include "port/atomics.h" #include "postmaster/bgwriter.h" +#include "postmaster/datachecksum_state.h" #include "postmaster/startup.h" #include "postmaster/walsummarizer.h" #include "postmaster/walwriter.h" @@ -92,6 +93,7 @@ #include "storage/predicate.h" #include "storage/proc.h" #include "storage/procarray.h" +#include "storage/procsignal.h" #include "storage/reinit.h" #include "storage/spin.h" #include "storage/sync.h" @@ -553,6 +555,9 @@ typedef struct XLogCtlData */ XLogRecPtr lastFpwDisableRecPtr; + /* last data_checksum_version we've seen */ + uint32 data_checksum_version; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -650,6 +655,21 @@ static XLogRecPtr LocalMinRecoveryPoint; static TimeLineID LocalMinRecoveryPointTLI; static bool updateMinRecoveryPoint = true; +/* + * Local state for Controlfile data_checksum_version. After initialization + * this is only updated when absorbing a procsignal barrier during interrupt + * processing. The reason for keeping a copy in backend-private memory is to + * avoid locking for interrogating the data checksum state. Possible values + * are the data checksum versions defined in storage/checksum.h. + */ +static ChecksumStateType LocalDataChecksumState = 0; + +/* + * Variable backing the GUC, keep it in sync with LocalDataChecksumState. + * See SetLocalDataChecksumState(). + */ +int data_checksums = 0; + /* For WALInsertLockAcquire/Release functions */ static int MyLockNo = 0; static bool holdingAllLocks = false; @@ -717,6 +737,8 @@ static void WALInsertLockAcquireExclusive(void); static void WALInsertLockRelease(void); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); +static void XLogChecksums(uint32 new_type); + /* * Insert an XLOG record represented by an already-constructed chain of data * chunks. This is a low-level routine; to construct the WAL record header @@ -4254,6 +4276,12 @@ InitControlFile(uint64 sysidentifier, uint32 data_checksum_version) ControlFile->wal_log_hints = wal_log_hints; ControlFile->track_commit_timestamp = track_commit_timestamp; ControlFile->data_checksum_version = data_checksum_version; + + /* + * Set the data_checksum_version value into XLogCtl, which is where all + * processes get the current value from. + */ + XLogCtl->data_checksum_version = data_checksum_version; } static void @@ -4588,10 +4616,6 @@ ReadControlFile(void) (SizeOfXLogLongPHD - SizeOfXLogShortPHD); CalculateCheckpointSegments(); - - /* Make the initdb settings visible as GUC variables, too */ - SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", - PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); } /* @@ -4625,13 +4649,323 @@ GetMockAuthenticationNonce(void) } /* - * Are checksums enabled for data pages? + * DataChecksumsNeedWrite + * Returns whether data checksums must be written or not + * + * Returns true if data checksums are enabled, or are in the process of being + * enabled. During "inprogress-on" and "inprogress-off" states checksums must + * be written even though they are not verified (see datachecksum_state.c for + * a longer discussion). + * + * This function is intended for callsites which are about to write a data page + * to storage, and need to know whether to re-calculate the checksum for the + * page header. Calling this function must be performed as close to the write + * operation as possible to keep the critical section short. */ bool -DataChecksumsEnabled(void) +DataChecksumsNeedWrite(void) { + return (LocalDataChecksumState == PG_DATA_CHECKSUM_VERSION || + LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_ON || + LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_OFF); +} + +bool +DataChecksumsInProgressOn(void) +{ + return LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_ON; +} + +/* + * DataChecksumsNeedVerify + * Returns whether data checksums must be verified or not + * + * Data checksums are only verified if they are fully enabled in the cluster. + * During the "inprogress-on" and "inprogress-off" states they are only + * updated, not verified (see datachecksum_state.c for a longer discussion). + * + * This function is intended for callsites which have read data and are about + * to perform checksum validation based on the result of this. Calling this + * function must be performed as close to the validation call as possible to + * keep the critical section short. This is in order to protect against time of + * check/time of use situations around data checksum validation. + */ +bool +DataChecksumsNeedVerify(void) +{ + return (LocalDataChecksumState == PG_DATA_CHECKSUM_VERSION); +} + +/* + * SetDataChecksumsOnInProgress + * Sets the data checksum state to "inprogress-on" to enable checksums + * + * To start the process of enabling data checksums in a running cluster the + * data_checksum_version state must be changed to "inprogress-on". See + * SetDataChecksumsOn below for a description on how this state change works. + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOnInProgress(void) +{ + uint64 barrier; + Assert(ControlFile != NULL); - return (ControlFile->data_checksum_version > 0); + + /* + * The state transition is performed in a critical section with + * checkpoints held off to provide crash safety. + */ + START_CRIT_SECTION(); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); + + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + + /* + * Update the controlfile before waiting since if we have an immediate + * shutdown while waiting we want to come back up with checksums enabled. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* + * Await state change in all backends to ensure that all backends are in + * "inprogress-on". Once done we know that all backends are writing data + * checksums. + */ + WaitForProcSignalBarrier(barrier); +} + +/* + * SetDataChecksumsOn + * Set data checksums state to 'on' cluster-wide + * + * Enabling data checksums is performed using two barriers, the first one to + * set the state to "inprogress-on" (done by SetDataChecksumsOnInProgress()) + * and the second one to set the state to "on" (done here). Below is a short + * description of the processing, a more detailed write-up can be found in + * datachecksum_state.c. + * + * To start the process of enabling data checksums in a running cluster the + * data_checksum_version state must be changed to "inprogress-on". This state + * requires data checksums to be written but not verified. This ensures that + * all data pages can be checksummed without the risk of false negatives in + * validation during the process. When all existing pages are guaranteed to + * have checksums, and all new pages will be initiated with checksums, the + * state can be changed to "on". Once the state is "on" checksums will be both + * written and verified. + * + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOn(void) +{ + uint64 barrier; + + Assert(ControlFile != NULL); + + SpinLockAcquire(&XLogCtl->info_lck); + + /* + * The only allowed state transition to "on" is from "inprogress-on" since + * that state ensures that all pages will have data checksums written. No + * such state transition exists, if it does happen it's likely due to a + * programmer error. + */ + if (XLogCtl->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON) + { + SpinLockRelease(&XLogCtl->info_lck); + elog(WARNING, + "cannot set data checksums to \"on\", current state is not \"inprogress-on\", disabling"); + SetDataChecksumsOff(); + return; + } + + SpinLockRelease(&XLogCtl->info_lck); + + INJECTION_POINT("datachecksums-enable-checksums-delay", NULL); + START_CRIT_SECTION(); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + XLogChecksums(PG_DATA_CHECKSUM_VERSION); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); + + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + + /* + * Update the controlfile before waiting since if we have an immediate + * shutdown while waiting we want to come back up with checksums enabled. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST); + + /* + * Await state transition to "on" in all backends. When done we know that + * data data checksums are both written and verified in all backends. + */ + WaitForProcSignalBarrier(barrier); +} + +/* + * SetDataChecksumsOff + * Disables data checksums cluster-wide + * + * Disabling data checksums must be performed with two sets of barriers, each + * carrying a different state. The state is first set to "inprogress-off" + * during which checksums are still written but not verified. This ensures that + * backends which have yet to observe the state change from "on" won't get + * validation errors on concurrently modified pages. Once all backends have + * changed to "inprogress-off", the barrier for moving to "off" can be emitted. + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOff(void) +{ + uint64 barrier; + + Assert(ControlFile != NULL); + + SpinLockAcquire(&XLogCtl->info_lck); + + /* If data checksums are already disabled there is nothing to do */ + if (XLogCtl->data_checksum_version == 0) + { + SpinLockRelease(&XLogCtl->info_lck); + return; + } + + /* + * If data checksums are currently enabled we first transition to the + * "inprogress-off" state during which backends continue to write + * checksums without verifying them. When all backends are in + * "inprogress-off" the next transition to "off" can be performed, after + * which all data checksum processing is disabled. + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_VERSION) + { + SpinLockRelease(&XLogCtl->info_lck); + + START_CRIT_SECTION(); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_OFF); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF); + + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST); + + /* + * Update local state in all backends to ensure that any backend in + * "on" state is changed to "inprogress-off". + */ + WaitForProcSignalBarrier(barrier); + + /* + * At this point we know that no backends are verifying data checksums + * during reading. Next, we can safely move to state "off" to also + * stop writing checksums. + */ + } + else + { + /* + * Ending up here implies that the checksums state is "inprogress-on" + * or "inprogress-off" and we can transition directly to "off" from + * there. + */ + SpinLockRelease(&XLogCtl->info_lck); + } + + START_CRIT_SECTION(); + /* Ensure that we don't incur a checkpoint during disabling checksums */ + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + XLogChecksums(PG_DATA_CHECKSUM_OFF); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); + + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST); + + WaitForProcSignalBarrier(barrier); +} + +/* + * InitLocalDataChecksumState + * + * Set up backend local caches of controldata variables which may change at + * any point during runtime and thus require special cased locking. So far + * this only applies to data_checksum_version, but it's intended to be general + * purpose enough to handle future cases. + */ +void +InitLocalDataChecksumState(void) +{ + SpinLockAcquire(&XLogCtl->info_lck); + SetLocalDataChecksumState(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); +} + +void +SetLocalDataChecksumState(uint32 data_checksum_version) +{ + LocalDataChecksumState = data_checksum_version; + + data_checksums = data_checksum_version; +} + +/* guc hook */ +const char * +show_data_checksums(void) +{ + return get_checksum_state_string(LocalDataChecksumState); } /* @@ -4925,6 +5259,7 @@ LocalProcessControlFile(bool reset) Assert(reset || ControlFile == NULL); ControlFile = palloc_object(ControlFileData); ReadControlFile(); + SetLocalDataChecksumState(ControlFile->data_checksum_version); } /* @@ -5094,6 +5429,11 @@ XLOGShmemInit(void) XLogCtl->InstallXLogFileSegmentActive = false; XLogCtl->WalWriterSleeping = false; + /* Use the checksum info from control file */ + XLogCtl->data_checksum_version = ControlFile->data_checksum_version; + + SetLocalDataChecksumState(XLogCtl->data_checksum_version); + SpinLockInit(&XLogCtl->Insert.insertpos_lck); SpinLockInit(&XLogCtl->info_lck); pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr); @@ -5168,6 +5508,7 @@ BootStrapXLOG(uint32 data_checksum_version) checkPoint.newestCommitTsXid = InvalidTransactionId; checkPoint.time = (pg_time_t) time(NULL); checkPoint.oldestActiveXid = InvalidTransactionId; + checkPoint.dataChecksumState = data_checksum_version; TransamVariables->nextXid = checkPoint.nextXid; TransamVariables->nextOid = checkPoint.nextOid; @@ -6244,6 +6585,47 @@ StartupXLOG(void) pfree(endOfRecoveryInfo->recoveryStopReason); pfree(endOfRecoveryInfo); + /* + * If we reach this point with checksums in the state inprogress-on, it + * means that data checksums were in the process of being enabled when the + * cluster shut down. Since processing didn't finish, the operation will + * have to be restarted from scratch since there is no capability to + * continue where it was when the cluster shut down. Thus, revert the + * state back to off, and inform the user with a warning message. Being + * able to restart processing is a TODO, but it wouldn't be possible to + * restart here since we cannot launch a dynamic background worker + * directly from here (it has to be from a regular backend). + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON) + { + XLogChecksums(PG_DATA_CHECKSUM_OFF); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SetLocalDataChecksumState(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); + + ereport(WARNING, + errmsg("enabling data checksums was interrupted"), + errhint("Data checksum processing must be manually restarted for checksums to be enabled")); + } + + /* + * If data checksums were being disabled when the cluster was shut down, + * we know that we have a state where all backends have stopped validating + * checksums and we can move to off instead of prompting the user to + * perform any action. + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF) + { + XLogChecksums(PG_DATA_CHECKSUM_OFF); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SetLocalDataChecksumState(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); + } + /* * All done with end-of-recovery actions. * @@ -6549,7 +6931,7 @@ GetRedoRecPtr(void) XLogRecPtr ptr; /* - * The possibly not up-to-date copy in XlogCtl is enough. Even if we + * The possibly not up-to-date copy in XLogCtl is enough. Even if we * grabbed a WAL insertion lock to read the authoritative value in * Insert->RedoRecPtr, someone might update it just after we've released * the lock. @@ -7127,6 +7509,12 @@ CreateCheckPoint(int flags) checkPoint.fullPageWrites = Insert->fullPageWrites; checkPoint.wal_level = wal_level; + /* + * Get the current data_checksum_version value from xlogctl, valid at the + * time of the checkpoint. + */ + checkPoint.dataChecksumState = XLogCtl->data_checksum_version; + if (shutdown) { XLogRecPtr curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos); @@ -7183,6 +7571,9 @@ CreateCheckPoint(int flags) WALInsertLockAcquire(); redo_rec.wal_level = wal_level; + SpinLockAcquire(&XLogCtl->info_lck); + redo_rec.data_checksum_version = XLogCtl->data_checksum_version; + SpinLockRelease(&XLogCtl->info_lck); WALInsertLockRelease(); /* Include WAL level in record for WAL summarizer's benefit. */ @@ -7243,6 +7634,10 @@ CreateCheckPoint(int flags) checkPoint.nextOid += TransamVariables->oidCount; LWLockRelease(OidGenLock); + SpinLockAcquire(&XLogCtl->info_lck); + checkPoint.dataChecksumState = XLogCtl->data_checksum_version; + SpinLockRelease(&XLogCtl->info_lck); + checkPoint.logicalDecodingEnabled = IsLogicalDecodingEnabled(); MultiXactGetCheckptMulti(shutdown, @@ -7392,6 +7787,9 @@ CreateCheckPoint(int flags) ControlFile->minRecoveryPoint = InvalidXLogRecPtr; ControlFile->minRecoveryPointTLI = 0; + /* make sure we start with the checksum version as of the checkpoint */ + ControlFile->data_checksum_version = checkPoint.dataChecksumState; + /* * Persist unloggedLSN value. It's reset on crash recovery, so this goes * unused on non-shutdown checkpoints, but seems useful to store it always @@ -7535,6 +7933,12 @@ CreateEndOfRecoveryRecord(void) LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->minRecoveryPoint = recptr; ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID; + + /* start with the latest checksum version (as of the end of recovery) */ + SpinLockAcquire(&XLogCtl->info_lck); + ControlFile->data_checksum_version = XLogCtl->data_checksum_version; + SpinLockRelease(&XLogCtl->info_lck); + UpdateControlFile(); LWLockRelease(ControlFileLock); @@ -7876,6 +8280,10 @@ CreateRestartPoint(int flags) if (flags & CHECKPOINT_IS_SHUTDOWN) ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; } + + /* we shall start with the latest checksum version */ + ControlFile->data_checksum_version = lastCheckPoint.dataChecksumState; + UpdateControlFile(); } LWLockRelease(ControlFileLock); @@ -8314,6 +8722,24 @@ XLogReportParameters(void) } } +/* + * Log the new state of checksums + */ +static void +XLogChecksums(uint32 new_type) +{ + xl_checksum_state xlrec; + XLogRecPtr recptr; + + xlrec.new_checksum_state = new_type; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state)); + + recptr = XLogInsert(RM_XLOG2_ID, XLOG2_CHECKSUMS); + XLogFlush(recptr); +} + /* * Update full_page_writes in shared memory, and write an * XLOG_FPW_CHANGE record if necessary. @@ -8440,6 +8866,11 @@ xlog_redo(XLogReaderState *record) MultiXactAdvanceOldest(checkPoint.oldestMulti, checkPoint.oldestMultiDB); + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = checkPoint.dataChecksumState; + SetLocalDataChecksumState(checkPoint.dataChecksumState); + SpinLockRelease(&XLogCtl->info_lck); + /* * No need to set oldestClogXid here as well; it'll be set when we * redo an xl_clog_truncate if it changed since initialization. @@ -8499,6 +8930,7 @@ xlog_redo(XLogReaderState *record) /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; + ControlFile->data_checksum_version = checkPoint.dataChecksumState; LWLockRelease(ControlFileLock); /* @@ -8525,6 +8957,8 @@ xlog_redo(XLogReaderState *record) { CheckPoint checkPoint; TimeLineID replayTLI; + bool new_state = false; + int old_state; memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); /* In an ONLINE checkpoint, treat the XID counter as a minimum */ @@ -8563,6 +8997,8 @@ xlog_redo(XLogReaderState *record) /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; + old_state = ControlFile->data_checksum_version; + ControlFile->data_checksum_version = checkPoint.dataChecksumState; LWLockRelease(ControlFileLock); /* TLI should not change in an on-line checkpoint */ @@ -8574,6 +9010,18 @@ xlog_redo(XLogReaderState *record) RecoveryRestartPoint(&checkPoint, record); + /* + * If the data checksum state change we need to emit a barrier. + */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = checkPoint.dataChecksumState; + if (checkPoint.dataChecksumState != old_state) + new_state = true; + SpinLockRelease(&XLogCtl->info_lck); + + if (new_state) + EmitAndWaitDataChecksumsBarrier(checkPoint.dataChecksumState); + /* * After replaying a checkpoint record, free all smgr objects. * Otherwise we would never do so for dropped relations, as the @@ -8735,7 +9183,19 @@ xlog_redo(XLogReaderState *record) } else if (info == XLOG_CHECKPOINT_REDO) { - /* nothing to do here, just for informational purposes */ + xl_checkpoint_redo redo_rec; + bool new_state = false; + + memcpy(&redo_rec, XLogRecGetData(record), sizeof(xl_checkpoint_redo)); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = redo_rec.data_checksum_version; + if (redo_rec.data_checksum_version != ControlFile->data_checksum_version) + new_state = true; + SpinLockRelease(&XLogCtl->info_lck); + + if (new_state) + EmitAndWaitDataChecksumsBarrier(redo_rec.data_checksum_version); } else if (info == XLOG_LOGICAL_DECODING_STATUS_CHANGE) { @@ -8788,6 +9248,30 @@ xlog_redo(XLogReaderState *record) } } +void +xlog2_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG2_CHECKSUMS) + { + xl_checksum_state state; + + memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state)); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = state.new_checksum_state; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Block on a procsignalbarrier to await all processes having seen the + * change to checksum status. Once the barrier has been passed we can + * initiate the corresponding processing. + */ + EmitAndWaitDataChecksumsBarrier(state.new_checksum_state); + } +} + /* * Return the extra open flags used for opening a file, depending on the * value of the GUCs wal_sync_method, fsync and debug_io_direct. diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index ab1fbae8001..9c79dadaacc 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -1613,10 +1613,11 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, /* * If we weren't told not to verify checksums, and if checksums are * enabled for this cluster, and if this is a relation file, then verify - * the checksum. + * the checksum. We cannot at this point check if checksums are enabled + * or disabled as that might change, thus we check at each point where we + * could be validating a checksum. */ - if (!noverify_checksums && DataChecksumsEnabled() && - RelFileNumberIsValid(relfilenumber)) + if (!noverify_checksums && RelFileNumberIsValid(relfilenumber)) verify_checksum = true; /* @@ -1749,7 +1750,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, * If the amount of data we were able to read was not a multiple of * BLCKSZ, we cannot verify checksums, which are block-level. */ - if (verify_checksum && (cnt % BLCKSZ != 0)) + if (verify_checksum && DataChecksumsNeedVerify() && (cnt % BLCKSZ != 0)) { ereport(WARNING, (errmsg("could not verify checksum in file \"%s\", block " @@ -1844,9 +1845,10 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, * 'blkno' is the block number of the first page in the bbsink's buffer * relative to the start of the relation. * - * 'verify_checksum' indicates whether we should try to verify checksums - * for the blocks we read. If we do this, we'll update *checksum_failures - * and issue warnings as appropriate. + * 'verify_checksum' determines if the user has asked to verify checksums, but + * since data checksums can be disabled, or become disabled, we need to check + * state before verifying individual pages. If we do this, we'll update + * *checksum_failures and issue warnings as appropriate. */ static off_t read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, @@ -1872,6 +1874,13 @@ read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, int reread_cnt; uint16 expected_checksum; + /* + * The data checksum state can change at any point, so we need to + * re-check before each page. + */ + if (!DataChecksumsNeedVerify()) + return cnt; + page = sink->bbs_buffer + BLCKSZ * i; /* If the page is OK, go on to the next one. */ @@ -1894,7 +1903,12 @@ read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, * allows us to wait until we can be certain that no write to the * block is in progress. Since we don't have any such thing right now, * we just do this and hope for the best. + * + * The data checksum state may also have changed concurrently so check + * again. */ + if (!DataChecksumsNeedVerify()) + return cnt; reread_cnt = basebackup_read_file(fd, sink->bbs_buffer + BLCKSZ * i, BLCKSZ, offset + BLCKSZ * i, @@ -2009,6 +2023,9 @@ verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno, if (PageIsNew(page) || PageGetLSN(page) >= start_lsn) return true; + if (!DataChecksumsNeedVerify()) + return true; + /* Perform the actual checksum calculation. */ checksum = pg_checksum_page(page, blkno); diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 38ef683d4c7..c52c0a6023d 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -35,6 +35,7 @@ #include "port/pg_getopt_ctx.h" #include "postmaster/postmaster.h" #include "storage/bufpage.h" +#include "storage/checksum.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/proc.h" diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index e54018004db..eba25aa3e4d 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1451,6 +1451,25 @@ CREATE VIEW pg_stat_progress_copy AS FROM pg_stat_get_progress_info('COPY') AS S LEFT JOIN pg_database D ON S.datid = D.oid; +CREATE VIEW pg_stat_progress_data_checksums AS + SELECT + S.pid AS pid, S.datid, D.datname AS datname, + CASE S.param1 WHEN 0 THEN 'enabling' + WHEN 1 THEN 'disabling' + WHEN 2 THEN 'waiting on temporary tables' + WHEN 3 THEN 'waiting on barrier' + WHEN 4 THEN 'done' + END AS phase, + CASE S.param2 WHEN -1 THEN NULL ELSE S.param2 END AS databases_total, + S.param3 AS databases_done, + CASE S.param4 WHEN -1 THEN NULL ELSE S.param4 END AS relations_total, + CASE S.param5 WHEN -1 THEN NULL ELSE S.param5 END AS relations_done, + CASE S.param6 WHEN -1 THEN NULL ELSE S.param6 END AS blocks_total, + CASE S.param7 WHEN -1 THEN NULL ELSE S.param7 END AS blocks_done + FROM pg_stat_get_progress_info('DATACHECKSUMS') AS S + LEFT JOIN pg_database D ON S.datid = D.oid + ORDER BY S.datid; -- return the launcher process first + CREATE VIEW pg_user_mappings AS SELECT U.oid AS umid, diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 9b18bb4a17e..f0819d15ab7 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1044,7 +1044,14 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) if (pg_strcasecmp(strategy, "wal_log") == 0) dbstrategy = CREATEDB_WAL_LOG; else if (pg_strcasecmp(strategy, "file_copy") == 0) + { + if (DataChecksumsInProgressOn()) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("create database strategy \"%s\" not allowed when data checksums are being enabled", + strategy)); dbstrategy = CREATEDB_FILE_COPY; + } else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index 0f4435d2d97..55044b2bc6f 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -18,6 +18,7 @@ OBJS = \ bgworker.o \ bgwriter.o \ checkpointer.o \ + datachecksum_state.o \ fork_process.o \ interrupt.o \ launch_backend.o \ diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index cf24f662d27..8fdc518b3a1 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -15,6 +15,7 @@ #include #include +#include "access/xlog.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/auxprocess.h" @@ -69,6 +70,24 @@ AuxiliaryProcessMainCommon(void) ProcSignalInit(NULL, 0); + /* + * Initialize a local cache of the data_checksum_version, to be updated by + * the procsignal-based barriers. + * + * This intentionally happens after initializing the procsignal, otherwise + * we might miss a state change. This means we can get a barrier for the + * state we've just initialized - but it can happen only once. + * + * The postmaster (which is what gets forked into the new child process) + * does not handle barriers, therefore it may not have the current value + * of LocalDataChecksumVersion value (it'll have the value read from the + * control file, which may be arbitrarily old). + * + * NB: Even if the postmaster handled barriers, the value might still be + * stale, as it might have changed after this process forked. + */ + InitLocalDataChecksumState(); + /* * Auxiliary processes don't run transactions, but they may need a * resource owner anyway to manage buffer pins acquired outside diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index f2a62489d9c..536aff7ca05 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -18,6 +18,7 @@ #include "pgstat.h" #include "port/atomics.h" #include "postmaster/bgworker_internals.h" +#include "postmaster/datachecksum_state.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/logicalworker.h" @@ -144,7 +145,14 @@ static const struct .fn_name = "TableSyncWorkerMain", .fn_addr = TableSyncWorkerMain }, - + { + .fn_name = "DataChecksumsWorkerLauncherMain", + .fn_addr = DataChecksumsWorkerLauncherMain + }, + { + .fn_name = "DataChecksumsWorkerMain", + .fn_addr = DataChecksumsWorkerMain + } }; /* Private functions. */ diff --git a/src/backend/postmaster/datachecksum_state.c b/src/backend/postmaster/datachecksum_state.c new file mode 100644 index 00000000000..76004bcedc6 --- /dev/null +++ b/src/backend/postmaster/datachecksum_state.c @@ -0,0 +1,1612 @@ +/*------------------------------------------------------------------------- + * + * datachecksum_state.c + * Background worker for enabling or disabling data checksums online as + * well as functionality for manipulating data checksum state + * + * When enabling data checksums on a cluster at initdb time or when shut down + * with pg_checksums, no extra process is required as each page is checksummed, + * and verified, when accessed. When enabling checksums on an already running + * cluster, this worker will ensure that all pages are checksummed before + * verification of the checksums is turned on. In the case of disabling + * checksums, the state transition is performed only in the control file, no + * changes are performed on the data pages. + * + * Checksums can be either enabled or disabled cluster-wide, with on/off being + * the end state for data_checksums. + * + * 1. Enabling checksums + * --------------------- + * When enabling checksums in an online cluster, data_checksums will be set to + * "inprogress-on" which signals that write operations MUST compute and write + * the checksum on the data page, but during reading the checksum SHALL NOT be + * verified. This ensures that all objects created during when checksums are + * being enabled will have checksums set, but reads won't fail due to missing or + * invalid checksums. Invalid checksums can be present in case the cluster had + * checksums enabled, then disabled them and updated the page while they were + * disabled. + * + * The DataChecksumsWorker will compile a list of all databases at the start, + * any databases created concurrently will see the in-progress state and will + * be checksummed automatically. All databases from the original list MUST BE + * successfully processed in order for data checksums to be enabled, the only + * exception are databases which are dropped before having been processed. + * + * For each database, all relations which have storage are read and every data + * page is marked dirty to force a write with the checksum. This will generate + * a lot of WAL as the entire database is read and written. + * + * If the processing is interrupted by a cluster crash or restart, it needs to + * be restarted from the beginning again as state isn't persisted. + * + * 2. Disabling checksums + * ---------------------- + * When disabling checksums, data_checksums will be set to "inprogress-off" + * which signals that checksums are written but no longer need to be verified. + * This ensures that backends which have not yet transitioned to the + * "inprogress-off" state will still see valid checksums on pages. + * + * 3. Synchronization and Correctness + * ---------------------------------- + * The processes involved in enabling or disabling data checksums in an + * online cluster must be properly synchronized with the normal backends + * serving concurrent queries to ensure correctness. Correctness is defined + * as the following: + * + * - Backends SHALL NOT violate the data_checksums state they have agreed to + * by acknowledging the procsignalbarrier: This means that all backends + * MUST calculate and write data checksums during all states except off; + * MUST validate checksums only in the 'on' state. + * - Data checksums SHALL NOT be considered enabled cluster-wide until all + * currently connected backends have state "on": This means that all + * backends must wait on the procsignalbarrier to be acknowledged by all + * before proceeding to validate data checksums. + * + * There are two steps of synchronization required for changing data_checksums + * in an online cluster: (i) changing state in the active backends ("on", + * "off", "inprogress-on" and "inprogress-off"), and (ii) ensuring no + * incompatible objects and processes are left in a database when workers end. + * The former deals with cluster-wide agreement on data checksum state and the + * latter with ensuring that any concurrent activity cannot break the data + * checksum contract during processing. + * + * Synchronizing the state change is done with procsignal barriers. Before + * updating the data_checksums state in the control file, all other backends must absorb the + * barrier. Barrier absorption will happen during interrupt processing, which + * means that connected backends will change state at different times. If + * waiting for a barrier is done during startup, for example during replay, it + * is important to realize that any locks held by the startup process might + * cause deadlocks if backends end up waiting for those locks while startup + * is waiting for a procsignalbarrier. + * + * 3.1 When Enabling Data Checksums + * -------------------------------- + * A process which fails to observe data checksums being enabled can induce two + * types of errors: failing to write the checksum when modifying the page and + * failing to validate the data checksum on the page when reading it. + * + * When processing starts all backends belong to one of the below sets, with + * one if Bd and Bi being empty: + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Bi: Backends in "inprogress-on" state + * + * If processing is started in an online cluster then all backends are in Bd. + * If processing was halted by the cluster shutting down (due to a crash or + * intentional restart), the controlfile state "inprogress-on" will be observed + * on system startup and all backends will be placed in Bd. The controlfile + * state will also be set to "off". + * + * Backends transition Bd -> Bi via a procsignalbarrier which is emitted by the + * DataChecksumsLauncher. When all backends have acknowledged the barrier then + * Bd will be empty and the next phase can begin: calculating and writing data + * checksums with DataChecksumsWorkers. When the DataChecksumsWorker processes + * have finished writing checksums on all pages, data checksums are enabled + * cluster-wide via another procsignalbarrier. There are four sets of backends + * where Bd shall be an empty set: + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Be: Backends in "on" state + * Bi: Backends in "inprogress-on" state + * + * Backends in Bi and Be will write checksums when modifying a page, but only + * backends in Be will verify the checksum during reading. The Bg backend is + * blocked waiting for all backends in Bi to process interrupts and move to + * Be. Any backend starting while Bg is waiting on the procsignalbarrier will + * observe the global state being "on" and will thus automatically belong to + * Be. Checksums are enabled cluster-wide when Bi is an empty set. Bi and Be + * are compatible sets while still operating based on their local state as + * both write data checksums. + * + * 3.2 When Disabling Data Checksums + * --------------------------------- + * A process which fails to observe that data checksums have been disabled + * can induce two types of errors: writing the checksum when modifying the + * page and validating a data checksum which is no longer correct due to + * modifications to the page. The former is not an error per se as data + * integrity is maintained, but it is wasteful. The latter will cause errors + * in user operations. Assuming the following sets of backends: + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Be: Backends in "on" state + * Bo: Backends in "inprogress-off" state + * Bi: Backends in "inprogress-on" state + * + * Backends transition from the Be state to Bd like so: Be -> Bo -> Bd. From + * all other states, the transition can be straight to Bd. + * + * The goal is to transition all backends to Bd making the others empty sets. + * Backends in Bo write data checksums, but don't validate them, such that + * backends still in Be can continue to validate pages until the barrier has + * been absorbed such that they are in Bo. Once all backends are in Bo, the + * barrier to transition to "off" can be raised and all backends can safely + * stop writing data checksums as no backend is enforcing data checksum + * validation any longer. + * + * 4. Future opportunities for optimizations + * ----------------------------------------- + * Below are some potential optimizations and improvements which were brought + * up during reviews of this feature, but which weren't implemented in the + * initial version. These are ideas listed without any validation on their + * feasibility or potential payoff. More discussion on (most of) these can be + * found on the -hackers threads linked to in the commit message of this + * feature. + * + * * Launching datachecksumsworker for resuming operation from the startup + * process: Currently users have to restart processing manually after a + * restart since dynamic background worker cannot be started from the + * postmaster. Changing the startup process could make restarting the + * processing automatic on cluster restart. + * * Avoid dirtying the page when checksums already match: Iff the checksum + * on the page happens to already match we still dirty the page. It should + * be enough to only do the log_newpage_buffer() call in that case. + * * Teach pg_checksums to avoid checksummed pages when pg_checksums is used + * to enable checksums on a cluster which is in inprogress-on state and + * may have checksummed pages (make pg_checksums be able to resume an + * online operation). This should only be attempted for wal_level minimal. + * * Restartability (not necessarily with page granularity). + * * Avoid processing databases which were created during inprogress-on. + * Right now all databases are processed regardless to be safe. + * * Teach CREATE DATABASE to calculate checksums for databases created + * during inprogress-on with a template database which has yet to be + * processed. + * + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/datachecksum_state.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "catalog/indexing.h" +#include "catalog/pg_class.h" +#include "catalog/pg_database.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "common/relpath.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/bgwriter.h" +#include "postmaster/datachecksum_state.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lmgr.h" +#include "storage/lwlock.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/injection_point.h" +#include "utils/lsyscache.h" +#include "utils/ps_status.h" +#include "utils/syscache.h" +#include "utils/wait_event.h" + +/* + * Configuration of conditions which must match when absorbing a procsignal + * barrier during data checksum enable/disable operations. A single function + * is used for absorbing all barriers, and the current and target states must + * be defined as a from/to tuple in the checksum_barriers struct. + */ +typedef struct ChecksumBarrierCondition +{ + /* Current state of data checksums */ + int from; + /* Target state for data checksums */ + int to; +} ChecksumBarrierCondition; + +static const ChecksumBarrierCondition checksum_barriers[6] = +{ + /* + * Disabling checksums: If checksums are currently enabled, disabling must + * go through the 'inprogress-off' state. + */ + {PG_DATA_CHECKSUM_VERSION, PG_DATA_CHECKSUM_INPROGRESS_OFF}, + {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_OFF}, + + /* + * If checksums are in the process of being enabled, but are not yet being + * verified, we can abort by going back to 'off' state. + */ + {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_OFF}, + + /* + * Enabling checksums must normally go through the 'inprogress-on' state. + */ + {PG_DATA_CHECKSUM_OFF, PG_DATA_CHECKSUM_INPROGRESS_ON}, + {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_VERSION}, + + /* + * If checksums are being disabled but all backends are still computing + * checksums, we can go straight back to 'on' + */ + {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_VERSION}, +}; + +/* + * Signaling between backends calling pg_enable/disable_data_checksums, the + * checksums launcher process, and the checksums worker process. + * + * This struct is protected by DataChecksumsWorkerLock + */ +typedef struct DataChecksumsStateStruct +{ + /* + * These are set by pg_{enable|disable}_data_checksums, to tell the + * launcher what the target state is. + */ + DataChecksumsWorkerOperation launch_operation; + int launch_cost_delay; + int launch_cost_limit; + + /* + * Is a launcher process is currently running? This is set by the main + * launcher process, after it has read the above launch_* parameters. + */ + bool launcher_running; + + /* + * Is a worker process currently running? This is set by the worker + * launcher when it starts waiting for a worker process to finish. + */ + int worker_pid; + + /* + * These fields indicate the target state that the launcher is currently + * working towards. They can be different from the corresponding launch_* + * fields, if a new pg_enable/disable_data_checksums() call was made while + * the launcher/worker was already running. + * + * The below members are set when the launcher starts, and are only + * accessed read-only by the single worker. Thus, we can access these + * without a lock. If multiple workers, or dynamic cost parameters, are + * supported at some point then this would need to be revisited. + */ + DataChecksumsWorkerOperation operation; + int cost_delay; + int cost_limit; + + /* + * Signaling between the launcher and the worker process. + * + * As there is only a single worker, and the launcher won't read these + * until the worker exits, they can be accessed without the need for a + * lock. If multiple workers are supported then this will have to be + * revisited. + */ + + /* result, set by worker before exiting */ + DataChecksumsWorkerResult success; + + /* + * tells the worker process whether it should also process the shared + * catalogs + */ + bool process_shared_catalogs; +} DataChecksumsStateStruct; + +/* Shared memory segment for datachecksumsworker */ +static DataChecksumsStateStruct *DataChecksumState; + +typedef struct DataChecksumsWorkerDatabase +{ + Oid dboid; + char *dbname; +} DataChecksumsWorkerDatabase; + +/* Flag set by the interrupt handler */ +static volatile sig_atomic_t abort_requested = false; + +/* + * Have we set the DataChecksumsStateStruct->launcher_running flag? + * If we have, we need to clear it before exiting! + */ +static volatile sig_atomic_t launcher_running = false; + +/* Are we enabling data checksums, or disabling them? */ +static DataChecksumsWorkerOperation operation; + +/* Prototypes */ +static bool DatabaseExists(Oid dboid); +static List *BuildDatabaseList(void); +static List *BuildRelationList(bool temp_relations, bool include_shared); +static void FreeDatabaseList(List *dblist); +static DataChecksumsWorkerResult ProcessDatabase(DataChecksumsWorkerDatabase *db); +static bool ProcessAllDatabases(void); +static bool ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy); +static void launcher_cancel_handler(SIGNAL_ARGS); +static void WaitForAllTransactionsToFinish(void); + +/***************************************************************************** + * Functionality for manipulating the data checksum state in the cluster + */ + +void +EmitAndWaitDataChecksumsBarrier(uint32 state) +{ + uint64 barrier; + + switch (state) + { + case PG_DATA_CHECKSUM_INPROGRESS_ON: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); + WaitForProcSignalBarrier(barrier); + break; + + case PG_DATA_CHECKSUM_INPROGRESS_OFF: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF); + WaitForProcSignalBarrier(barrier); + break; + + case PG_DATA_CHECKSUM_VERSION: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); + WaitForProcSignalBarrier(barrier); + break; + + case PG_DATA_CHECKSUM_OFF: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); + WaitForProcSignalBarrier(barrier); + break; + + default: + Assert(false); + } +} + +/* + * AbsorbDataChecksumsBarrier + * Generic function for absorbing data checksum state changes + * + * All procsignalbarriers regarding data checksum state changes are absorbed + * with this function. The set of conditions required for the state change to + * be accepted are listed in the checksum_barriers struct, target_state is + * used to look up the relevant entry. + */ +bool +AbsorbDataChecksumsBarrier(ProcSignalBarrierType barrier) +{ + uint32 target_state; + int current = data_checksums; + bool found = false; + + /* + * Translate the barrier condition to the target state, doing it here + * instead of in the procsignal code saves the latter from knowing about + * checksum states. + */ + switch (barrier) + { + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON: + target_state = PG_DATA_CHECKSUM_INPROGRESS_ON; + break; + case PROCSIGNAL_BARRIER_CHECKSUM_ON: + target_state = PG_DATA_CHECKSUM_VERSION; + break; + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF: + target_state = PG_DATA_CHECKSUM_INPROGRESS_OFF; + break; + case PROCSIGNAL_BARRIER_CHECKSUM_OFF: + target_state = PG_DATA_CHECKSUM_OFF; + break; + default: + elog(ERROR, "incorrect barrier \"%i\" received", barrier); + } + + /* + * If the target state matches the current state then the barrier has been + * repeated. + */ + if (current == target_state) + return true; + + /* + * If the cluster is in recovery we skip the validation of current state + * since the replay is trusted. + */ + if (RecoveryInProgress()) + { + SetLocalDataChecksumState(target_state); + return true; + } + + /* + * Find the barrier condition definition for the target state. Not finding + * a condition would be a grave programmer error as the states are a + * discrete set. + */ + for (int i = 0; i < lengthof(checksum_barriers) && !found; i++) + { + if (checksum_barriers[i].from == current && checksum_barriers[i].to == target_state) + found = true; + } + + /* + * If the relevant state criteria aren't satisfied, throw an error which + * will be caught by the procsignal machinery for a later retry. + */ + if (!found) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("incorrect data checksum state %i for target state %i", + current, target_state)); + + SetLocalDataChecksumState(target_state); + return true; +} + + +/* + * Disables data checksums for the cluster, if applicable. Starts a background + * worker which turns off the data checksums. + */ +Datum +disable_data_checksums(PG_FUNCTION_ARGS) +{ + if (!superuser()) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to change data checksum state")); + + StartDataChecksumsWorkerLauncher(DISABLE_DATACHECKSUMS, 0, 0); + PG_RETURN_VOID(); +} + +/* + * Enables data checksums for the cluster, if applicable. Supports vacuum- + * like cost based throttling to limit system load. Starts a background worker + * which updates data checksums on existing data. + */ +Datum +enable_data_checksums(PG_FUNCTION_ARGS) +{ + int cost_delay = PG_GETARG_INT32(0); + int cost_limit = PG_GETARG_INT32(1); + + if (!superuser()) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to change data checksum state")); + + if (cost_delay < 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cost delay cannot be a negative value")); + + if (cost_limit <= 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cost limit must be greater than zero")); + + StartDataChecksumsWorkerLauncher(ENABLE_DATACHECKSUMS, cost_delay, cost_limit); + + PG_RETURN_VOID(); +} + + +/***************************************************************************** + * Functionality for running the datachecksumsworker and associated launcher + */ + +/* + * StartDataChecksumsWorkerLauncher + * Main entry point for datachecksumsworker launcher process + * + * The main entrypoint for starting data checksums processing for enabling as + * well as disabling. + */ +void +StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op, + int cost_delay, + int cost_limit) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + bool launcher_running; + DataChecksumsWorkerOperation launcher_running_op; + +#ifdef USE_ASSERT_CHECKING + /* The cost delay settings have no effect when disabling */ + if (op == DISABLE_DATACHECKSUMS) + Assert(cost_delay == 0 && cost_limit == 0); +#endif + + INJECTION_POINT("datachecksumsworker-startup-delay", NULL); + + /* Store the desired state in shared memory */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + + DataChecksumState->launch_operation = op; + DataChecksumState->launch_cost_delay = cost_delay; + DataChecksumState->launch_cost_limit = cost_limit; + + /* Is the launcher already running? If so, what is it doing? */ + launcher_running = DataChecksumState->launcher_running; + if (launcher_running) + launcher_running_op = DataChecksumState->operation; + + LWLockRelease(DataChecksumsWorkerLock); + + /* + * Launch a new launcher process, if it's not running already. + * + * If the launcher is currently busy enabling the checksums, and we want + * them disabled (or vice versa), the launcher will notice that at latest + * when it's about to exit, and will loop back process the new request. So + * if the launcher is already running, we don't need to do anything more + * here to abort it. + * + * If you call pg_enable/disable_data_checksums() twice in a row, before + * the launcher has had a chance to start up, we still end up launching it + * twice. That's OK, the second invocation will see that a launcher is + * already running and exit quickly. + * + * TODO: We could optimize here and skip launching the launcher, if we are + * already in the desired state, i.e. if the checksums are already enabled + * and you call pg_enable_data_checksums(). + */ + if (!launcher_running) + { + /* + * Prepare the BackgroundWorker and launch it. + */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DataChecksumsWorkerLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum launcher"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = (Datum) 0; + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("failed to start background worker to process data checksums")); + } + else + { + if (launcher_running_op == op) + ereport(ERROR, + errmsg("data checksum processing already running")); + } +} + +/* + * ProcessSingleRelationFork + * Enable data checksums in a single relation/fork. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy) +{ + BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum); + char activity[NAMEDATALEN * 2 + 128]; + char *relns; + + relns = get_namespace_name(RelationGetNamespace(reln)); + + /* Report the current relation to pgstat_activity */ + snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s, %u blocks)", + (relns ? relns : ""), RelationGetRelationName(reln), forkNames[forkNum], numblocks); + pgstat_report_activity(STATE_RUNNING, activity); + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL, numblocks); + if (relns) + pfree(relns); + + /* + * We are looping over the blocks which existed at the time of process + * start, which is safe since new blocks are created with checksums set + * already due to the state being "inprogress-on". + */ + for (BlockNumber blknum = 0; blknum < numblocks; blknum++) + { + Buffer buf = ReadBufferExtended(reln, forkNum, blknum, RBM_NORMAL, strategy); + + /* Need to get an exclusive lock to mark the buffer as dirty */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Mark the buffer as dirty and force a full page write. We have to + * re-write the page to WAL even if the checksum hasn't changed, + * because if there is a replica it might have a slightly different + * version of the page with an invalid checksum, caused by unlogged + * changes (e.g. hintbits) on the primary happening while checksums + * were off. This can happen if there was a valid checksum on the page + * at one point in the past, so only when checksums are first on, then + * off, and then turned on again. TODO: investigate if this could be + * avoided if the checksum is calculated to be correct and wal_level + * is set to "minimal", + */ + START_CRIT_SECTION(); + MarkBufferDirty(buf); + log_newpage_buffer(buf, false); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + + /* + * This is the only place where we check if we are asked to abort, the + * abortion will bubble up from here. + */ + Assert(operation == ENABLE_DATACHECKSUMS); + LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED); + if (DataChecksumState->launch_operation == DISABLE_DATACHECKSUMS) + abort_requested = true; + LWLockRelease(DataChecksumsWorkerLock); + + if (abort_requested) + return false; + + /* update the block counter */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_DONE, + (blknum + 1)); + + /* + * Processing is re-using the vacuum cost delay for process + * throttling, hence why we call vacuum APIs here. + */ + vacuum_delay_point(false); + } + + return true; +} + +/* + * ProcessSingleRelationByOid + * Process a single relation based on oid. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy) +{ + Relation rel; + bool aborted = false; + + StartTransactionCommand(); + + rel = try_relation_open(relationId, AccessShareLock); + if (rel == NULL) + { + /* + * Relation no longer exists. We don't consider this an error since + * there are no pages in it that need data checksums, and thus return + * true. The worker operates off a list of relations generated at the + * start of processing, so relations being dropped in the meantime is + * to be expected. + */ + CommitTransactionCommand(); + pgstat_report_activity(STATE_IDLE, NULL); + return true; + } + RelationGetSmgr(rel); + + for (ForkNumber fnum = 0; fnum <= MAX_FORKNUM; fnum++) + { + if (smgrexists(rel->rd_smgr, fnum)) + { + if (!ProcessSingleRelationFork(rel, fnum, strategy)) + { + aborted = true; + break; + } + } + } + relation_close(rel, AccessShareLock); + + CommitTransactionCommand(); + pgstat_report_activity(STATE_IDLE, NULL); + + return !aborted; +} + +/* + * ProcessDatabase + * Enable data checksums in a single database. + * + * We do this by launching a dynamic background worker into this database, and + * waiting for it to finish. We have to do this in a separate worker, since + * each process can only be connected to one database during its lifetime. + */ +static DataChecksumsWorkerResult +ProcessDatabase(DataChecksumsWorkerDatabase *db) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + BgwHandleStatus status; + pid_t pid; + char activity[NAMEDATALEN + 64]; + + DataChecksumState->success = DATACHECKSUMSWORKER_FAILED; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "%s", "DataChecksumsWorkerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum worker"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum worker"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid); + + /* + * If there are no worker slots available, there is little we can do. If + * we retry in a bit it's still unlikely that the user has managed to + * reconfigure in the meantime and we'd be run through retries fast. + */ + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + ereport(WARNING, + errmsg("could not start background worker for enabling data checksums in database \"%s\"", + db->dbname), + errhint("The \"%s\" setting might be too low.", "max_worker_processes")); + return DATACHECKSUMSWORKER_FAILED; + } + + status = WaitForBackgroundWorkerStartup(bgw_handle, &pid); + if (status == BGWH_STOPPED) + { + ereport(WARNING, + errmsg("could not start background worker for enabling data checksums in database \"%s\"", + db->dbname), + errhint("More details on the error might be found in the server log.")); + + /* + * Heuristic to see if the database was dropped, and if it was we can + * treat it as not an error, else treat as fatal and error out. TODO: + * this could probably be improved with a tighter check. + */ + if (DatabaseExists(db->dboid)) + return DATACHECKSUMSWORKER_FAILED; + else + return DATACHECKSUMSWORKER_DROPDB; + } + + /* + * If the postmaster crashed we cannot end up with a processed database so + * we have no alternative other than exiting. When enabling checksums we + * won't at this time have changed the data checksums state in pg_control + * to enabled so when the cluster comes back up processing will have to be + * restarted. + */ + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("cannot enable data checksums without the postmaster process"), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums().")); + + Assert(status == BGWH_STARTED); + ereport(LOG, + errmsg("initiating data checksum processing in database \"%s\"", + db->dbname)); + + /* Save the pid of the worker so we can signal it later */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + DataChecksumState->worker_pid = pid; + LWLockRelease(DataChecksumsWorkerLock); + + snprintf(activity, sizeof(activity) - 1, + "Waiting for worker in database %s (pid %ld)", db->dbname, (long) pid); + pgstat_report_activity(STATE_RUNNING, activity); + + status = WaitForBackgroundWorkerShutdown(bgw_handle); + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("postmaster exited during data checksum processing in \"%s\"", + db->dbname), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums().")); + + if (DataChecksumState->success == DATACHECKSUMSWORKER_ABORTED) + ereport(LOG, + errmsg("data checksums processing was aborted in database \"%s\"", + db->dbname)); + + pgstat_report_activity(STATE_IDLE, NULL); + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + DataChecksumState->worker_pid = InvalidPid; + LWLockRelease(DataChecksumsWorkerLock); + + return DataChecksumState->success; +} + +/* + * launcher_exit + * + * Internal routine for cleaning up state when the launcher process exits. We + * need to clean up the abort flag to ensure that processing started again if + * it was previously aborted (note: started again, *not* restarted from where + * it left off). + */ +static void +launcher_exit(int code, Datum arg) +{ + abort_requested = false; + + if (launcher_running) + { + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + if (DataChecksumState->worker_pid != InvalidPid) + { + ereport(LOG, + errmsg("data checksums launcher exiting while worker is still running, signalling worker")); + kill(DataChecksumState->worker_pid, SIGTERM); + } + LWLockRelease(DataChecksumsWorkerLock); + } + + /* + * If the launcher is exiting before data checksums are enabled then set + * the state to off since processing cannot be resumed. + */ + if (DataChecksumsInProgressOn()) + SetDataChecksumsOff(); + + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + launcher_running = false; + DataChecksumState->launcher_running = false; + LWLockRelease(DataChecksumsWorkerLock); +} + +/* + * launcher_cancel_handler + * + * Internal routine for reacting to SIGINT and flagging the worker to abort. + * The worker won't be interrupted immediately but will check for abort flag + * between each block in a relation. + */ +static void +launcher_cancel_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + abort_requested = true; + + /* + * There is no sleeping in the main loop, the flag will be checked + * periodically in ProcessSingleRelationFork. The worker does however + * sleep when waiting for concurrent transactions to end so we still need + * to set the latch. + */ + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * WaitForAllTransactionsToFinish + * Blocks awaiting all current transactions to finish + * + * Returns when all transactions which are active at the call of the function + * have ended, or if the postmaster dies while waiting. If the postmaster dies + * the abort flag will be set to indicate that the caller of this shouldn't + * proceed. + * + * NB: this will return early, if aborted by SIGINT or if the target state + * is changed while we're running. + */ +static void +WaitForAllTransactionsToFinish(void) +{ + TransactionId waitforxid; + + LWLockAcquire(XidGenLock, LW_SHARED); + waitforxid = XidFromFullTransactionId(TransamVariables->nextXid); + LWLockRelease(XidGenLock); + + while (TransactionIdPrecedes(GetOldestActiveTransactionId(false, true), waitforxid)) + { + char activity[64]; + int rc; + + /* Oldest running xid is older than us, so wait */ + snprintf(activity, + sizeof(activity), + "Waiting for current transactions to finish (waiting for %u)", + waitforxid); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 3 seconds */ + ResetLatch(MyLatch); + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 3000, + WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION); + + /* + * If the postmaster died we won't be able to enable checksums + * cluster-wide so abort and hope to continue when restarted. + */ + if (rc & WL_POSTMASTER_DEATH) + ereport(FATAL, + errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("postmaster exited during data checksums processing"), + errhint("Data checksums processing must be restarted manually after cluster restart.")); + + CHECK_FOR_INTERRUPTS(); + + LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED); + if (DataChecksumState->launch_operation != operation) + abort_requested = true; + LWLockRelease(DataChecksumsWorkerLock); + if (abort_requested) + break; + } + + pgstat_report_activity(STATE_IDLE, NULL); + return; +} + +/* + * DataChecksumsWorkerLauncherMain + * + * Main function for launching dynamic background workers for processing data + * checksums in databases. This function has the bgworker management, with + * ProcessAllDatabases being responsible for looping over the databases and + * initiating processing. + */ +void +DataChecksumsWorkerLauncherMain(Datum arg) +{ + on_shmem_exit(launcher_exit, 0); + + ereport(DEBUG1, + errmsg("background worker \"datachecksums launcher\" started")); + + pqsignal(SIGTERM, die); + pqsignal(SIGINT, launcher_cancel_handler); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_LAUNCHER; + init_ps_display(NULL); + + INJECTION_POINT("datachecksumsworker-launcher-delay", NULL); + + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + + if (DataChecksumState->launcher_running) + { + ereport(LOG, + errmsg("background worker \"datachecksums launcher\" already running, exiting")); + /* Launcher was already running, let it finish */ + LWLockRelease(DataChecksumsWorkerLock); + return; + } + + launcher_running = true; + + /* Initialize a connection to shared catalogs only */ + BackgroundWorkerInitializeConnectionByOid(InvalidOid, InvalidOid, 0); + + operation = DataChecksumState->launch_operation; + DataChecksumState->launcher_running = true; + DataChecksumState->operation = operation; + DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay; + DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit; + LWLockRelease(DataChecksumsWorkerLock); + + /* + * The target state can change while we are busy enabling/disabling + * checksums, if the user calls pg_disable/enable_data_checksums() before + * we are finished with the previous request. In that case, we will loop + * back here, to process the new request. + */ +again: + + pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS, + InvalidOid); + + if (operation == ENABLE_DATACHECKSUMS) + { + /* + * If we are asked to enable checksums in a cluster which already has + * checksums enabled, exit immediately as there is nothing more to do. + */ + if (DataChecksumsNeedVerify()) + goto done; + + ereport(LOG, + errmsg("enabling data checksums requested, starting data checksum calculation")); + + /* + * Set the state to inprogress-on and wait on the procsignal barrier. + */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_ENABLING); + SetDataChecksumsOnInProgress(); + + /* + * All backends are now in inprogress-on state and are writing data + * checksums. Start processing all data at rest. + */ + if (!ProcessAllDatabases()) + { + /* + * If the target state changed during processing then it's not a + * failure, so restart processing instead. + */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + if (DataChecksumState->launch_operation != operation) + { + LWLockRelease(DataChecksumsWorkerLock); + goto done; + } + LWLockRelease(DataChecksumsWorkerLock); + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("unable to enable data checksums in cluster")); + } + + /* + * Data checksums have been set on all pages, set the state to on in + * order to instruct backends to validate checksums on reading. + */ + SetDataChecksumsOn(); + + ereport(LOG, + errmsg("data checksums are now enabled")); + } + else if (operation == DISABLE_DATACHECKSUMS) + { + ereport(LOG, + errmsg("disabling data checksums requested")); + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_DISABLING); + SetDataChecksumsOff(); + ereport(LOG, + errmsg("data checksums are now disabled")); + } + else + Assert(false); + +done: + + /* + * This state will only be displayed for a fleeting moment, but for the + * sake of correctness it is still added before ending the command. + */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_DONE); + + /* + * All done. But before we exit, check if the target state was changed + * while we were running. In that case we will have to start all over + * again. + */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + if (DataChecksumState->launch_operation != operation) + { + DataChecksumState->operation = DataChecksumState->launch_operation; + operation = DataChecksumState->launch_operation; + DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay; + DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit; + LWLockRelease(DataChecksumsWorkerLock); + goto again; + } + + /* Shut down progress reporting as we are done */ + pgstat_progress_end_command(); + + launcher_running = false; + DataChecksumState->launcher_running = false; + LWLockRelease(DataChecksumsWorkerLock); +} + +/* + * ProcessAllDatabases + * Compute the list of all databases and process checksums in each + * + * This will generate a list of databases to process for enabling checksums. + * If a database encounters a failure then processing will end immediately and + * return an error. + */ +static bool +ProcessAllDatabases(void) +{ + List *DatabaseList; + int cumulative_total = 0; + + /* Set up so first run processes shared catalogs, not once in every db */ + DataChecksumState->process_shared_catalogs = true; + + /* Get a list of all databases to process */ + WaitForAllTransactionsToFinish(); + DatabaseList = BuildDatabaseList(); + + /* + * Update progress reporting with the total number of databases we need to + * process. This number should not be changed during processing, the + * columns for processed databases is instead increased such that it can + * be compared against the total. + */ + { + const int index[] = { + PROGRESS_DATACHECKSUMS_DBS_TOTAL, + PROGRESS_DATACHECKSUMS_DBS_DONE, + PROGRESS_DATACHECKSUMS_RELS_TOTAL, + PROGRESS_DATACHECKSUMS_RELS_DONE, + PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL, + PROGRESS_DATACHECKSUMS_BLOCKS_DONE, + }; + + int64 vals[6]; + + vals[0] = list_length(DatabaseList); + vals[1] = 0; + /* translated to NULL */ + vals[2] = -1; + vals[3] = -1; + vals[4] = -1; + vals[5] = -1; + + pgstat_progress_update_multi_param(6, index, vals); + } + + foreach_ptr(DataChecksumsWorkerDatabase, db, DatabaseList) + { + DataChecksumsWorkerResult result; + + result = ProcessDatabase(db); + + /* Allow a test process to alter the result of the operation */ + INJECTION_POINT("datachecksumsworker-modify-db-result", &result); + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE, + ++cumulative_total); + + if (result == DATACHECKSUMSWORKER_FAILED) + { + /* + * Disable checksums on cluster, because we failed one of the + * databases and this is an all or nothing process. + */ + SetDataChecksumsOff(); + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("data checksums failed to get enabled in all databases, aborting"), + errhint("The server log might have more information on the cause of the error.")); + } + else if (result == DATACHECKSUMSWORKER_ABORTED || abort_requested) + { + /* Abort flag set, so exit the whole process */ + return false; + } + + /* + * When one database has completed, it will have done shared catalogs + * so we don't have to process them again. + */ + DataChecksumState->process_shared_catalogs = false; + } + + FreeDatabaseList(DatabaseList); + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_WAITING_BARRIER); + return true; +} + +/* + * DataChecksumStateSize + * Compute required space for datachecksumsworker-related shared memory + */ +Size +DataChecksumsShmemSize(void) +{ + Size size; + + size = sizeof(DataChecksumsStateStruct); + size = MAXALIGN(size); + + return size; +} + +/* + * DataChecksumStateInit + * Allocate and initialize datachecksumsworker-related shared memory + */ +void +DataChecksumsShmemInit(void) +{ + bool found; + + DataChecksumState = (DataChecksumsStateStruct *) + ShmemInitStruct("DataChecksumsWorker Data", + DataChecksumsShmemSize(), + &found); + if (!found) + MemSet(DataChecksumState, 0, DataChecksumsShmemSize()); +} + +/* + * DatabaseExists + * + * Scans the system catalog to check if a database with the given Oid exist + * and returns true if it is found, else false. + */ +static bool +DatabaseExists(Oid dboid) +{ + Relation rel; + ScanKeyData skey; + SysScanDesc scan; + bool found; + HeapTuple tuple; + + StartTransactionCommand(); + + rel = table_open(DatabaseRelationId, AccessShareLock); + ScanKeyInit(&skey, + Anum_pg_database_oid, + BTEqualStrategyNumber, F_OIDEQ, + dboid); + scan = systable_beginscan(rel, DatabaseOidIndexId, true, SnapshotSelf, + 1, &skey); + tuple = systable_getnext(scan); + found = HeapTupleIsValid(tuple); + + systable_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return found; +} + +/* + * BuildDatabaseList + * Compile a list of all currently available databases in the cluster + * + * This creates the list of databases for the datachecksumsworker workers to + * add checksums to. If the caller wants to ensure that no concurrently + * running CREATE DATABASE calls exist, this needs to be preceded by a call + * to WaitForAllTransactionsToFinish(). + */ +static List * +BuildDatabaseList(void) +{ + List *DatabaseList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(DatabaseRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup); + DataChecksumsWorkerDatabase *db; + + oldctx = MemoryContextSwitchTo(ctx); + + db = (DataChecksumsWorkerDatabase *) palloc0(sizeof(DataChecksumsWorkerDatabase)); + + db->dboid = pgdb->oid; + db->dbname = pstrdup(NameStr(pgdb->datname)); + + DatabaseList = lappend(DatabaseList, db); + + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return DatabaseList; +} + +static void +FreeDatabaseList(List *dblist) +{ + if (!dblist) + return; + + foreach_ptr(DataChecksumsWorkerDatabase, db, dblist) + { + if (db->dbname != NULL) + pfree(db->dbname); + } + + list_free_deep(dblist); +} + +/* + * BuildRelationList + * Compile a list of relations in the database + * + * Returns a list of OIDs for the request relation types. If temp_relations + * is True then only temporary relations are returned. If temp_relations is + * False then non-temporary relations which have data checksums are returned. + * If include_shared is True then shared relations are included as well in a + * non-temporary list. include_shared has no relevance when building a list of + * temporary relations. + */ +static List * +BuildRelationList(bool temp_relations, bool include_shared) +{ + List *RelationList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup); + + /* Only include temporary relations when explicitly asked to */ + if (pgc->relpersistence == RELPERSISTENCE_TEMP) + { + if (!temp_relations) + continue; + } + else + { + /* + * If we are only interested in temp relations then continue + * immediately as the current relation isn't a temp relation. + */ + if (temp_relations) + continue; + + if (!RELKIND_HAS_STORAGE(pgc->relkind)) + continue; + + if (pgc->relisshared && !include_shared) + continue; + } + + oldctx = MemoryContextSwitchTo(ctx); + RelationList = lappend_oid(RelationList, pgc->oid); + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return RelationList; +} + +/* + * DataChecksumsWorkerMain + * + * Main function for enabling checksums in a single database, This is the + * function set as the bgw_function_name in the dynamic background worker + * process initiated for each database by the worker launcher. After enabling + * data checksums in each applicable relation in the database, it will wait for + * all temporary relations that were present when the function started to + * disappear before returning. This is required since we cannot rewrite + * existing temporary relations with data checksums. + */ +void +DataChecksumsWorkerMain(Datum arg) +{ + Oid dboid = DatumGetObjectId(arg); + List *RelationList = NIL; + List *InitialTempTableList = NIL; + BufferAccessStrategy strategy; + bool aborted = false; + int64 rels_done; + + operation = ENABLE_DATACHECKSUMS; + + pqsignal(SIGTERM, die); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_WORKER; + init_ps_display(NULL); + + BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, + BGWORKER_BYPASS_ALLOWCONN); + + /* worker will have a separate entry in pg_stat_progress_data_checksums */ + pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS, + InvalidOid); + + /* + * Get a list of all temp tables present as we start in this database. We + * need to wait until they are all gone until we are done, since we cannot + * access these relations and modify them. + */ + InitialTempTableList = BuildRelationList(true, false); + + /* + * Enable vacuum cost delay, if any. While this process isn't doing any + * vacuuming, we are re-using the infrastructure that vacuum cost delay + * provides rather than inventing something bespoke. This is an internal + * implementation detail and care should be taken to avoid it bleeding + * through to the user to avoid confusion. + */ + Assert(DataChecksumState->operation == ENABLE_DATACHECKSUMS); + VacuumCostDelay = DataChecksumState->cost_delay; + VacuumCostLimit = DataChecksumState->cost_limit; + VacuumCostActive = (VacuumCostDelay > 0); + VacuumCostBalance = 0; + VacuumCostPageHit = 0; + VacuumCostPageMiss = 0; + VacuumCostPageDirty = 0; + + /* + * Create and set the vacuum strategy as our buffer strategy. + */ + strategy = GetAccessStrategy(BAS_VACUUM); + + RelationList = BuildRelationList(false, + DataChecksumState->process_shared_catalogs); + + /* Update the total number of relations to be processed in this DB. */ + { + const int index[] = { + PROGRESS_DATACHECKSUMS_RELS_TOTAL, + PROGRESS_DATACHECKSUMS_RELS_DONE + }; + + int64 vals[2]; + + vals[0] = list_length(RelationList); + vals[1] = 0; + + pgstat_progress_update_multi_param(2, index, vals); + } + + /* Process the relations */ + rels_done = 0; + foreach_oid(reloid, RelationList) + { + CHECK_FOR_INTERRUPTS(); + + if (!ProcessSingleRelationByOid(reloid, strategy)) + { + aborted = true; + break; + } + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_RELS_DONE, + ++rels_done); + } + list_free(RelationList); + + if (aborted) + { + DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED; + ereport(DEBUG1, + errmsg("data checksum processing aborted in database OID %u", + dboid)); + return; + } + + /* The worker is about to wait for temporary tables to go away. */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL); + + /* + * Wait for all temp tables that existed when we started to go away. This + * is necessary since we cannot "reach" them to enable checksums. Any temp + * tables created after we started will already have checksums in them + * (due to the "inprogress-on" state), so no need to wait for those. + */ + for (;;) + { + List *CurrentTempTables; + int numleft; + char activity[64]; + + CurrentTempTables = BuildRelationList(true, false); + numleft = 0; + foreach_oid(tmptbloid, InitialTempTableList) + { + if (list_member_oid(CurrentTempTables, tmptbloid)) + numleft++; + } + list_free(CurrentTempTables); + + INJECTION_POINT("datachecksumsworker-fake-temptable-wait", &numleft); + + if (numleft == 0) + break; + + /* + * At least one temp table is left to wait for, indicate in pgstat + * activity and progress reporting. + */ + snprintf(activity, + sizeof(activity), + "Waiting for %d temp tables to be removed", numleft); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 3 seconds */ + ResetLatch(MyLatch); + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 3000, + WAIT_EVENT_CHECKSUM_ENABLE_TEMPTABLE_WAIT); + + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + aborted = DataChecksumState->launch_operation != operation; + LWLockRelease(DataChecksumsWorkerLock); + + CHECK_FOR_INTERRUPTS(); + + if (aborted || abort_requested) + { + DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED; + ereport(LOG, + errmsg("data checksum processing aborted in database OID %u", + dboid)); + return; + } + } + + list_free(InitialTempTableList); + + /* worker done */ + pgstat_progress_end_command(); + + DataChecksumState->success = DATACHECKSUMSWORKER_SUCCESSFUL; +} diff --git a/src/backend/postmaster/meson.build b/src/backend/postmaster/meson.build index e1f70726604..6cba23bbeef 100644 --- a/src/backend/postmaster/meson.build +++ b/src/backend/postmaster/meson.build @@ -6,6 +6,7 @@ backend_sources += files( 'bgworker.c', 'bgwriter.c', 'checkpointer.c', + 'datachecksum_state.c', 'fork_process.c', 'interrupt.c', 'launch_backend.c', diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index abf0c97569e..eb4f3eb72d4 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -2991,6 +2991,11 @@ PostmasterStateMachine(void) B_INVALID, B_STANDALONE_BACKEND); + /* also add data checksums processes */ + remainMask = btmask_add(remainMask, + B_DATACHECKSUMSWORKER_LAUNCHER, + B_DATACHECKSUMSWORKER_WORKER); + /* All types should be included in targetMask or remainMask */ Assert((remainMask.mask | targetMask.mask) == BTYPE_MASK_ALL.mask); } diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 3c027bcb2f7..57aaef57c61 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -189,6 +189,22 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) } } +void +xlog2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & ~XLR_INFO_MASK; + + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(buf->record), buf->origptr); + + switch (info) + { + case XLOG2_CHECKSUMS: + break; + default: + elog(ERROR, "unexpected RM_XLOG2_ID record type: %u", info); + } +} + /* * Handle rmgr XACT_ID records for LogicalDecodingProcessRecord(). */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 5c64570020d..3cc0b0bdd92 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8567,6 +8567,13 @@ buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, if (flags & READ_BUFFERS_IGNORE_CHECKSUM_FAILURES) piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE; + /* + * If the buffers are marked for zero on error, we want to log that in + * case of a checksum failure. + */ + if (flags & READ_BUFFERS_ZERO_ON_ERROR) + piv_flags |= PIV_ZERO_BUFFERS_ON_ERROR; + /* Check for garbage data. */ if (!failed) { diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index d692d419846..7aab5da3386 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -31,6 +31,7 @@ #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" +#include "postmaster/datachecksum_state.h" #include "postmaster/walsummarizer.h" #include "replication/logicallauncher.h" #include "replication/origin.h" @@ -142,6 +143,7 @@ CalculateShmemSize(void) size = add_size(size, AioShmemSize()); size = add_size(size, WaitLSNShmemSize()); size = add_size(size, LogicalDecodingCtlShmemSize()); + size = add_size(size, DataChecksumsShmemSize()); /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); @@ -310,6 +312,7 @@ CreateOrAttachShmemStructs(void) PgArchShmemInit(); ApplyLauncherShmemInit(); SlotSyncShmemInit(); + DataChecksumsShmemInit(); /* * Set up other modules that need some shared memory space diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 7e017c8d53b..f1ab3aa3fe0 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -22,6 +22,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "port/pg_bitutils.h" +#include "postmaster/datachecksum_state.h" #include "replication/logicalctl.h" #include "replication/logicalworker.h" #include "replication/walsender.h" @@ -582,6 +583,13 @@ ProcessProcSignalBarrier(void) case PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO: processed = ProcessBarrierUpdateXLogLogicalInfo(); break; + + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON: + case PROCSIGNAL_BARRIER_CHECKSUM_ON: + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF: + case PROCSIGNAL_BARRIER_CHECKSUM_OFF: + processed = AbsorbDataChecksumsBarrier(type); + break; } /* diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README index e30d7ac59ad..73c36a63908 100644 --- a/src/backend/storage/page/README +++ b/src/backend/storage/page/README @@ -10,7 +10,9 @@ http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed 2010/12/22 on -hackers list. Current implementation requires this be enabled system-wide at initdb time, or -by using the pg_checksums tool on an offline cluster. +by using the pg_checksums tool on an offline cluster. Checksums can also be +enabled at runtime using pg_enable_data_checksums(), and disabled by using +pg_disable_data_checksums(). The checksum is not valid at all times on a data page!! The checksum is valid when the page leaves the shared pool and is checked diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 56f1f7ae9fc..1fdfda59edd 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -107,7 +107,15 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail */ if (!PageIsNew(page)) { - if (DataChecksumsEnabled()) + /* + * There shouldn't be any check for interrupt calls happening in this + * codepath, but just to be on the safe side we hold interrupts since + * if they did happen the data checksum state could change during + * verifying checksums, which could lead to incorrect verification + * results. + */ + HOLD_INTERRUPTS(); + if (DataChecksumsNeedVerify()) { checksum = pg_checksum_page(page, blkno); @@ -118,6 +126,7 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail *checksum_failure_p = true; } } + RESUME_INTERRUPTS(); /* * The following checks don't prove the header is correct, only that @@ -151,8 +160,9 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail if ((flags & (PIV_LOG_WARNING | PIV_LOG_LOG)) != 0) ereport(flags & PIV_LOG_WARNING ? WARNING : LOG, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("page verification failed, calculated checksum %u but expected %u", - checksum, p->pd_checksum))); + errmsg("page verification failed, calculated checksum %u but expected %u%s", + checksum, p->pd_checksum, + (flags & PIV_ZERO_BUFFERS_ON_ERROR ? ", buffer will be zeroed" : "")))); if (header_sane && (flags & PIV_IGNORE_CHECKSUM_FAILURE)) return true; @@ -1507,9 +1517,14 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum, void PageSetChecksum(Page page, BlockNumber blkno) { + HOLD_INTERRUPTS(); /* If we don't need a checksum, just return */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) + { + RESUME_INTERRUPTS(); return; + } ((PageHeader) page)->pd_checksum = pg_checksum_page(page, blkno); + RESUME_INTERRUPTS(); } diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 7727fed3bda..04fe13e64c6 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -380,6 +380,8 @@ pgstat_tracks_backend_bktype(BackendType bktype) case B_CHECKPOINTER: case B_IO_WORKER: case B_STARTUP: + case B_DATACHECKSUMSWORKER_LAUNCHER: + case B_DATACHECKSUMSWORKER_WORKER: return false; case B_AUTOVAC_WORKER: diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 28de24538dc..2be26e92283 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -362,6 +362,8 @@ pgstat_tracks_io_bktype(BackendType bktype) case B_LOGGER: return false; + case B_DATACHECKSUMSWORKER_LAUNCHER: + case B_DATACHECKSUMSWORKER_WORKER: case B_AUTOVAC_LAUNCHER: case B_AUTOVAC_WORKER: case B_BACKEND: diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 6be80d2daad..0a6d16f8154 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -119,6 +119,8 @@ CHECKPOINT_DELAY_COMPLETE "Waiting for a backend that blocks a checkpoint from c CHECKPOINT_DELAY_START "Waiting for a backend that blocks a checkpoint from starting." CHECKPOINT_DONE "Waiting for a checkpoint to complete." CHECKPOINT_START "Waiting for a checkpoint to start." +CHECKSUM_ENABLE_STARTCONDITION "Waiting for data checksums enabling to start." +CHECKSUM_ENABLE_TEMPTABLE_WAIT "Waiting for temporary tables to be dropped for data checksums to be enabled." EXECUTE_GATHER "Waiting for activity from a child process while executing a Gather plan node." HASH_BATCH_ALLOCATE "Waiting for an elected Parallel Hash participant to allocate a hash table." HASH_BATCH_ELECT "Waiting to elect a Parallel Hash participant to allocate a hash table." @@ -365,6 +367,7 @@ SerialControl "Waiting to read or update shared pg_serial s AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." WaitLSN "Waiting to read or update shared Wait-for-LSN state." LogicalDecodingControl "Waiting to read or update logical decoding status information." +DataChecksumsWorker "Waiting for data checksums worker." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 9185a8e6b83..1408de387ea 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -297,6 +297,8 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS) cmdtype = PROGRESS_COMMAND_BASEBACKUP; else if (pg_strcasecmp(cmd, "COPY") == 0) cmdtype = PROGRESS_COMMAND_COPY; + else if (pg_strcasecmp(cmd, "DATACHECKSUMS") == 0) + cmdtype = PROGRESS_COMMAND_DATACHECKSUMS; else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -1182,9 +1184,6 @@ pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS) int64 result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else @@ -1200,9 +1199,6 @@ pg_stat_get_db_checksum_last_failure(PG_FUNCTION_ARGS) TimestampTz result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index ba191977697..7ffc808073a 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -845,7 +845,8 @@ InitializeSessionUserIdStandalone(void) * workers, in slot sync worker and in background workers. */ Assert(!IsUnderPostmaster || AmAutoVacuumWorkerProcess() || - AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess()); + AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess() || + AmDataChecksumsWorkerProcess()); /* call only once */ Assert(!OidIsValid(AuthenticatedUserId)); diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 783a7400464..6f074013aa9 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -758,6 +758,24 @@ InitPostgres(const char *in_dbname, Oid dboid, ProcSignalInit(MyCancelKey, MyCancelKeyLength); + /* + * Initialize a local cache of the data_checksum_version, to be updated by + * the procsignal-based barriers. + * + * This intentionally happens after initializing the procsignal, otherwise + * we might miss a state change. This means we can get a barrier for the + * state we've just initialized. + * + * The postmaster (which is what gets forked into the new child process) + * does not handle barriers, therefore it may not have the current value + * of LocalDataChecksumVersion value (it'll have the value read from the + * control file, which may be arbitrarily old). + * + * NB: Even if the postmaster handled barriers, the value might still be + * stale, as it might have changed after this process forked. + */ + InitLocalDataChecksumState(); + /* * Also set up timeout handlers needed for backend operation. We need * these in every case except bootstrap. @@ -886,7 +904,7 @@ InitPostgres(const char *in_dbname, Oid dboid, errhint("You should immediately run CREATE USER \"%s\" SUPERUSER;.", username != NULL ? username : "postgres"))); } - else if (AmBackgroundWorkerProcess()) + else if (AmBackgroundWorkerProcess() || AmDataChecksumsWorkerProcess()) { if (username == NULL && !OidIsValid(useroid)) { diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index fc0900efe5f..a315c4ab8ab 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -571,11 +571,12 @@ max => '1.0', }, -{ name => 'data_checksums', type => 'bool', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', +{ name => 'data_checksums', type => 'enum', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', short_desc => 'Shows whether data checksums are turned on for this cluster.', flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED', variable => 'data_checksums', - boot_val => 'false', + boot_val => 'PG_DATA_CHECKSUM_OFF', + options => 'data_checksums_options', }, # Can't be set by ALTER SYSTEM as it can lead to recursive definition diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 1e14b7b4af0..d9ca13baff9 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -501,6 +501,14 @@ static const struct config_enum_entry file_extend_method_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry data_checksums_options[] = { + {"on", PG_DATA_CHECKSUM_VERSION, true}, + {"off", PG_DATA_CHECKSUM_OFF, true}, + {"inprogress-on", PG_DATA_CHECKSUM_INPROGRESS_ON, true}, + {"inprogress-off", PG_DATA_CHECKSUM_INPROGRESS_OFF, true}, + {NULL, 0, false} +}; + /* * Options for enum values stored in other modules */ @@ -629,7 +637,6 @@ static int shared_memory_size_in_huge_pages; static int wal_block_size; static int num_os_semaphores; static int effective_wal_level = WAL_LEVEL_REPLICA; -static bool data_checksums; static bool integer_datetimes; #ifdef USE_ASSERT_CHECKING diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index c8194c27aa7..6d0337853e0 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -543,11 +543,11 @@ # archiver autovacuum # backend bgworker # bgwriter checkpointer - # ioworker postmaster - # slotsyncworker startup - # syslogger walreceiver - # walsummarizer walwriter - # walsender + # checksums ioworker + # postmaster slotsyncworker + # startup syslogger + # walreceiver walsummarizer + # walwriter walsender # # Level values in order of decreasing # detail: diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 301e256fbb1..2a38f1d688b 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -585,7 +585,7 @@ main(int argc, char *argv[]) ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) pg_fatal("cluster must be shut down"); - if (ControlFile->data_checksum_version == 0 && + if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_VERSION && mode == PG_MODE_CHECK) pg_fatal("data checksums are not enabled in cluster"); @@ -593,7 +593,7 @@ main(int argc, char *argv[]) mode == PG_MODE_DISABLE) pg_fatal("data checksums are already disabled in cluster"); - if (ControlFile->data_checksum_version > 0 && + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION && mode == PG_MODE_ENABLE) pg_fatal("data checksums are already enabled in cluster"); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index a4060309ae0..fe5fc5ec133 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -287,6 +287,8 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.oldestCommitTsXid); printf(_("Latest checkpoint's newestCommitTsXid:%u\n"), ControlFile->checkPointCopy.newestCommitTsXid); + printf(_("Latest checkpoint's data_checksum_version:%u\n"), + ControlFile->checkPointCopy.dataChecksumState); printf(_("Time of latest checkpoint: %s\n"), ckpttime_str); printf(_("Fake LSN counter for unlogged rels: %X/%08X\n"), diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index aa6e8b4de5d..79053d22dcc 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -15,6 +15,7 @@ #include "access/xlog_internal.h" #include "common/string.h" #include "pg_upgrade.h" +#include "storage/checksum.h" /* @@ -736,6 +737,14 @@ check_control_data(ControlData *oldctrl, * check_for_isn_and_int8_passing_mismatch(). */ + /* + * If data checksums are in any in-progress state then disallow the + * upgrade. The user should either let the process finish, or turn off + * data checksums, before retrying. + */ + if (oldctrl->data_checksum_version > PG_DATA_CHECKSUM_VERSION) + pg_fatal("checksums are being enabled in the old cluster"); + /* * We might eventually allow upgrades from checksum to no-checksum * clusters. diff --git a/src/bin/pg_waldump/t/001_basic.pl b/src/bin/pg_waldump/t/001_basic.pl index a268f0f1dd0..7dd1c3dd63e 100644 --- a/src/bin/pg_waldump/t/001_basic.pl +++ b/src/bin/pg_waldump/t/001_basic.pl @@ -79,7 +79,8 @@ BRIN CommitTs ReplicationOrigin Generic -LogicalMessage$/, +LogicalMessage +XLOG2$/, 'rmgr list'); diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 3352b5f8532..ae32ef16d67 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL) PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL) PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, logicalmsg_decode) +PG_RMGR(RM_XLOG2_ID, "XLOG2", xlog2_redo, xlog2_desc, xlog2_identify, NULL, NULL, NULL, xlog2_decode) diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index dcc12eb8cbe..4af38e74ce4 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -57,6 +57,7 @@ extern PGDLLIMPORT int CommitDelay; extern PGDLLIMPORT int CommitSiblings; extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; +extern PGDLLIMPORT int data_checksums; extern PGDLLIMPORT int CheckPointSegments; @@ -119,7 +120,7 @@ extern PGDLLIMPORT bool XLogLogicalInfo; * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log * them if forced by wal_log_hints=on. */ -#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) +#define XLogHintBitIsNeeded() (wal_log_hints || DataChecksumsNeedWrite()) /* Do we need to WAL-log information required only for Hot Standby and logical replication? */ #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) @@ -229,8 +230,11 @@ extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); extern XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); extern void xlog_redo(struct XLogReaderState *record); +extern void xlog2_redo(struct XLogReaderState *record); extern void xlog_desc(StringInfo buf, struct XLogReaderState *record); +extern void xlog2_desc(StringInfo buf, struct XLogReaderState *record); extern const char *xlog_identify(uint8 info); +extern const char *xlog2_identify(uint8 info); extern void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli); @@ -243,7 +247,16 @@ extern XLogRecPtr GetXLogWriteRecPtr(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); -extern bool DataChecksumsEnabled(void); +extern bool DataChecksumsNeedWrite(void); +extern bool DataChecksumsNeedVerify(void); +extern bool DataChecksumsInProgressOn(void); +extern void SetDataChecksumsOnInProgress(void); +extern void SetDataChecksumsOn(void); +extern void SetDataChecksumsOff(void); +extern const char *show_data_checksums(void); +extern const char *get_checksum_state_string(uint32 state); +extern void InitLocalDataChecksumState(void); +extern void SetLocalDataChecksumState(uint32 data_checksum_version); extern bool GetDefaultCharSignedness(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 755835d63bf..10c18d39ff8 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -25,6 +25,7 @@ #include "lib/stringinfo.h" #include "pgtime.h" #include "storage/block.h" +#include "storage/checksum.h" #include "storage/relfilelocator.h" @@ -287,6 +288,12 @@ typedef struct xl_restore_point char rp_name[MAXFNAMELEN]; } xl_restore_point; +/* Information logged when data checksum level is changed */ +typedef struct xl_checksum_state +{ + ChecksumStateType new_checksum_state; +} xl_checksum_state; + /* Overwrite of prior contrecord */ typedef struct xl_overwrite_contrecord { @@ -307,6 +314,7 @@ typedef struct xl_end_of_recovery typedef struct xl_checkpoint_redo { int wal_level; + uint32 data_checksum_version; } xl_checkpoint_redo; /* diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index b1c5afc15df..582bb2e2058 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202604021 +#define CATALOG_VERSION_NO 202604031 #endif diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 77a661e818b..80b3a730e03 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -22,7 +22,7 @@ /* Version identifier for this pg_control format */ -#define PG_CONTROL_VERSION 1901 +#define PG_CONTROL_VERSION 1902 /* Nonce key length, see below */ #define MOCK_AUTH_NONCE_LEN 32 @@ -63,6 +63,9 @@ typedef struct CheckPoint * set to InvalidTransactionId. */ TransactionId oldestActiveXid; + + /* data checksums state at the time of the checkpoint */ + uint32 dataChecksumState; } CheckPoint; /* XLOG info values for XLOG rmgr */ @@ -83,6 +86,9 @@ typedef struct CheckPoint #define XLOG_CHECKPOINT_REDO 0xE0 #define XLOG_LOGICAL_DECODING_STATUS_CHANGE 0xF0 +/* XLOG info values for XLOG2 rmgr */ +#define XLOG2_CHECKSUMS 0x00 + /* * System status indicator. Note this is stored in pg_control; if you change diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index acf16254b21..bd177aebfcb 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12558,6 +12558,20 @@ proname => 'jsonb_subscript_handler', prorettype => 'internal', proargtypes => 'internal', prosrc => 'jsonb_subscript_handler' }, +# data checksum management functions +{ oid => '9258', + descr => 'disable data checksums', + proname => 'pg_disable_data_checksums', provolatile => 'v', prorettype => 'void', + proparallel => 'r', prosrc => 'disable_data_checksums', proargtypes => '', + proacl => '{POSTGRES=X}'}, +{ oid => '9257', + descr => 'enable data checksums', + proname => 'pg_enable_data_checksums', provolatile => 'v', prorettype => 'void', + proparallel => 'r', proargtypes => 'int4 int4', proallargtypes => '{int4,int4}', + proargmodes => '{i,i}', proargnames => '{cost_delay,cost_limit}', + proargdefaults => '{0,100}', prosrc => 'enable_data_checksums', + proacl => '{POSTGRES=X}'}, + # collation management functions { oid => '3445', descr => 'import collations from operating system', proname => 'pg_import_system_collations', procost => '100', diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 9c40772706c..67948667a97 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -185,4 +185,20 @@ #define PROGRESS_COPY_TYPE_PIPE 3 #define PROGRESS_COPY_TYPE_CALLBACK 4 +/* Progress parameters for PROGRESS_DATACHECKSUMS */ +#define PROGRESS_DATACHECKSUMS_PHASE 0 +#define PROGRESS_DATACHECKSUMS_DBS_TOTAL 1 +#define PROGRESS_DATACHECKSUMS_DBS_DONE 2 +#define PROGRESS_DATACHECKSUMS_RELS_TOTAL 3 +#define PROGRESS_DATACHECKSUMS_RELS_DONE 4 +#define PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL 5 +#define PROGRESS_DATACHECKSUMS_BLOCKS_DONE 6 + +/* Phases of datachecksumsworker operation */ +#define PROGRESS_DATACHECKSUMS_PHASE_ENABLING 0 +#define PROGRESS_DATACHECKSUMS_PHASE_DISABLING 1 +#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL 2 +#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_BARRIER 3 +#define PROGRESS_DATACHECKSUMS_PHASE_DONE 4 + #endif diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 04f29748be7..7277c37e779 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -367,6 +367,9 @@ typedef enum BackendType B_WAL_SUMMARIZER, B_WAL_WRITER, + B_DATACHECKSUMSWORKER_LAUNCHER, + B_DATACHECKSUMSWORKER_WORKER, + /* * Logger is not connected to shared memory and does not have a PGPROC * entry. @@ -392,6 +395,9 @@ extern PGDLLIMPORT BackendType MyBackendType; #define AmWalSummarizerProcess() (MyBackendType == B_WAL_SUMMARIZER) #define AmWalWriterProcess() (MyBackendType == B_WAL_WRITER) #define AmIoWorkerProcess() (MyBackendType == B_IO_WORKER) +#define AmDataChecksumsWorkerProcess() \ + (MyBackendType == B_DATACHECKSUMSWORKER_LAUNCHER || \ + MyBackendType == B_DATACHECKSUMSWORKER_WORKER) #define AmSpecialWorkerProcess() \ (AmAutoVacuumLauncherProcess() || \ diff --git a/src/include/postmaster/datachecksum_state.h b/src/include/postmaster/datachecksum_state.h new file mode 100644 index 00000000000..343494edcc8 --- /dev/null +++ b/src/include/postmaster/datachecksum_state.h @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------- + * + * datachecksum_state.h + * header file for data checksum helper background worker and data + * checksum state manipulation + * + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/postmaster/datachecksum_state.h + * + *------------------------------------------------------------------------- + */ +#ifndef DATACHECKSUM_STATE_H +#define DATACHECKSUM_STATE_H + +#include "storage/procsignal.h" + +/* Shared memory */ +extern Size DataChecksumsShmemSize(void); +extern void DataChecksumsShmemInit(void); + +/* Possible operations the Datachecksumsworker can perform */ +typedef enum DataChecksumsWorkerOperation +{ + ENABLE_DATACHECKSUMS, + DISABLE_DATACHECKSUMS, +} DataChecksumsWorkerOperation; + +/* + * Possible states for a database entry which has been processed. Exported + * here since we want to be able to reference this from injection point tests. + */ +typedef enum +{ + DATACHECKSUMSWORKER_SUCCESSFUL = 0, + DATACHECKSUMSWORKER_ABORTED, + DATACHECKSUMSWORKER_FAILED, + DATACHECKSUMSWORKER_DROPDB, +} DataChecksumsWorkerResult; + +/* Prototypes for data checksum state manipulation */ +bool AbsorbDataChecksumsBarrier(ProcSignalBarrierType target_state); +void EmitAndWaitDataChecksumsBarrier(uint32 state); + +/* Prototypes for data checksum background worker */ + +/* Start the background processes for enabling or disabling checksums */ +void StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op, + int cost_delay, + int cost_limit); + +/* Background worker entrypoints */ +void DataChecksumsWorkerLauncherMain(Datum arg); +void DataChecksumsWorkerMain(Datum arg); + +#endif /* DATACHECKSUM_STATE_H */ diff --git a/src/include/postmaster/proctypelist.h b/src/include/postmaster/proctypelist.h index feac19ba207..b3477e6f17a 100644 --- a/src/include/postmaster/proctypelist.h +++ b/src/include/postmaster/proctypelist.h @@ -38,6 +38,8 @@ PG_PROCTYPE(B_BACKEND, "backend", gettext_noop("client backend"), BackendMain, t PG_PROCTYPE(B_BG_WORKER, "bgworker", gettext_noop("background worker"), BackgroundWorkerMain, true) PG_PROCTYPE(B_BG_WRITER, "bgwriter", gettext_noop("background writer"), BackgroundWriterMain, true) PG_PROCTYPE(B_CHECKPOINTER, "checkpointer", gettext_noop("checkpointer"), CheckpointerMain, true) +PG_PROCTYPE(B_DATACHECKSUMSWORKER_LAUNCHER, "checksums", gettext_noop("datachecksum launcher"), NULL, false) +PG_PROCTYPE(B_DATACHECKSUMSWORKER_WORKER, "checksums", gettext_noop("datachecksum worker"), NULL, false) PG_PROCTYPE(B_DEAD_END_BACKEND, "backend", gettext_noop("dead-end client backend"), BackendMain, true) PG_PROCTYPE(B_INVALID, "postmaster", gettext_noop("unrecognized"), NULL, false) PG_PROCTYPE(B_IO_WORKER, "ioworker", gettext_noop("io worker"), IoWorkerMain, true) diff --git a/src/include/replication/decode.h b/src/include/replication/decode.h index 49f00fc48b8..107e43ef750 100644 --- a/src/include/replication/decode.h +++ b/src/include/replication/decode.h @@ -22,6 +22,7 @@ typedef struct XLogRecordBuffer } XLogRecordBuffer; extern void xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +extern void xlog2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); extern void heap_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); extern void heap2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); extern void xact_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index e5267b93fe6..634e1e49ee5 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -230,7 +230,6 @@ typedef PageHeaderData *PageHeader; * handling pages. */ #define PG_PAGE_LAYOUT_VERSION 4 -#define PG_DATA_CHECKSUM_VERSION 1 /* ---------------------------------------------------------------- * page support functions @@ -501,6 +500,7 @@ do { \ #define PIV_LOG_WARNING (1 << 0) #define PIV_LOG_LOG (1 << 1) #define PIV_IGNORE_CHECKSUM_FAILURE (1 << 2) +#define PIV_ZERO_BUFFERS_ON_ERROR (1 << 3) #define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap) \ PageAddItemExtended(page, item, size, offsetNumber, \ diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index ff417d5ae3e..3b1440c0c95 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -15,6 +15,22 @@ #include "storage/block.h" +/* + * Checksum state 0 is used for when data checksums are disabled (OFF). + * PG_DATA_CHECKSUM_INPROGRESS_{ON|OFF} defines that data checksums are either + * currently being enabled or disabled, and PG_DATA_CHECKSUM_VERSION defines + * that data checksums are enabled. The ChecksumStateType is stored in + * pg_control so changing requires a catversion bump, and the values cannot + * be reordered. New states must be added at the end. + */ +typedef enum ChecksumStateType +{ + PG_DATA_CHECKSUM_OFF = 0, + PG_DATA_CHECKSUM_VERSION = 1, + PG_DATA_CHECKSUM_INPROGRESS_OFF = 2, + PG_DATA_CHECKSUM_INPROGRESS_ON = 3, +} ChecksumStateType; + /* * Compute the checksum for a Postgres page. The page must be aligned on a * 4-byte boundary. diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index 59ee097977d..af8553bcb6c 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -87,6 +87,7 @@ PG_LWLOCK(52, SerialControl) PG_LWLOCK(53, AioWorkerSubmissionQueue) PG_LWLOCK(54, WaitLSN) PG_LWLOCK(55, LogicalDecodingControl) +PG_LWLOCK(56, DataChecksumsWorker) /* * There also exist several built-in LWLock tranches. As with the predefined diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 348fba53a93..cc4f26aa33d 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -48,6 +48,10 @@ typedef enum PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */ PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO, /* ask to update * XLogLogicalInfo */ + PROCSIGNAL_BARRIER_CHECKSUM_OFF, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF, + PROCSIGNAL_BARRIER_CHECKSUM_ON, } ProcSignalBarrierType; /* diff --git a/src/include/utils/backend_progress.h b/src/include/utils/backend_progress.h index 6300dbd15d5..61e13c40e28 100644 --- a/src/include/utils/backend_progress.h +++ b/src/include/utils/backend_progress.h @@ -28,6 +28,7 @@ typedef enum ProgressCommandType PROGRESS_COMMAND_BASEBACKUP, PROGRESS_COMMAND_COPY, PROGRESS_COMMAND_REPACK, + PROGRESS_COMMAND_DATACHECKSUMS, } ProgressCommandType; #define PGSTAT_NUM_PROGRESS_PARAM 20 diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 28ce3b35eda..864b407abcf 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -20,6 +20,7 @@ SUBDIRS = \ test_bitmapset \ test_bloomfilter \ test_cloexec \ + test_checksums \ test_copy_callbacks \ test_custom_rmgrs \ test_custom_stats \ diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 3ac291656c1..e5acacd5083 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -20,6 +20,7 @@ subdir('test_binaryheap') subdir('test_bitmapset') subdir('test_bloomfilter') subdir('test_cloexec') +subdir('test_checksums') subdir('test_copy_callbacks') subdir('test_cplusplusext') subdir('test_custom_rmgrs') diff --git a/src/test/modules/test_checksums/.gitignore b/src/test/modules/test_checksums/.gitignore new file mode 100644 index 00000000000..871e943d50e --- /dev/null +++ b/src/test/modules/test_checksums/.gitignore @@ -0,0 +1,2 @@ +# Generated by test suite +/tmp_check/ diff --git a/src/test/modules/test_checksums/Makefile b/src/test/modules/test_checksums/Makefile new file mode 100644 index 00000000000..fa85b79ae57 --- /dev/null +++ b/src/test/modules/test_checksums/Makefile @@ -0,0 +1,40 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/modules/test_checksums +# +# Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/test/modules/test_checksums/Makefile +# +#------------------------------------------------------------------------- + +EXTRA_INSTALL = src/test/modules/injection_points + +export enable_injection_points + +MODULE_big = test_checksums +OBJS = \ + $(WIN32RES) \ + test_checksums.o +PGFILEDESC = "test_checksums - test code for data checksums" + +EXTENSION = test_checksums +DATA = test_checksums--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_checksums +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) diff --git a/src/test/modules/test_checksums/README b/src/test/modules/test_checksums/README new file mode 100644 index 00000000000..6a23e4ff0ae --- /dev/null +++ b/src/test/modules/test_checksums/README @@ -0,0 +1,30 @@ +src/test/modules/test_checksums/README + +Regression tests for data checksums +=================================== +This directory contains a test suite for enabling, and disabling, data +checksums both offline as well as in a running cluster. + +Running the tests with autoconf +=============================== + + make check + +or + + make installcheck + +Running the tests with meson +============================ +From your build directory, issue the following command: + + meson test -q --print-errorlogs --suite setup --suite test_checksums + +NOTE: This creates a temporary installation (in the case of "make check" or +"--suite setup"), with multiple nodes, be they master or standby(s) for the +purpose of the tests. + +NOTE: This test suite requires TAP tests to be enabled, a subset of the tests +also require injection points to function. In order to run the extended test +then "checksum_extended" must be set in the PG_TEST_EXTRA environment +variable. diff --git a/src/test/modules/test_checksums/meson.build b/src/test/modules/test_checksums/meson.build new file mode 100644 index 00000000000..9b1421a9b91 --- /dev/null +++ b/src/test/modules/test_checksums/meson.build @@ -0,0 +1,38 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +test_checksums_sources = files( + 'test_checksums.c', +) + +test_checksums = shared_module('test_checksums', + test_checksums_sources, + kwargs: pg_test_mod_args, +) +test_install_libs += test_checksums + +test_install_data += files( + 'test_checksums.control', + 'test_checksums--1.0.sql', +) + +tests += { + 'name': 'test_checksums', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, + 'tests': [ + 't/001_basic.pl', + 't/002_restarts.pl', + 't/003_standby_restarts.pl', + 't/004_offline.pl', + 't/005_injection.pl', + 't/006_pgbench_single.pl', + 't/007_pgbench_standby.pl', + 't/008_pitr.pl', + 't/009_fpi.pl', + ], + }, +} diff --git a/src/test/modules/test_checksums/t/001_basic.pl b/src/test/modules/test_checksums/t/001_basic.pl new file mode 100644 index 00000000000..c008e95fbff --- /dev/null +++ b/src/test/modules/test_checksums/t/001_basic.pl @@ -0,0 +1,63 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('basic_node'); +$node->init(no_data_checksums => 1); +$node->start; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are turned off +test_checksum_state($node, 'off'); + +# Enable data checksums and wait for the state transition to 'on' +enable_data_checksums($node, wait => 'on'); + +# Run a dummy query just to make sure we can read back data +my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1 "); +is($result, '9999', 'ensure checksummed pages can be read back'); + +# Enable data checksums again which should be a no-op so we explicitly don't +# wait for any state transition as none should happen here +enable_data_checksums($node); +test_checksum_state($node, 'on'); +# ..and make sure we can still read/write data +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Disable checksums again and wait for the state transition +disable_data_checksums($node, wait => 1); + +# Test reading data again +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure previously checksummed pages can be read back'); + +# Re-enable checksums and make sure that the underlying data has changed to +# ensure that checksums will be different. +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +enable_data_checksums($node, wait => 'on'); + +# Run a dummy query just to make sure we can read back the data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/002_restarts.pl b/src/test/modules/test_checksums/t/002_restarts.pl new file mode 100644 index 00000000000..bab59be82bd --- /dev/null +++ b/src/test/modules/test_checksums/t/002_restarts.pl @@ -0,0 +1,110 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with a +# restart which breaks processing. +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('restarts_node'); +$node->init(no_data_checksums => 1); +$node->start; + +# Initialize result storage for queries +my $result; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +SKIP: +{ + skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 6 + if (!$ENV{PG_TEST_EXTRA} + || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/); + + # Create a barrier for checksum enablement to block on, in this case a pre- + # existing temporary table which is kept open while processing is started. + # We can accomplish this by setting up an interactive psql process which + # keeps the temporary table created as we enable checksums in another psql + # process. + # + # This is a similar test to the synthetic variant in 005_injection.pl + # which fakes this scenario. + my $bsession = $node->background_psql('postgres'); + $bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);'); + + # In another session, make sure we can see the blocking temp table but + # start processing anyways and check that we are blocked with a proper + # wait event. + $result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';" + ); + is($result, 't', 'ensure we can see the temporary table'); + + # Enabling data checksums shouldn't work as the process is blocked on the + # temporary table held open by $bsession. Ensure that we reach inprogress- + # on before we do more tests. + enable_data_checksums($node, wait => 'inprogress-on'); + + # Wait for processing to finish and the worker waiting for leftover temp + # relations to be able to actually finish + $result = $node->poll_query_until( + 'postgres', + "SELECT wait_event FROM pg_catalog.pg_stat_activity " + . "WHERE backend_type = 'datachecksum worker';", + 'ChecksumEnableTemptableWait'); + + # The datachecksumsworker waits for temporary tables to disappear for 3 + # seconds before retrying, so sleep for 4 seconds to be guaranteed to see + # a retry cycle + sleep(4); + + # Re-check the wait event to ensure we are blocked on the right thing. + $result = $node->safe_psql('postgres', + "SELECT wait_event FROM pg_catalog.pg_stat_activity " + . "WHERE backend_type = 'datachecksum worker';"); + is($result, 'ChecksumEnableTemptableWait', + 'ensure the correct wait condition is set'); + test_checksum_state($node, 'inprogress-on'); + + # Stop the cluster while bsession is still attached. We can't close the + # session first since the brief period between closing and stopping might + # be enough for checksums to get enabled. + $node->stop; + $bsession->quit; + $node->start; + + # Ensure the checksums aren't enabled across the restart. This leaves the + # cluster in the same state as before we entered the SKIP block. + test_checksum_state($node, 'off'); +} + +enable_data_checksums($node, wait => 'on'); + +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +$result = $node->poll_query_until( + 'postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksum%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +disable_data_checksums($node, wait => 1); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/003_standby_restarts.pl b/src/test/modules/test_checksums/t/003_standby_restarts.pl new file mode 100644 index 00000000000..6b016925651 --- /dev/null +++ b/src/test/modules/test_checksums/t/003_standby_restarts.pl @@ -0,0 +1,114 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# streaming replication +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize primary node +my $node_primary = PostgreSQL::Test::Cluster->new('standby_restarts_primary'); +$node_primary->init(allows_streaming => 1, no_data_checksums => 1); +$node_primary->start; + +my $slotname = 'physical_slot'; +$node_primary->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('$slotname')"); + +# Take backup +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +# Create streaming standby linking to primary +my $node_standby = PostgreSQL::Test::Cluster->new('standby_restarts_standby'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby->append_conf( + 'postgresql.conf', qq[ +primary_slot_name = '$slotname' +]); +$node_standby->start; + +# Create some content on the primary to have un-checksummed data in the cluster +$node_primary->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Wait for standbys to catch up +$node_primary->wait_for_catchup($node_standby, 'replay', + $node_primary->lsn('insert')); + +# Check that checksums are turned off on all nodes +test_checksum_state($node_primary, 'off'); +test_checksum_state($node_standby, 'off'); + +# --------------------------------------------------------------------------- +# Enable checksums for the cluster, and make sure that both the primary and +# standby change state. +# + +# Ensure that the primary switches to "inprogress-on" +enable_data_checksums($node_primary, wait => 'inprogress-on'); +# Wait for checksum enable to be replayed +$node_primary->wait_for_catchup($node_standby, 'replay'); + +# Ensure that the standby has switched to "inprogress-on" or "on". Normally it +# would be "inprogress-on", but it is theoretically possible for the primary to +# complete the checksum enabling *and* have the standby replay that record +# before we reach the check below. +my $result = $node_standby->poll_query_until( + 'postgres', + "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'f'); +is($result, 1, 'ensure standby has absorbed the inprogress-on barrier'); +$result = $node_standby->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); + +is(($result eq 'inprogress-on' || $result eq 'on'), + 1, 'ensure checksums are on, or in progress, on standby_1'); + +# Insert some more data which should be checksummed on INSERT +$node_primary->safe_psql('postgres', + "INSERT INTO t VALUES (generate_series(1, 10000));"); + +# Wait for checksums enabled on the primary and standby +wait_for_checksum_state($node_primary, 'on'); +wait_for_checksum_state($node_standby, 'on'); + +$result = + $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1"); +is($result, '19998', 'ensure we can safely read all data with checksums'); + +$result = $node_primary->poll_query_until( + 'postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksum%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +# +# Disable checksums and ensure it's propagated to standby and that we can +# still read all data +# + +# Disable checksums and wait for the operation to be replayed +disable_data_checksums($node_primary); +$node_primary->wait_for_catchup($node_standby, 'replay'); +# Ensure that the primary and standby has switched to off +wait_for_checksum_state($node_primary, 'off'); +wait_for_checksum_state($node_standby, 'off'); +# Doublecheck reading data without errors +$result = + $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1"); +is($result, "19998", 'ensure we can safely read all data without checksums'); + +$node_standby->stop; +$node_primary->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/004_offline.pl b/src/test/modules/test_checksums/t/004_offline.pl new file mode 100644 index 00000000000..f1972bddff1 --- /dev/null +++ b/src/test/modules/test_checksums/t/004_offline.pl @@ -0,0 +1,82 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums offline from various states +# of checksum processing +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('offline_node'); +$node->init(no_data_checksums => 1); +$node->start; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +# Enable checksums offline using pg_checksums +$node->stop; +$node->checksum_enable_offline; +$node->start; + +# Ensure that checksums are enabled +test_checksum_state($node, 'on'); + +# Run a dummy query just to make sure we can read back some data +my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +# Disable checksums offline again using pg_checksums +$node->stop; +$node->checksum_disable_offline; +$node->start; + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +# Create a barrier for checksum enablement to block on, in this case a pre- +# existing temporary table which is kept open while processing is started. We +# can accomplish this by setting up an interactive psql process which keeps the +# temporary table created as we enable checksums in another psql process. + +my $bsession = $node->background_psql('postgres'); +$bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);'); + +# In another session, make sure we can see the blocking temp table but start +# processing anyways and check that we are blocked with a proper wait event. +$result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"); +is($result, 't', 'ensure we can see the temporary table'); + +enable_data_checksums($node, wait => 'inprogress-on'); + +# Turn the cluster off and enable checksums offline, then start back up +$bsession->quit; +$node->stop; +$node->checksum_enable_offline; +$node->start; + +# Ensure that checksums are now enabled even though processing wasn't +# restarted +test_checksum_state($node, 'on'); + +# Run a dummy query just to make sure we can read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/005_injection.pl b/src/test/modules/test_checksums/t/005_injection.pl new file mode 100644 index 00000000000..897f282a1f2 --- /dev/null +++ b/src/test/modules/test_checksums/t/005_injection.pl @@ -0,0 +1,74 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# injection point tests injecting failures into the processing + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# --------------------------------------------------------------------------- +# Test cluster setup +# + +# Initiate testcluster +my $node = PostgreSQL::Test::Cluster->new('injection_node'); +$node->init(no_data_checksums => 1); +$node->start; + +# Set up test environment +$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); + +# --------------------------------------------------------------------------- +# Inducing failures and crashes in processing + +# Force enabling checksums to fail by marking one of the databases as having +# failed in processing. +disable_data_checksums($node, wait => 1); +$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(true);'); +enable_data_checksums($node, wait => 'off'); +$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(false);'); + +# Make sure that disabling after a failure works +disable_data_checksums($node); +test_checksum_state($node, 'off'); + +# --------------------------------------------------------------------------- +# Timing and retry related tests +# + +SKIP: +{ + skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 4 + if (!$ENV{PG_TEST_EXTRA} + || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/); + + # Inject a delay in the barrier for enabling checksums + disable_data_checksums($node, wait => 1); + $node->safe_psql('postgres', 'SELECT dcw_inject_delay_barrier();'); + enable_data_checksums($node, wait => 'on'); + + # Fake the existence of a temporary table at the start of processing, which + # will force the processing to wait and retry in order to wait for it to + # disappear. + disable_data_checksums($node, wait => 1); + $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(true);'); + enable_data_checksums($node, wait => 'on'); +} + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/006_pgbench_single.pl b/src/test/modules/test_checksums/t/006_pgbench_single.pl new file mode 100644 index 00000000000..0ab5b04b931 --- /dev/null +++ b/src/test/modules/test_checksums/t/006_pgbench_single.pl @@ -0,0 +1,275 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# concurrent activity via pgbench runs + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# This test suite is expensive, or very expensive, to execute. There are two +# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite +# an "checksum_extended" for the full suite. The full suite can run for hours +# on slow or constrained systems. +my $extended = undef; +if ($ENV{PG_TEST_EXTRA}) +{ + $extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/); + plan skip_all => 'Expensive data checksums test disabled' + unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/); +} +else +{ + plan skip_all => 'Expensive data checksums test disabled'; +} + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node; +my $node_loglocation = 0; + +# The number of full test iterations which will be performed. The exact number +# of tests performed and the wall time taken is non-deterministic as the test +# performs a lot of randomized actions, but 10 iterations will be a long test +# run regardless. +my $TEST_ITERATIONS = 1; +$TEST_ITERATIONS = 10 if ($extended); + +# Variables which record the current state of the cluster +my $data_checksum_state = 'off'; +my $pgbench = undef; + +# Start a pgbench run in the background against the server specified via the +# port passed as parameter. +sub background_rw_pgbench +{ + my $port = shift; + + # If a previous pgbench is still running, start by shutting it down. + $pgbench->finish if $pgbench; + + my $clients = 1; + my $runtime = 2; + + if ($extended) + { + # Randomize the number of pgbench clients a bit (range 1-16) + $clients = 1 + int(rand(15)); + $runtime = 600; + } + my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients); + + # Randomize whether we spawn connections or not + push(@cmd, '-C') if ($extended && cointoss); + # Finally add the database name to use + push(@cmd, 'postgres'); + + $pgbench = IPC::Run::start( + \@cmd, + '<' => '/dev/null', + '>' => '/dev/null', + '2>' => '/dev/null', + IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); +} + +# Invert the state of data checksums in the cluster, if data checksums are on +# then disable them and vice versa. Also performs proper validation of the +# before and after state. +sub flip_data_checksums +{ + # First, make sure the cluster is in the state we expect it to be + test_checksum_state($node, $data_checksum_state); + + if ($data_checksum_state eq 'off') + { + # Coin-toss to see if we are injecting a retry due to a temptable + $node->safe_psql('postgres', 'SELECT dcw_fake_temptable();') + if cointoss(); + + # log LSN right before we start changing checksums + my $result = + $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before enabling: " . $result . "\n"); + + # Ensure that the primary switches to "inprogress-on" + enable_data_checksums($node, wait => 'inprogress-on'); + + random_sleep() if ($extended); + + # Wait for checksums enabled on the primary + wait_for_checksum_state($node, 'on'); + + # log LSN right after the primary flips checksums to "on" + $result = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after enabling: " . $result . "\n"); + + random_sleep() if ($extended); + + $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(false);'); + $data_checksum_state = 'on'; + } + elsif ($data_checksum_state eq 'on') + { + random_sleep() if ($extended); + + # log LSN right before we start changing checksums + my $result = + $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before disabling: " . $result . "\n"); + + disable_data_checksums($node); + + # Wait for checksums disabled on the primary + wait_for_checksum_state($node, 'off'); + + # log LSN right after the primary flips checksums to "off" + $result = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after disabling: " . $result . "\n"); + + random_sleep() if ($extended); + + $data_checksum_state = 'off'; + } + else + { + # This should only happen due to programmer error when hacking on the + # test code, but since that might pass subtly we error out. + BAIL_OUT('data_checksum_state variable has invalid state:' + . $data_checksum_state); + } +} + +# Create and start a cluster with one node +$node = PostgreSQL::Test::Cluster->new('pgbench_single_main'); +$node->init(allows_streaming => 1, no_data_checksums => 1); +# max_connections need to be bumped in order to accommodate for pgbench clients +# and log_statement is dialled down since it otherwise will generate enormous +# amounts of logging. Page verification failures are still logged. +$node->append_conf( + 'postgresql.conf', + qq[ +max_connections = 100 +log_statement = none +]); +$node->start; +$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;"); +# Initialize pgbench +my $scalefactor = ($extended ? 10 : 1); +$node->command_ok( + [ + 'pgbench', '-p', $node->port, '-i', + '-s', $scalefactor, '-q', 'postgres' + ]); +# Start the test suite with pgbench running. +background_rw_pgbench($node->port); + +# Main test suite. This loop will start a pgbench run on the cluster and while +# that's running flip the state of data checksums concurrently. It will then +# randomly restart the cluster and then check for +# the desired state. The idea behind doing things randomly is to stress out +# any timing related issues by subjecting the cluster for varied workloads. +# A TODO is to generate a trace such that any test failure can be traced to +# its order of operations for debugging. +for (my $i = 0; $i < $TEST_ITERATIONS; $i++) +{ + note("iteration ", ($i + 1), " of ", $TEST_ITERATIONS); + + if (!$node->is_alive) + { + # Start, to do recovery, and stop + $node->start; + $node->stop('fast'); + + # Since the log isn't being written to now, parse the log and check + # for instances of checksum verification failures. + my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, + $node_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log (during WAL recovery)" + ); + $node_loglocation = -s $node->logfile; + + # Randomize the WAL size, to trigger checkpoints less/more often + my $sb = 64 + int(rand(1024)); + $node->append_conf('postgresql.conf', qq[max_wal_size = $sb]); + note("changing max_wal_size to " . $sb); + + $node->start; + + # Start a pgbench in the background against the primary + background_rw_pgbench($node->port); + } + + $node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); + + flip_data_checksums(); + random_sleep() if ($extended); + my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); + is($result, '100000', 'ensure data pages can be read back on primary'); + + random_sleep() if ($extended); + + # Potentially powercycle the node + if (cointoss()) + { + $node->stop(stopmode()); + + PostgreSQL::Test::Utils::system_log("pg_controldata", + $node->data_dir); + + my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, + $node_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log (outside WAL recovery)" + ); + $node_loglocation = -s $node->logfile; + } + + random_sleep() if ($extended); +} + +# Make sure the node is running +if (!$node->is_alive) +{ + $node->start; +} + +# Testrun is over, ensure that data reads back as expected and perform a final +# verification of the data checksum state. +my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '100000', 'ensure data pages can be read back on primary'); +test_checksum_state($node, $data_checksum_state); + +# Perform one final pass over the logs and hunt for unexpected errors +my $log = + PostgreSQL::Test::Utils::slurp_file($node->logfile, $node_loglocation); +unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log"); +$node_loglocation = -s $node->logfile; + +$node->teardown_node; + +done_testing(); diff --git a/src/test/modules/test_checksums/t/007_pgbench_standby.pl b/src/test/modules/test_checksums/t/007_pgbench_standby.pl new file mode 100644 index 00000000000..b0d40d24005 --- /dev/null +++ b/src/test/modules/test_checksums/t/007_pgbench_standby.pl @@ -0,0 +1,400 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster, +# comprising of a primary and a replicated standby, with concurrent activity +# via pgbench runs + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# This test suite is expensive, or very expensive, to execute. There are two +# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite +# an "checksum_extended" for the full suite. The full suite can run for hours +# on slow or constrained systems. +my $extended = undef; +if ($ENV{PG_TEST_EXTRA}) +{ + $extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/); + plan skip_all => 'Expensive data checksums test disabled' + unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/); +} +else +{ + plan skip_all => 'Expensive data checksums test disabled'; +} + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node_primary_slot = 'physical_slot'; +my $node_primary_backup = 'primary_backup'; +my $node_primary; +my $node_primary_loglocation = 0; +my $node_standby; +my $node_standby_loglocation = 0; + +# The number of full test iterations which will be performed. The exact number +# of tests performed and the wall time taken is non-deterministic as the test +# performs a lot of randomized actions, but 5 iterations will be a long test +# run regardless. +my $TEST_ITERATIONS = 5; +$TEST_ITERATIONS = 1 if ($extended); + +# Variables which record the current state of the cluster +my $data_checksum_state = 'off'; + +my $pgbench_primary = undef; +my $pgbench_standby = undef; + +# Start a pgbench run in the background against the server specified via the +# port passed as parameter +sub background_pgbench +{ + my ($port, $standby) = @_; + my $pgbench = ($standby ? \$pgbench_standby : \$pgbench_primary); + + # Terminate any currently running pgbench process before continuing + $$pgbench->finish if $$pgbench; + + my $clients = 1; + my $runtime = 5; + + if ($extended) + { + # Randomize the number of pgbench clients a bit (range 1-16) + $clients = 1 + int(rand(15)); + $runtime = 600; + } + + my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients); + # Randomize whether we spawn connections or not + push(@cmd, '-C') if ($extended && cointoss()); + # If we run on a standby it needs to be a read-only benchmark + push(@cmd, '-S') if ($standby); + # Finally add the database name to use + push(@cmd, 'postgres'); + + $$pgbench = IPC::Run::start( + \@cmd, + '<' => '/dev/null', + '>' => '/dev/null', + '2>' => '/dev/null', + IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); +} + +# Invert the state of data checksums in the cluster, if data checksums are on +# then disable them and vice versa. Also performs proper validation of the +# before and after state. +sub flip_data_checksums +{ + # First, make sure the cluster is in the state we expect it to be + test_checksum_state($node_primary, $data_checksum_state); + test_checksum_state($node_standby, $data_checksum_state); + + if ($data_checksum_state eq 'off') + { + # Coin-toss to see if we are injecting a retry due to a temptable + $node_primary->safe_psql('postgres', 'SELECT dcw_fake_temptable();') + if cointoss(); + + # log LSN right before we start changing checksums + my $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before enabling: " . $result . "\n"); + + # Ensure that the primary switches to "inprogress-on" + enable_data_checksums($node_primary, wait => 'inprogress-on'); + + random_sleep() if ($extended); + + # Wait for checksum enable to be replayed + $node_primary->wait_for_catchup($node_standby, 'replay'); + + # Ensure that the standby has switched to "inprogress-on" or "on". + # Normally it would be "inprogress-on", but it is theoretically + # possible for the primary to complete the checksum enabling *and* have + # the standby replay that record before we reach the check below. + $result = $node_standby->poll_query_until( + 'postgres', + "SELECT setting = 'off' " + . "FROM pg_catalog.pg_settings " + . "WHERE name = 'data_checksums';", + 'f'); + is($result, 1, + 'ensure standby has absorbed the inprogress-on barrier'); + $result = $node_standby->safe_psql('postgres', + "SELECT setting " + . "FROM pg_catalog.pg_settings " + . "WHERE name = 'data_checksums';"); + + is(($result eq 'inprogress-on' || $result eq 'on'), + 1, 'ensure checksums are on, or in progress, on standby_1'); + + # Wait for checksums enabled on the primary and standby + wait_for_checksum_state($node_primary, 'on'); + + # log LSN right after the primary flips checksums to "on" + $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after enabling: " . $result . "\n"); + + random_sleep() if ($extended); + wait_for_checksum_state($node_standby, 'on'); + + $node_primary->safe_psql('postgres', + 'SELECT dcw_fake_temptable(false);'); + $data_checksum_state = 'on'; + } + elsif ($data_checksum_state eq 'on') + { + random_sleep() if ($extended); + + # log LSN right before we start changing checksums + my $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before disabling: " . $result . "\n"); + + disable_data_checksums($node_primary); + $node_primary->wait_for_catchup($node_standby, 'replay'); + + # Wait for checksums disabled on the primary and standby + wait_for_checksum_state($node_primary, 'off'); + wait_for_checksum_state($node_standby, 'off'); + + # log LSN right after the primary flips checksums to "off" + $result = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after disabling: " . $result . "\n"); + + random_sleep() if ($extended); + wait_for_checksum_state($node_standby, 'off'); + + $data_checksum_state = 'off'; + } + else + { + # This should only happen due to programmer error when hacking on the + # test code, but since that might pass subtly we error out. + BAIL_OUT('data_checksum_state variable has invalid state:' + . $data_checksum_state); + } +} + +# Create and start a cluster with one primary and one standby node, and ensure +# they are caught up and in sync. +$node_primary = PostgreSQL::Test::Cluster->new('pgbench_standby_main'); +$node_primary->init(allows_streaming => 1, no_data_checksums => 1); +# max_connections need to be bumped in order to accommodate for pgbench clients +# and log_statement is dialled down since it otherwise will generate enormous +# amounts of logging. Page verification failures are still logged. +$node_primary->append_conf( + 'postgresql.conf', + qq[ +max_connections = 30 +log_statement = none +]); +$node_primary->start; +$node_primary->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +# Create some content to have un-checksummed data in the cluster +$node_primary->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;"); +$node_primary->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('$node_primary_slot');"); +$node_primary->backup($node_primary_backup); + +$node_standby = PostgreSQL::Test::Cluster->new('pgbench_standby_standby'); +$node_standby->init_from_backup($node_primary, $node_primary_backup, + has_streaming => 1); +$node_standby->append_conf( + 'postgresql.conf', qq[ +primary_slot_name = '$node_primary_slot' +]); +$node_standby->start; + +# Initialize pgbench and wait for the objects to be created on the standby +my $scalefactor = ($extended ? 10 : 1); +$node_primary->command_ok( + [ + 'pgbench', '-p', $node_primary->port, '-i', '-s', $scalefactor, '-q', + 'postgres' + ]); +$node_primary->wait_for_catchup($node_standby, 'replay'); + +# Start the test suite with pgbench running on all nodes +background_pgbench($node_standby->port, 1); +background_pgbench($node_primary->port, 0); + +# Main test suite. This loop will start a pgbench run on the cluster and while +# that's running flip the state of data checksums concurrently. It will then +# randomly restart the cluster and then check for +# the desired state. The idea behind doing things randomly is to stress out +# any timing related issues by subjecting the cluster for varied workloads. +# A TODO is to generate a trace such that any test failure can be traced to +# its order of operations for debugging. +for (my $i = 0; $i < $TEST_ITERATIONS; $i++) +{ + note("iteration ", ($i + 1), " of ", $TEST_ITERATIONS); + + if (!$node_primary->is_alive) + { + # start, to do recovery, and stop + $node_primary->start; + $node_primary->stop('fast'); + + # Since the log isn't being written to now, parse the log and check + # for instances of checksum verification failures. + my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, + $node_primary_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log (during WAL recovery)" + ); + $node_primary_loglocation = -s $node_primary->logfile; + + # randomize the WAL size, to trigger checkpoints less/more often + my $sb = 32 + int(rand(960)); + $node_primary->append_conf('postgresql.conf', qq[max_wal_size = $sb]); + + note("changing primary max_wal_size to " . $sb); + + $node_primary->start; + + # Start a pgbench in the background against the primary + background_pgbench($node_primary->port, 0); + } + + if (!$node_standby->is_alive) + { + $node_standby->start; + $node_standby->stop('fast'); + + # Since the log isn't being written to now, parse the log and check + # for instances of checksum verification failures. + my $log = + PostgreSQL::Test::Utils::slurp_file($node_standby->logfile, + $node_standby_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in standby_1 log (during WAL recovery)" + ); + $node_standby_loglocation = -s $node_standby->logfile; + + # randomize the WAL size, to trigger checkpoints less/more often + my $sb = 32 + int(rand(960)); + $node_standby->append_conf('postgresql.conf', qq[max_wal_size = $sb]); + + note("changing standby max_wal_size to " . $sb); + + $node_standby->start; + + # Start a read-only pgbench in the background on the standby + background_pgbench($node_standby->port, 1); + } + + $node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;"); + $node_primary->wait_for_catchup($node_standby, 'write'); + + flip_data_checksums(); + random_sleep() if ($extended); + my $result = $node_primary->safe_psql('postgres', + "SELECT count(*) FROM t WHERE a > 1"); + is($result, '100000', 'ensure data pages can be read back on primary'); + random_sleep(); + + # Potentially powercycle the cluster (the nodes independently). A TODO is + # to randomly stop the nodes in the opposite order too. + if ($extended && cointoss()) + { + $node_primary->stop(stopmode()); + + # print the contents of the control file on the primary + PostgreSQL::Test::Utils::system_log("pg_controldata", + $node_primary->data_dir); + + # slurp the file after shutdown, so that it doesn't interfere with the recovery + my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, + $node_primary_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log (outside WAL recovery)" + ); + $node_primary_loglocation = -s $node_primary->logfile; + } + + random_sleep() if ($extended); + + if ($extended && cointoss()) + { + $node_standby->stop(stopmode()); + + # print the contents of the control file on the standby + PostgreSQL::Test::Utils::system_log("pg_controldata", + $node_standby->data_dir); + + # slurp the file after shutdown, so that it doesn't interfere with the recovery + my $log = + PostgreSQL::Test::Utils::slurp_file($node_standby->logfile, + $node_standby_loglocation); + unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in standby_1 log (outside WAL recovery)" + ); + $node_standby_loglocation = -s $node_standby->logfile; + } +} + +# make sure the nodes are running +if (!$node_primary->is_alive) +{ + $node_primary->start; +} + +if (!$node_standby->is_alive) +{ + $node_standby->start; +} + +# Testrun is over, ensure that data reads back as expected and perform a final +# verification of the data checksum state. +my $result = + $node_primary->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '100000', 'ensure data pages can be read back on primary'); +test_checksum_state($node_primary, $data_checksum_state); +test_checksum_state($node_standby, $data_checksum_state); + +# Perform one final pass over the logs and hunt for unexpected errors +my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, + $node_primary_loglocation); +unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in primary log"); +$node_primary_loglocation = -s $node_primary->logfile; +$log = PostgreSQL::Test::Utils::slurp_file($node_standby->logfile, + $node_standby_loglocation); +unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in standby_1 log"); +$node_standby_loglocation = -s $node_standby->logfile; + +$node_standby->teardown_node; +$node_primary->teardown_node; + +done_testing(); diff --git a/src/test/modules/test_checksums/t/008_pitr.pl b/src/test/modules/test_checksums/t/008_pitr.pl new file mode 100644 index 00000000000..b9b89f414ab --- /dev/null +++ b/src/test/modules/test_checksums/t/008_pitr.pl @@ -0,0 +1,189 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# This test suite is expensive, or very expensive, to execute. There are two +# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite +# an "checksum_extended" for the full suite. +my $extended = undef; +if ($ENV{PG_TEST_EXTRA}) +{ + $extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/); + plan skip_all => 'Expensive data checksums test disabled' + unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/); +} +else +{ + plan skip_all => 'Expensive data checksums test disabled'; +} + + +my $pgbench = undef; +my $data_checksum_state = 'off'; + +my $node_primary; + +# Invert the state of data checksums in the cluster, if data checksums are on +# then disable them and vice versa. Also performs proper validation of the +# before and after state. +sub flip_data_checksums +{ + my $lsn_pre = undef; + my $lsn_post = undef; + + # First, make sure the cluster is in the state we expect it to be + test_checksum_state($node_primary, $data_checksum_state); + + if ($data_checksum_state eq 'off') + { + # log LSN right before we start changing checksums + $lsn_pre = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before enabling: " . $lsn_pre . "\n"); + + # Wait for checksums enabled on the primary + enable_data_checksums($node_primary, wait => 'on'); + + # log LSN right after the primary flips checksums to "on" + $lsn_post = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after enabling: " . $lsn_post . "\n"); + + $data_checksum_state = 'on'; + } + elsif ($data_checksum_state eq 'on') + { + # log LSN right before we start changing checksums + $lsn_pre = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + + disable_data_checksums($node_primary); + + # Wait for checksums disabled on the primary + wait_for_checksum_state($node_primary, 'off'); + + # log LSN right after the primary flips checksums to "off" + $lsn_post = + $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + + $data_checksum_state = 'off'; + } + else + { + # This should only happen due to programmer error when hacking on the + # test code, but since that might pass subtly we error out. + BAIL_OUT('data_checksum_state variable has invalid state:' + . $data_checksum_state); + } + + return ($lsn_pre, $lsn_post); +} +# Start a pgbench run in the background against the server specified via the +# port passed as parameter. +sub background_rw_pgbench +{ + my $port = shift; + + # If a previous pgbench is still running, start by shutting it down. + $pgbench->finish if $pgbench; + + # Randomize the number of pgbench clients in extended mode, else 1 client + my $clients = ($extended ? 1 + int(rand(15)) : 1); + my $runtime = ($extended ? 600 : 5); + + my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients); + + # Randomize whether we spawn connections or not + push(@cmd, '-C') if ($extended && cointoss()); + # Finally add the database name to use + push(@cmd, 'postgres'); + + $pgbench = IPC::Run::start( + \@cmd, + '<' => '/dev/null', + '>' => '/dev/null', + '2>' => '/dev/null', + IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); +} + +# Start a primary node with WAL archiving enabled and with enough connections +# available to handle pgbench clients. +$node_primary = PostgreSQL::Test::Cluster->new('pitr_main'); +$node_primary->init( + has_archiving => 1, + allows_streaming => 1, + no_data_checksums => 1); +$node_primary->append_conf( + 'postgresql.conf', + qq[ +max_connections = 100 +log_statement = none +]); +$node_primary->start; + +# Prime the cluster with a bit of known data which we can read back to check +# for data consistency as well as page verification faults in the logfile. +$node_primary->safe_psql('postgres', + 'CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;'); +# Initialize and start pgbench in read/write mode against the cluster +my $scalefactor = ($extended ? 10 : 1); +$node_primary->command_ok( + [ + 'pgbench', '-p', $node_primary->port, '-i', '-s', $scalefactor, '-q', + 'postgres' + ]); +background_rw_pgbench($node_primary->port); + +# Take a backup to use for PITR +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +my ($pre_lsn, $post_lsn) = flip_data_checksums(); + +$node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +$node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('a');"); +$node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +$node_primary->stop('immediate'); + +my $node_pitr = PostgreSQL::Test::Cluster->new('pitr_backup'); +$node_pitr->init_from_backup( + $node_primary, $backup_name, + standby => 0, + has_restoring => 1); +$node_pitr->append_conf( + 'postgresql.conf', qq{ +recovery_target_lsn = '$post_lsn' +recovery_target_action = 'promote' +recovery_target_inclusive = on +}); + +$node_pitr->start; + +$node_pitr->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") + or die "Timed out while waiting for PITR promotion"; + +test_checksum_state($node_pitr, $data_checksum_state); +my $result = + $node_pitr->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '99999', 'ensure data pages can be read back on primary'); + +$node_pitr->stop; + +my $log = PostgreSQL::Test::Utils::slurp_file($node_pitr->logfile, 0); +unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in pitr log"); + +done_testing(); diff --git a/src/test/modules/test_checksums/t/009_fpi.pl b/src/test/modules/test_checksums/t/009_fpi.pl new file mode 100644 index 00000000000..a1cea91f787 --- /dev/null +++ b/src/test/modules/test_checksums/t/009_fpi.pl @@ -0,0 +1,64 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Create and start a cluster with one node +my $node = PostgreSQL::Test::Cluster->new('fpi_node'); +$node->init(allows_streaming => 1, no_data_checksums => 1); +# max_connections need to be bumped in order to accommodate for pgbench clients +# and log_statement is dialled down since it otherwise will generate enormous +# amounts of logging. Page verification failures are still logged. +$node->append_conf( + 'postgresql.conf', + qq[ +max_connections = 100 +log_statement = none +]); +$node->start; +$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1, 1000000) AS a;"); + +# Enable data checksums and wait for the state transition to 'on' +enable_data_checksums($node, wait => 'on'); + +$node->safe_psql('postgres', 'UPDATE t SET a = a + 1;'); + +disable_data_checksums($node, wait => 1); + +$node->append_conf('postgresql.conf', 'full_page_writes = off'); +$node->restart; +test_checksum_state($node, 'off'); + +$node->safe_psql('postgres', 'UPDATE t SET a = a + 1;'); +$node->safe_psql('postgres', 'DELETE FROM t WHERE a < 10000;'); + +$node->adjust_conf('postgresql.conf', 'full_page_writes', 'on'); +$node->restart; +test_checksum_state($node, 'off'); + +enable_data_checksums($node, wait => 'on'); + +my $result = $node->safe_psql('postgres', 'SELECT count(*) FROM t;'); +is($result, '990003', 'Reading back all data from table t'); + +$node->stop; +my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, 0); +unlike( + $log, + qr/page verification failed,.+\d$/, + "no checksum validation errors in server log"); + +done_testing(); diff --git a/src/test/modules/test_checksums/t/DataChecksums/Utils.pm b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm new file mode 100644 index 00000000000..9a2269e8a92 --- /dev/null +++ b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm @@ -0,0 +1,262 @@ + +# Copyright (c) 2026, PostgreSQL Global Development Group + +=pod + +=head1 NAME + +DataChecksums::Utils - Utility functions for testing data checksums in a running cluster + +=head1 SYNOPSIS + + use PostgreSQL::Test::Cluster; + use DataChecksums::Utils qw( .. ); + + # Create, and start, a new cluster + my $node = PostgreSQL::Test::Cluster->new('primary'); + $node->init; + $node->start; + + test_checksum_state($node, 'off'); + + enable_data_checksums($node); + + wait_for_checksum_state($node, 'on'); + + +=cut + +package DataChecksums::Utils; + +use strict; +use warnings FATAL => 'all'; +use Exporter 'import'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +our @EXPORT = qw( + cointoss + disable_data_checksums + enable_data_checksums + random_sleep + stopmode + test_checksum_state + wait_for_checksum_state + wait_for_cluster_crash +); + +=pod + +=head1 METHODS + +=over + +=item test_checksum_state(node, state) + +Test that the current value of the data checksum GUC in the server running +at B matches B. If the values differ, a test failure is logged. +Returns True if the values match, otherwise False. + +=cut + +sub test_checksum_state +{ + my ($postgresnode, $state) = @_; + + my $result = $postgresnode->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" + ); + is($result, $state, 'ensure checksums are set to ' . $state); + return $result eq $state; +} + +=item wait_for_checksum_state(node, state) + +Test the value of the data checksum GUC in the server running at B +repeatedly until it matches B or times out. Processing will run for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. If the +values differ when the process times out, False is returned and a test failure +is logged, otherwise True. + +=cut + +sub wait_for_checksum_state +{ + my ($postgresnode, $state) = @_; + + my $res = $postgresnode->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + $state); + is($res, 1, 'ensure data checksums are transitioned to ' . $state); + return $res == 1; +} + +=item wait_for_cluster_crash(node, params) + +Repeatedly test if the cluster running at B responds to connections +and return when it no longer does so, or when it times out. Processing will +run for $PostgreSQL::Test::Utils::timeout_default seconds unless a timeout +value is specified as a parameter. Returns True if the cluster crashed, else +False if the process timed out. + +=over + +=item timeout + +Approximate number of seconds to wait for cluster to crash, default is +$PostgreSQL::Test::Utils::timeout_default. There are no real-time guarantees +that the total process time won't exceed the timeout. + +=back + +=cut + +sub wait_for_cluster_crash +{ + my $postgresnode = shift; + my %params = @_; + my $crash = 0; + + $params{timeout} = $PostgreSQL::Test::Utils::timeout_default + unless (defined($params{timeout})); + + for (my $naps = 0; $naps < $params{timeout}; $naps++) + { + if (!$postgresnode->is_alive) + { + $crash = 1; + last; + } + sleep(1); + } + + return $crash == 1; +} + +=item enable_data_checksums($node, %params) + +Function for enabling data checksums in the cluster running at B. + +=over + +=item cost_delay + +The B to use when enabling data checksums, default is 0. + +=item cost_limit + +The B to use when enabling data checksums, default is 100. + +=item wait + +If defined, the function will wait for the state defined in this parameter, +waiting timing out, before returning. The function will wait for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. + +=back + +=cut + +sub enable_data_checksums +{ + my $postgresnode = shift; + my %params = @_; + + # Set sane defaults for the parameters + $params{cost_delay} = 0 unless (defined($params{cost_delay})); + $params{cost_limit} = 100 unless (defined($params{cost_limit})); + + my $query = <<'EOQ'; +SELECT pg_enable_data_checksums(%s, %s); +EOQ + + $postgresnode->safe_psql('postgres', + sprintf($query, $params{cost_delay}, $params{cost_limit})); + + wait_for_checksum_state($postgresnode, $params{wait}) + if (defined($params{wait})); +} + +=item disable_data_checksums($node, %params) + +Function for disabling data checksums in the cluster running at B. + +=over + +=item wait + +If defined, the function will wait for the state to turn to B, or +waiting timing out, before returning. The function will wait for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. +Unlike in C the value of the parameter is discarded. + +=back + +=cut + +sub disable_data_checksums +{ + my $postgresnode = shift; + my %params = @_; + + $postgresnode->safe_psql('postgres', + 'SELECT pg_disable_data_checksums();'); + + wait_for_checksum_state($postgresnode, 'off') if (defined($params{wait})); +} + +=item cointoss + +Helper for retrieving a binary value with random distribution for deciding +whether to turn things off during testing. + +=back + +=cut + +sub cointoss +{ + return int(rand() < 0.5); +} + +=item random_sleep(max) + +Helper for injecting random sleeps here and there in the testrun. The sleep +duration will be in the range (0,B), but won't be predictable in order to +avoid sleep patterns that manage to avoid race conditions and timing bugs. +The default B is 3 seconds. + +=back + +=cut + +sub random_sleep +{ + my $max = shift; + return if (defined($max) && ($max == 0)); + sleep(int(rand(defined($max) ? $max : 3))) if cointoss; +} + +=item stopmode + +Small helper function for randomly selecting a valid stopmode. + +=back + +=cut + +sub stopmode +{ + return 'immediate' if (cointoss); + return 'fast'; +} + +=pod + +=back + +=cut + +1; diff --git a/src/test/modules/test_checksums/test_checksums--1.0.sql b/src/test/modules/test_checksums/test_checksums--1.0.sql new file mode 100644 index 00000000000..90642d247fa --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums--1.0.sql @@ -0,0 +1,24 @@ +/* src/test/modules/test_checksums/test_checksums--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_checksums" to load this file. \quit + +CREATE FUNCTION dcw_inject_delay_barrier(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_inject_launcher_delay(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_inject_startup_delay(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_inject_fail_database(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_fake_temptable(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_checksums/test_checksums.c b/src/test/modules/test_checksums/test_checksums.c new file mode 100644 index 00000000000..b087a3b4664 --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums.c @@ -0,0 +1,184 @@ +/*-------------------------------------------------------------------------- + * + * test_checksums.c + * Test data checksums + * + * Copyright (c) 2026, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_checksums/test_checksums.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "funcapi.h" +#include "miscadmin.h" +#include "postmaster/datachecksum_state.h" +#include "storage/latch.h" +#include "utils/injection_point.h" +#include "utils/wait_event.h" + +PG_MODULE_MAGIC; + +extern PGDLLEXPORT void dc_delay_barrier(const char *name, const void *private_data, void *arg); +extern PGDLLEXPORT void dc_modify_db_result(const char *name, const void *private_data, void *arg); +extern PGDLLEXPORT void dc_fake_temptable(const char *name, const void *private_data, void *arg); + +extern PGDLLEXPORT void crash(const char *name, const void *private_data, void *arg); + +/* + * Test for delaying emission of procsignalbarriers. + */ +void +dc_delay_barrier(const char *name, const void *private_data, void *arg) +{ + (void) name; + (void) private_data; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + (3 * 1000), + WAIT_EVENT_PG_SLEEP); +} + +PG_FUNCTION_INFO_V1(dcw_inject_delay_barrier); +Datum +dcw_inject_delay_barrier(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksums-enable-checksums-delay", + "test_checksums", + "dc_delay_barrier", + NULL, + 0); + else + InjectionPointDetach("datachecksums-enable-checksums-delay"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(dcw_inject_launcher_delay); +Datum +dcw_inject_launcher_delay(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-launcher-delay", + "test_checksums", + "dc_delay_barrier", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-launcher-delay"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(dcw_inject_startup_delay); +Datum +dcw_inject_startup_delay(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-startup-delay", + "test_checksums", + "dc_delay_barrier", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-startup-delay"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +static uint32 db_fail = DATACHECKSUMSWORKER_FAILED; + +void +dc_modify_db_result(const char *name, const void *private_data, void *arg) +{ + DataChecksumsWorkerResult *res = (DataChecksumsWorkerResult *) arg; + uint32 new_res = *(uint32 *) private_data; + + *res = new_res; +} + +PG_FUNCTION_INFO_V1(dcw_inject_fail_database); +Datum +dcw_inject_fail_database(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-modify-db-result", + "test_checksums", + "dc_modify_db_result", + &db_fail, + sizeof(uint32)); + else + InjectionPointDetach("datachecksumsworker-modify-db-result"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +/* + * Test to force waiting for existing temptables. + */ +void +dc_fake_temptable(const char *name, const void *private_data, void *arg) +{ + static bool first_pass = true; + int *numleft = (int *) arg; + + if (first_pass) + *numleft = 1; + first_pass = false; +} + +PG_FUNCTION_INFO_V1(dcw_fake_temptable); +Datum +dcw_fake_temptable(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-fake-temptable-wait", + "test_checksums", + "dc_fake_temptable", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-fake-temptable-wait"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +void +crash(const char *name, const void *private_data, void *arg) +{ + abort(); +} diff --git a/src/test/modules/test_checksums/test_checksums.control b/src/test/modules/test_checksums/test_checksums.control new file mode 100644 index 00000000000..84b4cc035a7 --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums.control @@ -0,0 +1,4 @@ +comment = 'Test code for data checksums' +default_version = '1.0' +module_pathname = '$libdir/test_checksums' +relocatable = true diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index f8dc732e66e..54e6b646e8f 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -3898,6 +3898,42 @@ sub advance_wal } } +=item $node->checksum_enable_offline() + +Enable data page checksums in an offline cluster with B. The +caller is responsible for ensuring that the cluster is in the right state for +this operation. + +=cut + +sub checksum_enable_offline +{ + my ($self) = @_; + + print "# Enabling checksums in \"$self->data_dir\"\n"; + PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D', + $self->data_dir, '-e'); + return; +} + +=item $node->checksum_disable_offline() + +Disable data page checksums in an offline cluster with B. The +caller is responsible for ensuring that the cluster is in the right state for +this operation. + +=cut + +sub checksum_disable_offline +{ + my ($self) = @_; + + print "# Disabling checksums in \"$self->data_dir\"\n"; + PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D', + $self->data_dir, '-d'); + return; +} + =pod =back diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 2b3cf6d8569..81a73c426d2 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2085,6 +2085,41 @@ pg_stat_progress_create_index| SELECT s.pid, s.param15 AS partitions_done FROM (pg_stat_get_progress_info('CREATE INDEX'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) LEFT JOIN pg_database d ON ((s.datid = d.oid))); +pg_stat_progress_data_checksums| SELECT s.pid, + s.datid, + d.datname, + CASE s.param1 + WHEN 0 THEN 'enabling'::text + WHEN 1 THEN 'disabling'::text + WHEN 2 THEN 'waiting on temporary tables'::text + WHEN 3 THEN 'waiting on barrier'::text + WHEN 4 THEN 'done'::text + ELSE NULL::text + END AS phase, + CASE s.param2 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param2 + END AS databases_total, + s.param3 AS databases_done, + CASE s.param4 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param4 + END AS relations_total, + CASE s.param5 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param5 + END AS relations_done, + CASE s.param6 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param6 + END AS blocks_total, + CASE s.param7 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param7 + END AS blocks_done + FROM (pg_stat_get_progress_info('DATACHECKSUMS'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) + LEFT JOIN pg_database d ON ((s.datid = d.oid))) + ORDER BY s.datid; pg_stat_progress_repack| SELECT s.pid, s.datid, d.datname, diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index ea7f7846895..35632f83052 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -51,6 +51,22 @@ client backend|relation|vacuum client backend|temp relation|normal client backend|wal|init client backend|wal|normal +datachecksum launcher|relation|bulkread +datachecksum launcher|relation|bulkwrite +datachecksum launcher|relation|init +datachecksum launcher|relation|normal +datachecksum launcher|relation|vacuum +datachecksum launcher|temp relation|normal +datachecksum launcher|wal|init +datachecksum launcher|wal|normal +datachecksum worker|relation|bulkread +datachecksum worker|relation|bulkwrite +datachecksum worker|relation|init +datachecksum worker|relation|normal +datachecksum worker|relation|vacuum +datachecksum worker|temp relation|normal +datachecksum worker|wal|init +datachecksum worker|wal|normal io worker|relation|bulkread io worker|relation|bulkwrite io worker|relation|init @@ -95,7 +111,7 @@ walsummarizer|wal|init walsummarizer|wal|normal walwriter|wal|init walwriter|wal|normal -(79 rows) +(95 rows) \a -- ensure that both seqscan and indexscan plans are allowed SET enable_seqscan TO on; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 91b1225da82..ad999aa48dd 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -438,6 +438,8 @@ CheckPointStmt CheckpointStatsData CheckpointerRequest CheckpointerShmemStruct +ChecksumBarrierCondition +ChecksumStateType Chromosome CkptSortItem CkptTsStatus @@ -610,6 +612,7 @@ CustomScan CustomScanMethods CustomScanState CycleCtr +DataChecksumsWorkerOperation DBState DbOidName DCHCacheEntry @@ -628,6 +631,9 @@ DSMREntryType DSMRegistryCtxStruct DSMRegistryEntry DWORD +DataChecksumsWorkerDatabase +DataChecksumsWorkerResult +DataChecksumsStateStruct DataDirSyncMethod DataDumperPtr DataPageDeleteStack @@ -4405,6 +4411,7 @@ xl_btree_unlink_page xl_btree_update xl_btree_vacuum xl_checkpoint_redo +xl_checksum_state xl_clog_truncate xl_commit_ts_truncate xl_dbase_create_file_copy_rec