diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 422ba304982..d3fea738ca3 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -7187,6 +7187,7 @@ local0.* /var/log/postgresql
bgworkerbgwritercheckpointer
+ checksumsioworkerpostmasterslotsyncworker
diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml
index 210b1118bdf..24ecb46542e 100644
--- a/doc/src/sgml/func/func-admin.sgml
+++ b/doc/src/sgml/func/func-admin.sgml
@@ -3123,4 +3123,82 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8');
+
+ Data Checksum Functions
+
+
+ The functions shown in can
+ be used to enable or disable data checksums in a running cluster.
+
+
+ Changing data checksums can be done in a cluster with concurrent activity
+ without blocking queries, but overall system performance will be affected.
+ See for further details on how changing the
+ data checksums state can affect a system and possible mitigations for how
+ to reduce the impact.
+
+
+
+ Data Checksum Functions
+
+
+
+
+ Function
+
+
+ Description
+
+
+
+
+
+
+
+
+ pg_enable_data_checksums
+
+ pg_enable_data_checksums ( cost_delayint, cost_limitint )
+ void
+
+
+ Initiates the process of enabling data checksums for the cluster. This
+ will set the data checksums state to inprogress-on
+ as well as start a background worker that will process all pages in all
+ databases and enable data checksums on them. When all pages have
+ been processed, the cluster will automatically set data checksums state
+ to on. This operation is WAL logged and replicated
+ to all standby nodes.
+
+
+ If cost_delay and cost_limit are
+ specified, the process is throttled using the same principles as
+ Cost-based Vacuum Delay.
+
+
+
+
+
+
+
+ pg_disable_data_checksums
+
+ pg_disable_data_checksums ()
+ void
+
+
+ Disables data checksum calculation and validation for the cluster. This
+ will set the data checksum state to inprogress-off
+ while data checksums are being disabled. When all active backends have
+ stopped validating data checksums, the data checksum state will be
+ set to off.
+
+
+
+
+
+
+
+
+
diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml
index 113d7640626..b881ae71198 100644
--- a/doc/src/sgml/glossary.sgml
+++ b/doc/src/sgml/glossary.sgml
@@ -199,6 +199,8 @@
(but not the autovacuum workers),
the background writer,
the checkpointer,
+ the data checksums worker,
+ the data checksums worker launcher,
the logger,
the startup process,
the WAL archiver,
@@ -574,6 +576,28 @@
+
+ Data Checksums Worker
+
+
+ A background worker
+ which enables data checksums in a specific database.
+
+
+
+
+
+ Data Checksums Worker Launcher
+
+
+ A background worker
+ which starts data
+ checksum worker processes for enabling data checksums in each
+ database, or disables data checksums cluster-wide.
+
+
+
+
Database
diff --git a/doc/src/sgml/images/Makefile b/doc/src/sgml/images/Makefile
index 38f8869d78d..7b8ac0fbb32 100644
--- a/doc/src/sgml/images/Makefile
+++ b/doc/src/sgml/images/Makefile
@@ -3,6 +3,7 @@
# see README in this directory about image handling
ALL_IMAGES = \
+ datachecksums.svg \
genetic-algorithm.svg \
gin.svg \
pagelayout.svg \
diff --git a/doc/src/sgml/images/datachecksums.gv b/doc/src/sgml/images/datachecksums.gv
new file mode 100644
index 00000000000..dff3ff7340a
--- /dev/null
+++ b/doc/src/sgml/images/datachecksums.gv
@@ -0,0 +1,14 @@
+digraph G {
+ A -> B [label="SELECT pg_enable_data_checksums()"];
+ B -> C;
+ D -> A;
+ C -> D [label="SELECT pg_disable_data_checksums()"];
+ E -> A [label=" --no-data-checksums"];
+ E -> C [label=" --data-checksums"];
+
+ A [label="off"];
+ B [label="inprogress-on"];
+ C [label="on"];
+ D [label="inprogress-off"];
+ E [label="initdb"];
+}
diff --git a/doc/src/sgml/images/datachecksums.svg b/doc/src/sgml/images/datachecksums.svg
new file mode 100644
index 00000000000..8c58f42922e
--- /dev/null
+++ b/doc/src/sgml/images/datachecksums.svg
@@ -0,0 +1,81 @@
+
+
+
+
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index bb75ed1069b..312374da5e0 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -3885,9 +3885,14 @@ description | Waiting for a newly initialized WAL file to reach durable storage
Number of data page checksum failures detected in this
- database (or on a shared object), or NULL if data checksums are
- disabled.
-
+ database (or on a shared object). Detected failures are not reset if
+ the setting changes. Clusters
+ which are initialized without data checksums will show this as
+ 0. In PostgreSQL version
+ 18 and earlier, this was set to NULL for clusters
+ with data checksums disabled.
+
+
@@ -3896,8 +3901,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage
Time at which the last data page checksum failure was detected in
- this database (or on a shared object), or NULL if data checksums are
- disabled.
+ this database (or on a shared object). Last failure is reported
+ regardless of the setting.
@@ -7634,6 +7639,219 @@ FROM pg_stat_get_backend_idset() AS backendid;
+
+ Data Checksum Progress Reporting
+
+
+ pg_stat_progress_data_checksums
+
+
+
+ When data checksums are being enabled on a running cluster, the
+ pg_stat_progress_data_checksums view will contain
+ a row for the launcher process, and one row for each worker process which
+ is currently calculating and writing checksums for the data pages in a database.
+ The launcher provides overview of the overall progress (how many databases
+ have been processed, how many remain), while the workers track progress for
+ currently processed databases.
+
+
+
+ pg_stat_progress_data_checksums View
+
+
+
+
+
+ Column Type
+
+
+ Description
+
+
+
+
+
+
+
+
+
+ pidinteger
+
+
+ Process ID of the data checksum process, launcher or worker.
+
+
+
+
+
+
+
+ datidoid
+
+
+ OID of this database, or 0 for the launcher process.
+
+
+
+
+
+
+
+ datnamename
+
+
+ Name of this database, or NULL for the
+ launcher process.
+
+
+
+
+
+
+
+ phasetext
+
+
+ Current processing phase, see
+ for description of the phases.
+
+
+
+
+
+
+
+ databases_totalinteger
+
+
+ The total number of databases which will be processed. Only the
+ launcher process has this value set, the worker processes have this
+ set to NULL.
+
+
+
+
+
+
+
+ databases_doneinteger
+
+
+ The number of databases which have been processed. Only the launcher
+ process has this value set, the worker processes have this set to
+ NULL.
+
+
+
+
+
+
+
+ relations_totalinteger
+
+
+ The total number of relations which will be processed, or
+ NULL if the worker process hasn't
+ calculated the number of relations yet. The launcher process has
+ this set to NULL since it isn't responsible for
+ processing relations, only launching worker processes.
+
+
+
+
+
+
+
+ relations_doneinteger
+
+
+ The number of relations which have been processed. The launcher
+ process has this set to NULL.
+
+
+
+
+
+
+
+ blocks_totalinteger
+
+
+ The number of blocks in the current relation which will be processed,
+ or NULL if the worker process hasn't
+ calculated the number of blocks yet. The launcher process has
+ this set to NULL.
+
+
+
+
+
+
+
+ blocks_doneinteger
+
+
+ The number of blocks in the current relation which have been processed.
+ The launcher process has this set to NULL.
+
+
+
+
+
+
+
+
+
+ Data Checksum Phases
+
+
+
+
+
+ Phase
+ Description
+
+
+
+
+ enabling
+
+ The command is currently enabling data checksums on the cluster.
+
+
+
+ disabling
+
+ The command is currently disabling data checksums on the cluster.
+
+
+
+ done
+
+ The command is done and the data checksum state in the cluster has
+ changed.
+
+
+
+ waiting on barrier
+
+ The command is currently waiting for the current active backends to
+ acknowledge the change in data checksum state.
+
+
+
+ waiting on temporary tables
+
+ The command is currently waiting for all temporary tables which existed
+ at the time the command was started to be removed.
+
+
+
+
+
+
+
diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml
index b64393c813f..45890324075 100644
--- a/doc/src/sgml/ref/pg_checksums.sgml
+++ b/doc/src/sgml/ref/pg_checksums.sgml
@@ -45,6 +45,12 @@ PostgreSQL documentation
exit status is nonzero if the operation failed.
+
+ When enabling checksums, if checksums were in the process of being enabled
+ when the cluster was shut down, pg_checksums
+ will still process all relations regardless of the online processing.
+
+
When verifying checksums, every file in the cluster is scanned. When
enabling checksums, each relation file block with a changed checksum is
diff --git a/doc/src/sgml/regress.sgml b/doc/src/sgml/regress.sgml
index 873387ec168..c74941bfbf2 100644
--- a/doc/src/sgml/regress.sgml
+++ b/doc/src/sgml/regress.sgml
@@ -275,6 +275,20 @@ make check-world PG_TEST_EXTRA='kerberos ldap ssl load_balance libpq_encryption'
The following values are currently supported:
+
+ checksum, checksum_extended
+
+
+ Runs additional tests for enabling data checksums which inject faults
+ to cause re-tries in the processing, as well as tests that run pgbench
+ concurrently and randomly restarts the cluster. Some of these test
+ suites require injection points enabled in the installation.
+ checksum_extended is an extended version with
+ longer runtime, injected random delays and larger datasets.
+
+
+
+
kerberos
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index f3b86b26be9..165af8a0cf2 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -246,9 +246,10 @@
Checksums can be disabled when the cluster is initialized using initdb.
- They can also be enabled or disabled at a later time as an offline
- operation. Data checksums are enabled or disabled at the full cluster
- level, and cannot be specified individually for databases or tables.
+ They can also be enabled or disabled at a later time either as an offline
+ operation or online in a running cluster allowing concurrent access. Data
+ checksums are enabled or disabled at the full cluster level, and cannot be
+ specified individually for databases, tables or replicated cluster members.
@@ -265,7 +266,7 @@
- Off-line Enabling of Checksums
+ Offline Enabling of Checksums
The pg_checksums
@@ -274,6 +275,123 @@
+
+
+ Online Enabling of Checksums
+
+
+ Checksums can be enabled or disabled online, by calling the appropriate
+ functions.
+
+
+
+ Both enabling and disabling data checksums happens in two phases, separated
+ by a checkpoint to ensure durability. The different states, and their
+ transitions, are illustrated in
+ and discussed in further detail in this section.
+
+
+
+
+ data checksums states
+
+
+
+
+
+
+
+
+
+ Enabling checksums will set the cluster checksum state to
+ inprogress-on. During this time, checksums will be
+ written but not verified. In addition to this, a background worker process
+ is started that enables checksums on all existing data in the cluster. Once
+ this worker has completed processing all databases in the cluster, the
+ checksum state will automatically switch to on. The
+ processing will consume two background worker processes, make sure that
+ max_worker_processes allows for at least two more
+ additional processes.
+
+
+
+ The process will initially wait for all open transactions to finish before
+ it starts, so that it can be certain that there are no tables that have been
+ created inside a transaction that has not committed yet and thus would not
+ be visible to the process enabling checksums. It will also, for each database,
+ wait for all pre-existing temporary tables to get removed before it finishes.
+ If long-lived temporary tables are used in an application it may be necessary
+ to terminate these application connections to allow the process to complete.
+
+
+
+ If the cluster is stopped while in inprogress-on state,
+ for any reason, or processing was interrupted, then the checksum enable
+ process must be restarted manually. To do this, re-execute the function
+ pg_enable_data_checksums() once the cluster has been
+ restarted. The process will start over, there is no support for resuming
+ work from where it was interrupted. If the cluster is stopped while in
+ inprogress-off, then the checksum state will be set to
+ off when the cluster is restarted.
+
+
+
+ Disabling data checksums will set the data checksum state to
+ inprogress-off. During this time, checksums will be
+ written but not verified. After all processes acknowledge the change,
+ the state will automatically be set to off.
+
+
+
+ Disabling data checksums while data checksums are actively being enabled
+ will abort the current processing.
+
+
+
+ Impact on system of online operations
+
+ Enabling data checksums can cause significant I/O to the system, as all of the
+ database pages will need to be rewritten, and will be written both to the
+ data files and the WAL. The impact may be limited by throttling using the
+ cost_delay and cost_limit
+ parameters of the pg_enable_data_checksums() function.
+
+
+
+
+
+ I/O: all pages need to have data checksums calculated and written which
+ will generate a lot of dirty pages that will need to be flushed to disk,
+ as well as WAL logged.
+
+
+ Replication: When the standby receives the data checksum state change
+ in the WAL stream it will issue a
+ restartpoint in order to flush the current state into the
+ pg_control file. The restartpoint will flush the
+ current state to disk and will block redo until finished. This in turn
+ will induce replication lag, which on synchronous standbys also blocks
+ the primary. Reducing before the
+ process is started can help with reducing the time it takes for the
+ restartpoint to finish.
+
+
+ Shutdown/Restart: If the server is shut down or restarted when data
+ checksums are being enabled, the process will not resume and all pages
+ need to be recalculated and rewritten. Enabling data checksums should
+ be done when there is no need for regular maintenance or during a
+ service window.
+
+
+
+
+
+ No I/O is incurred when disabling data checksums, but checkpoints are
+ still required.
+
+
+
+
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 44194d3ea17..2468a7d2578 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -18,6 +18,7 @@
#include "access/xlog.h"
#include "access/xlog_internal.h"
#include "catalog/pg_control.h"
+#include "storage/checksum.h"
#include "utils/guc.h"
#include "utils/timestamp.h"
@@ -54,6 +55,40 @@ get_wal_level_string(int wal_level)
return wal_level_str;
}
+const char *
+get_checksum_state_string(uint32 state)
+{
+ switch (state)
+ {
+ case PG_DATA_CHECKSUM_VERSION:
+ return "on";
+ case PG_DATA_CHECKSUM_INPROGRESS_OFF:
+ return "inprogress-off";
+ case PG_DATA_CHECKSUM_INPROGRESS_ON:
+ return "inprogress-on";
+ case PG_DATA_CHECKSUM_OFF:
+ return "off";
+ }
+
+ Assert(false);
+ return "?";
+}
+
+void
+xlog2_desc(StringInfo buf, XLogReaderState *record)
+{
+ char *rec = XLogRecGetData(record);
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+ if (info == XLOG2_CHECKSUMS)
+ {
+ xl_checksum_state xlrec;
+
+ memcpy(&xlrec, rec, sizeof(xl_checksum_state));
+ appendStringInfoString(buf, get_checksum_state_string(xlrec.new_checksum_state));
+ }
+}
+
void
xlog_desc(StringInfo buf, XLogReaderState *record)
{
@@ -69,7 +104,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
"tli %u; prev tli %u; fpw %s; wal_level %s; logical decoding %s; xid %u:%u; oid %u; multi %u; offset %" PRIu64 "; "
"oldest xid %u in DB %u; oldest multi %u in DB %u; "
"oldest/newest commit timestamp xid: %u/%u; "
- "oldest running xid %u; %s",
+ "oldest running xid %u; "
+ "checksums %s; %s",
LSN_FORMAT_ARGS(checkpoint->redo),
checkpoint->ThisTimeLineID,
checkpoint->PrevTimeLineID,
@@ -88,6 +124,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
checkpoint->oldestCommitTsXid,
checkpoint->newestCommitTsXid,
checkpoint->oldestActiveXid,
+ get_checksum_state_string(checkpoint->dataChecksumState),
(info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
}
else if (info == XLOG_NEXTOID)
@@ -166,7 +203,9 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
xl_checkpoint_redo xlrec;
memcpy(&xlrec, rec, sizeof(xl_checkpoint_redo));
- appendStringInfo(buf, "wal_level %s", get_wal_level_string(xlrec.wal_level));
+ appendStringInfo(buf, "wal_level %s; checksums %s",
+ get_wal_level_string(xlrec.wal_level),
+ get_checksum_state_string(xlrec.data_checksum_version));
}
else if (info == XLOG_LOGICAL_DECODING_STATUS_CHANGE)
{
@@ -241,6 +280,21 @@ xlog_identify(uint8 info)
return id;
}
+const char *
+xlog2_identify(uint8 info)
+{
+ const char *id = NULL;
+
+ switch (info & ~XLR_INFO_MASK)
+ {
+ case XLOG2_CHECKSUMS:
+ id = "CHECKSUMS";
+ break;
+ }
+
+ return id;
+}
+
/*
* Returns a string giving information about all the blocks in an
* XLogRecord.
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 2c1c6f88b74..9e8999bbb61 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -75,6 +75,7 @@
#include "pgstat.h"
#include "port/atomics.h"
#include "postmaster/bgwriter.h"
+#include "postmaster/datachecksum_state.h"
#include "postmaster/startup.h"
#include "postmaster/walsummarizer.h"
#include "postmaster/walwriter.h"
@@ -92,6 +93,7 @@
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
+#include "storage/procsignal.h"
#include "storage/reinit.h"
#include "storage/spin.h"
#include "storage/sync.h"
@@ -553,6 +555,9 @@ typedef struct XLogCtlData
*/
XLogRecPtr lastFpwDisableRecPtr;
+ /* last data_checksum_version we've seen */
+ uint32 data_checksum_version;
+
slock_t info_lck; /* locks shared variables shown above */
} XLogCtlData;
@@ -650,6 +655,21 @@ static XLogRecPtr LocalMinRecoveryPoint;
static TimeLineID LocalMinRecoveryPointTLI;
static bool updateMinRecoveryPoint = true;
+/*
+ * Local state for Controlfile data_checksum_version. After initialization
+ * this is only updated when absorbing a procsignal barrier during interrupt
+ * processing. The reason for keeping a copy in backend-private memory is to
+ * avoid locking for interrogating the data checksum state. Possible values
+ * are the data checksum versions defined in storage/checksum.h.
+ */
+static ChecksumStateType LocalDataChecksumState = 0;
+
+/*
+ * Variable backing the GUC, keep it in sync with LocalDataChecksumState.
+ * See SetLocalDataChecksumState().
+ */
+int data_checksums = 0;
+
/* For WALInsertLockAcquire/Release functions */
static int MyLockNo = 0;
static bool holdingAllLocks = false;
@@ -717,6 +737,8 @@ static void WALInsertLockAcquireExclusive(void);
static void WALInsertLockRelease(void);
static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
+static void XLogChecksums(uint32 new_type);
+
/*
* Insert an XLOG record represented by an already-constructed chain of data
* chunks. This is a low-level routine; to construct the WAL record header
@@ -4254,6 +4276,12 @@ InitControlFile(uint64 sysidentifier, uint32 data_checksum_version)
ControlFile->wal_log_hints = wal_log_hints;
ControlFile->track_commit_timestamp = track_commit_timestamp;
ControlFile->data_checksum_version = data_checksum_version;
+
+ /*
+ * Set the data_checksum_version value into XLogCtl, which is where all
+ * processes get the current value from.
+ */
+ XLogCtl->data_checksum_version = data_checksum_version;
}
static void
@@ -4588,10 +4616,6 @@ ReadControlFile(void)
(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
CalculateCheckpointSegments();
-
- /* Make the initdb settings visible as GUC variables, too */
- SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
- PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
}
/*
@@ -4625,13 +4649,323 @@ GetMockAuthenticationNonce(void)
}
/*
- * Are checksums enabled for data pages?
+ * DataChecksumsNeedWrite
+ * Returns whether data checksums must be written or not
+ *
+ * Returns true if data checksums are enabled, or are in the process of being
+ * enabled. During "inprogress-on" and "inprogress-off" states checksums must
+ * be written even though they are not verified (see datachecksum_state.c for
+ * a longer discussion).
+ *
+ * This function is intended for callsites which are about to write a data page
+ * to storage, and need to know whether to re-calculate the checksum for the
+ * page header. Calling this function must be performed as close to the write
+ * operation as possible to keep the critical section short.
*/
bool
-DataChecksumsEnabled(void)
+DataChecksumsNeedWrite(void)
{
+ return (LocalDataChecksumState == PG_DATA_CHECKSUM_VERSION ||
+ LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_ON ||
+ LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_OFF);
+}
+
+bool
+DataChecksumsInProgressOn(void)
+{
+ return LocalDataChecksumState == PG_DATA_CHECKSUM_INPROGRESS_ON;
+}
+
+/*
+ * DataChecksumsNeedVerify
+ * Returns whether data checksums must be verified or not
+ *
+ * Data checksums are only verified if they are fully enabled in the cluster.
+ * During the "inprogress-on" and "inprogress-off" states they are only
+ * updated, not verified (see datachecksum_state.c for a longer discussion).
+ *
+ * This function is intended for callsites which have read data and are about
+ * to perform checksum validation based on the result of this. Calling this
+ * function must be performed as close to the validation call as possible to
+ * keep the critical section short. This is in order to protect against time of
+ * check/time of use situations around data checksum validation.
+ */
+bool
+DataChecksumsNeedVerify(void)
+{
+ return (LocalDataChecksumState == PG_DATA_CHECKSUM_VERSION);
+}
+
+/*
+ * SetDataChecksumsOnInProgress
+ * Sets the data checksum state to "inprogress-on" to enable checksums
+ *
+ * To start the process of enabling data checksums in a running cluster the
+ * data_checksum_version state must be changed to "inprogress-on". See
+ * SetDataChecksumsOn below for a description on how this state change works.
+ * This function blocks until all backends in the cluster have acknowledged the
+ * state transition.
+ */
+void
+SetDataChecksumsOnInProgress(void)
+{
+ uint64 barrier;
+
Assert(ControlFile != NULL);
- return (ControlFile->data_checksum_version > 0);
+
+ /*
+ * The state transition is performed in a critical section with
+ * checkpoints held off to provide crash safety.
+ */
+ START_CRIT_SECTION();
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+ XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
+
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+ END_CRIT_SECTION();
+
+ /*
+ * Update the controlfile before waiting since if we have an immediate
+ * shutdown while waiting we want to come back up with checksums enabled.
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON;
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ /*
+ * Await state change in all backends to ensure that all backends are in
+ * "inprogress-on". Once done we know that all backends are writing data
+ * checksums.
+ */
+ WaitForProcSignalBarrier(barrier);
+}
+
+/*
+ * SetDataChecksumsOn
+ * Set data checksums state to 'on' cluster-wide
+ *
+ * Enabling data checksums is performed using two barriers, the first one to
+ * set the state to "inprogress-on" (done by SetDataChecksumsOnInProgress())
+ * and the second one to set the state to "on" (done here). Below is a short
+ * description of the processing, a more detailed write-up can be found in
+ * datachecksum_state.c.
+ *
+ * To start the process of enabling data checksums in a running cluster the
+ * data_checksum_version state must be changed to "inprogress-on". This state
+ * requires data checksums to be written but not verified. This ensures that
+ * all data pages can be checksummed without the risk of false negatives in
+ * validation during the process. When all existing pages are guaranteed to
+ * have checksums, and all new pages will be initiated with checksums, the
+ * state can be changed to "on". Once the state is "on" checksums will be both
+ * written and verified.
+ *
+ * This function blocks until all backends in the cluster have acknowledged the
+ * state transition.
+ */
+void
+SetDataChecksumsOn(void)
+{
+ uint64 barrier;
+
+ Assert(ControlFile != NULL);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+
+ /*
+ * The only allowed state transition to "on" is from "inprogress-on" since
+ * that state ensures that all pages will have data checksums written. No
+ * such state transition exists, if it does happen it's likely due to a
+ * programmer error.
+ */
+ if (XLogCtl->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON)
+ {
+ SpinLockRelease(&XLogCtl->info_lck);
+ elog(WARNING,
+ "cannot set data checksums to \"on\", current state is not \"inprogress-on\", disabling");
+ SetDataChecksumsOff();
+ return;
+ }
+
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ INJECTION_POINT("datachecksums-enable-checksums-delay", NULL);
+ START_CRIT_SECTION();
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+ XLogChecksums(PG_DATA_CHECKSUM_VERSION);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
+
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+ END_CRIT_SECTION();
+
+ /*
+ * Update the controlfile before waiting since if we have an immediate
+ * shutdown while waiting we want to come back up with checksums enabled.
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
+
+ /*
+ * Await state transition to "on" in all backends. When done we know that
+ * data data checksums are both written and verified in all backends.
+ */
+ WaitForProcSignalBarrier(barrier);
+}
+
+/*
+ * SetDataChecksumsOff
+ * Disables data checksums cluster-wide
+ *
+ * Disabling data checksums must be performed with two sets of barriers, each
+ * carrying a different state. The state is first set to "inprogress-off"
+ * during which checksums are still written but not verified. This ensures that
+ * backends which have yet to observe the state change from "on" won't get
+ * validation errors on concurrently modified pages. Once all backends have
+ * changed to "inprogress-off", the barrier for moving to "off" can be emitted.
+ * This function blocks until all backends in the cluster have acknowledged the
+ * state transition.
+ */
+void
+SetDataChecksumsOff(void)
+{
+ uint64 barrier;
+
+ Assert(ControlFile != NULL);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+
+ /* If data checksums are already disabled there is nothing to do */
+ if (XLogCtl->data_checksum_version == 0)
+ {
+ SpinLockRelease(&XLogCtl->info_lck);
+ return;
+ }
+
+ /*
+ * If data checksums are currently enabled we first transition to the
+ * "inprogress-off" state during which backends continue to write
+ * checksums without verifying them. When all backends are in
+ * "inprogress-off" the next transition to "off" can be performed, after
+ * which all data checksum processing is disabled.
+ */
+ if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
+ {
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ START_CRIT_SECTION();
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+ XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_OFF);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
+
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+ END_CRIT_SECTION();
+
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF;
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
+
+ /*
+ * Update local state in all backends to ensure that any backend in
+ * "on" state is changed to "inprogress-off".
+ */
+ WaitForProcSignalBarrier(barrier);
+
+ /*
+ * At this point we know that no backends are verifying data checksums
+ * during reading. Next, we can safely move to state "off" to also
+ * stop writing checksums.
+ */
+ }
+ else
+ {
+ /*
+ * Ending up here implies that the checksums state is "inprogress-on"
+ * or "inprogress-off" and we can transition directly to "off" from
+ * there.
+ */
+ SpinLockRelease(&XLogCtl->info_lck);
+ }
+
+ START_CRIT_SECTION();
+ /* Ensure that we don't incur a checkpoint during disabling checksums */
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+ XLogChecksums(PG_DATA_CHECKSUM_OFF);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = 0;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
+
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+ END_CRIT_SECTION();
+
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->data_checksum_version = PG_DATA_CHECKSUM_OFF;
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_FAST);
+
+ WaitForProcSignalBarrier(barrier);
+}
+
+/*
+ * InitLocalDataChecksumState
+ *
+ * Set up backend local caches of controldata variables which may change at
+ * any point during runtime and thus require special cased locking. So far
+ * this only applies to data_checksum_version, but it's intended to be general
+ * purpose enough to handle future cases.
+ */
+void
+InitLocalDataChecksumState(void)
+{
+ SpinLockAcquire(&XLogCtl->info_lck);
+ SetLocalDataChecksumState(XLogCtl->data_checksum_version);
+ SpinLockRelease(&XLogCtl->info_lck);
+}
+
+void
+SetLocalDataChecksumState(uint32 data_checksum_version)
+{
+ LocalDataChecksumState = data_checksum_version;
+
+ data_checksums = data_checksum_version;
+}
+
+/* guc hook */
+const char *
+show_data_checksums(void)
+{
+ return get_checksum_state_string(LocalDataChecksumState);
}
/*
@@ -4925,6 +5259,7 @@ LocalProcessControlFile(bool reset)
Assert(reset || ControlFile == NULL);
ControlFile = palloc_object(ControlFileData);
ReadControlFile();
+ SetLocalDataChecksumState(ControlFile->data_checksum_version);
}
/*
@@ -5094,6 +5429,11 @@ XLOGShmemInit(void)
XLogCtl->InstallXLogFileSegmentActive = false;
XLogCtl->WalWriterSleeping = false;
+ /* Use the checksum info from control file */
+ XLogCtl->data_checksum_version = ControlFile->data_checksum_version;
+
+ SetLocalDataChecksumState(XLogCtl->data_checksum_version);
+
SpinLockInit(&XLogCtl->Insert.insertpos_lck);
SpinLockInit(&XLogCtl->info_lck);
pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr);
@@ -5168,6 +5508,7 @@ BootStrapXLOG(uint32 data_checksum_version)
checkPoint.newestCommitTsXid = InvalidTransactionId;
checkPoint.time = (pg_time_t) time(NULL);
checkPoint.oldestActiveXid = InvalidTransactionId;
+ checkPoint.dataChecksumState = data_checksum_version;
TransamVariables->nextXid = checkPoint.nextXid;
TransamVariables->nextOid = checkPoint.nextOid;
@@ -6244,6 +6585,47 @@ StartupXLOG(void)
pfree(endOfRecoveryInfo->recoveryStopReason);
pfree(endOfRecoveryInfo);
+ /*
+ * If we reach this point with checksums in the state inprogress-on, it
+ * means that data checksums were in the process of being enabled when the
+ * cluster shut down. Since processing didn't finish, the operation will
+ * have to be restarted from scratch since there is no capability to
+ * continue where it was when the cluster shut down. Thus, revert the
+ * state back to off, and inform the user with a warning message. Being
+ * able to restart processing is a TODO, but it wouldn't be possible to
+ * restart here since we cannot launch a dynamic background worker
+ * directly from here (it has to be from a regular backend).
+ */
+ if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON)
+ {
+ XLogChecksums(PG_DATA_CHECKSUM_OFF);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = 0;
+ SetLocalDataChecksumState(XLogCtl->data_checksum_version);
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ ereport(WARNING,
+ errmsg("enabling data checksums was interrupted"),
+ errhint("Data checksum processing must be manually restarted for checksums to be enabled"));
+ }
+
+ /*
+ * If data checksums were being disabled when the cluster was shut down,
+ * we know that we have a state where all backends have stopped validating
+ * checksums and we can move to off instead of prompting the user to
+ * perform any action.
+ */
+ if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF)
+ {
+ XLogChecksums(PG_DATA_CHECKSUM_OFF);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = 0;
+ SetLocalDataChecksumState(XLogCtl->data_checksum_version);
+ SpinLockRelease(&XLogCtl->info_lck);
+ }
+
/*
* All done with end-of-recovery actions.
*
@@ -6549,7 +6931,7 @@ GetRedoRecPtr(void)
XLogRecPtr ptr;
/*
- * The possibly not up-to-date copy in XlogCtl is enough. Even if we
+ * The possibly not up-to-date copy in XLogCtl is enough. Even if we
* grabbed a WAL insertion lock to read the authoritative value in
* Insert->RedoRecPtr, someone might update it just after we've released
* the lock.
@@ -7127,6 +7509,12 @@ CreateCheckPoint(int flags)
checkPoint.fullPageWrites = Insert->fullPageWrites;
checkPoint.wal_level = wal_level;
+ /*
+ * Get the current data_checksum_version value from xlogctl, valid at the
+ * time of the checkpoint.
+ */
+ checkPoint.dataChecksumState = XLogCtl->data_checksum_version;
+
if (shutdown)
{
XLogRecPtr curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
@@ -7183,6 +7571,9 @@ CreateCheckPoint(int flags)
WALInsertLockAcquire();
redo_rec.wal_level = wal_level;
+ SpinLockAcquire(&XLogCtl->info_lck);
+ redo_rec.data_checksum_version = XLogCtl->data_checksum_version;
+ SpinLockRelease(&XLogCtl->info_lck);
WALInsertLockRelease();
/* Include WAL level in record for WAL summarizer's benefit. */
@@ -7243,6 +7634,10 @@ CreateCheckPoint(int flags)
checkPoint.nextOid += TransamVariables->oidCount;
LWLockRelease(OidGenLock);
+ SpinLockAcquire(&XLogCtl->info_lck);
+ checkPoint.dataChecksumState = XLogCtl->data_checksum_version;
+ SpinLockRelease(&XLogCtl->info_lck);
+
checkPoint.logicalDecodingEnabled = IsLogicalDecodingEnabled();
MultiXactGetCheckptMulti(shutdown,
@@ -7392,6 +7787,9 @@ CreateCheckPoint(int flags)
ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
ControlFile->minRecoveryPointTLI = 0;
+ /* make sure we start with the checksum version as of the checkpoint */
+ ControlFile->data_checksum_version = checkPoint.dataChecksumState;
+
/*
* Persist unloggedLSN value. It's reset on crash recovery, so this goes
* unused on non-shutdown checkpoints, but seems useful to store it always
@@ -7535,6 +7933,12 @@ CreateEndOfRecoveryRecord(void)
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->minRecoveryPoint = recptr;
ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
+
+ /* start with the latest checksum version (as of the end of recovery) */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ ControlFile->data_checksum_version = XLogCtl->data_checksum_version;
+ SpinLockRelease(&XLogCtl->info_lck);
+
UpdateControlFile();
LWLockRelease(ControlFileLock);
@@ -7876,6 +8280,10 @@ CreateRestartPoint(int flags)
if (flags & CHECKPOINT_IS_SHUTDOWN)
ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
}
+
+ /* we shall start with the latest checksum version */
+ ControlFile->data_checksum_version = lastCheckPoint.dataChecksumState;
+
UpdateControlFile();
}
LWLockRelease(ControlFileLock);
@@ -8314,6 +8722,24 @@ XLogReportParameters(void)
}
}
+/*
+ * Log the new state of checksums
+ */
+static void
+XLogChecksums(uint32 new_type)
+{
+ xl_checksum_state xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.new_checksum_state = new_type;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state));
+
+ recptr = XLogInsert(RM_XLOG2_ID, XLOG2_CHECKSUMS);
+ XLogFlush(recptr);
+}
+
/*
* Update full_page_writes in shared memory, and write an
* XLOG_FPW_CHANGE record if necessary.
@@ -8440,6 +8866,11 @@ xlog_redo(XLogReaderState *record)
MultiXactAdvanceOldest(checkPoint.oldestMulti,
checkPoint.oldestMultiDB);
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = checkPoint.dataChecksumState;
+ SetLocalDataChecksumState(checkPoint.dataChecksumState);
+ SpinLockRelease(&XLogCtl->info_lck);
+
/*
* No need to set oldestClogXid here as well; it'll be set when we
* redo an xl_clog_truncate if it changed since initialization.
@@ -8499,6 +8930,7 @@ xlog_redo(XLogReaderState *record)
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
+ ControlFile->data_checksum_version = checkPoint.dataChecksumState;
LWLockRelease(ControlFileLock);
/*
@@ -8525,6 +8957,8 @@ xlog_redo(XLogReaderState *record)
{
CheckPoint checkPoint;
TimeLineID replayTLI;
+ bool new_state = false;
+ int old_state;
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
/* In an ONLINE checkpoint, treat the XID counter as a minimum */
@@ -8563,6 +8997,8 @@ xlog_redo(XLogReaderState *record)
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
+ old_state = ControlFile->data_checksum_version;
+ ControlFile->data_checksum_version = checkPoint.dataChecksumState;
LWLockRelease(ControlFileLock);
/* TLI should not change in an on-line checkpoint */
@@ -8574,6 +9010,18 @@ xlog_redo(XLogReaderState *record)
RecoveryRestartPoint(&checkPoint, record);
+ /*
+ * If the data checksum state change we need to emit a barrier.
+ */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = checkPoint.dataChecksumState;
+ if (checkPoint.dataChecksumState != old_state)
+ new_state = true;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ if (new_state)
+ EmitAndWaitDataChecksumsBarrier(checkPoint.dataChecksumState);
+
/*
* After replaying a checkpoint record, free all smgr objects.
* Otherwise we would never do so for dropped relations, as the
@@ -8735,7 +9183,19 @@ xlog_redo(XLogReaderState *record)
}
else if (info == XLOG_CHECKPOINT_REDO)
{
- /* nothing to do here, just for informational purposes */
+ xl_checkpoint_redo redo_rec;
+ bool new_state = false;
+
+ memcpy(&redo_rec, XLogRecGetData(record), sizeof(xl_checkpoint_redo));
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = redo_rec.data_checksum_version;
+ if (redo_rec.data_checksum_version != ControlFile->data_checksum_version)
+ new_state = true;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ if (new_state)
+ EmitAndWaitDataChecksumsBarrier(redo_rec.data_checksum_version);
}
else if (info == XLOG_LOGICAL_DECODING_STATUS_CHANGE)
{
@@ -8788,6 +9248,30 @@ xlog_redo(XLogReaderState *record)
}
}
+void
+xlog2_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+ if (info == XLOG2_CHECKSUMS)
+ {
+ xl_checksum_state state;
+
+ memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state));
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = state.new_checksum_state;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /*
+ * Block on a procsignalbarrier to await all processes having seen the
+ * change to checksum status. Once the barrier has been passed we can
+ * initiate the corresponding processing.
+ */
+ EmitAndWaitDataChecksumsBarrier(state.new_checksum_state);
+ }
+}
+
/*
* Return the extra open flags used for opening a file, depending on the
* value of the GUCs wal_sync_method, fsync and debug_io_direct.
diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c
index ab1fbae8001..9c79dadaacc 100644
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -1613,10 +1613,11 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
/*
* If we weren't told not to verify checksums, and if checksums are
* enabled for this cluster, and if this is a relation file, then verify
- * the checksum.
+ * the checksum. We cannot at this point check if checksums are enabled
+ * or disabled as that might change, thus we check at each point where we
+ * could be validating a checksum.
*/
- if (!noverify_checksums && DataChecksumsEnabled() &&
- RelFileNumberIsValid(relfilenumber))
+ if (!noverify_checksums && RelFileNumberIsValid(relfilenumber))
verify_checksum = true;
/*
@@ -1749,7 +1750,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
* If the amount of data we were able to read was not a multiple of
* BLCKSZ, we cannot verify checksums, which are block-level.
*/
- if (verify_checksum && (cnt % BLCKSZ != 0))
+ if (verify_checksum && DataChecksumsNeedVerify() && (cnt % BLCKSZ != 0))
{
ereport(WARNING,
(errmsg("could not verify checksum in file \"%s\", block "
@@ -1844,9 +1845,10 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
* 'blkno' is the block number of the first page in the bbsink's buffer
* relative to the start of the relation.
*
- * 'verify_checksum' indicates whether we should try to verify checksums
- * for the blocks we read. If we do this, we'll update *checksum_failures
- * and issue warnings as appropriate.
+ * 'verify_checksum' determines if the user has asked to verify checksums, but
+ * since data checksums can be disabled, or become disabled, we need to check
+ * state before verifying individual pages. If we do this, we'll update
+ * *checksum_failures and issue warnings as appropriate.
*/
static off_t
read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd,
@@ -1872,6 +1874,13 @@ read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd,
int reread_cnt;
uint16 expected_checksum;
+ /*
+ * The data checksum state can change at any point, so we need to
+ * re-check before each page.
+ */
+ if (!DataChecksumsNeedVerify())
+ return cnt;
+
page = sink->bbs_buffer + BLCKSZ * i;
/* If the page is OK, go on to the next one. */
@@ -1894,7 +1903,12 @@ read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd,
* allows us to wait until we can be certain that no write to the
* block is in progress. Since we don't have any such thing right now,
* we just do this and hope for the best.
+ *
+ * The data checksum state may also have changed concurrently so check
+ * again.
*/
+ if (!DataChecksumsNeedVerify())
+ return cnt;
reread_cnt =
basebackup_read_file(fd, sink->bbs_buffer + BLCKSZ * i,
BLCKSZ, offset + BLCKSZ * i,
@@ -2009,6 +2023,9 @@ verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno,
if (PageIsNew(page) || PageGetLSN(page) >= start_lsn)
return true;
+ if (!DataChecksumsNeedVerify())
+ return true;
+
/* Perform the actual checksum calculation. */
checksum = pg_checksum_page(page, blkno);
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 38ef683d4c7..c52c0a6023d 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -35,6 +35,7 @@
#include "port/pg_getopt_ctx.h"
#include "postmaster/postmaster.h"
#include "storage/bufpage.h"
+#include "storage/checksum.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/proc.h"
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index e54018004db..eba25aa3e4d 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1451,6 +1451,25 @@ CREATE VIEW pg_stat_progress_copy AS
FROM pg_stat_get_progress_info('COPY') AS S
LEFT JOIN pg_database D ON S.datid = D.oid;
+CREATE VIEW pg_stat_progress_data_checksums AS
+ SELECT
+ S.pid AS pid, S.datid, D.datname AS datname,
+ CASE S.param1 WHEN 0 THEN 'enabling'
+ WHEN 1 THEN 'disabling'
+ WHEN 2 THEN 'waiting on temporary tables'
+ WHEN 3 THEN 'waiting on barrier'
+ WHEN 4 THEN 'done'
+ END AS phase,
+ CASE S.param2 WHEN -1 THEN NULL ELSE S.param2 END AS databases_total,
+ S.param3 AS databases_done,
+ CASE S.param4 WHEN -1 THEN NULL ELSE S.param4 END AS relations_total,
+ CASE S.param5 WHEN -1 THEN NULL ELSE S.param5 END AS relations_done,
+ CASE S.param6 WHEN -1 THEN NULL ELSE S.param6 END AS blocks_total,
+ CASE S.param7 WHEN -1 THEN NULL ELSE S.param7 END AS blocks_done
+ FROM pg_stat_get_progress_info('DATACHECKSUMS') AS S
+ LEFT JOIN pg_database D ON S.datid = D.oid
+ ORDER BY S.datid; -- return the launcher process first
+
CREATE VIEW pg_user_mappings AS
SELECT
U.oid AS umid,
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 9b18bb4a17e..f0819d15ab7 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1044,7 +1044,14 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
if (pg_strcasecmp(strategy, "wal_log") == 0)
dbstrategy = CREATEDB_WAL_LOG;
else if (pg_strcasecmp(strategy, "file_copy") == 0)
+ {
+ if (DataChecksumsInProgressOn())
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("create database strategy \"%s\" not allowed when data checksums are being enabled",
+ strategy));
dbstrategy = CREATEDB_FILE_COPY;
+ }
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 0f4435d2d97..55044b2bc6f 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -18,6 +18,7 @@ OBJS = \
bgworker.o \
bgwriter.o \
checkpointer.o \
+ datachecksum_state.o \
fork_process.o \
interrupt.o \
launch_backend.o \
diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c
index cf24f662d27..8fdc518b3a1 100644
--- a/src/backend/postmaster/auxprocess.c
+++ b/src/backend/postmaster/auxprocess.c
@@ -15,6 +15,7 @@
#include
#include
+#include "access/xlog.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/auxprocess.h"
@@ -69,6 +70,24 @@ AuxiliaryProcessMainCommon(void)
ProcSignalInit(NULL, 0);
+ /*
+ * Initialize a local cache of the data_checksum_version, to be updated by
+ * the procsignal-based barriers.
+ *
+ * This intentionally happens after initializing the procsignal, otherwise
+ * we might miss a state change. This means we can get a barrier for the
+ * state we've just initialized - but it can happen only once.
+ *
+ * The postmaster (which is what gets forked into the new child process)
+ * does not handle barriers, therefore it may not have the current value
+ * of LocalDataChecksumVersion value (it'll have the value read from the
+ * control file, which may be arbitrarily old).
+ *
+ * NB: Even if the postmaster handled barriers, the value might still be
+ * stale, as it might have changed after this process forked.
+ */
+ InitLocalDataChecksumState();
+
/*
* Auxiliary processes don't run transactions, but they may need a
* resource owner anyway to manage buffer pins acquired outside
diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index f2a62489d9c..536aff7ca05 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -18,6 +18,7 @@
#include "pgstat.h"
#include "port/atomics.h"
#include "postmaster/bgworker_internals.h"
+#include "postmaster/datachecksum_state.h"
#include "postmaster/postmaster.h"
#include "replication/logicallauncher.h"
#include "replication/logicalworker.h"
@@ -144,7 +145,14 @@ static const struct
.fn_name = "TableSyncWorkerMain",
.fn_addr = TableSyncWorkerMain
},
-
+ {
+ .fn_name = "DataChecksumsWorkerLauncherMain",
+ .fn_addr = DataChecksumsWorkerLauncherMain
+ },
+ {
+ .fn_name = "DataChecksumsWorkerMain",
+ .fn_addr = DataChecksumsWorkerMain
+ }
};
/* Private functions. */
diff --git a/src/backend/postmaster/datachecksum_state.c b/src/backend/postmaster/datachecksum_state.c
new file mode 100644
index 00000000000..76004bcedc6
--- /dev/null
+++ b/src/backend/postmaster/datachecksum_state.c
@@ -0,0 +1,1612 @@
+/*-------------------------------------------------------------------------
+ *
+ * datachecksum_state.c
+ * Background worker for enabling or disabling data checksums online as
+ * well as functionality for manipulating data checksum state
+ *
+ * When enabling data checksums on a cluster at initdb time or when shut down
+ * with pg_checksums, no extra process is required as each page is checksummed,
+ * and verified, when accessed. When enabling checksums on an already running
+ * cluster, this worker will ensure that all pages are checksummed before
+ * verification of the checksums is turned on. In the case of disabling
+ * checksums, the state transition is performed only in the control file, no
+ * changes are performed on the data pages.
+ *
+ * Checksums can be either enabled or disabled cluster-wide, with on/off being
+ * the end state for data_checksums.
+ *
+ * 1. Enabling checksums
+ * ---------------------
+ * When enabling checksums in an online cluster, data_checksums will be set to
+ * "inprogress-on" which signals that write operations MUST compute and write
+ * the checksum on the data page, but during reading the checksum SHALL NOT be
+ * verified. This ensures that all objects created during when checksums are
+ * being enabled will have checksums set, but reads won't fail due to missing or
+ * invalid checksums. Invalid checksums can be present in case the cluster had
+ * checksums enabled, then disabled them and updated the page while they were
+ * disabled.
+ *
+ * The DataChecksumsWorker will compile a list of all databases at the start,
+ * any databases created concurrently will see the in-progress state and will
+ * be checksummed automatically. All databases from the original list MUST BE
+ * successfully processed in order for data checksums to be enabled, the only
+ * exception are databases which are dropped before having been processed.
+ *
+ * For each database, all relations which have storage are read and every data
+ * page is marked dirty to force a write with the checksum. This will generate
+ * a lot of WAL as the entire database is read and written.
+ *
+ * If the processing is interrupted by a cluster crash or restart, it needs to
+ * be restarted from the beginning again as state isn't persisted.
+ *
+ * 2. Disabling checksums
+ * ----------------------
+ * When disabling checksums, data_checksums will be set to "inprogress-off"
+ * which signals that checksums are written but no longer need to be verified.
+ * This ensures that backends which have not yet transitioned to the
+ * "inprogress-off" state will still see valid checksums on pages.
+ *
+ * 3. Synchronization and Correctness
+ * ----------------------------------
+ * The processes involved in enabling or disabling data checksums in an
+ * online cluster must be properly synchronized with the normal backends
+ * serving concurrent queries to ensure correctness. Correctness is defined
+ * as the following:
+ *
+ * - Backends SHALL NOT violate the data_checksums state they have agreed to
+ * by acknowledging the procsignalbarrier: This means that all backends
+ * MUST calculate and write data checksums during all states except off;
+ * MUST validate checksums only in the 'on' state.
+ * - Data checksums SHALL NOT be considered enabled cluster-wide until all
+ * currently connected backends have state "on": This means that all
+ * backends must wait on the procsignalbarrier to be acknowledged by all
+ * before proceeding to validate data checksums.
+ *
+ * There are two steps of synchronization required for changing data_checksums
+ * in an online cluster: (i) changing state in the active backends ("on",
+ * "off", "inprogress-on" and "inprogress-off"), and (ii) ensuring no
+ * incompatible objects and processes are left in a database when workers end.
+ * The former deals with cluster-wide agreement on data checksum state and the
+ * latter with ensuring that any concurrent activity cannot break the data
+ * checksum contract during processing.
+ *
+ * Synchronizing the state change is done with procsignal barriers. Before
+ * updating the data_checksums state in the control file, all other backends must absorb the
+ * barrier. Barrier absorption will happen during interrupt processing, which
+ * means that connected backends will change state at different times. If
+ * waiting for a barrier is done during startup, for example during replay, it
+ * is important to realize that any locks held by the startup process might
+ * cause deadlocks if backends end up waiting for those locks while startup
+ * is waiting for a procsignalbarrier.
+ *
+ * 3.1 When Enabling Data Checksums
+ * --------------------------------
+ * A process which fails to observe data checksums being enabled can induce two
+ * types of errors: failing to write the checksum when modifying the page and
+ * failing to validate the data checksum on the page when reading it.
+ *
+ * When processing starts all backends belong to one of the below sets, with
+ * one if Bd and Bi being empty:
+ *
+ * Bg: Backend updating the global state and emitting the procsignalbarrier
+ * Bd: Backends in "off" state
+ * Bi: Backends in "inprogress-on" state
+ *
+ * If processing is started in an online cluster then all backends are in Bd.
+ * If processing was halted by the cluster shutting down (due to a crash or
+ * intentional restart), the controlfile state "inprogress-on" will be observed
+ * on system startup and all backends will be placed in Bd. The controlfile
+ * state will also be set to "off".
+ *
+ * Backends transition Bd -> Bi via a procsignalbarrier which is emitted by the
+ * DataChecksumsLauncher. When all backends have acknowledged the barrier then
+ * Bd will be empty and the next phase can begin: calculating and writing data
+ * checksums with DataChecksumsWorkers. When the DataChecksumsWorker processes
+ * have finished writing checksums on all pages, data checksums are enabled
+ * cluster-wide via another procsignalbarrier. There are four sets of backends
+ * where Bd shall be an empty set:
+ *
+ * Bg: Backend updating the global state and emitting the procsignalbarrier
+ * Bd: Backends in "off" state
+ * Be: Backends in "on" state
+ * Bi: Backends in "inprogress-on" state
+ *
+ * Backends in Bi and Be will write checksums when modifying a page, but only
+ * backends in Be will verify the checksum during reading. The Bg backend is
+ * blocked waiting for all backends in Bi to process interrupts and move to
+ * Be. Any backend starting while Bg is waiting on the procsignalbarrier will
+ * observe the global state being "on" and will thus automatically belong to
+ * Be. Checksums are enabled cluster-wide when Bi is an empty set. Bi and Be
+ * are compatible sets while still operating based on their local state as
+ * both write data checksums.
+ *
+ * 3.2 When Disabling Data Checksums
+ * ---------------------------------
+ * A process which fails to observe that data checksums have been disabled
+ * can induce two types of errors: writing the checksum when modifying the
+ * page and validating a data checksum which is no longer correct due to
+ * modifications to the page. The former is not an error per se as data
+ * integrity is maintained, but it is wasteful. The latter will cause errors
+ * in user operations. Assuming the following sets of backends:
+ *
+ * Bg: Backend updating the global state and emitting the procsignalbarrier
+ * Bd: Backends in "off" state
+ * Be: Backends in "on" state
+ * Bo: Backends in "inprogress-off" state
+ * Bi: Backends in "inprogress-on" state
+ *
+ * Backends transition from the Be state to Bd like so: Be -> Bo -> Bd. From
+ * all other states, the transition can be straight to Bd.
+ *
+ * The goal is to transition all backends to Bd making the others empty sets.
+ * Backends in Bo write data checksums, but don't validate them, such that
+ * backends still in Be can continue to validate pages until the barrier has
+ * been absorbed such that they are in Bo. Once all backends are in Bo, the
+ * barrier to transition to "off" can be raised and all backends can safely
+ * stop writing data checksums as no backend is enforcing data checksum
+ * validation any longer.
+ *
+ * 4. Future opportunities for optimizations
+ * -----------------------------------------
+ * Below are some potential optimizations and improvements which were brought
+ * up during reviews of this feature, but which weren't implemented in the
+ * initial version. These are ideas listed without any validation on their
+ * feasibility or potential payoff. More discussion on (most of) these can be
+ * found on the -hackers threads linked to in the commit message of this
+ * feature.
+ *
+ * * Launching datachecksumsworker for resuming operation from the startup
+ * process: Currently users have to restart processing manually after a
+ * restart since dynamic background worker cannot be started from the
+ * postmaster. Changing the startup process could make restarting the
+ * processing automatic on cluster restart.
+ * * Avoid dirtying the page when checksums already match: Iff the checksum
+ * on the page happens to already match we still dirty the page. It should
+ * be enough to only do the log_newpage_buffer() call in that case.
+ * * Teach pg_checksums to avoid checksummed pages when pg_checksums is used
+ * to enable checksums on a cluster which is in inprogress-on state and
+ * may have checksummed pages (make pg_checksums be able to resume an
+ * online operation). This should only be attempted for wal_level minimal.
+ * * Restartability (not necessarily with page granularity).
+ * * Avoid processing databases which were created during inprogress-on.
+ * Right now all databases are processed regardless to be safe.
+ * * Teach CREATE DATABASE to calculate checksums for databases created
+ * during inprogress-on with a template database which has yet to be
+ * processed.
+ *
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/postmaster/datachecksum_state.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_database.h"
+#include "commands/progress.h"
+#include "commands/vacuum.h"
+#include "common/relpath.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/datachecksum_state.h"
+#include "storage/bufmgr.h"
+#include "storage/checksum.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/lmgr.h"
+#include "storage/lwlock.h"
+#include "storage/procarray.h"
+#include "storage/smgr.h"
+#include "tcop/tcopprot.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/injection_point.h"
+#include "utils/lsyscache.h"
+#include "utils/ps_status.h"
+#include "utils/syscache.h"
+#include "utils/wait_event.h"
+
+/*
+ * Configuration of conditions which must match when absorbing a procsignal
+ * barrier during data checksum enable/disable operations. A single function
+ * is used for absorbing all barriers, and the current and target states must
+ * be defined as a from/to tuple in the checksum_barriers struct.
+ */
+typedef struct ChecksumBarrierCondition
+{
+ /* Current state of data checksums */
+ int from;
+ /* Target state for data checksums */
+ int to;
+} ChecksumBarrierCondition;
+
+static const ChecksumBarrierCondition checksum_barriers[6] =
+{
+ /*
+ * Disabling checksums: If checksums are currently enabled, disabling must
+ * go through the 'inprogress-off' state.
+ */
+ {PG_DATA_CHECKSUM_VERSION, PG_DATA_CHECKSUM_INPROGRESS_OFF},
+ {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_OFF},
+
+ /*
+ * If checksums are in the process of being enabled, but are not yet being
+ * verified, we can abort by going back to 'off' state.
+ */
+ {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_OFF},
+
+ /*
+ * Enabling checksums must normally go through the 'inprogress-on' state.
+ */
+ {PG_DATA_CHECKSUM_OFF, PG_DATA_CHECKSUM_INPROGRESS_ON},
+ {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_VERSION},
+
+ /*
+ * If checksums are being disabled but all backends are still computing
+ * checksums, we can go straight back to 'on'
+ */
+ {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_VERSION},
+};
+
+/*
+ * Signaling between backends calling pg_enable/disable_data_checksums, the
+ * checksums launcher process, and the checksums worker process.
+ *
+ * This struct is protected by DataChecksumsWorkerLock
+ */
+typedef struct DataChecksumsStateStruct
+{
+ /*
+ * These are set by pg_{enable|disable}_data_checksums, to tell the
+ * launcher what the target state is.
+ */
+ DataChecksumsWorkerOperation launch_operation;
+ int launch_cost_delay;
+ int launch_cost_limit;
+
+ /*
+ * Is a launcher process is currently running? This is set by the main
+ * launcher process, after it has read the above launch_* parameters.
+ */
+ bool launcher_running;
+
+ /*
+ * Is a worker process currently running? This is set by the worker
+ * launcher when it starts waiting for a worker process to finish.
+ */
+ int worker_pid;
+
+ /*
+ * These fields indicate the target state that the launcher is currently
+ * working towards. They can be different from the corresponding launch_*
+ * fields, if a new pg_enable/disable_data_checksums() call was made while
+ * the launcher/worker was already running.
+ *
+ * The below members are set when the launcher starts, and are only
+ * accessed read-only by the single worker. Thus, we can access these
+ * without a lock. If multiple workers, or dynamic cost parameters, are
+ * supported at some point then this would need to be revisited.
+ */
+ DataChecksumsWorkerOperation operation;
+ int cost_delay;
+ int cost_limit;
+
+ /*
+ * Signaling between the launcher and the worker process.
+ *
+ * As there is only a single worker, and the launcher won't read these
+ * until the worker exits, they can be accessed without the need for a
+ * lock. If multiple workers are supported then this will have to be
+ * revisited.
+ */
+
+ /* result, set by worker before exiting */
+ DataChecksumsWorkerResult success;
+
+ /*
+ * tells the worker process whether it should also process the shared
+ * catalogs
+ */
+ bool process_shared_catalogs;
+} DataChecksumsStateStruct;
+
+/* Shared memory segment for datachecksumsworker */
+static DataChecksumsStateStruct *DataChecksumState;
+
+typedef struct DataChecksumsWorkerDatabase
+{
+ Oid dboid;
+ char *dbname;
+} DataChecksumsWorkerDatabase;
+
+/* Flag set by the interrupt handler */
+static volatile sig_atomic_t abort_requested = false;
+
+/*
+ * Have we set the DataChecksumsStateStruct->launcher_running flag?
+ * If we have, we need to clear it before exiting!
+ */
+static volatile sig_atomic_t launcher_running = false;
+
+/* Are we enabling data checksums, or disabling them? */
+static DataChecksumsWorkerOperation operation;
+
+/* Prototypes */
+static bool DatabaseExists(Oid dboid);
+static List *BuildDatabaseList(void);
+static List *BuildRelationList(bool temp_relations, bool include_shared);
+static void FreeDatabaseList(List *dblist);
+static DataChecksumsWorkerResult ProcessDatabase(DataChecksumsWorkerDatabase *db);
+static bool ProcessAllDatabases(void);
+static bool ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy);
+static void launcher_cancel_handler(SIGNAL_ARGS);
+static void WaitForAllTransactionsToFinish(void);
+
+/*****************************************************************************
+ * Functionality for manipulating the data checksum state in the cluster
+ */
+
+void
+EmitAndWaitDataChecksumsBarrier(uint32 state)
+{
+ uint64 barrier;
+
+ switch (state)
+ {
+ case PG_DATA_CHECKSUM_INPROGRESS_ON:
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
+ WaitForProcSignalBarrier(barrier);
+ break;
+
+ case PG_DATA_CHECKSUM_INPROGRESS_OFF:
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
+ WaitForProcSignalBarrier(barrier);
+ break;
+
+ case PG_DATA_CHECKSUM_VERSION:
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
+ WaitForProcSignalBarrier(barrier);
+ break;
+
+ case PG_DATA_CHECKSUM_OFF:
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
+ WaitForProcSignalBarrier(barrier);
+ break;
+
+ default:
+ Assert(false);
+ }
+}
+
+/*
+ * AbsorbDataChecksumsBarrier
+ * Generic function for absorbing data checksum state changes
+ *
+ * All procsignalbarriers regarding data checksum state changes are absorbed
+ * with this function. The set of conditions required for the state change to
+ * be accepted are listed in the checksum_barriers struct, target_state is
+ * used to look up the relevant entry.
+ */
+bool
+AbsorbDataChecksumsBarrier(ProcSignalBarrierType barrier)
+{
+ uint32 target_state;
+ int current = data_checksums;
+ bool found = false;
+
+ /*
+ * Translate the barrier condition to the target state, doing it here
+ * instead of in the procsignal code saves the latter from knowing about
+ * checksum states.
+ */
+ switch (barrier)
+ {
+ case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON:
+ target_state = PG_DATA_CHECKSUM_INPROGRESS_ON;
+ break;
+ case PROCSIGNAL_BARRIER_CHECKSUM_ON:
+ target_state = PG_DATA_CHECKSUM_VERSION;
+ break;
+ case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF:
+ target_state = PG_DATA_CHECKSUM_INPROGRESS_OFF;
+ break;
+ case PROCSIGNAL_BARRIER_CHECKSUM_OFF:
+ target_state = PG_DATA_CHECKSUM_OFF;
+ break;
+ default:
+ elog(ERROR, "incorrect barrier \"%i\" received", barrier);
+ }
+
+ /*
+ * If the target state matches the current state then the barrier has been
+ * repeated.
+ */
+ if (current == target_state)
+ return true;
+
+ /*
+ * If the cluster is in recovery we skip the validation of current state
+ * since the replay is trusted.
+ */
+ if (RecoveryInProgress())
+ {
+ SetLocalDataChecksumState(target_state);
+ return true;
+ }
+
+ /*
+ * Find the barrier condition definition for the target state. Not finding
+ * a condition would be a grave programmer error as the states are a
+ * discrete set.
+ */
+ for (int i = 0; i < lengthof(checksum_barriers) && !found; i++)
+ {
+ if (checksum_barriers[i].from == current && checksum_barriers[i].to == target_state)
+ found = true;
+ }
+
+ /*
+ * If the relevant state criteria aren't satisfied, throw an error which
+ * will be caught by the procsignal machinery for a later retry.
+ */
+ if (!found)
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("incorrect data checksum state %i for target state %i",
+ current, target_state));
+
+ SetLocalDataChecksumState(target_state);
+ return true;
+}
+
+
+/*
+ * Disables data checksums for the cluster, if applicable. Starts a background
+ * worker which turns off the data checksums.
+ */
+Datum
+disable_data_checksums(PG_FUNCTION_ARGS)
+{
+ if (!superuser())
+ ereport(ERROR,
+ errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be superuser to change data checksum state"));
+
+ StartDataChecksumsWorkerLauncher(DISABLE_DATACHECKSUMS, 0, 0);
+ PG_RETURN_VOID();
+}
+
+/*
+ * Enables data checksums for the cluster, if applicable. Supports vacuum-
+ * like cost based throttling to limit system load. Starts a background worker
+ * which updates data checksums on existing data.
+ */
+Datum
+enable_data_checksums(PG_FUNCTION_ARGS)
+{
+ int cost_delay = PG_GETARG_INT32(0);
+ int cost_limit = PG_GETARG_INT32(1);
+
+ if (!superuser())
+ ereport(ERROR,
+ errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be superuser to change data checksum state"));
+
+ if (cost_delay < 0)
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("cost delay cannot be a negative value"));
+
+ if (cost_limit <= 0)
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("cost limit must be greater than zero"));
+
+ StartDataChecksumsWorkerLauncher(ENABLE_DATACHECKSUMS, cost_delay, cost_limit);
+
+ PG_RETURN_VOID();
+}
+
+
+/*****************************************************************************
+ * Functionality for running the datachecksumsworker and associated launcher
+ */
+
+/*
+ * StartDataChecksumsWorkerLauncher
+ * Main entry point for datachecksumsworker launcher process
+ *
+ * The main entrypoint for starting data checksums processing for enabling as
+ * well as disabling.
+ */
+void
+StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op,
+ int cost_delay,
+ int cost_limit)
+{
+ BackgroundWorker bgw;
+ BackgroundWorkerHandle *bgw_handle;
+ bool launcher_running;
+ DataChecksumsWorkerOperation launcher_running_op;
+
+#ifdef USE_ASSERT_CHECKING
+ /* The cost delay settings have no effect when disabling */
+ if (op == DISABLE_DATACHECKSUMS)
+ Assert(cost_delay == 0 && cost_limit == 0);
+#endif
+
+ INJECTION_POINT("datachecksumsworker-startup-delay", NULL);
+
+ /* Store the desired state in shared memory */
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+
+ DataChecksumState->launch_operation = op;
+ DataChecksumState->launch_cost_delay = cost_delay;
+ DataChecksumState->launch_cost_limit = cost_limit;
+
+ /* Is the launcher already running? If so, what is it doing? */
+ launcher_running = DataChecksumState->launcher_running;
+ if (launcher_running)
+ launcher_running_op = DataChecksumState->operation;
+
+ LWLockRelease(DataChecksumsWorkerLock);
+
+ /*
+ * Launch a new launcher process, if it's not running already.
+ *
+ * If the launcher is currently busy enabling the checksums, and we want
+ * them disabled (or vice versa), the launcher will notice that at latest
+ * when it's about to exit, and will loop back process the new request. So
+ * if the launcher is already running, we don't need to do anything more
+ * here to abort it.
+ *
+ * If you call pg_enable/disable_data_checksums() twice in a row, before
+ * the launcher has had a chance to start up, we still end up launching it
+ * twice. That's OK, the second invocation will see that a launcher is
+ * already running and exit quickly.
+ *
+ * TODO: We could optimize here and skip launching the launcher, if we are
+ * already in the desired state, i.e. if the checksums are already enabled
+ * and you call pg_enable_data_checksums().
+ */
+ if (!launcher_running)
+ {
+ /*
+ * Prepare the BackgroundWorker and launch it.
+ */
+ memset(&bgw, 0, sizeof(bgw));
+ bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
+ bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+ snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+ snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DataChecksumsWorkerLauncherMain");
+ snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum launcher");
+ snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum launcher");
+ bgw.bgw_restart_time = BGW_NEVER_RESTART;
+ bgw.bgw_notify_pid = MyProcPid;
+ bgw.bgw_main_arg = (Datum) 0;
+
+ if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
+ ereport(ERROR,
+ errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("failed to start background worker to process data checksums"));
+ }
+ else
+ {
+ if (launcher_running_op == op)
+ ereport(ERROR,
+ errmsg("data checksum processing already running"));
+ }
+}
+
+/*
+ * ProcessSingleRelationFork
+ * Enable data checksums in a single relation/fork.
+ *
+ * Returns true if successful, and false if *aborted*. On error, an actual
+ * error is raised in the lower levels.
+ */
+static bool
+ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy)
+{
+ BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum);
+ char activity[NAMEDATALEN * 2 + 128];
+ char *relns;
+
+ relns = get_namespace_name(RelationGetNamespace(reln));
+
+ /* Report the current relation to pgstat_activity */
+ snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s, %u blocks)",
+ (relns ? relns : ""), RelationGetRelationName(reln), forkNames[forkNum], numblocks);
+ pgstat_report_activity(STATE_RUNNING, activity);
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL, numblocks);
+ if (relns)
+ pfree(relns);
+
+ /*
+ * We are looping over the blocks which existed at the time of process
+ * start, which is safe since new blocks are created with checksums set
+ * already due to the state being "inprogress-on".
+ */
+ for (BlockNumber blknum = 0; blknum < numblocks; blknum++)
+ {
+ Buffer buf = ReadBufferExtended(reln, forkNum, blknum, RBM_NORMAL, strategy);
+
+ /* Need to get an exclusive lock to mark the buffer as dirty */
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+ /*
+ * Mark the buffer as dirty and force a full page write. We have to
+ * re-write the page to WAL even if the checksum hasn't changed,
+ * because if there is a replica it might have a slightly different
+ * version of the page with an invalid checksum, caused by unlogged
+ * changes (e.g. hintbits) on the primary happening while checksums
+ * were off. This can happen if there was a valid checksum on the page
+ * at one point in the past, so only when checksums are first on, then
+ * off, and then turned on again. TODO: investigate if this could be
+ * avoided if the checksum is calculated to be correct and wal_level
+ * is set to "minimal",
+ */
+ START_CRIT_SECTION();
+ MarkBufferDirty(buf);
+ log_newpage_buffer(buf, false);
+ END_CRIT_SECTION();
+
+ UnlockReleaseBuffer(buf);
+
+ /*
+ * This is the only place where we check if we are asked to abort, the
+ * abortion will bubble up from here.
+ */
+ Assert(operation == ENABLE_DATACHECKSUMS);
+ LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
+ if (DataChecksumState->launch_operation == DISABLE_DATACHECKSUMS)
+ abort_requested = true;
+ LWLockRelease(DataChecksumsWorkerLock);
+
+ if (abort_requested)
+ return false;
+
+ /* update the block counter */
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_DONE,
+ (blknum + 1));
+
+ /*
+ * Processing is re-using the vacuum cost delay for process
+ * throttling, hence why we call vacuum APIs here.
+ */
+ vacuum_delay_point(false);
+ }
+
+ return true;
+}
+
+/*
+ * ProcessSingleRelationByOid
+ * Process a single relation based on oid.
+ *
+ * Returns true if successful, and false if *aborted*. On error, an actual
+ * error is raised in the lower levels.
+ */
+static bool
+ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy)
+{
+ Relation rel;
+ bool aborted = false;
+
+ StartTransactionCommand();
+
+ rel = try_relation_open(relationId, AccessShareLock);
+ if (rel == NULL)
+ {
+ /*
+ * Relation no longer exists. We don't consider this an error since
+ * there are no pages in it that need data checksums, and thus return
+ * true. The worker operates off a list of relations generated at the
+ * start of processing, so relations being dropped in the meantime is
+ * to be expected.
+ */
+ CommitTransactionCommand();
+ pgstat_report_activity(STATE_IDLE, NULL);
+ return true;
+ }
+ RelationGetSmgr(rel);
+
+ for (ForkNumber fnum = 0; fnum <= MAX_FORKNUM; fnum++)
+ {
+ if (smgrexists(rel->rd_smgr, fnum))
+ {
+ if (!ProcessSingleRelationFork(rel, fnum, strategy))
+ {
+ aborted = true;
+ break;
+ }
+ }
+ }
+ relation_close(rel, AccessShareLock);
+
+ CommitTransactionCommand();
+ pgstat_report_activity(STATE_IDLE, NULL);
+
+ return !aborted;
+}
+
+/*
+ * ProcessDatabase
+ * Enable data checksums in a single database.
+ *
+ * We do this by launching a dynamic background worker into this database, and
+ * waiting for it to finish. We have to do this in a separate worker, since
+ * each process can only be connected to one database during its lifetime.
+ */
+static DataChecksumsWorkerResult
+ProcessDatabase(DataChecksumsWorkerDatabase *db)
+{
+ BackgroundWorker bgw;
+ BackgroundWorkerHandle *bgw_handle;
+ BgwHandleStatus status;
+ pid_t pid;
+ char activity[NAMEDATALEN + 64];
+
+ DataChecksumState->success = DATACHECKSUMSWORKER_FAILED;
+
+ memset(&bgw, 0, sizeof(bgw));
+ bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
+ bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+ snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+ snprintf(bgw.bgw_function_name, BGW_MAXLEN, "%s", "DataChecksumsWorkerMain");
+ snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum worker");
+ snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum worker");
+ bgw.bgw_restart_time = BGW_NEVER_RESTART;
+ bgw.bgw_notify_pid = MyProcPid;
+ bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid);
+
+ /*
+ * If there are no worker slots available, there is little we can do. If
+ * we retry in a bit it's still unlikely that the user has managed to
+ * reconfigure in the meantime and we'd be run through retries fast.
+ */
+ if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
+ {
+ ereport(WARNING,
+ errmsg("could not start background worker for enabling data checksums in database \"%s\"",
+ db->dbname),
+ errhint("The \"%s\" setting might be too low.", "max_worker_processes"));
+ return DATACHECKSUMSWORKER_FAILED;
+ }
+
+ status = WaitForBackgroundWorkerStartup(bgw_handle, &pid);
+ if (status == BGWH_STOPPED)
+ {
+ ereport(WARNING,
+ errmsg("could not start background worker for enabling data checksums in database \"%s\"",
+ db->dbname),
+ errhint("More details on the error might be found in the server log."));
+
+ /*
+ * Heuristic to see if the database was dropped, and if it was we can
+ * treat it as not an error, else treat as fatal and error out. TODO:
+ * this could probably be improved with a tighter check.
+ */
+ if (DatabaseExists(db->dboid))
+ return DATACHECKSUMSWORKER_FAILED;
+ else
+ return DATACHECKSUMSWORKER_DROPDB;
+ }
+
+ /*
+ * If the postmaster crashed we cannot end up with a processed database so
+ * we have no alternative other than exiting. When enabling checksums we
+ * won't at this time have changed the data checksums state in pg_control
+ * to enabled so when the cluster comes back up processing will have to be
+ * restarted.
+ */
+ if (status == BGWH_POSTMASTER_DIED)
+ ereport(FATAL,
+ errcode(ERRCODE_ADMIN_SHUTDOWN),
+ errmsg("cannot enable data checksums without the postmaster process"),
+ errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."));
+
+ Assert(status == BGWH_STARTED);
+ ereport(LOG,
+ errmsg("initiating data checksum processing in database \"%s\"",
+ db->dbname));
+
+ /* Save the pid of the worker so we can signal it later */
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ DataChecksumState->worker_pid = pid;
+ LWLockRelease(DataChecksumsWorkerLock);
+
+ snprintf(activity, sizeof(activity) - 1,
+ "Waiting for worker in database %s (pid %ld)", db->dbname, (long) pid);
+ pgstat_report_activity(STATE_RUNNING, activity);
+
+ status = WaitForBackgroundWorkerShutdown(bgw_handle);
+ if (status == BGWH_POSTMASTER_DIED)
+ ereport(FATAL,
+ errcode(ERRCODE_ADMIN_SHUTDOWN),
+ errmsg("postmaster exited during data checksum processing in \"%s\"",
+ db->dbname),
+ errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."));
+
+ if (DataChecksumState->success == DATACHECKSUMSWORKER_ABORTED)
+ ereport(LOG,
+ errmsg("data checksums processing was aborted in database \"%s\"",
+ db->dbname));
+
+ pgstat_report_activity(STATE_IDLE, NULL);
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ DataChecksumState->worker_pid = InvalidPid;
+ LWLockRelease(DataChecksumsWorkerLock);
+
+ return DataChecksumState->success;
+}
+
+/*
+ * launcher_exit
+ *
+ * Internal routine for cleaning up state when the launcher process exits. We
+ * need to clean up the abort flag to ensure that processing started again if
+ * it was previously aborted (note: started again, *not* restarted from where
+ * it left off).
+ */
+static void
+launcher_exit(int code, Datum arg)
+{
+ abort_requested = false;
+
+ if (launcher_running)
+ {
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ if (DataChecksumState->worker_pid != InvalidPid)
+ {
+ ereport(LOG,
+ errmsg("data checksums launcher exiting while worker is still running, signalling worker"));
+ kill(DataChecksumState->worker_pid, SIGTERM);
+ }
+ LWLockRelease(DataChecksumsWorkerLock);
+ }
+
+ /*
+ * If the launcher is exiting before data checksums are enabled then set
+ * the state to off since processing cannot be resumed.
+ */
+ if (DataChecksumsInProgressOn())
+ SetDataChecksumsOff();
+
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ launcher_running = false;
+ DataChecksumState->launcher_running = false;
+ LWLockRelease(DataChecksumsWorkerLock);
+}
+
+/*
+ * launcher_cancel_handler
+ *
+ * Internal routine for reacting to SIGINT and flagging the worker to abort.
+ * The worker won't be interrupted immediately but will check for abort flag
+ * between each block in a relation.
+ */
+static void
+launcher_cancel_handler(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ abort_requested = true;
+
+ /*
+ * There is no sleeping in the main loop, the flag will be checked
+ * periodically in ProcessSingleRelationFork. The worker does however
+ * sleep when waiting for concurrent transactions to end so we still need
+ * to set the latch.
+ */
+ SetLatch(MyLatch);
+
+ errno = save_errno;
+}
+
+/*
+ * WaitForAllTransactionsToFinish
+ * Blocks awaiting all current transactions to finish
+ *
+ * Returns when all transactions which are active at the call of the function
+ * have ended, or if the postmaster dies while waiting. If the postmaster dies
+ * the abort flag will be set to indicate that the caller of this shouldn't
+ * proceed.
+ *
+ * NB: this will return early, if aborted by SIGINT or if the target state
+ * is changed while we're running.
+ */
+static void
+WaitForAllTransactionsToFinish(void)
+{
+ TransactionId waitforxid;
+
+ LWLockAcquire(XidGenLock, LW_SHARED);
+ waitforxid = XidFromFullTransactionId(TransamVariables->nextXid);
+ LWLockRelease(XidGenLock);
+
+ while (TransactionIdPrecedes(GetOldestActiveTransactionId(false, true), waitforxid))
+ {
+ char activity[64];
+ int rc;
+
+ /* Oldest running xid is older than us, so wait */
+ snprintf(activity,
+ sizeof(activity),
+ "Waiting for current transactions to finish (waiting for %u)",
+ waitforxid);
+ pgstat_report_activity(STATE_RUNNING, activity);
+
+ /* Retry every 3 seconds */
+ ResetLatch(MyLatch);
+ rc = WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+ 3000,
+ WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION);
+
+ /*
+ * If the postmaster died we won't be able to enable checksums
+ * cluster-wide so abort and hope to continue when restarted.
+ */
+ if (rc & WL_POSTMASTER_DEATH)
+ ereport(FATAL,
+ errcode(ERRCODE_ADMIN_SHUTDOWN),
+ errmsg("postmaster exited during data checksums processing"),
+ errhint("Data checksums processing must be restarted manually after cluster restart."));
+
+ CHECK_FOR_INTERRUPTS();
+
+ LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
+ if (DataChecksumState->launch_operation != operation)
+ abort_requested = true;
+ LWLockRelease(DataChecksumsWorkerLock);
+ if (abort_requested)
+ break;
+ }
+
+ pgstat_report_activity(STATE_IDLE, NULL);
+ return;
+}
+
+/*
+ * DataChecksumsWorkerLauncherMain
+ *
+ * Main function for launching dynamic background workers for processing data
+ * checksums in databases. This function has the bgworker management, with
+ * ProcessAllDatabases being responsible for looping over the databases and
+ * initiating processing.
+ */
+void
+DataChecksumsWorkerLauncherMain(Datum arg)
+{
+ on_shmem_exit(launcher_exit, 0);
+
+ ereport(DEBUG1,
+ errmsg("background worker \"datachecksums launcher\" started"));
+
+ pqsignal(SIGTERM, die);
+ pqsignal(SIGINT, launcher_cancel_handler);
+ pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+ pqsignal(SIGUSR2, SIG_IGN);
+
+ BackgroundWorkerUnblockSignals();
+
+ MyBackendType = B_DATACHECKSUMSWORKER_LAUNCHER;
+ init_ps_display(NULL);
+
+ INJECTION_POINT("datachecksumsworker-launcher-delay", NULL);
+
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+
+ if (DataChecksumState->launcher_running)
+ {
+ ereport(LOG,
+ errmsg("background worker \"datachecksums launcher\" already running, exiting"));
+ /* Launcher was already running, let it finish */
+ LWLockRelease(DataChecksumsWorkerLock);
+ return;
+ }
+
+ launcher_running = true;
+
+ /* Initialize a connection to shared catalogs only */
+ BackgroundWorkerInitializeConnectionByOid(InvalidOid, InvalidOid, 0);
+
+ operation = DataChecksumState->launch_operation;
+ DataChecksumState->launcher_running = true;
+ DataChecksumState->operation = operation;
+ DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay;
+ DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit;
+ LWLockRelease(DataChecksumsWorkerLock);
+
+ /*
+ * The target state can change while we are busy enabling/disabling
+ * checksums, if the user calls pg_disable/enable_data_checksums() before
+ * we are finished with the previous request. In that case, we will loop
+ * back here, to process the new request.
+ */
+again:
+
+ pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS,
+ InvalidOid);
+
+ if (operation == ENABLE_DATACHECKSUMS)
+ {
+ /*
+ * If we are asked to enable checksums in a cluster which already has
+ * checksums enabled, exit immediately as there is nothing more to do.
+ */
+ if (DataChecksumsNeedVerify())
+ goto done;
+
+ ereport(LOG,
+ errmsg("enabling data checksums requested, starting data checksum calculation"));
+
+ /*
+ * Set the state to inprogress-on and wait on the procsignal barrier.
+ */
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
+ PROGRESS_DATACHECKSUMS_PHASE_ENABLING);
+ SetDataChecksumsOnInProgress();
+
+ /*
+ * All backends are now in inprogress-on state and are writing data
+ * checksums. Start processing all data at rest.
+ */
+ if (!ProcessAllDatabases())
+ {
+ /*
+ * If the target state changed during processing then it's not a
+ * failure, so restart processing instead.
+ */
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ if (DataChecksumState->launch_operation != operation)
+ {
+ LWLockRelease(DataChecksumsWorkerLock);
+ goto done;
+ }
+ LWLockRelease(DataChecksumsWorkerLock);
+ ereport(ERROR,
+ errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("unable to enable data checksums in cluster"));
+ }
+
+ /*
+ * Data checksums have been set on all pages, set the state to on in
+ * order to instruct backends to validate checksums on reading.
+ */
+ SetDataChecksumsOn();
+
+ ereport(LOG,
+ errmsg("data checksums are now enabled"));
+ }
+ else if (operation == DISABLE_DATACHECKSUMS)
+ {
+ ereport(LOG,
+ errmsg("disabling data checksums requested"));
+
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
+ PROGRESS_DATACHECKSUMS_PHASE_DISABLING);
+ SetDataChecksumsOff();
+ ereport(LOG,
+ errmsg("data checksums are now disabled"));
+ }
+ else
+ Assert(false);
+
+done:
+
+ /*
+ * This state will only be displayed for a fleeting moment, but for the
+ * sake of correctness it is still added before ending the command.
+ */
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
+ PROGRESS_DATACHECKSUMS_PHASE_DONE);
+
+ /*
+ * All done. But before we exit, check if the target state was changed
+ * while we were running. In that case we will have to start all over
+ * again.
+ */
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ if (DataChecksumState->launch_operation != operation)
+ {
+ DataChecksumState->operation = DataChecksumState->launch_operation;
+ operation = DataChecksumState->launch_operation;
+ DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay;
+ DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit;
+ LWLockRelease(DataChecksumsWorkerLock);
+ goto again;
+ }
+
+ /* Shut down progress reporting as we are done */
+ pgstat_progress_end_command();
+
+ launcher_running = false;
+ DataChecksumState->launcher_running = false;
+ LWLockRelease(DataChecksumsWorkerLock);
+}
+
+/*
+ * ProcessAllDatabases
+ * Compute the list of all databases and process checksums in each
+ *
+ * This will generate a list of databases to process for enabling checksums.
+ * If a database encounters a failure then processing will end immediately and
+ * return an error.
+ */
+static bool
+ProcessAllDatabases(void)
+{
+ List *DatabaseList;
+ int cumulative_total = 0;
+
+ /* Set up so first run processes shared catalogs, not once in every db */
+ DataChecksumState->process_shared_catalogs = true;
+
+ /* Get a list of all databases to process */
+ WaitForAllTransactionsToFinish();
+ DatabaseList = BuildDatabaseList();
+
+ /*
+ * Update progress reporting with the total number of databases we need to
+ * process. This number should not be changed during processing, the
+ * columns for processed databases is instead increased such that it can
+ * be compared against the total.
+ */
+ {
+ const int index[] = {
+ PROGRESS_DATACHECKSUMS_DBS_TOTAL,
+ PROGRESS_DATACHECKSUMS_DBS_DONE,
+ PROGRESS_DATACHECKSUMS_RELS_TOTAL,
+ PROGRESS_DATACHECKSUMS_RELS_DONE,
+ PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL,
+ PROGRESS_DATACHECKSUMS_BLOCKS_DONE,
+ };
+
+ int64 vals[6];
+
+ vals[0] = list_length(DatabaseList);
+ vals[1] = 0;
+ /* translated to NULL */
+ vals[2] = -1;
+ vals[3] = -1;
+ vals[4] = -1;
+ vals[5] = -1;
+
+ pgstat_progress_update_multi_param(6, index, vals);
+ }
+
+ foreach_ptr(DataChecksumsWorkerDatabase, db, DatabaseList)
+ {
+ DataChecksumsWorkerResult result;
+
+ result = ProcessDatabase(db);
+
+ /* Allow a test process to alter the result of the operation */
+ INJECTION_POINT("datachecksumsworker-modify-db-result", &result);
+
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE,
+ ++cumulative_total);
+
+ if (result == DATACHECKSUMSWORKER_FAILED)
+ {
+ /*
+ * Disable checksums on cluster, because we failed one of the
+ * databases and this is an all or nothing process.
+ */
+ SetDataChecksumsOff();
+ ereport(ERROR,
+ errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("data checksums failed to get enabled in all databases, aborting"),
+ errhint("The server log might have more information on the cause of the error."));
+ }
+ else if (result == DATACHECKSUMSWORKER_ABORTED || abort_requested)
+ {
+ /* Abort flag set, so exit the whole process */
+ return false;
+ }
+
+ /*
+ * When one database has completed, it will have done shared catalogs
+ * so we don't have to process them again.
+ */
+ DataChecksumState->process_shared_catalogs = false;
+ }
+
+ FreeDatabaseList(DatabaseList);
+
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
+ PROGRESS_DATACHECKSUMS_PHASE_WAITING_BARRIER);
+ return true;
+}
+
+/*
+ * DataChecksumStateSize
+ * Compute required space for datachecksumsworker-related shared memory
+ */
+Size
+DataChecksumsShmemSize(void)
+{
+ Size size;
+
+ size = sizeof(DataChecksumsStateStruct);
+ size = MAXALIGN(size);
+
+ return size;
+}
+
+/*
+ * DataChecksumStateInit
+ * Allocate and initialize datachecksumsworker-related shared memory
+ */
+void
+DataChecksumsShmemInit(void)
+{
+ bool found;
+
+ DataChecksumState = (DataChecksumsStateStruct *)
+ ShmemInitStruct("DataChecksumsWorker Data",
+ DataChecksumsShmemSize(),
+ &found);
+ if (!found)
+ MemSet(DataChecksumState, 0, DataChecksumsShmemSize());
+}
+
+/*
+ * DatabaseExists
+ *
+ * Scans the system catalog to check if a database with the given Oid exist
+ * and returns true if it is found, else false.
+ */
+static bool
+DatabaseExists(Oid dboid)
+{
+ Relation rel;
+ ScanKeyData skey;
+ SysScanDesc scan;
+ bool found;
+ HeapTuple tuple;
+
+ StartTransactionCommand();
+
+ rel = table_open(DatabaseRelationId, AccessShareLock);
+ ScanKeyInit(&skey,
+ Anum_pg_database_oid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ dboid);
+ scan = systable_beginscan(rel, DatabaseOidIndexId, true, SnapshotSelf,
+ 1, &skey);
+ tuple = systable_getnext(scan);
+ found = HeapTupleIsValid(tuple);
+
+ systable_endscan(scan);
+ table_close(rel, AccessShareLock);
+
+ CommitTransactionCommand();
+
+ return found;
+}
+
+/*
+ * BuildDatabaseList
+ * Compile a list of all currently available databases in the cluster
+ *
+ * This creates the list of databases for the datachecksumsworker workers to
+ * add checksums to. If the caller wants to ensure that no concurrently
+ * running CREATE DATABASE calls exist, this needs to be preceded by a call
+ * to WaitForAllTransactionsToFinish().
+ */
+static List *
+BuildDatabaseList(void)
+{
+ List *DatabaseList = NIL;
+ Relation rel;
+ TableScanDesc scan;
+ HeapTuple tup;
+ MemoryContext ctx = CurrentMemoryContext;
+ MemoryContext oldctx;
+
+ StartTransactionCommand();
+
+ rel = table_open(DatabaseRelationId, AccessShareLock);
+ scan = table_beginscan_catalog(rel, 0, NULL);
+
+ while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
+ {
+ Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup);
+ DataChecksumsWorkerDatabase *db;
+
+ oldctx = MemoryContextSwitchTo(ctx);
+
+ db = (DataChecksumsWorkerDatabase *) palloc0(sizeof(DataChecksumsWorkerDatabase));
+
+ db->dboid = pgdb->oid;
+ db->dbname = pstrdup(NameStr(pgdb->datname));
+
+ DatabaseList = lappend(DatabaseList, db);
+
+ MemoryContextSwitchTo(oldctx);
+ }
+
+ table_endscan(scan);
+ table_close(rel, AccessShareLock);
+
+ CommitTransactionCommand();
+
+ return DatabaseList;
+}
+
+static void
+FreeDatabaseList(List *dblist)
+{
+ if (!dblist)
+ return;
+
+ foreach_ptr(DataChecksumsWorkerDatabase, db, dblist)
+ {
+ if (db->dbname != NULL)
+ pfree(db->dbname);
+ }
+
+ list_free_deep(dblist);
+}
+
+/*
+ * BuildRelationList
+ * Compile a list of relations in the database
+ *
+ * Returns a list of OIDs for the request relation types. If temp_relations
+ * is True then only temporary relations are returned. If temp_relations is
+ * False then non-temporary relations which have data checksums are returned.
+ * If include_shared is True then shared relations are included as well in a
+ * non-temporary list. include_shared has no relevance when building a list of
+ * temporary relations.
+ */
+static List *
+BuildRelationList(bool temp_relations, bool include_shared)
+{
+ List *RelationList = NIL;
+ Relation rel;
+ TableScanDesc scan;
+ HeapTuple tup;
+ MemoryContext ctx = CurrentMemoryContext;
+ MemoryContext oldctx;
+
+ StartTransactionCommand();
+
+ rel = table_open(RelationRelationId, AccessShareLock);
+ scan = table_beginscan_catalog(rel, 0, NULL);
+
+ while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
+ {
+ Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup);
+
+ /* Only include temporary relations when explicitly asked to */
+ if (pgc->relpersistence == RELPERSISTENCE_TEMP)
+ {
+ if (!temp_relations)
+ continue;
+ }
+ else
+ {
+ /*
+ * If we are only interested in temp relations then continue
+ * immediately as the current relation isn't a temp relation.
+ */
+ if (temp_relations)
+ continue;
+
+ if (!RELKIND_HAS_STORAGE(pgc->relkind))
+ continue;
+
+ if (pgc->relisshared && !include_shared)
+ continue;
+ }
+
+ oldctx = MemoryContextSwitchTo(ctx);
+ RelationList = lappend_oid(RelationList, pgc->oid);
+ MemoryContextSwitchTo(oldctx);
+ }
+
+ table_endscan(scan);
+ table_close(rel, AccessShareLock);
+
+ CommitTransactionCommand();
+
+ return RelationList;
+}
+
+/*
+ * DataChecksumsWorkerMain
+ *
+ * Main function for enabling checksums in a single database, This is the
+ * function set as the bgw_function_name in the dynamic background worker
+ * process initiated for each database by the worker launcher. After enabling
+ * data checksums in each applicable relation in the database, it will wait for
+ * all temporary relations that were present when the function started to
+ * disappear before returning. This is required since we cannot rewrite
+ * existing temporary relations with data checksums.
+ */
+void
+DataChecksumsWorkerMain(Datum arg)
+{
+ Oid dboid = DatumGetObjectId(arg);
+ List *RelationList = NIL;
+ List *InitialTempTableList = NIL;
+ BufferAccessStrategy strategy;
+ bool aborted = false;
+ int64 rels_done;
+
+ operation = ENABLE_DATACHECKSUMS;
+
+ pqsignal(SIGTERM, die);
+ pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+
+ BackgroundWorkerUnblockSignals();
+
+ MyBackendType = B_DATACHECKSUMSWORKER_WORKER;
+ init_ps_display(NULL);
+
+ BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid,
+ BGWORKER_BYPASS_ALLOWCONN);
+
+ /* worker will have a separate entry in pg_stat_progress_data_checksums */
+ pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS,
+ InvalidOid);
+
+ /*
+ * Get a list of all temp tables present as we start in this database. We
+ * need to wait until they are all gone until we are done, since we cannot
+ * access these relations and modify them.
+ */
+ InitialTempTableList = BuildRelationList(true, false);
+
+ /*
+ * Enable vacuum cost delay, if any. While this process isn't doing any
+ * vacuuming, we are re-using the infrastructure that vacuum cost delay
+ * provides rather than inventing something bespoke. This is an internal
+ * implementation detail and care should be taken to avoid it bleeding
+ * through to the user to avoid confusion.
+ */
+ Assert(DataChecksumState->operation == ENABLE_DATACHECKSUMS);
+ VacuumCostDelay = DataChecksumState->cost_delay;
+ VacuumCostLimit = DataChecksumState->cost_limit;
+ VacuumCostActive = (VacuumCostDelay > 0);
+ VacuumCostBalance = 0;
+ VacuumCostPageHit = 0;
+ VacuumCostPageMiss = 0;
+ VacuumCostPageDirty = 0;
+
+ /*
+ * Create and set the vacuum strategy as our buffer strategy.
+ */
+ strategy = GetAccessStrategy(BAS_VACUUM);
+
+ RelationList = BuildRelationList(false,
+ DataChecksumState->process_shared_catalogs);
+
+ /* Update the total number of relations to be processed in this DB. */
+ {
+ const int index[] = {
+ PROGRESS_DATACHECKSUMS_RELS_TOTAL,
+ PROGRESS_DATACHECKSUMS_RELS_DONE
+ };
+
+ int64 vals[2];
+
+ vals[0] = list_length(RelationList);
+ vals[1] = 0;
+
+ pgstat_progress_update_multi_param(2, index, vals);
+ }
+
+ /* Process the relations */
+ rels_done = 0;
+ foreach_oid(reloid, RelationList)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ if (!ProcessSingleRelationByOid(reloid, strategy))
+ {
+ aborted = true;
+ break;
+ }
+
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_RELS_DONE,
+ ++rels_done);
+ }
+ list_free(RelationList);
+
+ if (aborted)
+ {
+ DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED;
+ ereport(DEBUG1,
+ errmsg("data checksum processing aborted in database OID %u",
+ dboid));
+ return;
+ }
+
+ /* The worker is about to wait for temporary tables to go away. */
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
+ PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL);
+
+ /*
+ * Wait for all temp tables that existed when we started to go away. This
+ * is necessary since we cannot "reach" them to enable checksums. Any temp
+ * tables created after we started will already have checksums in them
+ * (due to the "inprogress-on" state), so no need to wait for those.
+ */
+ for (;;)
+ {
+ List *CurrentTempTables;
+ int numleft;
+ char activity[64];
+
+ CurrentTempTables = BuildRelationList(true, false);
+ numleft = 0;
+ foreach_oid(tmptbloid, InitialTempTableList)
+ {
+ if (list_member_oid(CurrentTempTables, tmptbloid))
+ numleft++;
+ }
+ list_free(CurrentTempTables);
+
+ INJECTION_POINT("datachecksumsworker-fake-temptable-wait", &numleft);
+
+ if (numleft == 0)
+ break;
+
+ /*
+ * At least one temp table is left to wait for, indicate in pgstat
+ * activity and progress reporting.
+ */
+ snprintf(activity,
+ sizeof(activity),
+ "Waiting for %d temp tables to be removed", numleft);
+ pgstat_report_activity(STATE_RUNNING, activity);
+
+ /* Retry every 3 seconds */
+ ResetLatch(MyLatch);
+ (void) WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ 3000,
+ WAIT_EVENT_CHECKSUM_ENABLE_TEMPTABLE_WAIT);
+
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ aborted = DataChecksumState->launch_operation != operation;
+ LWLockRelease(DataChecksumsWorkerLock);
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (aborted || abort_requested)
+ {
+ DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED;
+ ereport(LOG,
+ errmsg("data checksum processing aborted in database OID %u",
+ dboid));
+ return;
+ }
+ }
+
+ list_free(InitialTempTableList);
+
+ /* worker done */
+ pgstat_progress_end_command();
+
+ DataChecksumState->success = DATACHECKSUMSWORKER_SUCCESSFUL;
+}
diff --git a/src/backend/postmaster/meson.build b/src/backend/postmaster/meson.build
index e1f70726604..6cba23bbeef 100644
--- a/src/backend/postmaster/meson.build
+++ b/src/backend/postmaster/meson.build
@@ -6,6 +6,7 @@ backend_sources += files(
'bgworker.c',
'bgwriter.c',
'checkpointer.c',
+ 'datachecksum_state.c',
'fork_process.c',
'interrupt.c',
'launch_backend.c',
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index abf0c97569e..eb4f3eb72d4 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2991,6 +2991,11 @@ PostmasterStateMachine(void)
B_INVALID,
B_STANDALONE_BACKEND);
+ /* also add data checksums processes */
+ remainMask = btmask_add(remainMask,
+ B_DATACHECKSUMSWORKER_LAUNCHER,
+ B_DATACHECKSUMSWORKER_WORKER);
+
/* All types should be included in targetMask or remainMask */
Assert((remainMask.mask | targetMask.mask) == BTYPE_MASK_ALL.mask);
}
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 3c027bcb2f7..57aaef57c61 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -189,6 +189,22 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
}
}
+void
+xlog2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+ uint8 info = XLogRecGetInfo(buf->record) & ~XLR_INFO_MASK;
+
+ ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(buf->record), buf->origptr);
+
+ switch (info)
+ {
+ case XLOG2_CHECKSUMS:
+ break;
+ default:
+ elog(ERROR, "unexpected RM_XLOG2_ID record type: %u", info);
+ }
+}
+
/*
* Handle rmgr XACT_ID records for LogicalDecodingProcessRecord().
*/
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 5c64570020d..3cc0b0bdd92 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8567,6 +8567,13 @@ buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
if (flags & READ_BUFFERS_IGNORE_CHECKSUM_FAILURES)
piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
+ /*
+ * If the buffers are marked for zero on error, we want to log that in
+ * case of a checksum failure.
+ */
+ if (flags & READ_BUFFERS_ZERO_ON_ERROR)
+ piv_flags |= PIV_ZERO_BUFFERS_ON_ERROR;
+
/* Check for garbage data. */
if (!failed)
{
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index d692d419846..7aab5da3386 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -31,6 +31,7 @@
#include "postmaster/autovacuum.h"
#include "postmaster/bgworker_internals.h"
#include "postmaster/bgwriter.h"
+#include "postmaster/datachecksum_state.h"
#include "postmaster/walsummarizer.h"
#include "replication/logicallauncher.h"
#include "replication/origin.h"
@@ -142,6 +143,7 @@ CalculateShmemSize(void)
size = add_size(size, AioShmemSize());
size = add_size(size, WaitLSNShmemSize());
size = add_size(size, LogicalDecodingCtlShmemSize());
+ size = add_size(size, DataChecksumsShmemSize());
/* include additional requested shmem from preload libraries */
size = add_size(size, total_addin_request);
@@ -310,6 +312,7 @@ CreateOrAttachShmemStructs(void)
PgArchShmemInit();
ApplyLauncherShmemInit();
SlotSyncShmemInit();
+ DataChecksumsShmemInit();
/*
* Set up other modules that need some shared memory space
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index 7e017c8d53b..f1ab3aa3fe0 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -22,6 +22,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "port/pg_bitutils.h"
+#include "postmaster/datachecksum_state.h"
#include "replication/logicalctl.h"
#include "replication/logicalworker.h"
#include "replication/walsender.h"
@@ -582,6 +583,13 @@ ProcessProcSignalBarrier(void)
case PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO:
processed = ProcessBarrierUpdateXLogLogicalInfo();
break;
+
+ case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON:
+ case PROCSIGNAL_BARRIER_CHECKSUM_ON:
+ case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF:
+ case PROCSIGNAL_BARRIER_CHECKSUM_OFF:
+ processed = AbsorbDataChecksumsBarrier(type);
+ break;
}
/*
diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README
index e30d7ac59ad..73c36a63908 100644
--- a/src/backend/storage/page/README
+++ b/src/backend/storage/page/README
@@ -10,7 +10,9 @@ http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed
2010/12/22 on -hackers list.
Current implementation requires this be enabled system-wide at initdb time, or
-by using the pg_checksums tool on an offline cluster.
+by using the pg_checksums tool on an offline cluster. Checksums can also be
+enabled at runtime using pg_enable_data_checksums(), and disabled by using
+pg_disable_data_checksums().
The checksum is not valid at all times on a data page!!
The checksum is valid when the page leaves the shared pool and is checked
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 56f1f7ae9fc..1fdfda59edd 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -107,7 +107,15 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail
*/
if (!PageIsNew(page))
{
- if (DataChecksumsEnabled())
+ /*
+ * There shouldn't be any check for interrupt calls happening in this
+ * codepath, but just to be on the safe side we hold interrupts since
+ * if they did happen the data checksum state could change during
+ * verifying checksums, which could lead to incorrect verification
+ * results.
+ */
+ HOLD_INTERRUPTS();
+ if (DataChecksumsNeedVerify())
{
checksum = pg_checksum_page(page, blkno);
@@ -118,6 +126,7 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail
*checksum_failure_p = true;
}
}
+ RESUME_INTERRUPTS();
/*
* The following checks don't prove the header is correct, only that
@@ -151,8 +160,9 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail
if ((flags & (PIV_LOG_WARNING | PIV_LOG_LOG)) != 0)
ereport(flags & PIV_LOG_WARNING ? WARNING : LOG,
(errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("page verification failed, calculated checksum %u but expected %u",
- checksum, p->pd_checksum)));
+ errmsg("page verification failed, calculated checksum %u but expected %u%s",
+ checksum, p->pd_checksum,
+ (flags & PIV_ZERO_BUFFERS_ON_ERROR ? ", buffer will be zeroed" : ""))));
if (header_sane && (flags & PIV_IGNORE_CHECKSUM_FAILURE))
return true;
@@ -1507,9 +1517,14 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
void
PageSetChecksum(Page page, BlockNumber blkno)
{
+ HOLD_INTERRUPTS();
/* If we don't need a checksum, just return */
- if (PageIsNew(page) || !DataChecksumsEnabled())
+ if (PageIsNew(page) || !DataChecksumsNeedWrite())
+ {
+ RESUME_INTERRUPTS();
return;
+ }
((PageHeader) page)->pd_checksum = pg_checksum_page(page, blkno);
+ RESUME_INTERRUPTS();
}
diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c
index 7727fed3bda..04fe13e64c6 100644
--- a/src/backend/utils/activity/pgstat_backend.c
+++ b/src/backend/utils/activity/pgstat_backend.c
@@ -380,6 +380,8 @@ pgstat_tracks_backend_bktype(BackendType bktype)
case B_CHECKPOINTER:
case B_IO_WORKER:
case B_STARTUP:
+ case B_DATACHECKSUMSWORKER_LAUNCHER:
+ case B_DATACHECKSUMSWORKER_WORKER:
return false;
case B_AUTOVAC_WORKER:
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 28de24538dc..2be26e92283 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -362,6 +362,8 @@ pgstat_tracks_io_bktype(BackendType bktype)
case B_LOGGER:
return false;
+ case B_DATACHECKSUMSWORKER_LAUNCHER:
+ case B_DATACHECKSUMSWORKER_WORKER:
case B_AUTOVAC_LAUNCHER:
case B_AUTOVAC_WORKER:
case B_BACKEND:
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 6be80d2daad..0a6d16f8154 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -119,6 +119,8 @@ CHECKPOINT_DELAY_COMPLETE "Waiting for a backend that blocks a checkpoint from c
CHECKPOINT_DELAY_START "Waiting for a backend that blocks a checkpoint from starting."
CHECKPOINT_DONE "Waiting for a checkpoint to complete."
CHECKPOINT_START "Waiting for a checkpoint to start."
+CHECKSUM_ENABLE_STARTCONDITION "Waiting for data checksums enabling to start."
+CHECKSUM_ENABLE_TEMPTABLE_WAIT "Waiting for temporary tables to be dropped for data checksums to be enabled."
EXECUTE_GATHER "Waiting for activity from a child process while executing a Gather plan node."
HASH_BATCH_ALLOCATE "Waiting for an elected Parallel Hash participant to allocate a hash table."
HASH_BATCH_ELECT "Waiting to elect a Parallel Hash participant to allocate a hash table."
@@ -365,6 +367,7 @@ SerialControl "Waiting to read or update shared pg_serial s
AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue."
WaitLSN "Waiting to read or update shared Wait-for-LSN state."
LogicalDecodingControl "Waiting to read or update logical decoding status information."
+DataChecksumsWorker "Waiting for data checksums worker."
#
# END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE)
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 9185a8e6b83..1408de387ea 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -297,6 +297,8 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS)
cmdtype = PROGRESS_COMMAND_BASEBACKUP;
else if (pg_strcasecmp(cmd, "COPY") == 0)
cmdtype = PROGRESS_COMMAND_COPY;
+ else if (pg_strcasecmp(cmd, "DATACHECKSUMS") == 0)
+ cmdtype = PROGRESS_COMMAND_DATACHECKSUMS;
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -1182,9 +1184,6 @@ pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS)
int64 result;
PgStat_StatDBEntry *dbentry;
- if (!DataChecksumsEnabled())
- PG_RETURN_NULL();
-
if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
result = 0;
else
@@ -1200,9 +1199,6 @@ pg_stat_get_db_checksum_last_failure(PG_FUNCTION_ARGS)
TimestampTz result;
PgStat_StatDBEntry *dbentry;
- if (!DataChecksumsEnabled())
- PG_RETURN_NULL();
-
if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
result = 0;
else
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index ba191977697..7ffc808073a 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -845,7 +845,8 @@ InitializeSessionUserIdStandalone(void)
* workers, in slot sync worker and in background workers.
*/
Assert(!IsUnderPostmaster || AmAutoVacuumWorkerProcess() ||
- AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess());
+ AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess() ||
+ AmDataChecksumsWorkerProcess());
/* call only once */
Assert(!OidIsValid(AuthenticatedUserId));
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 783a7400464..6f074013aa9 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -758,6 +758,24 @@ InitPostgres(const char *in_dbname, Oid dboid,
ProcSignalInit(MyCancelKey, MyCancelKeyLength);
+ /*
+ * Initialize a local cache of the data_checksum_version, to be updated by
+ * the procsignal-based barriers.
+ *
+ * This intentionally happens after initializing the procsignal, otherwise
+ * we might miss a state change. This means we can get a barrier for the
+ * state we've just initialized.
+ *
+ * The postmaster (which is what gets forked into the new child process)
+ * does not handle barriers, therefore it may not have the current value
+ * of LocalDataChecksumVersion value (it'll have the value read from the
+ * control file, which may be arbitrarily old).
+ *
+ * NB: Even if the postmaster handled barriers, the value might still be
+ * stale, as it might have changed after this process forked.
+ */
+ InitLocalDataChecksumState();
+
/*
* Also set up timeout handlers needed for backend operation. We need
* these in every case except bootstrap.
@@ -886,7 +904,7 @@ InitPostgres(const char *in_dbname, Oid dboid,
errhint("You should immediately run CREATE USER \"%s\" SUPERUSER;.",
username != NULL ? username : "postgres")));
}
- else if (AmBackgroundWorkerProcess())
+ else if (AmBackgroundWorkerProcess() || AmDataChecksumsWorkerProcess())
{
if (username == NULL && !OidIsValid(useroid))
{
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index fc0900efe5f..a315c4ab8ab 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -571,11 +571,12 @@
max => '1.0',
},
-{ name => 'data_checksums', type => 'bool', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS',
+{ name => 'data_checksums', type => 'enum', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS',
short_desc => 'Shows whether data checksums are turned on for this cluster.',
flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED',
variable => 'data_checksums',
- boot_val => 'false',
+ boot_val => 'PG_DATA_CHECKSUM_OFF',
+ options => 'data_checksums_options',
},
# Can't be set by ALTER SYSTEM as it can lead to recursive definition
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 1e14b7b4af0..d9ca13baff9 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -501,6 +501,14 @@ static const struct config_enum_entry file_extend_method_options[] = {
{NULL, 0, false}
};
+static const struct config_enum_entry data_checksums_options[] = {
+ {"on", PG_DATA_CHECKSUM_VERSION, true},
+ {"off", PG_DATA_CHECKSUM_OFF, true},
+ {"inprogress-on", PG_DATA_CHECKSUM_INPROGRESS_ON, true},
+ {"inprogress-off", PG_DATA_CHECKSUM_INPROGRESS_OFF, true},
+ {NULL, 0, false}
+};
+
/*
* Options for enum values stored in other modules
*/
@@ -629,7 +637,6 @@ static int shared_memory_size_in_huge_pages;
static int wal_block_size;
static int num_os_semaphores;
static int effective_wal_level = WAL_LEVEL_REPLICA;
-static bool data_checksums;
static bool integer_datetimes;
#ifdef USE_ASSERT_CHECKING
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index c8194c27aa7..6d0337853e0 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -543,11 +543,11 @@
# archiver autovacuum
# backend bgworker
# bgwriter checkpointer
- # ioworker postmaster
- # slotsyncworker startup
- # syslogger walreceiver
- # walsummarizer walwriter
- # walsender
+ # checksums ioworker
+ # postmaster slotsyncworker
+ # startup syslogger
+ # walreceiver walsummarizer
+ # walwriter walsender
#
# Level values in order of decreasing
# detail:
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 301e256fbb1..2a38f1d688b 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -585,7 +585,7 @@ main(int argc, char *argv[])
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
pg_fatal("cluster must be shut down");
- if (ControlFile->data_checksum_version == 0 &&
+ if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_VERSION &&
mode == PG_MODE_CHECK)
pg_fatal("data checksums are not enabled in cluster");
@@ -593,7 +593,7 @@ main(int argc, char *argv[])
mode == PG_MODE_DISABLE)
pg_fatal("data checksums are already disabled in cluster");
- if (ControlFile->data_checksum_version > 0 &&
+ if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION &&
mode == PG_MODE_ENABLE)
pg_fatal("data checksums are already enabled in cluster");
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index a4060309ae0..fe5fc5ec133 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -287,6 +287,8 @@ main(int argc, char *argv[])
ControlFile->checkPointCopy.oldestCommitTsXid);
printf(_("Latest checkpoint's newestCommitTsXid:%u\n"),
ControlFile->checkPointCopy.newestCommitTsXid);
+ printf(_("Latest checkpoint's data_checksum_version:%u\n"),
+ ControlFile->checkPointCopy.dataChecksumState);
printf(_("Time of latest checkpoint: %s\n"),
ckpttime_str);
printf(_("Fake LSN counter for unlogged rels: %X/%08X\n"),
diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c
index aa6e8b4de5d..79053d22dcc 100644
--- a/src/bin/pg_upgrade/controldata.c
+++ b/src/bin/pg_upgrade/controldata.c
@@ -15,6 +15,7 @@
#include "access/xlog_internal.h"
#include "common/string.h"
#include "pg_upgrade.h"
+#include "storage/checksum.h"
/*
@@ -736,6 +737,14 @@ check_control_data(ControlData *oldctrl,
* check_for_isn_and_int8_passing_mismatch().
*/
+ /*
+ * If data checksums are in any in-progress state then disallow the
+ * upgrade. The user should either let the process finish, or turn off
+ * data checksums, before retrying.
+ */
+ if (oldctrl->data_checksum_version > PG_DATA_CHECKSUM_VERSION)
+ pg_fatal("checksums are being enabled in the old cluster");
+
/*
* We might eventually allow upgrades from checksum to no-checksum
* clusters.
diff --git a/src/bin/pg_waldump/t/001_basic.pl b/src/bin/pg_waldump/t/001_basic.pl
index a268f0f1dd0..7dd1c3dd63e 100644
--- a/src/bin/pg_waldump/t/001_basic.pl
+++ b/src/bin/pg_waldump/t/001_basic.pl
@@ -79,7 +79,8 @@ BRIN
CommitTs
ReplicationOrigin
Generic
-LogicalMessage$/,
+LogicalMessage
+XLOG2$/,
'rmgr list');
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index 3352b5f8532..ae32ef16d67 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i
PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL)
PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL)
PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, logicalmsg_decode)
+PG_RMGR(RM_XLOG2_ID, "XLOG2", xlog2_redo, xlog2_desc, xlog2_identify, NULL, NULL, NULL, xlog2_decode)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index dcc12eb8cbe..4af38e74ce4 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -57,6 +57,7 @@ extern PGDLLIMPORT int CommitDelay;
extern PGDLLIMPORT int CommitSiblings;
extern PGDLLIMPORT bool track_wal_io_timing;
extern PGDLLIMPORT int wal_decode_buffer_size;
+extern PGDLLIMPORT int data_checksums;
extern PGDLLIMPORT int CheckPointSegments;
@@ -119,7 +120,7 @@ extern PGDLLIMPORT bool XLogLogicalInfo;
* of the bits make it to disk, but the checksum wouldn't match. Also WAL-log
* them if forced by wal_log_hints=on.
*/
-#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints)
+#define XLogHintBitIsNeeded() (wal_log_hints || DataChecksumsNeedWrite())
/* Do we need to WAL-log information required only for Hot Standby and logical replication? */
#define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA)
@@ -229,8 +230,11 @@ extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn);
extern XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
extern void xlog_redo(struct XLogReaderState *record);
+extern void xlog2_redo(struct XLogReaderState *record);
extern void xlog_desc(StringInfo buf, struct XLogReaderState *record);
+extern void xlog2_desc(StringInfo buf, struct XLogReaderState *record);
extern const char *xlog_identify(uint8 info);
+extern const char *xlog2_identify(uint8 info);
extern void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli);
@@ -243,7 +247,16 @@ extern XLogRecPtr GetXLogWriteRecPtr(void);
extern uint64 GetSystemIdentifier(void);
extern char *GetMockAuthenticationNonce(void);
-extern bool DataChecksumsEnabled(void);
+extern bool DataChecksumsNeedWrite(void);
+extern bool DataChecksumsNeedVerify(void);
+extern bool DataChecksumsInProgressOn(void);
+extern void SetDataChecksumsOnInProgress(void);
+extern void SetDataChecksumsOn(void);
+extern void SetDataChecksumsOff(void);
+extern const char *show_data_checksums(void);
+extern const char *get_checksum_state_string(uint32 state);
+extern void InitLocalDataChecksumState(void);
+extern void SetLocalDataChecksumState(uint32 data_checksum_version);
extern bool GetDefaultCharSignedness(void);
extern XLogRecPtr GetFakeLSNForUnloggedRel(void);
extern Size XLOGShmemSize(void);
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 755835d63bf..10c18d39ff8 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -25,6 +25,7 @@
#include "lib/stringinfo.h"
#include "pgtime.h"
#include "storage/block.h"
+#include "storage/checksum.h"
#include "storage/relfilelocator.h"
@@ -287,6 +288,12 @@ typedef struct xl_restore_point
char rp_name[MAXFNAMELEN];
} xl_restore_point;
+/* Information logged when data checksum level is changed */
+typedef struct xl_checksum_state
+{
+ ChecksumStateType new_checksum_state;
+} xl_checksum_state;
+
/* Overwrite of prior contrecord */
typedef struct xl_overwrite_contrecord
{
@@ -307,6 +314,7 @@ typedef struct xl_end_of_recovery
typedef struct xl_checkpoint_redo
{
int wal_level;
+ uint32 data_checksum_version;
} xl_checkpoint_redo;
/*
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index b1c5afc15df..582bb2e2058 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202604021
+#define CATALOG_VERSION_NO 202604031
#endif
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 77a661e818b..80b3a730e03 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -22,7 +22,7 @@
/* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION 1901
+#define PG_CONTROL_VERSION 1902
/* Nonce key length, see below */
#define MOCK_AUTH_NONCE_LEN 32
@@ -63,6 +63,9 @@ typedef struct CheckPoint
* set to InvalidTransactionId.
*/
TransactionId oldestActiveXid;
+
+ /* data checksums state at the time of the checkpoint */
+ uint32 dataChecksumState;
} CheckPoint;
/* XLOG info values for XLOG rmgr */
@@ -83,6 +86,9 @@ typedef struct CheckPoint
#define XLOG_CHECKPOINT_REDO 0xE0
#define XLOG_LOGICAL_DECODING_STATUS_CHANGE 0xF0
+/* XLOG info values for XLOG2 rmgr */
+#define XLOG2_CHECKSUMS 0x00
+
/*
* System status indicator. Note this is stored in pg_control; if you change
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index acf16254b21..bd177aebfcb 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -12558,6 +12558,20 @@
proname => 'jsonb_subscript_handler', prorettype => 'internal',
proargtypes => 'internal', prosrc => 'jsonb_subscript_handler' },
+# data checksum management functions
+{ oid => '9258',
+ descr => 'disable data checksums',
+ proname => 'pg_disable_data_checksums', provolatile => 'v', prorettype => 'void',
+ proparallel => 'r', prosrc => 'disable_data_checksums', proargtypes => '',
+ proacl => '{POSTGRES=X}'},
+{ oid => '9257',
+ descr => 'enable data checksums',
+ proname => 'pg_enable_data_checksums', provolatile => 'v', prorettype => 'void',
+ proparallel => 'r', proargtypes => 'int4 int4', proallargtypes => '{int4,int4}',
+ proargmodes => '{i,i}', proargnames => '{cost_delay,cost_limit}',
+ proargdefaults => '{0,100}', prosrc => 'enable_data_checksums',
+ proacl => '{POSTGRES=X}'},
+
# collation management functions
{ oid => '3445', descr => 'import collations from operating system',
proname => 'pg_import_system_collations', procost => '100',
diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h
index 9c40772706c..67948667a97 100644
--- a/src/include/commands/progress.h
+++ b/src/include/commands/progress.h
@@ -185,4 +185,20 @@
#define PROGRESS_COPY_TYPE_PIPE 3
#define PROGRESS_COPY_TYPE_CALLBACK 4
+/* Progress parameters for PROGRESS_DATACHECKSUMS */
+#define PROGRESS_DATACHECKSUMS_PHASE 0
+#define PROGRESS_DATACHECKSUMS_DBS_TOTAL 1
+#define PROGRESS_DATACHECKSUMS_DBS_DONE 2
+#define PROGRESS_DATACHECKSUMS_RELS_TOTAL 3
+#define PROGRESS_DATACHECKSUMS_RELS_DONE 4
+#define PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL 5
+#define PROGRESS_DATACHECKSUMS_BLOCKS_DONE 6
+
+/* Phases of datachecksumsworker operation */
+#define PROGRESS_DATACHECKSUMS_PHASE_ENABLING 0
+#define PROGRESS_DATACHECKSUMS_PHASE_DISABLING 1
+#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL 2
+#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_BARRIER 3
+#define PROGRESS_DATACHECKSUMS_PHASE_DONE 4
+
#endif
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 04f29748be7..7277c37e779 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -367,6 +367,9 @@ typedef enum BackendType
B_WAL_SUMMARIZER,
B_WAL_WRITER,
+ B_DATACHECKSUMSWORKER_LAUNCHER,
+ B_DATACHECKSUMSWORKER_WORKER,
+
/*
* Logger is not connected to shared memory and does not have a PGPROC
* entry.
@@ -392,6 +395,9 @@ extern PGDLLIMPORT BackendType MyBackendType;
#define AmWalSummarizerProcess() (MyBackendType == B_WAL_SUMMARIZER)
#define AmWalWriterProcess() (MyBackendType == B_WAL_WRITER)
#define AmIoWorkerProcess() (MyBackendType == B_IO_WORKER)
+#define AmDataChecksumsWorkerProcess() \
+ (MyBackendType == B_DATACHECKSUMSWORKER_LAUNCHER || \
+ MyBackendType == B_DATACHECKSUMSWORKER_WORKER)
#define AmSpecialWorkerProcess() \
(AmAutoVacuumLauncherProcess() || \
diff --git a/src/include/postmaster/datachecksum_state.h b/src/include/postmaster/datachecksum_state.h
new file mode 100644
index 00000000000..343494edcc8
--- /dev/null
+++ b/src/include/postmaster/datachecksum_state.h
@@ -0,0 +1,58 @@
+/*-------------------------------------------------------------------------
+ *
+ * datachecksum_state.h
+ * header file for data checksum helper background worker and data
+ * checksum state manipulation
+ *
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/postmaster/datachecksum_state.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DATACHECKSUM_STATE_H
+#define DATACHECKSUM_STATE_H
+
+#include "storage/procsignal.h"
+
+/* Shared memory */
+extern Size DataChecksumsShmemSize(void);
+extern void DataChecksumsShmemInit(void);
+
+/* Possible operations the Datachecksumsworker can perform */
+typedef enum DataChecksumsWorkerOperation
+{
+ ENABLE_DATACHECKSUMS,
+ DISABLE_DATACHECKSUMS,
+} DataChecksumsWorkerOperation;
+
+/*
+ * Possible states for a database entry which has been processed. Exported
+ * here since we want to be able to reference this from injection point tests.
+ */
+typedef enum
+{
+ DATACHECKSUMSWORKER_SUCCESSFUL = 0,
+ DATACHECKSUMSWORKER_ABORTED,
+ DATACHECKSUMSWORKER_FAILED,
+ DATACHECKSUMSWORKER_DROPDB,
+} DataChecksumsWorkerResult;
+
+/* Prototypes for data checksum state manipulation */
+bool AbsorbDataChecksumsBarrier(ProcSignalBarrierType target_state);
+void EmitAndWaitDataChecksumsBarrier(uint32 state);
+
+/* Prototypes for data checksum background worker */
+
+/* Start the background processes for enabling or disabling checksums */
+void StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op,
+ int cost_delay,
+ int cost_limit);
+
+/* Background worker entrypoints */
+void DataChecksumsWorkerLauncherMain(Datum arg);
+void DataChecksumsWorkerMain(Datum arg);
+
+#endif /* DATACHECKSUM_STATE_H */
diff --git a/src/include/postmaster/proctypelist.h b/src/include/postmaster/proctypelist.h
index feac19ba207..b3477e6f17a 100644
--- a/src/include/postmaster/proctypelist.h
+++ b/src/include/postmaster/proctypelist.h
@@ -38,6 +38,8 @@ PG_PROCTYPE(B_BACKEND, "backend", gettext_noop("client backend"), BackendMain, t
PG_PROCTYPE(B_BG_WORKER, "bgworker", gettext_noop("background worker"), BackgroundWorkerMain, true)
PG_PROCTYPE(B_BG_WRITER, "bgwriter", gettext_noop("background writer"), BackgroundWriterMain, true)
PG_PROCTYPE(B_CHECKPOINTER, "checkpointer", gettext_noop("checkpointer"), CheckpointerMain, true)
+PG_PROCTYPE(B_DATACHECKSUMSWORKER_LAUNCHER, "checksums", gettext_noop("datachecksum launcher"), NULL, false)
+PG_PROCTYPE(B_DATACHECKSUMSWORKER_WORKER, "checksums", gettext_noop("datachecksum worker"), NULL, false)
PG_PROCTYPE(B_DEAD_END_BACKEND, "backend", gettext_noop("dead-end client backend"), BackendMain, true)
PG_PROCTYPE(B_INVALID, "postmaster", gettext_noop("unrecognized"), NULL, false)
PG_PROCTYPE(B_IO_WORKER, "ioworker", gettext_noop("io worker"), IoWorkerMain, true)
diff --git a/src/include/replication/decode.h b/src/include/replication/decode.h
index 49f00fc48b8..107e43ef750 100644
--- a/src/include/replication/decode.h
+++ b/src/include/replication/decode.h
@@ -22,6 +22,7 @@ typedef struct XLogRecordBuffer
} XLogRecordBuffer;
extern void xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+extern void xlog2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
extern void heap_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
extern void heap2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
extern void xact_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index e5267b93fe6..634e1e49ee5 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -230,7 +230,6 @@ typedef PageHeaderData *PageHeader;
* handling pages.
*/
#define PG_PAGE_LAYOUT_VERSION 4
-#define PG_DATA_CHECKSUM_VERSION 1
/* ----------------------------------------------------------------
* page support functions
@@ -501,6 +500,7 @@ do { \
#define PIV_LOG_WARNING (1 << 0)
#define PIV_LOG_LOG (1 << 1)
#define PIV_IGNORE_CHECKSUM_FAILURE (1 << 2)
+#define PIV_ZERO_BUFFERS_ON_ERROR (1 << 3)
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap) \
PageAddItemExtended(page, item, size, offsetNumber, \
diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h
index ff417d5ae3e..3b1440c0c95 100644
--- a/src/include/storage/checksum.h
+++ b/src/include/storage/checksum.h
@@ -15,6 +15,22 @@
#include "storage/block.h"
+/*
+ * Checksum state 0 is used for when data checksums are disabled (OFF).
+ * PG_DATA_CHECKSUM_INPROGRESS_{ON|OFF} defines that data checksums are either
+ * currently being enabled or disabled, and PG_DATA_CHECKSUM_VERSION defines
+ * that data checksums are enabled. The ChecksumStateType is stored in
+ * pg_control so changing requires a catversion bump, and the values cannot
+ * be reordered. New states must be added at the end.
+ */
+typedef enum ChecksumStateType
+{
+ PG_DATA_CHECKSUM_OFF = 0,
+ PG_DATA_CHECKSUM_VERSION = 1,
+ PG_DATA_CHECKSUM_INPROGRESS_OFF = 2,
+ PG_DATA_CHECKSUM_INPROGRESS_ON = 3,
+} ChecksumStateType;
+
/*
* Compute the checksum for a Postgres page. The page must be aligned on a
* 4-byte boundary.
diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h
index 59ee097977d..af8553bcb6c 100644
--- a/src/include/storage/lwlocklist.h
+++ b/src/include/storage/lwlocklist.h
@@ -87,6 +87,7 @@ PG_LWLOCK(52, SerialControl)
PG_LWLOCK(53, AioWorkerSubmissionQueue)
PG_LWLOCK(54, WaitLSN)
PG_LWLOCK(55, LogicalDecodingControl)
+PG_LWLOCK(56, DataChecksumsWorker)
/*
* There also exist several built-in LWLock tranches. As with the predefined
diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h
index 348fba53a93..cc4f26aa33d 100644
--- a/src/include/storage/procsignal.h
+++ b/src/include/storage/procsignal.h
@@ -48,6 +48,10 @@ typedef enum
PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */
PROCSIGNAL_BARRIER_UPDATE_XLOG_LOGICAL_INFO, /* ask to update
* XLogLogicalInfo */
+ PROCSIGNAL_BARRIER_CHECKSUM_OFF,
+ PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON,
+ PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF,
+ PROCSIGNAL_BARRIER_CHECKSUM_ON,
} ProcSignalBarrierType;
/*
diff --git a/src/include/utils/backend_progress.h b/src/include/utils/backend_progress.h
index 6300dbd15d5..61e13c40e28 100644
--- a/src/include/utils/backend_progress.h
+++ b/src/include/utils/backend_progress.h
@@ -28,6 +28,7 @@ typedef enum ProgressCommandType
PROGRESS_COMMAND_BASEBACKUP,
PROGRESS_COMMAND_COPY,
PROGRESS_COMMAND_REPACK,
+ PROGRESS_COMMAND_DATACHECKSUMS,
} ProgressCommandType;
#define PGSTAT_NUM_PROGRESS_PARAM 20
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 28ce3b35eda..864b407abcf 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -20,6 +20,7 @@ SUBDIRS = \
test_bitmapset \
test_bloomfilter \
test_cloexec \
+ test_checksums \
test_copy_callbacks \
test_custom_rmgrs \
test_custom_stats \
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 3ac291656c1..e5acacd5083 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -20,6 +20,7 @@ subdir('test_binaryheap')
subdir('test_bitmapset')
subdir('test_bloomfilter')
subdir('test_cloexec')
+subdir('test_checksums')
subdir('test_copy_callbacks')
subdir('test_cplusplusext')
subdir('test_custom_rmgrs')
diff --git a/src/test/modules/test_checksums/.gitignore b/src/test/modules/test_checksums/.gitignore
new file mode 100644
index 00000000000..871e943d50e
--- /dev/null
+++ b/src/test/modules/test_checksums/.gitignore
@@ -0,0 +1,2 @@
+# Generated by test suite
+/tmp_check/
diff --git a/src/test/modules/test_checksums/Makefile b/src/test/modules/test_checksums/Makefile
new file mode 100644
index 00000000000..fa85b79ae57
--- /dev/null
+++ b/src/test/modules/test_checksums/Makefile
@@ -0,0 +1,40 @@
+#-------------------------------------------------------------------------
+#
+# Makefile for src/test/modules/test_checksums
+#
+# Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+# Portions Copyright (c) 1994, Regents of the University of California
+#
+# src/test/modules/test_checksums/Makefile
+#
+#-------------------------------------------------------------------------
+
+EXTRA_INSTALL = src/test/modules/injection_points
+
+export enable_injection_points
+
+MODULE_big = test_checksums
+OBJS = \
+ $(WIN32RES) \
+ test_checksums.o
+PGFILEDESC = "test_checksums - test code for data checksums"
+
+EXTENSION = test_checksums
+DATA = test_checksums--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_checksums
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+check:
+ $(prove_check)
+
+installcheck:
+ $(prove_installcheck)
diff --git a/src/test/modules/test_checksums/README b/src/test/modules/test_checksums/README
new file mode 100644
index 00000000000..6a23e4ff0ae
--- /dev/null
+++ b/src/test/modules/test_checksums/README
@@ -0,0 +1,30 @@
+src/test/modules/test_checksums/README
+
+Regression tests for data checksums
+===================================
+This directory contains a test suite for enabling, and disabling, data
+checksums both offline as well as in a running cluster.
+
+Running the tests with autoconf
+===============================
+
+ make check
+
+or
+
+ make installcheck
+
+Running the tests with meson
+============================
+From your build directory, issue the following command:
+
+ meson test -q --print-errorlogs --suite setup --suite test_checksums
+
+NOTE: This creates a temporary installation (in the case of "make check" or
+"--suite setup"), with multiple nodes, be they master or standby(s) for the
+purpose of the tests.
+
+NOTE: This test suite requires TAP tests to be enabled, a subset of the tests
+also require injection points to function. In order to run the extended test
+then "checksum_extended" must be set in the PG_TEST_EXTRA environment
+variable.
diff --git a/src/test/modules/test_checksums/meson.build b/src/test/modules/test_checksums/meson.build
new file mode 100644
index 00000000000..9b1421a9b91
--- /dev/null
+++ b/src/test/modules/test_checksums/meson.build
@@ -0,0 +1,38 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+test_checksums_sources = files(
+ 'test_checksums.c',
+)
+
+test_checksums = shared_module('test_checksums',
+ test_checksums_sources,
+ kwargs: pg_test_mod_args,
+)
+test_install_libs += test_checksums
+
+test_install_data += files(
+ 'test_checksums.control',
+ 'test_checksums--1.0.sql',
+)
+
+tests += {
+ 'name': 'test_checksums',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'tap': {
+ 'env': {
+ 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no',
+ },
+ 'tests': [
+ 't/001_basic.pl',
+ 't/002_restarts.pl',
+ 't/003_standby_restarts.pl',
+ 't/004_offline.pl',
+ 't/005_injection.pl',
+ 't/006_pgbench_single.pl',
+ 't/007_pgbench_standby.pl',
+ 't/008_pitr.pl',
+ 't/009_fpi.pl',
+ ],
+ },
+}
diff --git a/src/test/modules/test_checksums/t/001_basic.pl b/src/test/modules/test_checksums/t/001_basic.pl
new file mode 100644
index 00000000000..c008e95fbff
--- /dev/null
+++ b/src/test/modules/test_checksums/t/001_basic.pl
@@ -0,0 +1,63 @@
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums in an online cluster
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# Initialize node with checksums disabled.
+my $node = PostgreSQL::Test::Cluster->new('basic_node');
+$node->init(no_data_checksums => 1);
+$node->start;
+
+# Create some content to have un-checksummed data in the cluster
+$node->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
+
+# Ensure that checksums are turned off
+test_checksum_state($node, 'off');
+
+# Enable data checksums and wait for the state transition to 'on'
+enable_data_checksums($node, wait => 'on');
+
+# Run a dummy query just to make sure we can read back data
+my $result =
+ $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1 ");
+is($result, '9999', 'ensure checksummed pages can be read back');
+
+# Enable data checksums again which should be a no-op so we explicitly don't
+# wait for any state transition as none should happen here
+enable_data_checksums($node);
+test_checksum_state($node, 'on');
+# ..and make sure we can still read/write data
+$node->safe_psql('postgres', "UPDATE t SET a = a + 1;");
+$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '10000', 'ensure checksummed pages can be read back');
+
+# Disable checksums again and wait for the state transition
+disable_data_checksums($node, wait => 1);
+
+# Test reading data again
+$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '10000', 'ensure previously checksummed pages can be read back');
+
+# Re-enable checksums and make sure that the underlying data has changed to
+# ensure that checksums will be different.
+$node->safe_psql('postgres', "UPDATE t SET a = a + 1;");
+enable_data_checksums($node, wait => 'on');
+
+# Run a dummy query just to make sure we can read back the data
+$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '10000', 'ensure checksummed pages can be read back');
+
+$node->stop;
+done_testing();
diff --git a/src/test/modules/test_checksums/t/002_restarts.pl b/src/test/modules/test_checksums/t/002_restarts.pl
new file mode 100644
index 00000000000..bab59be82bd
--- /dev/null
+++ b/src/test/modules/test_checksums/t/002_restarts.pl
@@ -0,0 +1,110 @@
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums in an online cluster with a
+# restart which breaks processing.
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# Initialize node with checksums disabled.
+my $node = PostgreSQL::Test::Cluster->new('restarts_node');
+$node->init(no_data_checksums => 1);
+$node->start;
+
+# Initialize result storage for queries
+my $result;
+
+# Create some content to have un-checksummed data in the cluster
+$node->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
+
+# Ensure that checksums are disabled
+test_checksum_state($node, 'off');
+
+SKIP:
+{
+ skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 6
+ if (!$ENV{PG_TEST_EXTRA}
+ || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/);
+
+ # Create a barrier for checksum enablement to block on, in this case a pre-
+ # existing temporary table which is kept open while processing is started.
+ # We can accomplish this by setting up an interactive psql process which
+ # keeps the temporary table created as we enable checksums in another psql
+ # process.
+ #
+ # This is a similar test to the synthetic variant in 005_injection.pl
+ # which fakes this scenario.
+ my $bsession = $node->background_psql('postgres');
+ $bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);');
+
+ # In another session, make sure we can see the blocking temp table but
+ # start processing anyways and check that we are blocked with a proper
+ # wait event.
+ $result = $node->safe_psql('postgres',
+ "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"
+ );
+ is($result, 't', 'ensure we can see the temporary table');
+
+ # Enabling data checksums shouldn't work as the process is blocked on the
+ # temporary table held open by $bsession. Ensure that we reach inprogress-
+ # on before we do more tests.
+ enable_data_checksums($node, wait => 'inprogress-on');
+
+ # Wait for processing to finish and the worker waiting for leftover temp
+ # relations to be able to actually finish
+ $result = $node->poll_query_until(
+ 'postgres',
+ "SELECT wait_event FROM pg_catalog.pg_stat_activity "
+ . "WHERE backend_type = 'datachecksum worker';",
+ 'ChecksumEnableTemptableWait');
+
+ # The datachecksumsworker waits for temporary tables to disappear for 3
+ # seconds before retrying, so sleep for 4 seconds to be guaranteed to see
+ # a retry cycle
+ sleep(4);
+
+ # Re-check the wait event to ensure we are blocked on the right thing.
+ $result = $node->safe_psql('postgres',
+ "SELECT wait_event FROM pg_catalog.pg_stat_activity "
+ . "WHERE backend_type = 'datachecksum worker';");
+ is($result, 'ChecksumEnableTemptableWait',
+ 'ensure the correct wait condition is set');
+ test_checksum_state($node, 'inprogress-on');
+
+ # Stop the cluster while bsession is still attached. We can't close the
+ # session first since the brief period between closing and stopping might
+ # be enough for checksums to get enabled.
+ $node->stop;
+ $bsession->quit;
+ $node->start;
+
+ # Ensure the checksums aren't enabled across the restart. This leaves the
+ # cluster in the same state as before we entered the SKIP block.
+ test_checksum_state($node, 'off');
+}
+
+enable_data_checksums($node, wait => 'on');
+
+$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '9999', 'ensure checksummed pages can be read back');
+
+$result = $node->poll_query_until(
+ 'postgres',
+ "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksum%';",
+ '0');
+is($result, 1, 'await datachecksums worker/launcher termination');
+
+disable_data_checksums($node, wait => 1);
+
+$node->stop;
+done_testing();
diff --git a/src/test/modules/test_checksums/t/003_standby_restarts.pl b/src/test/modules/test_checksums/t/003_standby_restarts.pl
new file mode 100644
index 00000000000..6b016925651
--- /dev/null
+++ b/src/test/modules/test_checksums/t/003_standby_restarts.pl
@@ -0,0 +1,114 @@
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums in an online cluster with
+# streaming replication
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# Initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('standby_restarts_primary');
+$node_primary->init(allows_streaming => 1, no_data_checksums => 1);
+$node_primary->start;
+
+my $slotname = 'physical_slot';
+$node_primary->safe_psql('postgres',
+ "SELECT pg_create_physical_replication_slot('$slotname')");
+
+# Take backup
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+# Create streaming standby linking to primary
+my $node_standby = PostgreSQL::Test::Cluster->new('standby_restarts_standby');
+$node_standby->init_from_backup($node_primary, $backup_name,
+ has_streaming => 1);
+$node_standby->append_conf(
+ 'postgresql.conf', qq[
+primary_slot_name = '$slotname'
+]);
+$node_standby->start;
+
+# Create some content on the primary to have un-checksummed data in the cluster
+$node_primary->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
+
+# Wait for standbys to catch up
+$node_primary->wait_for_catchup($node_standby, 'replay',
+ $node_primary->lsn('insert'));
+
+# Check that checksums are turned off on all nodes
+test_checksum_state($node_primary, 'off');
+test_checksum_state($node_standby, 'off');
+
+# ---------------------------------------------------------------------------
+# Enable checksums for the cluster, and make sure that both the primary and
+# standby change state.
+#
+
+# Ensure that the primary switches to "inprogress-on"
+enable_data_checksums($node_primary, wait => 'inprogress-on');
+# Wait for checksum enable to be replayed
+$node_primary->wait_for_catchup($node_standby, 'replay');
+
+# Ensure that the standby has switched to "inprogress-on" or "on". Normally it
+# would be "inprogress-on", but it is theoretically possible for the primary to
+# complete the checksum enabling *and* have the standby replay that record
+# before we reach the check below.
+my $result = $node_standby->poll_query_until(
+ 'postgres',
+ "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';",
+ 'f');
+is($result, 1, 'ensure standby has absorbed the inprogress-on barrier');
+$result = $node_standby->safe_psql('postgres',
+ "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"
+);
+
+is(($result eq 'inprogress-on' || $result eq 'on'),
+ 1, 'ensure checksums are on, or in progress, on standby_1');
+
+# Insert some more data which should be checksummed on INSERT
+$node_primary->safe_psql('postgres',
+ "INSERT INTO t VALUES (generate_series(1, 10000));");
+
+# Wait for checksums enabled on the primary and standby
+wait_for_checksum_state($node_primary, 'on');
+wait_for_checksum_state($node_standby, 'on');
+
+$result =
+ $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1");
+is($result, '19998', 'ensure we can safely read all data with checksums');
+
+$result = $node_primary->poll_query_until(
+ 'postgres',
+ "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksum%';",
+ '0');
+is($result, 1, 'await datachecksums worker/launcher termination');
+
+#
+# Disable checksums and ensure it's propagated to standby and that we can
+# still read all data
+#
+
+# Disable checksums and wait for the operation to be replayed
+disable_data_checksums($node_primary);
+$node_primary->wait_for_catchup($node_standby, 'replay');
+# Ensure that the primary and standby has switched to off
+wait_for_checksum_state($node_primary, 'off');
+wait_for_checksum_state($node_standby, 'off');
+# Doublecheck reading data without errors
+$result =
+ $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1");
+is($result, "19998", 'ensure we can safely read all data without checksums');
+
+$node_standby->stop;
+$node_primary->stop;
+done_testing();
diff --git a/src/test/modules/test_checksums/t/004_offline.pl b/src/test/modules/test_checksums/t/004_offline.pl
new file mode 100644
index 00000000000..f1972bddff1
--- /dev/null
+++ b/src/test/modules/test_checksums/t/004_offline.pl
@@ -0,0 +1,82 @@
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums offline from various states
+# of checksum processing
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# Initialize node with checksums disabled.
+my $node = PostgreSQL::Test::Cluster->new('offline_node');
+$node->init(no_data_checksums => 1);
+$node->start;
+
+# Create some content to have un-checksummed data in the cluster
+$node->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
+
+# Ensure that checksums are disabled
+test_checksum_state($node, 'off');
+
+# Enable checksums offline using pg_checksums
+$node->stop;
+$node->checksum_enable_offline;
+$node->start;
+
+# Ensure that checksums are enabled
+test_checksum_state($node, 'on');
+
+# Run a dummy query just to make sure we can read back some data
+my $result =
+ $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '9999', 'ensure checksummed pages can be read back');
+
+# Disable checksums offline again using pg_checksums
+$node->stop;
+$node->checksum_disable_offline;
+$node->start;
+
+# Ensure that checksums are disabled
+test_checksum_state($node, 'off');
+
+# Create a barrier for checksum enablement to block on, in this case a pre-
+# existing temporary table which is kept open while processing is started. We
+# can accomplish this by setting up an interactive psql process which keeps the
+# temporary table created as we enable checksums in another psql process.
+
+my $bsession = $node->background_psql('postgres');
+$bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);');
+
+# In another session, make sure we can see the blocking temp table but start
+# processing anyways and check that we are blocked with a proper wait event.
+$result = $node->safe_psql('postgres',
+ "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';");
+is($result, 't', 'ensure we can see the temporary table');
+
+enable_data_checksums($node, wait => 'inprogress-on');
+
+# Turn the cluster off and enable checksums offline, then start back up
+$bsession->quit;
+$node->stop;
+$node->checksum_enable_offline;
+$node->start;
+
+# Ensure that checksums are now enabled even though processing wasn't
+# restarted
+test_checksum_state($node, 'on');
+
+# Run a dummy query just to make sure we can read back some data
+$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '9999', 'ensure checksummed pages can be read back');
+
+$node->stop;
+done_testing();
diff --git a/src/test/modules/test_checksums/t/005_injection.pl b/src/test/modules/test_checksums/t/005_injection.pl
new file mode 100644
index 00000000000..897f282a1f2
--- /dev/null
+++ b/src/test/modules/test_checksums/t/005_injection.pl
@@ -0,0 +1,74 @@
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums in an online cluster with
+# injection point tests injecting failures into the processing
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+if ($ENV{enable_injection_points} ne 'yes')
+{
+ plan skip_all => 'Injection points not supported by this build';
+}
+
+# ---------------------------------------------------------------------------
+# Test cluster setup
+#
+
+# Initiate testcluster
+my $node = PostgreSQL::Test::Cluster->new('injection_node');
+$node->init(no_data_checksums => 1);
+$node->start;
+
+# Set up test environment
+$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
+
+# ---------------------------------------------------------------------------
+# Inducing failures and crashes in processing
+
+# Force enabling checksums to fail by marking one of the databases as having
+# failed in processing.
+disable_data_checksums($node, wait => 1);
+$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(true);');
+enable_data_checksums($node, wait => 'off');
+$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(false);');
+
+# Make sure that disabling after a failure works
+disable_data_checksums($node);
+test_checksum_state($node, 'off');
+
+# ---------------------------------------------------------------------------
+# Timing and retry related tests
+#
+
+SKIP:
+{
+ skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 4
+ if (!$ENV{PG_TEST_EXTRA}
+ || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/);
+
+ # Inject a delay in the barrier for enabling checksums
+ disable_data_checksums($node, wait => 1);
+ $node->safe_psql('postgres', 'SELECT dcw_inject_delay_barrier();');
+ enable_data_checksums($node, wait => 'on');
+
+ # Fake the existence of a temporary table at the start of processing, which
+ # will force the processing to wait and retry in order to wait for it to
+ # disappear.
+ disable_data_checksums($node, wait => 1);
+ $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(true);');
+ enable_data_checksums($node, wait => 'on');
+}
+
+$node->stop;
+done_testing();
diff --git a/src/test/modules/test_checksums/t/006_pgbench_single.pl b/src/test/modules/test_checksums/t/006_pgbench_single.pl
new file mode 100644
index 00000000000..0ab5b04b931
--- /dev/null
+++ b/src/test/modules/test_checksums/t/006_pgbench_single.pl
@@ -0,0 +1,275 @@
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums in an online cluster with
+# concurrent activity via pgbench runs
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# This test suite is expensive, or very expensive, to execute. There are two
+# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite
+# an "checksum_extended" for the full suite. The full suite can run for hours
+# on slow or constrained systems.
+my $extended = undef;
+if ($ENV{PG_TEST_EXTRA})
+{
+ $extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/);
+ plan skip_all => 'Expensive data checksums test disabled'
+ unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/);
+}
+else
+{
+ plan skip_all => 'Expensive data checksums test disabled';
+}
+
+if ($ENV{enable_injection_points} ne 'yes')
+{
+ plan skip_all => 'Injection points not supported by this build';
+}
+
+my $node;
+my $node_loglocation = 0;
+
+# The number of full test iterations which will be performed. The exact number
+# of tests performed and the wall time taken is non-deterministic as the test
+# performs a lot of randomized actions, but 10 iterations will be a long test
+# run regardless.
+my $TEST_ITERATIONS = 1;
+$TEST_ITERATIONS = 10 if ($extended);
+
+# Variables which record the current state of the cluster
+my $data_checksum_state = 'off';
+my $pgbench = undef;
+
+# Start a pgbench run in the background against the server specified via the
+# port passed as parameter.
+sub background_rw_pgbench
+{
+ my $port = shift;
+
+ # If a previous pgbench is still running, start by shutting it down.
+ $pgbench->finish if $pgbench;
+
+ my $clients = 1;
+ my $runtime = 2;
+
+ if ($extended)
+ {
+ # Randomize the number of pgbench clients a bit (range 1-16)
+ $clients = 1 + int(rand(15));
+ $runtime = 600;
+ }
+ my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients);
+
+ # Randomize whether we spawn connections or not
+ push(@cmd, '-C') if ($extended && cointoss);
+ # Finally add the database name to use
+ push(@cmd, 'postgres');
+
+ $pgbench = IPC::Run::start(
+ \@cmd,
+ '<' => '/dev/null',
+ '>' => '/dev/null',
+ '2>' => '/dev/null',
+ IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default));
+}
+
+# Invert the state of data checksums in the cluster, if data checksums are on
+# then disable them and vice versa. Also performs proper validation of the
+# before and after state.
+sub flip_data_checksums
+{
+ # First, make sure the cluster is in the state we expect it to be
+ test_checksum_state($node, $data_checksum_state);
+
+ if ($data_checksum_state eq 'off')
+ {
+ # Coin-toss to see if we are injecting a retry due to a temptable
+ $node->safe_psql('postgres', 'SELECT dcw_fake_temptable();')
+ if cointoss();
+
+ # log LSN right before we start changing checksums
+ my $result =
+ $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+ note("LSN before enabling: " . $result . "\n");
+
+ # Ensure that the primary switches to "inprogress-on"
+ enable_data_checksums($node, wait => 'inprogress-on');
+
+ random_sleep() if ($extended);
+
+ # Wait for checksums enabled on the primary
+ wait_for_checksum_state($node, 'on');
+
+ # log LSN right after the primary flips checksums to "on"
+ $result = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+ note("LSN after enabling: " . $result . "\n");
+
+ random_sleep() if ($extended);
+
+ $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(false);');
+ $data_checksum_state = 'on';
+ }
+ elsif ($data_checksum_state eq 'on')
+ {
+ random_sleep() if ($extended);
+
+ # log LSN right before we start changing checksums
+ my $result =
+ $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+ note("LSN before disabling: " . $result . "\n");
+
+ disable_data_checksums($node);
+
+ # Wait for checksums disabled on the primary
+ wait_for_checksum_state($node, 'off');
+
+ # log LSN right after the primary flips checksums to "off"
+ $result = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+ note("LSN after disabling: " . $result . "\n");
+
+ random_sleep() if ($extended);
+
+ $data_checksum_state = 'off';
+ }
+ else
+ {
+ # This should only happen due to programmer error when hacking on the
+ # test code, but since that might pass subtly we error out.
+ BAIL_OUT('data_checksum_state variable has invalid state:'
+ . $data_checksum_state);
+ }
+}
+
+# Create and start a cluster with one node
+$node = PostgreSQL::Test::Cluster->new('pgbench_single_main');
+$node->init(allows_streaming => 1, no_data_checksums => 1);
+# max_connections need to be bumped in order to accommodate for pgbench clients
+# and log_statement is dialled down since it otherwise will generate enormous
+# amounts of logging. Page verification failures are still logged.
+$node->append_conf(
+ 'postgresql.conf',
+ qq[
+max_connections = 100
+log_statement = none
+]);
+$node->start;
+$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
+# Create some content to have un-checksummed data in the cluster
+$node->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;");
+# Initialize pgbench
+my $scalefactor = ($extended ? 10 : 1);
+$node->command_ok(
+ [
+ 'pgbench', '-p', $node->port, '-i',
+ '-s', $scalefactor, '-q', 'postgres'
+ ]);
+# Start the test suite with pgbench running.
+background_rw_pgbench($node->port);
+
+# Main test suite. This loop will start a pgbench run on the cluster and while
+# that's running flip the state of data checksums concurrently. It will then
+# randomly restart the cluster and then check for
+# the desired state. The idea behind doing things randomly is to stress out
+# any timing related issues by subjecting the cluster for varied workloads.
+# A TODO is to generate a trace such that any test failure can be traced to
+# its order of operations for debugging.
+for (my $i = 0; $i < $TEST_ITERATIONS; $i++)
+{
+ note("iteration ", ($i + 1), " of ", $TEST_ITERATIONS);
+
+ if (!$node->is_alive)
+ {
+ # Start, to do recovery, and stop
+ $node->start;
+ $node->stop('fast');
+
+ # Since the log isn't being written to now, parse the log and check
+ # for instances of checksum verification failures.
+ my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile,
+ $node_loglocation);
+ unlike(
+ $log,
+ qr/page verification failed,.+\d$/,
+ "no checksum validation errors in primary log (during WAL recovery)"
+ );
+ $node_loglocation = -s $node->logfile;
+
+ # Randomize the WAL size, to trigger checkpoints less/more often
+ my $sb = 64 + int(rand(1024));
+ $node->append_conf('postgresql.conf', qq[max_wal_size = $sb]);
+ note("changing max_wal_size to " . $sb);
+
+ $node->start;
+
+ # Start a pgbench in the background against the primary
+ background_rw_pgbench($node->port);
+ }
+
+ $node->safe_psql('postgres', "UPDATE t SET a = a + 1;");
+
+ flip_data_checksums();
+ random_sleep() if ($extended);
+ my $result =
+ $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+ is($result, '100000', 'ensure data pages can be read back on primary');
+
+ random_sleep() if ($extended);
+
+ # Potentially powercycle the node
+ if (cointoss())
+ {
+ $node->stop(stopmode());
+
+ PostgreSQL::Test::Utils::system_log("pg_controldata",
+ $node->data_dir);
+
+ my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile,
+ $node_loglocation);
+ unlike(
+ $log,
+ qr/page verification failed,.+\d$/,
+ "no checksum validation errors in primary log (outside WAL recovery)"
+ );
+ $node_loglocation = -s $node->logfile;
+ }
+
+ random_sleep() if ($extended);
+}
+
+# Make sure the node is running
+if (!$node->is_alive)
+{
+ $node->start;
+}
+
+# Testrun is over, ensure that data reads back as expected and perform a final
+# verification of the data checksum state.
+my $result =
+ $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '100000', 'ensure data pages can be read back on primary');
+test_checksum_state($node, $data_checksum_state);
+
+# Perform one final pass over the logs and hunt for unexpected errors
+my $log =
+ PostgreSQL::Test::Utils::slurp_file($node->logfile, $node_loglocation);
+unlike(
+ $log,
+ qr/page verification failed,.+\d$/,
+ "no checksum validation errors in primary log");
+$node_loglocation = -s $node->logfile;
+
+$node->teardown_node;
+
+done_testing();
diff --git a/src/test/modules/test_checksums/t/007_pgbench_standby.pl b/src/test/modules/test_checksums/t/007_pgbench_standby.pl
new file mode 100644
index 00000000000..b0d40d24005
--- /dev/null
+++ b/src/test/modules/test_checksums/t/007_pgbench_standby.pl
@@ -0,0 +1,400 @@
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums in an online cluster,
+# comprising of a primary and a replicated standby, with concurrent activity
+# via pgbench runs
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# This test suite is expensive, or very expensive, to execute. There are two
+# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite
+# an "checksum_extended" for the full suite. The full suite can run for hours
+# on slow or constrained systems.
+my $extended = undef;
+if ($ENV{PG_TEST_EXTRA})
+{
+ $extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/);
+ plan skip_all => 'Expensive data checksums test disabled'
+ unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/);
+}
+else
+{
+ plan skip_all => 'Expensive data checksums test disabled';
+}
+
+if ($ENV{enable_injection_points} ne 'yes')
+{
+ plan skip_all => 'Injection points not supported by this build';
+}
+
+my $node_primary_slot = 'physical_slot';
+my $node_primary_backup = 'primary_backup';
+my $node_primary;
+my $node_primary_loglocation = 0;
+my $node_standby;
+my $node_standby_loglocation = 0;
+
+# The number of full test iterations which will be performed. The exact number
+# of tests performed and the wall time taken is non-deterministic as the test
+# performs a lot of randomized actions, but 5 iterations will be a long test
+# run regardless.
+my $TEST_ITERATIONS = 5;
+$TEST_ITERATIONS = 1 if ($extended);
+
+# Variables which record the current state of the cluster
+my $data_checksum_state = 'off';
+
+my $pgbench_primary = undef;
+my $pgbench_standby = undef;
+
+# Start a pgbench run in the background against the server specified via the
+# port passed as parameter
+sub background_pgbench
+{
+ my ($port, $standby) = @_;
+ my $pgbench = ($standby ? \$pgbench_standby : \$pgbench_primary);
+
+ # Terminate any currently running pgbench process before continuing
+ $$pgbench->finish if $$pgbench;
+
+ my $clients = 1;
+ my $runtime = 5;
+
+ if ($extended)
+ {
+ # Randomize the number of pgbench clients a bit (range 1-16)
+ $clients = 1 + int(rand(15));
+ $runtime = 600;
+ }
+
+ my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients);
+ # Randomize whether we spawn connections or not
+ push(@cmd, '-C') if ($extended && cointoss());
+ # If we run on a standby it needs to be a read-only benchmark
+ push(@cmd, '-S') if ($standby);
+ # Finally add the database name to use
+ push(@cmd, 'postgres');
+
+ $$pgbench = IPC::Run::start(
+ \@cmd,
+ '<' => '/dev/null',
+ '>' => '/dev/null',
+ '2>' => '/dev/null',
+ IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default));
+}
+
+# Invert the state of data checksums in the cluster, if data checksums are on
+# then disable them and vice versa. Also performs proper validation of the
+# before and after state.
+sub flip_data_checksums
+{
+ # First, make sure the cluster is in the state we expect it to be
+ test_checksum_state($node_primary, $data_checksum_state);
+ test_checksum_state($node_standby, $data_checksum_state);
+
+ if ($data_checksum_state eq 'off')
+ {
+ # Coin-toss to see if we are injecting a retry due to a temptable
+ $node_primary->safe_psql('postgres', 'SELECT dcw_fake_temptable();')
+ if cointoss();
+
+ # log LSN right before we start changing checksums
+ my $result =
+ $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+ note("LSN before enabling: " . $result . "\n");
+
+ # Ensure that the primary switches to "inprogress-on"
+ enable_data_checksums($node_primary, wait => 'inprogress-on');
+
+ random_sleep() if ($extended);
+
+ # Wait for checksum enable to be replayed
+ $node_primary->wait_for_catchup($node_standby, 'replay');
+
+ # Ensure that the standby has switched to "inprogress-on" or "on".
+ # Normally it would be "inprogress-on", but it is theoretically
+ # possible for the primary to complete the checksum enabling *and* have
+ # the standby replay that record before we reach the check below.
+ $result = $node_standby->poll_query_until(
+ 'postgres',
+ "SELECT setting = 'off' "
+ . "FROM pg_catalog.pg_settings "
+ . "WHERE name = 'data_checksums';",
+ 'f');
+ is($result, 1,
+ 'ensure standby has absorbed the inprogress-on barrier');
+ $result = $node_standby->safe_psql('postgres',
+ "SELECT setting "
+ . "FROM pg_catalog.pg_settings "
+ . "WHERE name = 'data_checksums';");
+
+ is(($result eq 'inprogress-on' || $result eq 'on'),
+ 1, 'ensure checksums are on, or in progress, on standby_1');
+
+ # Wait for checksums enabled on the primary and standby
+ wait_for_checksum_state($node_primary, 'on');
+
+ # log LSN right after the primary flips checksums to "on"
+ $result =
+ $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+ note("LSN after enabling: " . $result . "\n");
+
+ random_sleep() if ($extended);
+ wait_for_checksum_state($node_standby, 'on');
+
+ $node_primary->safe_psql('postgres',
+ 'SELECT dcw_fake_temptable(false);');
+ $data_checksum_state = 'on';
+ }
+ elsif ($data_checksum_state eq 'on')
+ {
+ random_sleep() if ($extended);
+
+ # log LSN right before we start changing checksums
+ my $result =
+ $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+ note("LSN before disabling: " . $result . "\n");
+
+ disable_data_checksums($node_primary);
+ $node_primary->wait_for_catchup($node_standby, 'replay');
+
+ # Wait for checksums disabled on the primary and standby
+ wait_for_checksum_state($node_primary, 'off');
+ wait_for_checksum_state($node_standby, 'off');
+
+ # log LSN right after the primary flips checksums to "off"
+ $result =
+ $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+ note("LSN after disabling: " . $result . "\n");
+
+ random_sleep() if ($extended);
+ wait_for_checksum_state($node_standby, 'off');
+
+ $data_checksum_state = 'off';
+ }
+ else
+ {
+ # This should only happen due to programmer error when hacking on the
+ # test code, but since that might pass subtly we error out.
+ BAIL_OUT('data_checksum_state variable has invalid state:'
+ . $data_checksum_state);
+ }
+}
+
+# Create and start a cluster with one primary and one standby node, and ensure
+# they are caught up and in sync.
+$node_primary = PostgreSQL::Test::Cluster->new('pgbench_standby_main');
+$node_primary->init(allows_streaming => 1, no_data_checksums => 1);
+# max_connections need to be bumped in order to accommodate for pgbench clients
+# and log_statement is dialled down since it otherwise will generate enormous
+# amounts of logging. Page verification failures are still logged.
+$node_primary->append_conf(
+ 'postgresql.conf',
+ qq[
+max_connections = 30
+log_statement = none
+]);
+$node_primary->start;
+$node_primary->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
+# Create some content to have un-checksummed data in the cluster
+$node_primary->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;");
+$node_primary->safe_psql('postgres',
+ "SELECT pg_create_physical_replication_slot('$node_primary_slot');");
+$node_primary->backup($node_primary_backup);
+
+$node_standby = PostgreSQL::Test::Cluster->new('pgbench_standby_standby');
+$node_standby->init_from_backup($node_primary, $node_primary_backup,
+ has_streaming => 1);
+$node_standby->append_conf(
+ 'postgresql.conf', qq[
+primary_slot_name = '$node_primary_slot'
+]);
+$node_standby->start;
+
+# Initialize pgbench and wait for the objects to be created on the standby
+my $scalefactor = ($extended ? 10 : 1);
+$node_primary->command_ok(
+ [
+ 'pgbench', '-p', $node_primary->port, '-i', '-s', $scalefactor, '-q',
+ 'postgres'
+ ]);
+$node_primary->wait_for_catchup($node_standby, 'replay');
+
+# Start the test suite with pgbench running on all nodes
+background_pgbench($node_standby->port, 1);
+background_pgbench($node_primary->port, 0);
+
+# Main test suite. This loop will start a pgbench run on the cluster and while
+# that's running flip the state of data checksums concurrently. It will then
+# randomly restart the cluster and then check for
+# the desired state. The idea behind doing things randomly is to stress out
+# any timing related issues by subjecting the cluster for varied workloads.
+# A TODO is to generate a trace such that any test failure can be traced to
+# its order of operations for debugging.
+for (my $i = 0; $i < $TEST_ITERATIONS; $i++)
+{
+ note("iteration ", ($i + 1), " of ", $TEST_ITERATIONS);
+
+ if (!$node_primary->is_alive)
+ {
+ # start, to do recovery, and stop
+ $node_primary->start;
+ $node_primary->stop('fast');
+
+ # Since the log isn't being written to now, parse the log and check
+ # for instances of checksum verification failures.
+ my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile,
+ $node_primary_loglocation);
+ unlike(
+ $log,
+ qr/page verification failed,.+\d$/,
+ "no checksum validation errors in primary log (during WAL recovery)"
+ );
+ $node_primary_loglocation = -s $node_primary->logfile;
+
+ # randomize the WAL size, to trigger checkpoints less/more often
+ my $sb = 32 + int(rand(960));
+ $node_primary->append_conf('postgresql.conf', qq[max_wal_size = $sb]);
+
+ note("changing primary max_wal_size to " . $sb);
+
+ $node_primary->start;
+
+ # Start a pgbench in the background against the primary
+ background_pgbench($node_primary->port, 0);
+ }
+
+ if (!$node_standby->is_alive)
+ {
+ $node_standby->start;
+ $node_standby->stop('fast');
+
+ # Since the log isn't being written to now, parse the log and check
+ # for instances of checksum verification failures.
+ my $log =
+ PostgreSQL::Test::Utils::slurp_file($node_standby->logfile,
+ $node_standby_loglocation);
+ unlike(
+ $log,
+ qr/page verification failed,.+\d$/,
+ "no checksum validation errors in standby_1 log (during WAL recovery)"
+ );
+ $node_standby_loglocation = -s $node_standby->logfile;
+
+ # randomize the WAL size, to trigger checkpoints less/more often
+ my $sb = 32 + int(rand(960));
+ $node_standby->append_conf('postgresql.conf', qq[max_wal_size = $sb]);
+
+ note("changing standby max_wal_size to " . $sb);
+
+ $node_standby->start;
+
+ # Start a read-only pgbench in the background on the standby
+ background_pgbench($node_standby->port, 1);
+ }
+
+ $node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;");
+ $node_primary->wait_for_catchup($node_standby, 'write');
+
+ flip_data_checksums();
+ random_sleep() if ($extended);
+ my $result = $node_primary->safe_psql('postgres',
+ "SELECT count(*) FROM t WHERE a > 1");
+ is($result, '100000', 'ensure data pages can be read back on primary');
+ random_sleep();
+
+ # Potentially powercycle the cluster (the nodes independently). A TODO is
+ # to randomly stop the nodes in the opposite order too.
+ if ($extended && cointoss())
+ {
+ $node_primary->stop(stopmode());
+
+ # print the contents of the control file on the primary
+ PostgreSQL::Test::Utils::system_log("pg_controldata",
+ $node_primary->data_dir);
+
+ # slurp the file after shutdown, so that it doesn't interfere with the recovery
+ my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile,
+ $node_primary_loglocation);
+ unlike(
+ $log,
+ qr/page verification failed,.+\d$/,
+ "no checksum validation errors in primary log (outside WAL recovery)"
+ );
+ $node_primary_loglocation = -s $node_primary->logfile;
+ }
+
+ random_sleep() if ($extended);
+
+ if ($extended && cointoss())
+ {
+ $node_standby->stop(stopmode());
+
+ # print the contents of the control file on the standby
+ PostgreSQL::Test::Utils::system_log("pg_controldata",
+ $node_standby->data_dir);
+
+ # slurp the file after shutdown, so that it doesn't interfere with the recovery
+ my $log =
+ PostgreSQL::Test::Utils::slurp_file($node_standby->logfile,
+ $node_standby_loglocation);
+ unlike(
+ $log,
+ qr/page verification failed,.+\d$/,
+ "no checksum validation errors in standby_1 log (outside WAL recovery)"
+ );
+ $node_standby_loglocation = -s $node_standby->logfile;
+ }
+}
+
+# make sure the nodes are running
+if (!$node_primary->is_alive)
+{
+ $node_primary->start;
+}
+
+if (!$node_standby->is_alive)
+{
+ $node_standby->start;
+}
+
+# Testrun is over, ensure that data reads back as expected and perform a final
+# verification of the data checksum state.
+my $result =
+ $node_primary->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '100000', 'ensure data pages can be read back on primary');
+test_checksum_state($node_primary, $data_checksum_state);
+test_checksum_state($node_standby, $data_checksum_state);
+
+# Perform one final pass over the logs and hunt for unexpected errors
+my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile,
+ $node_primary_loglocation);
+unlike(
+ $log,
+ qr/page verification failed,.+\d$/,
+ "no checksum validation errors in primary log");
+$node_primary_loglocation = -s $node_primary->logfile;
+$log = PostgreSQL::Test::Utils::slurp_file($node_standby->logfile,
+ $node_standby_loglocation);
+unlike(
+ $log,
+ qr/page verification failed,.+\d$/,
+ "no checksum validation errors in standby_1 log");
+$node_standby_loglocation = -s $node_standby->logfile;
+
+$node_standby->teardown_node;
+$node_primary->teardown_node;
+
+done_testing();
diff --git a/src/test/modules/test_checksums/t/008_pitr.pl b/src/test/modules/test_checksums/t/008_pitr.pl
new file mode 100644
index 00000000000..b9b89f414ab
--- /dev/null
+++ b/src/test/modules/test_checksums/t/008_pitr.pl
@@ -0,0 +1,189 @@
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# This test suite is expensive, or very expensive, to execute. There are two
+# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite
+# an "checksum_extended" for the full suite.
+my $extended = undef;
+if ($ENV{PG_TEST_EXTRA})
+{
+ $extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/);
+ plan skip_all => 'Expensive data checksums test disabled'
+ unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/);
+}
+else
+{
+ plan skip_all => 'Expensive data checksums test disabled';
+}
+
+
+my $pgbench = undef;
+my $data_checksum_state = 'off';
+
+my $node_primary;
+
+# Invert the state of data checksums in the cluster, if data checksums are on
+# then disable them and vice versa. Also performs proper validation of the
+# before and after state.
+sub flip_data_checksums
+{
+ my $lsn_pre = undef;
+ my $lsn_post = undef;
+
+ # First, make sure the cluster is in the state we expect it to be
+ test_checksum_state($node_primary, $data_checksum_state);
+
+ if ($data_checksum_state eq 'off')
+ {
+ # log LSN right before we start changing checksums
+ $lsn_pre =
+ $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+ note("LSN before enabling: " . $lsn_pre . "\n");
+
+ # Wait for checksums enabled on the primary
+ enable_data_checksums($node_primary, wait => 'on');
+
+ # log LSN right after the primary flips checksums to "on"
+ $lsn_post =
+ $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+ note("LSN after enabling: " . $lsn_post . "\n");
+
+ $data_checksum_state = 'on';
+ }
+ elsif ($data_checksum_state eq 'on')
+ {
+ # log LSN right before we start changing checksums
+ $lsn_pre =
+ $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+
+ disable_data_checksums($node_primary);
+
+ # Wait for checksums disabled on the primary
+ wait_for_checksum_state($node_primary, 'off');
+
+ # log LSN right after the primary flips checksums to "off"
+ $lsn_post =
+ $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+
+ $data_checksum_state = 'off';
+ }
+ else
+ {
+ # This should only happen due to programmer error when hacking on the
+ # test code, but since that might pass subtly we error out.
+ BAIL_OUT('data_checksum_state variable has invalid state:'
+ . $data_checksum_state);
+ }
+
+ return ($lsn_pre, $lsn_post);
+}
+# Start a pgbench run in the background against the server specified via the
+# port passed as parameter.
+sub background_rw_pgbench
+{
+ my $port = shift;
+
+ # If a previous pgbench is still running, start by shutting it down.
+ $pgbench->finish if $pgbench;
+
+ # Randomize the number of pgbench clients in extended mode, else 1 client
+ my $clients = ($extended ? 1 + int(rand(15)) : 1);
+ my $runtime = ($extended ? 600 : 5);
+
+ my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients);
+
+ # Randomize whether we spawn connections or not
+ push(@cmd, '-C') if ($extended && cointoss());
+ # Finally add the database name to use
+ push(@cmd, 'postgres');
+
+ $pgbench = IPC::Run::start(
+ \@cmd,
+ '<' => '/dev/null',
+ '>' => '/dev/null',
+ '2>' => '/dev/null',
+ IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default));
+}
+
+# Start a primary node with WAL archiving enabled and with enough connections
+# available to handle pgbench clients.
+$node_primary = PostgreSQL::Test::Cluster->new('pitr_main');
+$node_primary->init(
+ has_archiving => 1,
+ allows_streaming => 1,
+ no_data_checksums => 1);
+$node_primary->append_conf(
+ 'postgresql.conf',
+ qq[
+max_connections = 100
+log_statement = none
+]);
+$node_primary->start;
+
+# Prime the cluster with a bit of known data which we can read back to check
+# for data consistency as well as page verification faults in the logfile.
+$node_primary->safe_psql('postgres',
+ 'CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;');
+# Initialize and start pgbench in read/write mode against the cluster
+my $scalefactor = ($extended ? 10 : 1);
+$node_primary->command_ok(
+ [
+ 'pgbench', '-p', $node_primary->port, '-i', '-s', $scalefactor, '-q',
+ 'postgres'
+ ]);
+background_rw_pgbench($node_primary->port);
+
+# Take a backup to use for PITR
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+my ($pre_lsn, $post_lsn) = flip_data_checksums();
+
+$node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;");
+$node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('a');");
+$node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;");
+$node_primary->stop('immediate');
+
+my $node_pitr = PostgreSQL::Test::Cluster->new('pitr_backup');
+$node_pitr->init_from_backup(
+ $node_primary, $backup_name,
+ standby => 0,
+ has_restoring => 1);
+$node_pitr->append_conf(
+ 'postgresql.conf', qq{
+recovery_target_lsn = '$post_lsn'
+recovery_target_action = 'promote'
+recovery_target_inclusive = on
+});
+
+$node_pitr->start;
+
+$node_pitr->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';")
+ or die "Timed out while waiting for PITR promotion";
+
+test_checksum_state($node_pitr, $data_checksum_state);
+my $result =
+ $node_pitr->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '99999', 'ensure data pages can be read back on primary');
+
+$node_pitr->stop;
+
+my $log = PostgreSQL::Test::Utils::slurp_file($node_pitr->logfile, 0);
+unlike(
+ $log,
+ qr/page verification failed,.+\d$/,
+ "no checksum validation errors in pitr log");
+
+done_testing();
diff --git a/src/test/modules/test_checksums/t/009_fpi.pl b/src/test/modules/test_checksums/t/009_fpi.pl
new file mode 100644
index 00000000000..a1cea91f787
--- /dev/null
+++ b/src/test/modules/test_checksums/t/009_fpi.pl
@@ -0,0 +1,64 @@
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# Create and start a cluster with one node
+my $node = PostgreSQL::Test::Cluster->new('fpi_node');
+$node->init(allows_streaming => 1, no_data_checksums => 1);
+# max_connections need to be bumped in order to accommodate for pgbench clients
+# and log_statement is dialled down since it otherwise will generate enormous
+# amounts of logging. Page verification failures are still logged.
+$node->append_conf(
+ 'postgresql.conf',
+ qq[
+max_connections = 100
+log_statement = none
+]);
+$node->start;
+$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
+# Create some content to have un-checksummed data in the cluster
+$node->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1, 1000000) AS a;");
+
+# Enable data checksums and wait for the state transition to 'on'
+enable_data_checksums($node, wait => 'on');
+
+$node->safe_psql('postgres', 'UPDATE t SET a = a + 1;');
+
+disable_data_checksums($node, wait => 1);
+
+$node->append_conf('postgresql.conf', 'full_page_writes = off');
+$node->restart;
+test_checksum_state($node, 'off');
+
+$node->safe_psql('postgres', 'UPDATE t SET a = a + 1;');
+$node->safe_psql('postgres', 'DELETE FROM t WHERE a < 10000;');
+
+$node->adjust_conf('postgresql.conf', 'full_page_writes', 'on');
+$node->restart;
+test_checksum_state($node, 'off');
+
+enable_data_checksums($node, wait => 'on');
+
+my $result = $node->safe_psql('postgres', 'SELECT count(*) FROM t;');
+is($result, '990003', 'Reading back all data from table t');
+
+$node->stop;
+my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, 0);
+unlike(
+ $log,
+ qr/page verification failed,.+\d$/,
+ "no checksum validation errors in server log");
+
+done_testing();
diff --git a/src/test/modules/test_checksums/t/DataChecksums/Utils.pm b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm
new file mode 100644
index 00000000000..9a2269e8a92
--- /dev/null
+++ b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm
@@ -0,0 +1,262 @@
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+=pod
+
+=head1 NAME
+
+DataChecksums::Utils - Utility functions for testing data checksums in a running cluster
+
+=head1 SYNOPSIS
+
+ use PostgreSQL::Test::Cluster;
+ use DataChecksums::Utils qw( .. );
+
+ # Create, and start, a new cluster
+ my $node = PostgreSQL::Test::Cluster->new('primary');
+ $node->init;
+ $node->start;
+
+ test_checksum_state($node, 'off');
+
+ enable_data_checksums($node);
+
+ wait_for_checksum_state($node, 'on');
+
+
+=cut
+
+package DataChecksums::Utils;
+
+use strict;
+use warnings FATAL => 'all';
+use Exporter 'import';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+our @EXPORT = qw(
+ cointoss
+ disable_data_checksums
+ enable_data_checksums
+ random_sleep
+ stopmode
+ test_checksum_state
+ wait_for_checksum_state
+ wait_for_cluster_crash
+);
+
+=pod
+
+=head1 METHODS
+
+=over
+
+=item test_checksum_state(node, state)
+
+Test that the current value of the data checksum GUC in the server running
+at B matches B. If the values differ, a test failure is logged.
+Returns True if the values match, otherwise False.
+
+=cut
+
+sub test_checksum_state
+{
+ my ($postgresnode, $state) = @_;
+
+ my $result = $postgresnode->safe_psql('postgres',
+ "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"
+ );
+ is($result, $state, 'ensure checksums are set to ' . $state);
+ return $result eq $state;
+}
+
+=item wait_for_checksum_state(node, state)
+
+Test the value of the data checksum GUC in the server running at B
+repeatedly until it matches B or times out. Processing will run for
+$PostgreSQL::Test::Utils::timeout_default seconds before timing out. If the
+values differ when the process times out, False is returned and a test failure
+is logged, otherwise True.
+
+=cut
+
+sub wait_for_checksum_state
+{
+ my ($postgresnode, $state) = @_;
+
+ my $res = $postgresnode->poll_query_until(
+ 'postgres',
+ "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';",
+ $state);
+ is($res, 1, 'ensure data checksums are transitioned to ' . $state);
+ return $res == 1;
+}
+
+=item wait_for_cluster_crash(node, params)
+
+Repeatedly test if the cluster running at B responds to connections
+and return when it no longer does so, or when it times out. Processing will
+run for $PostgreSQL::Test::Utils::timeout_default seconds unless a timeout
+value is specified as a parameter. Returns True if the cluster crashed, else
+False if the process timed out.
+
+=over
+
+=item timeout
+
+Approximate number of seconds to wait for cluster to crash, default is
+$PostgreSQL::Test::Utils::timeout_default. There are no real-time guarantees
+that the total process time won't exceed the timeout.
+
+=back
+
+=cut
+
+sub wait_for_cluster_crash
+{
+ my $postgresnode = shift;
+ my %params = @_;
+ my $crash = 0;
+
+ $params{timeout} = $PostgreSQL::Test::Utils::timeout_default
+ unless (defined($params{timeout}));
+
+ for (my $naps = 0; $naps < $params{timeout}; $naps++)
+ {
+ if (!$postgresnode->is_alive)
+ {
+ $crash = 1;
+ last;
+ }
+ sleep(1);
+ }
+
+ return $crash == 1;
+}
+
+=item enable_data_checksums($node, %params)
+
+Function for enabling data checksums in the cluster running at B.
+
+=over
+
+=item cost_delay
+
+The B to use when enabling data checksums, default is 0.
+
+=item cost_limit
+
+The B to use when enabling data checksums, default is 100.
+
+=item wait
+
+If defined, the function will wait for the state defined in this parameter,
+waiting timing out, before returning. The function will wait for
+$PostgreSQL::Test::Utils::timeout_default seconds before timing out.
+
+=back
+
+=cut
+
+sub enable_data_checksums
+{
+ my $postgresnode = shift;
+ my %params = @_;
+
+ # Set sane defaults for the parameters
+ $params{cost_delay} = 0 unless (defined($params{cost_delay}));
+ $params{cost_limit} = 100 unless (defined($params{cost_limit}));
+
+ my $query = <<'EOQ';
+SELECT pg_enable_data_checksums(%s, %s);
+EOQ
+
+ $postgresnode->safe_psql('postgres',
+ sprintf($query, $params{cost_delay}, $params{cost_limit}));
+
+ wait_for_checksum_state($postgresnode, $params{wait})
+ if (defined($params{wait}));
+}
+
+=item disable_data_checksums($node, %params)
+
+Function for disabling data checksums in the cluster running at B.
+
+=over
+
+=item wait
+
+If defined, the function will wait for the state to turn to B, or
+waiting timing out, before returning. The function will wait for
+$PostgreSQL::Test::Utils::timeout_default seconds before timing out.
+Unlike in C the value of the parameter is discarded.
+
+=back
+
+=cut
+
+sub disable_data_checksums
+{
+ my $postgresnode = shift;
+ my %params = @_;
+
+ $postgresnode->safe_psql('postgres',
+ 'SELECT pg_disable_data_checksums();');
+
+ wait_for_checksum_state($postgresnode, 'off') if (defined($params{wait}));
+}
+
+=item cointoss
+
+Helper for retrieving a binary value with random distribution for deciding
+whether to turn things off during testing.
+
+=back
+
+=cut
+
+sub cointoss
+{
+ return int(rand() < 0.5);
+}
+
+=item random_sleep(max)
+
+Helper for injecting random sleeps here and there in the testrun. The sleep
+duration will be in the range (0,B), but won't be predictable in order to
+avoid sleep patterns that manage to avoid race conditions and timing bugs.
+The default B is 3 seconds.
+
+=back
+
+=cut
+
+sub random_sleep
+{
+ my $max = shift;
+ return if (defined($max) && ($max == 0));
+ sleep(int(rand(defined($max) ? $max : 3))) if cointoss;
+}
+
+=item stopmode
+
+Small helper function for randomly selecting a valid stopmode.
+
+=back
+
+=cut
+
+sub stopmode
+{
+ return 'immediate' if (cointoss);
+ return 'fast';
+}
+
+=pod
+
+=back
+
+=cut
+
+1;
diff --git a/src/test/modules/test_checksums/test_checksums--1.0.sql b/src/test/modules/test_checksums/test_checksums--1.0.sql
new file mode 100644
index 00000000000..90642d247fa
--- /dev/null
+++ b/src/test/modules/test_checksums/test_checksums--1.0.sql
@@ -0,0 +1,24 @@
+/* src/test/modules/test_checksums/test_checksums--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_checksums" to load this file. \quit
+
+CREATE FUNCTION dcw_inject_delay_barrier(attach boolean DEFAULT true)
+ RETURNS pg_catalog.void
+ AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION dcw_inject_launcher_delay(attach boolean DEFAULT true)
+ RETURNS pg_catalog.void
+ AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION dcw_inject_startup_delay(attach boolean DEFAULT true)
+ RETURNS pg_catalog.void
+ AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION dcw_inject_fail_database(attach boolean DEFAULT true)
+ RETURNS pg_catalog.void
+ AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION dcw_fake_temptable(attach boolean DEFAULT true)
+ RETURNS pg_catalog.void
+ AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_checksums/test_checksums.c b/src/test/modules/test_checksums/test_checksums.c
new file mode 100644
index 00000000000..b087a3b4664
--- /dev/null
+++ b/src/test/modules/test_checksums/test_checksums.c
@@ -0,0 +1,184 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_checksums.c
+ * Test data checksums
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/test/modules/test_checksums/test_checksums.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "postmaster/datachecksum_state.h"
+#include "storage/latch.h"
+#include "utils/injection_point.h"
+#include "utils/wait_event.h"
+
+PG_MODULE_MAGIC;
+
+extern PGDLLEXPORT void dc_delay_barrier(const char *name, const void *private_data, void *arg);
+extern PGDLLEXPORT void dc_modify_db_result(const char *name, const void *private_data, void *arg);
+extern PGDLLEXPORT void dc_fake_temptable(const char *name, const void *private_data, void *arg);
+
+extern PGDLLEXPORT void crash(const char *name, const void *private_data, void *arg);
+
+/*
+ * Test for delaying emission of procsignalbarriers.
+ */
+void
+dc_delay_barrier(const char *name, const void *private_data, void *arg)
+{
+ (void) name;
+ (void) private_data;
+
+ (void) WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ (3 * 1000),
+ WAIT_EVENT_PG_SLEEP);
+}
+
+PG_FUNCTION_INFO_V1(dcw_inject_delay_barrier);
+Datum
+dcw_inject_delay_barrier(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ bool attach = PG_GETARG_BOOL(0);
+
+ if (attach)
+ InjectionPointAttach("datachecksums-enable-checksums-delay",
+ "test_checksums",
+ "dc_delay_barrier",
+ NULL,
+ 0);
+ else
+ InjectionPointDetach("datachecksums-enable-checksums-delay");
+#else
+ elog(ERROR,
+ "test is not working as intended when injection points are disabled");
+#endif
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(dcw_inject_launcher_delay);
+Datum
+dcw_inject_launcher_delay(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ bool attach = PG_GETARG_BOOL(0);
+
+ if (attach)
+ InjectionPointAttach("datachecksumsworker-launcher-delay",
+ "test_checksums",
+ "dc_delay_barrier",
+ NULL,
+ 0);
+ else
+ InjectionPointDetach("datachecksumsworker-launcher-delay");
+#else
+ elog(ERROR,
+ "test is not working as intended when injection points are disabled");
+#endif
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(dcw_inject_startup_delay);
+Datum
+dcw_inject_startup_delay(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ bool attach = PG_GETARG_BOOL(0);
+
+ if (attach)
+ InjectionPointAttach("datachecksumsworker-startup-delay",
+ "test_checksums",
+ "dc_delay_barrier",
+ NULL,
+ 0);
+ else
+ InjectionPointDetach("datachecksumsworker-startup-delay");
+#else
+ elog(ERROR,
+ "test is not working as intended when injection points are disabled");
+#endif
+ PG_RETURN_VOID();
+}
+
+static uint32 db_fail = DATACHECKSUMSWORKER_FAILED;
+
+void
+dc_modify_db_result(const char *name, const void *private_data, void *arg)
+{
+ DataChecksumsWorkerResult *res = (DataChecksumsWorkerResult *) arg;
+ uint32 new_res = *(uint32 *) private_data;
+
+ *res = new_res;
+}
+
+PG_FUNCTION_INFO_V1(dcw_inject_fail_database);
+Datum
+dcw_inject_fail_database(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ bool attach = PG_GETARG_BOOL(0);
+
+ if (attach)
+ InjectionPointAttach("datachecksumsworker-modify-db-result",
+ "test_checksums",
+ "dc_modify_db_result",
+ &db_fail,
+ sizeof(uint32));
+ else
+ InjectionPointDetach("datachecksumsworker-modify-db-result");
+#else
+ elog(ERROR,
+ "test is not working as intended when injection points are disabled");
+#endif
+ PG_RETURN_VOID();
+}
+
+/*
+ * Test to force waiting for existing temptables.
+ */
+void
+dc_fake_temptable(const char *name, const void *private_data, void *arg)
+{
+ static bool first_pass = true;
+ int *numleft = (int *) arg;
+
+ if (first_pass)
+ *numleft = 1;
+ first_pass = false;
+}
+
+PG_FUNCTION_INFO_V1(dcw_fake_temptable);
+Datum
+dcw_fake_temptable(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ bool attach = PG_GETARG_BOOL(0);
+
+ if (attach)
+ InjectionPointAttach("datachecksumsworker-fake-temptable-wait",
+ "test_checksums",
+ "dc_fake_temptable",
+ NULL,
+ 0);
+ else
+ InjectionPointDetach("datachecksumsworker-fake-temptable-wait");
+#else
+ elog(ERROR,
+ "test is not working as intended when injection points are disabled");
+#endif
+ PG_RETURN_VOID();
+}
+
+void
+crash(const char *name, const void *private_data, void *arg)
+{
+ abort();
+}
diff --git a/src/test/modules/test_checksums/test_checksums.control b/src/test/modules/test_checksums/test_checksums.control
new file mode 100644
index 00000000000..84b4cc035a7
--- /dev/null
+++ b/src/test/modules/test_checksums/test_checksums.control
@@ -0,0 +1,4 @@
+comment = 'Test code for data checksums'
+default_version = '1.0'
+module_pathname = '$libdir/test_checksums'
+relocatable = true
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index f8dc732e66e..54e6b646e8f 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -3898,6 +3898,42 @@ sub advance_wal
}
}
+=item $node->checksum_enable_offline()
+
+Enable data page checksums in an offline cluster with B. The
+caller is responsible for ensuring that the cluster is in the right state for
+this operation.
+
+=cut
+
+sub checksum_enable_offline
+{
+ my ($self) = @_;
+
+ print "# Enabling checksums in \"$self->data_dir\"\n";
+ PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D',
+ $self->data_dir, '-e');
+ return;
+}
+
+=item $node->checksum_disable_offline()
+
+Disable data page checksums in an offline cluster with B. The
+caller is responsible for ensuring that the cluster is in the right state for
+this operation.
+
+=cut
+
+sub checksum_disable_offline
+{
+ my ($self) = @_;
+
+ print "# Disabling checksums in \"$self->data_dir\"\n";
+ PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D',
+ $self->data_dir, '-d');
+ return;
+}
+
=pod
=back
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 2b3cf6d8569..81a73c426d2 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2085,6 +2085,41 @@ pg_stat_progress_create_index| SELECT s.pid,
s.param15 AS partitions_done
FROM (pg_stat_get_progress_info('CREATE INDEX'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20)
LEFT JOIN pg_database d ON ((s.datid = d.oid)));
+pg_stat_progress_data_checksums| SELECT s.pid,
+ s.datid,
+ d.datname,
+ CASE s.param1
+ WHEN 0 THEN 'enabling'::text
+ WHEN 1 THEN 'disabling'::text
+ WHEN 2 THEN 'waiting on temporary tables'::text
+ WHEN 3 THEN 'waiting on barrier'::text
+ WHEN 4 THEN 'done'::text
+ ELSE NULL::text
+ END AS phase,
+ CASE s.param2
+ WHEN '-1'::integer THEN NULL::bigint
+ ELSE s.param2
+ END AS databases_total,
+ s.param3 AS databases_done,
+ CASE s.param4
+ WHEN '-1'::integer THEN NULL::bigint
+ ELSE s.param4
+ END AS relations_total,
+ CASE s.param5
+ WHEN '-1'::integer THEN NULL::bigint
+ ELSE s.param5
+ END AS relations_done,
+ CASE s.param6
+ WHEN '-1'::integer THEN NULL::bigint
+ ELSE s.param6
+ END AS blocks_total,
+ CASE s.param7
+ WHEN '-1'::integer THEN NULL::bigint
+ ELSE s.param7
+ END AS blocks_done
+ FROM (pg_stat_get_progress_info('DATACHECKSUMS'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20)
+ LEFT JOIN pg_database d ON ((s.datid = d.oid)))
+ ORDER BY s.datid;
pg_stat_progress_repack| SELECT s.pid,
s.datid,
d.datname,
diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out
index ea7f7846895..35632f83052 100644
--- a/src/test/regress/expected/stats.out
+++ b/src/test/regress/expected/stats.out
@@ -51,6 +51,22 @@ client backend|relation|vacuum
client backend|temp relation|normal
client backend|wal|init
client backend|wal|normal
+datachecksum launcher|relation|bulkread
+datachecksum launcher|relation|bulkwrite
+datachecksum launcher|relation|init
+datachecksum launcher|relation|normal
+datachecksum launcher|relation|vacuum
+datachecksum launcher|temp relation|normal
+datachecksum launcher|wal|init
+datachecksum launcher|wal|normal
+datachecksum worker|relation|bulkread
+datachecksum worker|relation|bulkwrite
+datachecksum worker|relation|init
+datachecksum worker|relation|normal
+datachecksum worker|relation|vacuum
+datachecksum worker|temp relation|normal
+datachecksum worker|wal|init
+datachecksum worker|wal|normal
io worker|relation|bulkread
io worker|relation|bulkwrite
io worker|relation|init
@@ -95,7 +111,7 @@ walsummarizer|wal|init
walsummarizer|wal|normal
walwriter|wal|init
walwriter|wal|normal
-(79 rows)
+(95 rows)
\a
-- ensure that both seqscan and indexscan plans are allowed
SET enable_seqscan TO on;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 91b1225da82..ad999aa48dd 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -438,6 +438,8 @@ CheckPointStmt
CheckpointStatsData
CheckpointerRequest
CheckpointerShmemStruct
+ChecksumBarrierCondition
+ChecksumStateType
Chromosome
CkptSortItem
CkptTsStatus
@@ -610,6 +612,7 @@ CustomScan
CustomScanMethods
CustomScanState
CycleCtr
+DataChecksumsWorkerOperation
DBState
DbOidName
DCHCacheEntry
@@ -628,6 +631,9 @@ DSMREntryType
DSMRegistryCtxStruct
DSMRegistryEntry
DWORD
+DataChecksumsWorkerDatabase
+DataChecksumsWorkerResult
+DataChecksumsStateStruct
DataDirSyncMethod
DataDumperPtr
DataPageDeleteStack
@@ -4405,6 +4411,7 @@ xl_btree_unlink_page
xl_btree_update
xl_btree_vacuum
xl_checkpoint_redo
+xl_checksum_state
xl_clog_truncate
xl_commit_ts_truncate
xl_dbase_create_file_copy_rec