Detect and fix visibility map corruption in more cases

Move VM corruption detection and repair into heap page pruning. This
allows VM repair during on-access pruning, not only during vacuum.

Also, expand corruption detection to cover pages marked all-visible that
contain dead tuples and tuples inserted or deleted by in-progress
transactions, rather than only all-visible pages with LP_DEAD items.

Pinning the correct VM page before on-access pruning is cheap when
compared to the cost of actually pruning. The vmbuffer is saved in the
scan descriptor, so a query should only need to pin each VM page once,
and a single VM page covers a large number of heap pages.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://postgr.es/m/bqc4kh5midfn44gnjiqez3bjqv4zogydguvdn446riw45jcf3y%404ez66il7ebvk
This commit is contained in:
Melanie Plageman 2026-03-22 11:52:40 -04:00
parent 516310ed4d
commit 4f7ecca84d
4 changed files with 221 additions and 102 deletions

View file

@ -19,7 +19,7 @@
#include "access/htup_details.h"
#include "access/multixact.h"
#include "access/transam.h"
#include "access/visibilitymapdefs.h"
#include "access/visibilitymap.h"
#include "access/xlog.h"
#include "access/xloginsert.h"
#include "commands/vacuum.h"
@ -114,6 +114,21 @@ typedef struct
*/
HeapPageFreeze pagefrz;
/*-------------------------------------------------------
* Working state for visibility map processing
*-------------------------------------------------------
*/
/*
* Caller must provide a pinned vmbuffer corresponding to the heap block
* passed to heap_page_prune_and_freeze(). We will fix any corruption
* found in the VM.
*/
Buffer vmbuffer;
/* Bits in the vmbuffer for this heap page */
uint8 old_vmbits;
/*-------------------------------------------------------
* Information about what was done
*
@ -162,12 +177,31 @@ typedef struct
TransactionId visibility_cutoff_xid;
} PruneState;
/*
* Type of visibility map corruption detected on a heap page and its
* associated VM page. Passed to heap_page_fix_vm_corruption() so the caller
* can specify what it found rather than having the function rederive the
* corruption from page state.
*/
typedef enum VMCorruptionType
{
/* VM bits are set but the heap page-level PD_ALL_VISIBLE flag is not */
VM_CORRUPT_MISSING_PAGE_HINT,
/* LP_DEAD line pointers found on a page marked all-visible */
VM_CORRUPT_LPDEAD,
/* Tuple not visible to all transactions on a page marked all-visible */
VM_CORRUPT_TUPLE_VISIBILITY,
} VMCorruptionType;
/* Local functions */
static void prune_freeze_setup(PruneFreezeParams *params,
TransactionId *new_relfrozen_xid,
MultiXactId *new_relmin_mxid,
PruneFreezeResult *presult,
PruneState *prstate);
static void heap_page_fix_vm_corruption(PruneState *prstate,
OffsetNumber offnum,
VMCorruptionType ctype);
static void prune_freeze_plan(PruneState *prstate,
OffsetNumber *off_loc);
static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
@ -175,7 +209,8 @@ static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
static inline HTSV_Result htsv_get_valid_status(int status);
static void heap_prune_chain(OffsetNumber maxoff,
OffsetNumber rootoffnum, PruneState *prstate);
static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid,
OffsetNumber offnum);
static void heap_prune_record_redirect(PruneState *prstate,
OffsetNumber offnum, OffsetNumber rdoffnum,
bool was_normal);
@ -209,8 +244,9 @@ static bool heap_page_will_freeze(bool did_tuple_hint_fpi, bool do_prune, bool d
* Caller must have pin on the buffer, and must *not* have a lock on it.
*
* This function may pin *vmbuffer. It's passed by reference so the caller can
* reuse the pin across calls, avoiding repeated pin/unpin cycles. Caller is
* responsible for unpinning it.
* reuse the pin across calls, avoiding repeated pin/unpin cycles. If we find
* VM corruption during pruning, we will fix it. Caller is responsible for
* unpinning *vmbuffer.
*/
void
heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer)
@ -277,6 +313,17 @@ heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer)
{
OffsetNumber dummy_off_loc;
PruneFreezeResult presult;
PruneFreezeParams params;
visibilitymap_pin(relation, BufferGetBlockNumber(buffer),
vmbuffer);
params.relation = relation;
params.buffer = buffer;
params.vmbuffer = *vmbuffer;
params.reason = PRUNE_ON_ACCESS;
params.vistest = vistest;
params.cutoffs = NULL;
/*
* We don't pass the HEAP_PAGE_PRUNE_MARK_UNUSED_NOW option
@ -284,14 +331,7 @@ heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer)
* cannot safely determine that during on-access pruning with the
* current implementation.
*/
PruneFreezeParams params = {
.relation = relation,
.buffer = buffer,
.reason = PRUNE_ON_ACCESS,
.options = 0,
.vistest = vistest,
.cutoffs = NULL,
};
params.options = 0;
heap_page_prune_and_freeze(&params, &presult, &dummy_off_loc,
NULL, NULL);
@ -354,6 +394,12 @@ prune_freeze_setup(PruneFreezeParams *params,
prstate->buffer = params->buffer;
prstate->page = BufferGetPage(params->buffer);
Assert(BufferIsValid(params->vmbuffer));
prstate->vmbuffer = params->vmbuffer;
prstate->old_vmbits = visibilitymap_get_status(prstate->relation,
prstate->block,
&prstate->vmbuffer);
/*
* Our strategy is to scan the page and make lists of items to change,
* then apply the changes within a critical section. This keeps as much
@ -770,6 +816,108 @@ heap_page_will_freeze(bool did_tuple_hint_fpi,
return do_freeze;
}
/*
* Emit a warning about and fix visibility map corruption on the given page.
*
* The caller specifies the type of corruption it has already detected via
* corruption_type, so that we can emit the appropriate warning. All cases
* result in the VM bits being cleared; corruption types where PD_ALL_VISIBLE
* is incorrectly set also clear PD_ALL_VISIBLE.
*
* Must be called while holding an exclusive lock on the heap buffer. Dead
* items and not all-visible tuples must have been discovered under that same
* lock. Although we do not hold a lock on the VM buffer, it is pinned, and
* the heap buffer is exclusively locked, ensuring that no other backend can
* update the VM bits corresponding to this heap page.
*
* This function makes changes to the VM and, potentially, the heap page, but
* it does not need to be done in a critical section.
*/
static void
heap_page_fix_vm_corruption(PruneState *prstate, OffsetNumber offnum,
VMCorruptionType corruption_type)
{
const char *relname = RelationGetRelationName(prstate->relation);
bool do_clear_vm = false;
bool do_clear_heap = false;
Assert(BufferIsLockedByMeInMode(prstate->buffer, BUFFER_LOCK_EXCLUSIVE));
switch (corruption_type)
{
case VM_CORRUPT_LPDEAD:
ereport(WARNING,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("dead line pointer found on page marked all-visible"),
errcontext("relation \"%s\", page %u, tuple %u",
relname, prstate->block, offnum)));
do_clear_vm = true;
do_clear_heap = true;
break;
case VM_CORRUPT_TUPLE_VISIBILITY:
/*
* A HEAPTUPLE_LIVE tuple on an all-visible page can appear to not
* be visible to everyone when
* GetOldestNonRemovableTransactionId() returns a conservative
* value that's older than the real safe xmin. That is not
* corruption -- the PD_ALL_VISIBLE flag is still correct.
*
* However, dead tuple versions, in-progress inserts, and
* in-progress deletes should never appear on a page marked
* all-visible. That indicates real corruption. PD_ALL_VISIBLE
* should have been cleared by the DML operation that deleted or
* inserted the tuple.
*/
ereport(WARNING,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("tuple not visible to all transactions found on page marked all-visible"),
errcontext("relation \"%s\", page %u, tuple %u",
relname, prstate->block, offnum)));
do_clear_vm = true;
do_clear_heap = true;
break;
case VM_CORRUPT_MISSING_PAGE_HINT:
/*
* As of PostgreSQL 9.2, the visibility map bit should never be
* set if the page-level bit is clear. However, for vacuum, it's
* possible that the bit got cleared after
* heap_vac_scan_next_block() was called, so we must recheck now
* that we have the buffer lock before concluding that the VM is
* corrupt.
*/
Assert(!PageIsAllVisible(prstate->page));
Assert(prstate->old_vmbits & VISIBILITYMAP_VALID_BITS);
ereport(WARNING,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("page is not marked all-visible but visibility map bit is set"),
errcontext("relation \"%s\", page %u",
relname, prstate->block)));
do_clear_vm = true;
break;
}
Assert(do_clear_heap || do_clear_vm);
/* Avoid marking the buffer dirty if PD_ALL_VISIBLE is already clear */
if (do_clear_heap)
{
Assert(PageIsAllVisible(prstate->page));
PageClearAllVisible(prstate->page);
MarkBufferDirtyHint(prstate->buffer, true);
}
if (do_clear_vm)
{
visibilitymap_clear(prstate->relation, prstate->block,
prstate->vmbuffer,
VISIBILITYMAP_VALID_BITS);
prstate->old_vmbits = 0;
}
}
/*
* Prune and repair fragmentation and potentially freeze tuples on the
@ -830,6 +978,16 @@ heap_page_prune_and_freeze(PruneFreezeParams *params,
new_relfrozen_xid, new_relmin_mxid,
presult, &prstate);
/*
* If the VM is set but PD_ALL_VISIBLE is clear, fix that corruption
* before pruning and freezing so that the page and VM start out in a
* consistent state.
*/
if ((prstate.old_vmbits & VISIBILITYMAP_VALID_BITS) &&
!PageIsAllVisible(prstate.page))
heap_page_fix_vm_corruption(&prstate, InvalidOffsetNumber,
VM_CORRUPT_MISSING_PAGE_HINT);
/*
* Examine all line pointers and tuple visibility information to determine
* which line pointers should change state and which tuples may be frozen.
@ -973,6 +1131,7 @@ heap_page_prune_and_freeze(PruneFreezeParams *params,
presult->set_all_visible = prstate.set_all_visible;
presult->set_all_frozen = prstate.set_all_frozen;
presult->hastup = prstate.hastup;
presult->old_vmbits = prstate.old_vmbits;
/*
* For callers planning to update the visibility map, the conflict horizon
@ -1295,7 +1454,8 @@ process_chain:
/* Record lowest soon-prunable XID */
static void
heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
heap_prune_record_prunable(PruneState *prstate, TransactionId xid,
OffsetNumber offnum)
{
/*
* This should exactly match the PageSetPrunable macro. We can't store
@ -1305,6 +1465,14 @@ heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
if (!TransactionIdIsValid(prstate->new_prune_xid) ||
TransactionIdPrecedes(xid, prstate->new_prune_xid))
prstate->new_prune_xid = xid;
/*
* It's incorrect for a page to be marked all-visible if it contains
* prunable items.
*/
if (PageIsAllVisible(prstate->page))
heap_page_fix_vm_corruption(prstate, offnum,
VM_CORRUPT_TUPLE_VISIBILITY);
}
/* Record line pointer to be redirected */
@ -1388,6 +1556,15 @@ heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
heap_prune_record_unused(prstate, offnum, was_normal);
else
heap_prune_record_dead(prstate, offnum, was_normal);
/*
* It's incorrect for the page to be set all-visible if it contains dead
* items. Fix that on the heap page and check the VM for corruption as
* well. Do that here rather than in heap_prune_record_dead() so we also
* cover tuples that are directly marked LP_UNUSED via mark_unused_now.
*/
if (PageIsAllVisible(prstate->page))
heap_page_fix_vm_corruption(prstate, offnum, VM_CORRUPT_LPDEAD);
}
/* Record line pointer to be marked unused */
@ -1527,7 +1704,8 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum)
* that the page is reconsidered for pruning in future.
*/
heap_prune_record_prunable(prstate,
HeapTupleHeaderGetUpdateXid(htup));
HeapTupleHeaderGetUpdateXid(htup),
offnum);
break;
case HEAPTUPLE_INSERT_IN_PROGRESS:
@ -1542,6 +1720,11 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum)
prstate->set_all_visible = false;
prstate->set_all_frozen = false;
/* The page should not be marked all-visible */
if (PageIsAllVisible(page))
heap_page_fix_vm_corruption(prstate, offnum,
VM_CORRUPT_TUPLE_VISIBILITY);
/*
* If we wanted to optimize for aborts, we might consider marking
* the page prunable when we see INSERT_IN_PROGRESS. But we
@ -1566,7 +1749,8 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum)
* the page is reconsidered for pruning in future.
*/
heap_prune_record_prunable(prstate,
HeapTupleHeaderGetUpdateXid(htup));
HeapTupleHeaderGetUpdateXid(htup),
offnum);
break;
default:
@ -1632,6 +1816,13 @@ heap_prune_record_unchanged_lp_dead(PruneState *prstate, OffsetNumber offnum)
/* Record the dead offset for vacuum */
prstate->deadoffsets[prstate->lpdead_items++] = offnum;
/*
* It's incorrect for a page to be marked all-visible if it contains dead
* items.
*/
if (PageIsAllVisible(prstate->page))
heap_page_fix_vm_corruption(prstate, offnum, VM_CORRUPT_LPDEAD);
}
/*

View file

@ -432,11 +432,6 @@ static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis);
static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf,
BlockNumber blkno, Page page,
bool sharelock, Buffer vmbuffer);
static void identify_and_fix_vm_corruption(Relation rel, Buffer heap_buffer,
BlockNumber heap_blk, Page heap_page,
int nlpdead_items,
Buffer vmbuffer,
uint8 *vmbits);
static int lazy_scan_prune(LVRelState *vacrel, Buffer buf,
BlockNumber blkno, Page page,
Buffer vmbuffer,
@ -1989,81 +1984,6 @@ cmpOffsetNumbers(const void *a, const void *b)
return pg_cmp_u16(*(const OffsetNumber *) a, *(const OffsetNumber *) b);
}
/*
* Helper to correct any corruption detected on a heap page and its
* corresponding visibility map page after pruning but before setting the
* visibility map. It examines the heap page, the associated VM page, and the
* number of dead items previously identified.
*
* This function must be called while holding an exclusive lock on the heap
* buffer, and the dead items must have been discovered under that same lock.
* The provided vmbits must reflect the current state of the VM block
* referenced by vmbuffer. Although we do not hold a lock on the VM buffer, it
* is pinned, and the heap buffer is exclusively locked, ensuring that no
* other backend can update the VM bits corresponding to this heap page.
*
* If it clears corruption, it will zero out vmbits.
*/
static void
identify_and_fix_vm_corruption(Relation rel, Buffer heap_buffer,
BlockNumber heap_blk, Page heap_page,
int nlpdead_items,
Buffer vmbuffer,
uint8 *vmbits)
{
Assert(visibilitymap_get_status(rel, heap_blk, &vmbuffer) == *vmbits);
Assert(BufferIsLockedByMeInMode(heap_buffer, BUFFER_LOCK_EXCLUSIVE));
/*
* As of PostgreSQL 9.2, the visibility map bit should never be set if the
* page-level bit is clear. However, it's possible that the bit got
* cleared after heap_vac_scan_next_block() was called, so we must recheck
* with buffer lock before concluding that the VM is corrupt.
*/
if (!PageIsAllVisible(heap_page) &&
((*vmbits & VISIBILITYMAP_VALID_BITS) != 0))
{
ereport(WARNING,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
RelationGetRelationName(rel), heap_blk)));
visibilitymap_clear(rel, heap_blk, vmbuffer,
VISIBILITYMAP_VALID_BITS);
*vmbits = 0;
}
/*
* It's possible for the value returned by
* GetOldestNonRemovableTransactionId() to move backwards, so it's not
* wrong for us to see tuples that appear to not be visible to everyone
* yet, while PD_ALL_VISIBLE is already set. The real safe xmin value
* never moves backwards, but GetOldestNonRemovableTransactionId() is
* conservative and sometimes returns a value that's unnecessarily small,
* so if we see that contradiction it just means that the tuples that we
* think are not visible to everyone yet actually are, and the
* PD_ALL_VISIBLE flag is correct.
*
* There should never be LP_DEAD items on a page with PD_ALL_VISIBLE set,
* however.
*/
else if (PageIsAllVisible(heap_page) && nlpdead_items > 0)
{
ereport(WARNING,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u",
RelationGetRelationName(rel), heap_blk)));
PageClearAllVisible(heap_page);
MarkBufferDirty(heap_buffer);
visibilitymap_clear(rel, heap_blk, vmbuffer,
VISIBILITYMAP_VALID_BITS);
*vmbits = 0;
}
}
/*
* lazy_scan_prune() -- lazy_scan_heap() pruning and freezing.
*
@ -2095,6 +2015,7 @@ lazy_scan_prune(LVRelState *vacrel,
PruneFreezeParams params = {
.relation = rel,
.buffer = buf,
.vmbuffer = vmbuffer,
.reason = PRUNE_VACUUM_SCAN,
.options = HEAP_PAGE_PRUNE_FREEZE,
.vistest = vacrel->vistest,
@ -2204,18 +2125,12 @@ lazy_scan_prune(LVRelState *vacrel,
Assert(!presult.set_all_visible || !(*has_lpdead_items));
Assert(!presult.set_all_frozen || presult.set_all_visible);
old_vmbits = visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer);
identify_and_fix_vm_corruption(vacrel->rel, buf, blkno, page,
presult.lpdead_items, vmbuffer,
&old_vmbits);
if (!presult.set_all_visible)
return presult.ndeleted;
/* Set the visibility map and page visibility hint */
old_vmbits = presult.old_vmbits;
new_vmbits = VISIBILITYMAP_ALL_VISIBLE;
if (presult.set_all_frozen)
new_vmbits |= VISIBILITYMAP_ALL_FROZEN;

View file

@ -262,6 +262,12 @@ typedef struct PruneFreezeParams
Relation relation; /* relation containing buffer to be pruned */
Buffer buffer; /* buffer to be pruned */
/*
* Callers should provide a pinned vmbuffer corresponding to the heap
* block in buffer. We will check for and repair any corruption in the VM.
*/
Buffer vmbuffer;
/*
* The reason pruning was performed. It is used to set the WAL record
* opcode which is used for debugging and analysis purposes.
@ -324,6 +330,12 @@ typedef struct PruneFreezeResult
bool set_all_frozen;
TransactionId vm_conflict_horizon;
/*
* The value of the vmbuffer's vmbits at the beginning of pruning. It is
* cleared if VM corruption is found and corrected.
*/
uint8 old_vmbits;
/*
* Whether or not the page makes rel truncation unsafe. This is set to
* 'true', even if the page contains LP_DEAD items. VACUUM will remove

View file

@ -3277,6 +3277,7 @@ UserAuth
UserContext
UserMapping
UserOpts
VMCorruptionType
VacAttrStats
VacAttrStatsP
VacDeadItemsInfo