diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 8d9f0694206..05dd9c29d96 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -19,7 +19,7 @@ #include "access/htup_details.h" #include "access/multixact.h" #include "access/transam.h" -#include "access/visibilitymapdefs.h" +#include "access/visibilitymap.h" #include "access/xlog.h" #include "access/xloginsert.h" #include "commands/vacuum.h" @@ -114,6 +114,21 @@ typedef struct */ HeapPageFreeze pagefrz; + /*------------------------------------------------------- + * Working state for visibility map processing + *------------------------------------------------------- + */ + + /* + * Caller must provide a pinned vmbuffer corresponding to the heap block + * passed to heap_page_prune_and_freeze(). We will fix any corruption + * found in the VM. + */ + Buffer vmbuffer; + + /* Bits in the vmbuffer for this heap page */ + uint8 old_vmbits; + /*------------------------------------------------------- * Information about what was done * @@ -162,12 +177,31 @@ typedef struct TransactionId visibility_cutoff_xid; } PruneState; +/* + * Type of visibility map corruption detected on a heap page and its + * associated VM page. Passed to heap_page_fix_vm_corruption() so the caller + * can specify what it found rather than having the function rederive the + * corruption from page state. + */ +typedef enum VMCorruptionType +{ + /* VM bits are set but the heap page-level PD_ALL_VISIBLE flag is not */ + VM_CORRUPT_MISSING_PAGE_HINT, + /* LP_DEAD line pointers found on a page marked all-visible */ + VM_CORRUPT_LPDEAD, + /* Tuple not visible to all transactions on a page marked all-visible */ + VM_CORRUPT_TUPLE_VISIBILITY, +} VMCorruptionType; + /* Local functions */ static void prune_freeze_setup(PruneFreezeParams *params, TransactionId *new_relfrozen_xid, MultiXactId *new_relmin_mxid, PruneFreezeResult *presult, PruneState *prstate); +static void heap_page_fix_vm_corruption(PruneState *prstate, + OffsetNumber offnum, + VMCorruptionType ctype); static void prune_freeze_plan(PruneState *prstate, OffsetNumber *off_loc); static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, @@ -175,7 +209,8 @@ static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, static inline HTSV_Result htsv_get_valid_status(int status); static void heap_prune_chain(OffsetNumber maxoff, OffsetNumber rootoffnum, PruneState *prstate); -static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid); +static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid, + OffsetNumber offnum); static void heap_prune_record_redirect(PruneState *prstate, OffsetNumber offnum, OffsetNumber rdoffnum, bool was_normal); @@ -209,8 +244,9 @@ static bool heap_page_will_freeze(bool did_tuple_hint_fpi, bool do_prune, bool d * Caller must have pin on the buffer, and must *not* have a lock on it. * * This function may pin *vmbuffer. It's passed by reference so the caller can - * reuse the pin across calls, avoiding repeated pin/unpin cycles. Caller is - * responsible for unpinning it. + * reuse the pin across calls, avoiding repeated pin/unpin cycles. If we find + * VM corruption during pruning, we will fix it. Caller is responsible for + * unpinning *vmbuffer. */ void heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer) @@ -277,6 +313,17 @@ heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer) { OffsetNumber dummy_off_loc; PruneFreezeResult presult; + PruneFreezeParams params; + + visibilitymap_pin(relation, BufferGetBlockNumber(buffer), + vmbuffer); + + params.relation = relation; + params.buffer = buffer; + params.vmbuffer = *vmbuffer; + params.reason = PRUNE_ON_ACCESS; + params.vistest = vistest; + params.cutoffs = NULL; /* * We don't pass the HEAP_PAGE_PRUNE_MARK_UNUSED_NOW option @@ -284,14 +331,7 @@ heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer) * cannot safely determine that during on-access pruning with the * current implementation. */ - PruneFreezeParams params = { - .relation = relation, - .buffer = buffer, - .reason = PRUNE_ON_ACCESS, - .options = 0, - .vistest = vistest, - .cutoffs = NULL, - }; + params.options = 0; heap_page_prune_and_freeze(¶ms, &presult, &dummy_off_loc, NULL, NULL); @@ -354,6 +394,12 @@ prune_freeze_setup(PruneFreezeParams *params, prstate->buffer = params->buffer; prstate->page = BufferGetPage(params->buffer); + Assert(BufferIsValid(params->vmbuffer)); + prstate->vmbuffer = params->vmbuffer; + prstate->old_vmbits = visibilitymap_get_status(prstate->relation, + prstate->block, + &prstate->vmbuffer); + /* * Our strategy is to scan the page and make lists of items to change, * then apply the changes within a critical section. This keeps as much @@ -770,6 +816,108 @@ heap_page_will_freeze(bool did_tuple_hint_fpi, return do_freeze; } +/* + * Emit a warning about and fix visibility map corruption on the given page. + * + * The caller specifies the type of corruption it has already detected via + * corruption_type, so that we can emit the appropriate warning. All cases + * result in the VM bits being cleared; corruption types where PD_ALL_VISIBLE + * is incorrectly set also clear PD_ALL_VISIBLE. + * + * Must be called while holding an exclusive lock on the heap buffer. Dead + * items and not all-visible tuples must have been discovered under that same + * lock. Although we do not hold a lock on the VM buffer, it is pinned, and + * the heap buffer is exclusively locked, ensuring that no other backend can + * update the VM bits corresponding to this heap page. + * + * This function makes changes to the VM and, potentially, the heap page, but + * it does not need to be done in a critical section. + */ +static void +heap_page_fix_vm_corruption(PruneState *prstate, OffsetNumber offnum, + VMCorruptionType corruption_type) +{ + const char *relname = RelationGetRelationName(prstate->relation); + bool do_clear_vm = false; + bool do_clear_heap = false; + + Assert(BufferIsLockedByMeInMode(prstate->buffer, BUFFER_LOCK_EXCLUSIVE)); + + switch (corruption_type) + { + case VM_CORRUPT_LPDEAD: + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("dead line pointer found on page marked all-visible"), + errcontext("relation \"%s\", page %u, tuple %u", + relname, prstate->block, offnum))); + do_clear_vm = true; + do_clear_heap = true; + break; + + case VM_CORRUPT_TUPLE_VISIBILITY: + + /* + * A HEAPTUPLE_LIVE tuple on an all-visible page can appear to not + * be visible to everyone when + * GetOldestNonRemovableTransactionId() returns a conservative + * value that's older than the real safe xmin. That is not + * corruption -- the PD_ALL_VISIBLE flag is still correct. + * + * However, dead tuple versions, in-progress inserts, and + * in-progress deletes should never appear on a page marked + * all-visible. That indicates real corruption. PD_ALL_VISIBLE + * should have been cleared by the DML operation that deleted or + * inserted the tuple. + */ + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("tuple not visible to all transactions found on page marked all-visible"), + errcontext("relation \"%s\", page %u, tuple %u", + relname, prstate->block, offnum))); + do_clear_vm = true; + do_clear_heap = true; + break; + + case VM_CORRUPT_MISSING_PAGE_HINT: + + /* + * As of PostgreSQL 9.2, the visibility map bit should never be + * set if the page-level bit is clear. However, for vacuum, it's + * possible that the bit got cleared after + * heap_vac_scan_next_block() was called, so we must recheck now + * that we have the buffer lock before concluding that the VM is + * corrupt. + */ + Assert(!PageIsAllVisible(prstate->page)); + Assert(prstate->old_vmbits & VISIBILITYMAP_VALID_BITS); + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page is not marked all-visible but visibility map bit is set"), + errcontext("relation \"%s\", page %u", + relname, prstate->block))); + do_clear_vm = true; + break; + } + + Assert(do_clear_heap || do_clear_vm); + + /* Avoid marking the buffer dirty if PD_ALL_VISIBLE is already clear */ + if (do_clear_heap) + { + Assert(PageIsAllVisible(prstate->page)); + PageClearAllVisible(prstate->page); + MarkBufferDirtyHint(prstate->buffer, true); + } + + if (do_clear_vm) + { + visibilitymap_clear(prstate->relation, prstate->block, + prstate->vmbuffer, + VISIBILITYMAP_VALID_BITS); + prstate->old_vmbits = 0; + } +} /* * Prune and repair fragmentation and potentially freeze tuples on the @@ -830,6 +978,16 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, new_relfrozen_xid, new_relmin_mxid, presult, &prstate); + /* + * If the VM is set but PD_ALL_VISIBLE is clear, fix that corruption + * before pruning and freezing so that the page and VM start out in a + * consistent state. + */ + if ((prstate.old_vmbits & VISIBILITYMAP_VALID_BITS) && + !PageIsAllVisible(prstate.page)) + heap_page_fix_vm_corruption(&prstate, InvalidOffsetNumber, + VM_CORRUPT_MISSING_PAGE_HINT); + /* * Examine all line pointers and tuple visibility information to determine * which line pointers should change state and which tuples may be frozen. @@ -973,6 +1131,7 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, presult->set_all_visible = prstate.set_all_visible; presult->set_all_frozen = prstate.set_all_frozen; presult->hastup = prstate.hastup; + presult->old_vmbits = prstate.old_vmbits; /* * For callers planning to update the visibility map, the conflict horizon @@ -1295,7 +1454,8 @@ process_chain: /* Record lowest soon-prunable XID */ static void -heap_prune_record_prunable(PruneState *prstate, TransactionId xid) +heap_prune_record_prunable(PruneState *prstate, TransactionId xid, + OffsetNumber offnum) { /* * This should exactly match the PageSetPrunable macro. We can't store @@ -1305,6 +1465,14 @@ heap_prune_record_prunable(PruneState *prstate, TransactionId xid) if (!TransactionIdIsValid(prstate->new_prune_xid) || TransactionIdPrecedes(xid, prstate->new_prune_xid)) prstate->new_prune_xid = xid; + + /* + * It's incorrect for a page to be marked all-visible if it contains + * prunable items. + */ + if (PageIsAllVisible(prstate->page)) + heap_page_fix_vm_corruption(prstate, offnum, + VM_CORRUPT_TUPLE_VISIBILITY); } /* Record line pointer to be redirected */ @@ -1388,6 +1556,15 @@ heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum, heap_prune_record_unused(prstate, offnum, was_normal); else heap_prune_record_dead(prstate, offnum, was_normal); + + /* + * It's incorrect for the page to be set all-visible if it contains dead + * items. Fix that on the heap page and check the VM for corruption as + * well. Do that here rather than in heap_prune_record_dead() so we also + * cover tuples that are directly marked LP_UNUSED via mark_unused_now. + */ + if (PageIsAllVisible(prstate->page)) + heap_page_fix_vm_corruption(prstate, offnum, VM_CORRUPT_LPDEAD); } /* Record line pointer to be marked unused */ @@ -1527,7 +1704,8 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum) * that the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, - HeapTupleHeaderGetUpdateXid(htup)); + HeapTupleHeaderGetUpdateXid(htup), + offnum); break; case HEAPTUPLE_INSERT_IN_PROGRESS: @@ -1542,6 +1720,11 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum) prstate->set_all_visible = false; prstate->set_all_frozen = false; + /* The page should not be marked all-visible */ + if (PageIsAllVisible(page)) + heap_page_fix_vm_corruption(prstate, offnum, + VM_CORRUPT_TUPLE_VISIBILITY); + /* * If we wanted to optimize for aborts, we might consider marking * the page prunable when we see INSERT_IN_PROGRESS. But we @@ -1566,7 +1749,8 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum) * the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, - HeapTupleHeaderGetUpdateXid(htup)); + HeapTupleHeaderGetUpdateXid(htup), + offnum); break; default: @@ -1632,6 +1816,13 @@ heap_prune_record_unchanged_lp_dead(PruneState *prstate, OffsetNumber offnum) /* Record the dead offset for vacuum */ prstate->deadoffsets[prstate->lpdead_items++] = offnum; + + /* + * It's incorrect for a page to be marked all-visible if it contains dead + * items. + */ + if (PageIsAllVisible(prstate->page)) + heap_page_fix_vm_corruption(prstate, offnum, VM_CORRUPT_LPDEAD); } /* diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index c57432670e7..56722556417 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -432,11 +432,6 @@ static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis); static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, bool sharelock, Buffer vmbuffer); -static void identify_and_fix_vm_corruption(Relation rel, Buffer heap_buffer, - BlockNumber heap_blk, Page heap_page, - int nlpdead_items, - Buffer vmbuffer, - uint8 *vmbits); static int lazy_scan_prune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, Buffer vmbuffer, @@ -1989,81 +1984,6 @@ cmpOffsetNumbers(const void *a, const void *b) return pg_cmp_u16(*(const OffsetNumber *) a, *(const OffsetNumber *) b); } -/* - * Helper to correct any corruption detected on a heap page and its - * corresponding visibility map page after pruning but before setting the - * visibility map. It examines the heap page, the associated VM page, and the - * number of dead items previously identified. - * - * This function must be called while holding an exclusive lock on the heap - * buffer, and the dead items must have been discovered under that same lock. - - * The provided vmbits must reflect the current state of the VM block - * referenced by vmbuffer. Although we do not hold a lock on the VM buffer, it - * is pinned, and the heap buffer is exclusively locked, ensuring that no - * other backend can update the VM bits corresponding to this heap page. - * - * If it clears corruption, it will zero out vmbits. - */ -static void -identify_and_fix_vm_corruption(Relation rel, Buffer heap_buffer, - BlockNumber heap_blk, Page heap_page, - int nlpdead_items, - Buffer vmbuffer, - uint8 *vmbits) -{ - Assert(visibilitymap_get_status(rel, heap_blk, &vmbuffer) == *vmbits); - - Assert(BufferIsLockedByMeInMode(heap_buffer, BUFFER_LOCK_EXCLUSIVE)); - - /* - * As of PostgreSQL 9.2, the visibility map bit should never be set if the - * page-level bit is clear. However, it's possible that the bit got - * cleared after heap_vac_scan_next_block() was called, so we must recheck - * with buffer lock before concluding that the VM is corrupt. - */ - if (!PageIsAllVisible(heap_page) && - ((*vmbits & VISIBILITYMAP_VALID_BITS) != 0)) - { - ereport(WARNING, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - RelationGetRelationName(rel), heap_blk))); - - visibilitymap_clear(rel, heap_blk, vmbuffer, - VISIBILITYMAP_VALID_BITS); - *vmbits = 0; - } - - /* - * It's possible for the value returned by - * GetOldestNonRemovableTransactionId() to move backwards, so it's not - * wrong for us to see tuples that appear to not be visible to everyone - * yet, while PD_ALL_VISIBLE is already set. The real safe xmin value - * never moves backwards, but GetOldestNonRemovableTransactionId() is - * conservative and sometimes returns a value that's unnecessarily small, - * so if we see that contradiction it just means that the tuples that we - * think are not visible to everyone yet actually are, and the - * PD_ALL_VISIBLE flag is correct. - * - * There should never be LP_DEAD items on a page with PD_ALL_VISIBLE set, - * however. - */ - else if (PageIsAllVisible(heap_page) && nlpdead_items > 0) - { - ereport(WARNING, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", - RelationGetRelationName(rel), heap_blk))); - - PageClearAllVisible(heap_page); - MarkBufferDirty(heap_buffer); - visibilitymap_clear(rel, heap_blk, vmbuffer, - VISIBILITYMAP_VALID_BITS); - *vmbits = 0; - } -} - /* * lazy_scan_prune() -- lazy_scan_heap() pruning and freezing. * @@ -2095,6 +2015,7 @@ lazy_scan_prune(LVRelState *vacrel, PruneFreezeParams params = { .relation = rel, .buffer = buf, + .vmbuffer = vmbuffer, .reason = PRUNE_VACUUM_SCAN, .options = HEAP_PAGE_PRUNE_FREEZE, .vistest = vacrel->vistest, @@ -2204,18 +2125,12 @@ lazy_scan_prune(LVRelState *vacrel, Assert(!presult.set_all_visible || !(*has_lpdead_items)); Assert(!presult.set_all_frozen || presult.set_all_visible); - old_vmbits = visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer); - - identify_and_fix_vm_corruption(vacrel->rel, buf, blkno, page, - presult.lpdead_items, vmbuffer, - &old_vmbits); - if (!presult.set_all_visible) return presult.ndeleted; /* Set the visibility map and page visibility hint */ + old_vmbits = presult.old_vmbits; new_vmbits = VISIBILITYMAP_ALL_VISIBLE; - if (presult.set_all_frozen) new_vmbits |= VISIBILITYMAP_ALL_FROZEN; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 2fdc50b865b..00134012137 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -262,6 +262,12 @@ typedef struct PruneFreezeParams Relation relation; /* relation containing buffer to be pruned */ Buffer buffer; /* buffer to be pruned */ + /* + * Callers should provide a pinned vmbuffer corresponding to the heap + * block in buffer. We will check for and repair any corruption in the VM. + */ + Buffer vmbuffer; + /* * The reason pruning was performed. It is used to set the WAL record * opcode which is used for debugging and analysis purposes. @@ -324,6 +330,12 @@ typedef struct PruneFreezeResult bool set_all_frozen; TransactionId vm_conflict_horizon; + /* + * The value of the vmbuffer's vmbits at the beginning of pruning. It is + * cleared if VM corruption is found and corrected. + */ + uint8 old_vmbits; + /* * Whether or not the page makes rel truncation unsafe. This is set to * 'true', even if the page contains LP_DEAD items. VACUUM will remove diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 0042c33fa66..0c07c945f05 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3277,6 +3277,7 @@ UserAuth UserContext UserMapping UserOpts +VMCorruptionType VacAttrStats VacAttrStatsP VacDeadItemsInfo