mirror of
https://github.com/postgres/postgres.git
synced 2026-04-21 22:28:15 -04:00
Fix multixact backwards-compatibility with CHECKPOINT race condition
If a CHECKPOINT record with nextMulti N is written to the WAL before
the CREATE_ID record for N, and N happens to be the first multixid on
an offset page, the backwards compatibility logic to tolerate WAL
generated by older minor versions (before commit 789d65364c) failed to
compensate for the missing XLOG_MULTIXACT_ZERO_OFF_PAGE record. In
that case, the latest_page_number was initialized at the start of WAL
replay to the page for nextMulti from the CHECKPOINT record, even if
we had not seen the CREATE_ID record for that multixid yet, which
fooled the backwards compatibility logic to think that the page was
already initialized.
To fix, track the last XLOG_MULTIXACT_ZERO_OFF_PAGE that we've seen
separately from latest_page_number. If we haven't seen any
XLOG_MULTIXACT_ZERO_OFF_PAGE records yet, use
SimpleLruDoesPhysicalPageExist() to check if the page needs to be
initialized.
Reported-by: duankunren.dkr <duankunren.dkr@alibaba-inc.com>
Analyzed-by: duankunren.dkr <duankunren.dkr@alibaba-inc.com>
Reviewed-by: Andrey Borodin <x4mmm@yandex-team.ru>
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://www.postgresql.org/message-id/c4ef1737-8cba-458e-b6fd-4e2d6011e985.duankunren.dkr@alibaba-inc.com
Backpatch-through: 14-18
This commit is contained in:
parent
6ccfc44922
commit
1ca3850321
2 changed files with 73 additions and 24 deletions
|
|
@ -416,7 +416,17 @@ static MemoryContext MXactContext = NULL;
|
|||
#define debug_elog6(a,b,c,d,e,f)
|
||||
#endif
|
||||
|
||||
/* hack to deal with WAL generated with older minor versions */
|
||||
/*
|
||||
* Hack to deal with WAL generated with older minor versions.
|
||||
*
|
||||
* last_initialized_offsets_page is the XLOG_MULTIXACT_ZERO_OFF_PAGE record
|
||||
* that we saw during WAL replay, or -1 if we haven't seen any yet.
|
||||
*
|
||||
* pre_initialized_offsets_page is the last page that was implicitly
|
||||
* initialized by replaying a XLOG_MULTIXACT_CREATE_ID record, when we had not
|
||||
* seen a XLOG_MULTIXACT_ZERO_OFF_PAGE record for the page yet.
|
||||
*/
|
||||
static int64 last_initialized_offsets_page = -1;
|
||||
static int64 pre_initialized_offsets_page = -1;
|
||||
|
||||
/* internal MultiXactId management */
|
||||
|
|
@ -976,29 +986,68 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
|
|||
* such a version, the next page might not be initialized yet. Initialize
|
||||
* it now.
|
||||
*/
|
||||
if (InRecovery &&
|
||||
next_pageno != pageno &&
|
||||
pg_atomic_read_u64(&MultiXactOffsetCtl->shared->latest_page_number) == pageno)
|
||||
if (InRecovery && next_pageno != pageno)
|
||||
{
|
||||
elog(DEBUG1, "next offsets page is not initialized, initializing it now");
|
||||
bool init_needed;
|
||||
|
||||
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
|
||||
LWLockAcquire(lock, LW_EXCLUSIVE);
|
||||
|
||||
/* Create and zero the page */
|
||||
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);
|
||||
|
||||
/* Make sure it's written out */
|
||||
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
|
||||
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
|
||||
|
||||
LWLockRelease(lock);
|
||||
|
||||
/*
|
||||
* Remember that we initialized the page, so that we don't zero it
|
||||
* again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
|
||||
/*----------
|
||||
* Check if the page exists, and if not, initialize it now.
|
||||
*
|
||||
* The straightforward way to check if the page exists is to call
|
||||
* SimpleLruDoesPhysicalPageExist(). However, there two problems with
|
||||
* that:
|
||||
*
|
||||
* 1. It's somewhat expensive to call on every page switch.
|
||||
*
|
||||
* 2. It does not take into account pages that have been initialized
|
||||
* in the SLRU buffer cache but not yet flushed to disk. For such
|
||||
* pages, it will incorrectly return false.
|
||||
*
|
||||
* To fix both of those problems, if we have replayed any
|
||||
* XLOG_MULTIXACT_ZERO_OFF_PAGE records, we assume that the last page
|
||||
* that was zeroed by XLOG_MULTIXACT_ZERO_OFF_PAGE is the last page
|
||||
* that exists. This works because the XLOG_MULTIXACT_ZERO_OFF_PAGE
|
||||
* records must appear in the WAL in order, unlike CREATE_ID records.
|
||||
* We only resort to SimpleLruDoesPhysicalPageExist() if we haven't
|
||||
* seen any XLOG_MULTIXACT_ZERO_OFF_PAGE records yet, which should
|
||||
* happen at most once after starting WAL recovery.
|
||||
*
|
||||
* As an extra safety measure, if we do resort to
|
||||
* SimpleLruDoesPhysicalPageExist(), flush the SLRU buffers first so
|
||||
* that it will return an accurate result.
|
||||
*----------
|
||||
*/
|
||||
pre_initialized_offsets_page = next_pageno;
|
||||
if (last_initialized_offsets_page == -1)
|
||||
{
|
||||
SimpleLruWriteAll(MultiXactOffsetCtl, false);
|
||||
init_needed = !SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, next_pageno);
|
||||
}
|
||||
else
|
||||
init_needed = (last_initialized_offsets_page == pageno);
|
||||
|
||||
if (init_needed)
|
||||
{
|
||||
elog(DEBUG1, "next offsets page is not initialized, initializing it now");
|
||||
|
||||
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
|
||||
LWLockAcquire(lock, LW_EXCLUSIVE);
|
||||
|
||||
/* Create and zero the page */
|
||||
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);
|
||||
|
||||
/* Make sure it's written out */
|
||||
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
|
||||
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
|
||||
|
||||
LWLockRelease(lock);
|
||||
|
||||
/*
|
||||
* Remember that we initialized the page, so that we don't zero it
|
||||
* again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
|
||||
*/
|
||||
pre_initialized_offsets_page = next_pageno;
|
||||
last_initialized_offsets_page = next_pageno;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -3554,6 +3603,8 @@ multixact_redo(XLogReaderState *record)
|
|||
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
|
||||
|
||||
LWLockRelease(lock);
|
||||
|
||||
last_initialized_offsets_page = pageno;
|
||||
}
|
||||
else
|
||||
elog(DEBUG1, "skipping initialization of offsets page " INT64_FORMAT " because it was already initialized on multixid creation", pageno);
|
||||
|
|
|
|||
|
|
@ -110,9 +110,7 @@ typedef struct SlruSharedData
|
|||
/*
|
||||
* latest_page_number is the page number of the current end of the log;
|
||||
* this is not critical data, since we use it only to avoid swapping out
|
||||
* the latest page. (An exception: an accurate latest_page_number is
|
||||
* needed on pg_multixact/offsets to replay WAL generated with older minor
|
||||
* versions correctly. See RecordNewMultiXact().)
|
||||
* the latest page.
|
||||
*/
|
||||
pg_atomic_uint64 latest_page_number;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue