mirror of
https://github.com/postgres/postgres.git
synced 2026-03-28 13:23:48 -04:00
bufmgr: Don't copy pages while writing out
After the series of preceding commits introducing and using BufferBeginSetHintBits()/BufferSetHintBits16(), hint bits are not set anymore while IO is going on. Therefore we do not need to copy pages while they are being written out anymore. For the same reason XLogSaveBufferForHint() now does not need to operate on a copy of the page anymore, but can instead use the normal XLogRegisterBuffer() mechanism. For that the assertions and comments to XLogRegisterBuffer() had to be updated to allow share-exclusive locked buffers to be registered. Reviewed-by: Melanie Plageman <melanieplageman@gmail.com> Discussion: https://postgr.es/m/5ubipyssiju5twkb7zgqwdr7q2vhpkpmuelxfpanetlk6ofnop@hvxb4g2amb2d
This commit is contained in:
parent
79ac82125e
commit
41d3d64e87
8 changed files with 26 additions and 95 deletions
|
|
@ -1031,7 +1031,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
|
|||
zerobuf.data,
|
||||
true);
|
||||
|
||||
PageSetChecksumInplace(page, lastblock);
|
||||
PageSetChecksum(page, lastblock);
|
||||
smgrextend(RelationGetSmgr(rel), MAIN_FORKNUM, lastblock, zerobuf.data,
|
||||
false);
|
||||
|
||||
|
|
|
|||
|
|
@ -252,9 +252,9 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
|
|||
Assert(begininsert_called);
|
||||
|
||||
/*
|
||||
* Ordinarily, buffer should be exclusive-locked and marked dirty before
|
||||
* we get here, otherwise we could end up violating one of the rules in
|
||||
* access/transam/README.
|
||||
* Ordinarily, the buffer should be exclusive-locked (or share-exclusive
|
||||
* in case of hint bits) and marked dirty before we get here, otherwise we
|
||||
* could end up violating one of the rules in access/transam/README.
|
||||
*
|
||||
* Some callers intentionally register a clean page and never update that
|
||||
* page's LSN; in that case they can pass the flag REGBUF_NO_CHANGE to
|
||||
|
|
@ -262,8 +262,11 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
|
|||
*/
|
||||
#ifdef USE_ASSERT_CHECKING
|
||||
if (!(flags & REGBUF_NO_CHANGE))
|
||||
Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE) &&
|
||||
BufferIsDirty(buffer));
|
||||
{
|
||||
Assert(BufferIsDirty(buffer));
|
||||
Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE) ||
|
||||
BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE));
|
||||
}
|
||||
#endif
|
||||
|
||||
if (block_id >= max_registered_block_id)
|
||||
|
|
@ -1123,12 +1126,6 @@ XLogCheckBufferNeedsBackup(Buffer buffer)
|
|||
* buffer. The buffer already needs to have been marked dirty by
|
||||
* MarkBufferDirtyHint().
|
||||
*
|
||||
* We can't use the plain backup block mechanism since that relies on the
|
||||
* Buffer being exclusively locked. Since some modifications (setting LSN, hint
|
||||
* bits) are allowed in a sharelocked buffer that can lead to wal checksum
|
||||
* failures. So instead we copy the page and insert the copied data as normal
|
||||
* record data.
|
||||
*
|
||||
* We only need to do something if page has not yet been full page written in
|
||||
* this checkpoint round. The LSN of the inserted wal record is returned if we
|
||||
* had to write, InvalidXLogRecPtr otherwise.
|
||||
|
|
@ -1164,37 +1161,13 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
|
|||
if (lsn <= RedoRecPtr)
|
||||
{
|
||||
int flags = 0;
|
||||
PGAlignedBlock copied_buffer;
|
||||
char *origdata = (char *) BufferGetBlock(buffer);
|
||||
RelFileLocator rlocator;
|
||||
ForkNumber forkno;
|
||||
BlockNumber blkno;
|
||||
|
||||
/*
|
||||
* Copy buffer so we don't have to worry about concurrent hint bit or
|
||||
* lsn updates. We assume pd_lower/upper cannot be changed without an
|
||||
* exclusive lock, so the contents bkp are not racy.
|
||||
*/
|
||||
if (buffer_std)
|
||||
{
|
||||
/* Assume we can omit data between pd_lower and pd_upper */
|
||||
Page page = BufferGetPage(buffer);
|
||||
uint16 lower = ((PageHeader) page)->pd_lower;
|
||||
uint16 upper = ((PageHeader) page)->pd_upper;
|
||||
|
||||
memcpy(copied_buffer.data, origdata, lower);
|
||||
memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
|
||||
}
|
||||
else
|
||||
memcpy(copied_buffer.data, origdata, BLCKSZ);
|
||||
|
||||
XLogBeginInsert();
|
||||
|
||||
if (buffer_std)
|
||||
flags |= REGBUF_STANDARD;
|
||||
|
||||
BufferGetTag(buffer, &rlocator, &forkno, &blkno);
|
||||
XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data, flags);
|
||||
XLogRegisterBuffer(0, buffer, flags);
|
||||
|
||||
recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4440,7 +4440,6 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
|
|||
ErrorContextCallback errcallback;
|
||||
instr_time io_start;
|
||||
Block bufBlock;
|
||||
char *bufToWrite;
|
||||
|
||||
Assert(BufferLockHeldByMeInMode(buf, BUFFER_LOCK_EXCLUSIVE) ||
|
||||
BufferLockHeldByMeInMode(buf, BUFFER_LOCK_SHARE_EXCLUSIVE));
|
||||
|
|
@ -4503,22 +4502,15 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
|
|||
*/
|
||||
bufBlock = BufHdrGetBlock(buf);
|
||||
|
||||
/*
|
||||
* Update page checksum if desired. Since we have only shared lock on the
|
||||
* buffer, other processes might be updating hint bits in it, so we must
|
||||
* copy the page to private storage if we do checksumming.
|
||||
*/
|
||||
bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
|
||||
/* Update page checksum if desired. */
|
||||
PageSetChecksum((Page) bufBlock, buf->tag.blockNum);
|
||||
|
||||
io_start = pgstat_prepare_io_time(track_io_timing);
|
||||
|
||||
/*
|
||||
* bufToWrite is either the shared buffer or a copy, as appropriate.
|
||||
*/
|
||||
smgrwrite(reln,
|
||||
BufTagGetForkNum(&buf->tag),
|
||||
buf->tag.blockNum,
|
||||
bufToWrite,
|
||||
bufBlock,
|
||||
false);
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -199,7 +199,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
|
|||
reln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag),
|
||||
MyProcNumber);
|
||||
|
||||
PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
|
||||
PageSetChecksum(localpage, bufHdr->tag.blockNum);
|
||||
|
||||
io_start = pgstat_prepare_io_time(track_io_timing);
|
||||
|
||||
|
|
|
|||
|
|
@ -1492,53 +1492,20 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
|
|||
|
||||
|
||||
/*
|
||||
* Set checksum for a page in shared buffers.
|
||||
* Set checksum on a page.
|
||||
*
|
||||
* If checksums are disabled, or if the page is not initialized, just return
|
||||
* the input. Otherwise, we must make a copy of the page before calculating
|
||||
* the checksum, to prevent concurrent modifications (e.g. setting hint bits)
|
||||
* from making the final checksum invalid. It doesn't matter if we include or
|
||||
* exclude hints during the copy, as long as we write a valid page and
|
||||
* associated checksum.
|
||||
* If the page is in shared buffers, it needs to be locked in at least
|
||||
* share-exclusive mode.
|
||||
*
|
||||
* Returns a pointer to the block-sized data that needs to be written. Uses
|
||||
* statically-allocated memory, so the caller must immediately write the
|
||||
* returned page and not refer to it again.
|
||||
*/
|
||||
char *
|
||||
PageSetChecksumCopy(Page page, BlockNumber blkno)
|
||||
{
|
||||
static char *pageCopy = NULL;
|
||||
|
||||
/* If we don't need a checksum, just return the passed-in data */
|
||||
if (PageIsNew(page) || !DataChecksumsEnabled())
|
||||
return page;
|
||||
|
||||
/*
|
||||
* We allocate the copy space once and use it over on each subsequent
|
||||
* call. The point of palloc'ing here, rather than having a static char
|
||||
* array, is first to ensure adequate alignment for the checksumming code
|
||||
* and second to avoid wasting space in processes that never call this.
|
||||
*/
|
||||
if (pageCopy == NULL)
|
||||
pageCopy = MemoryContextAllocAligned(TopMemoryContext,
|
||||
BLCKSZ,
|
||||
PG_IO_ALIGN_SIZE,
|
||||
0);
|
||||
|
||||
memcpy(pageCopy, page, BLCKSZ);
|
||||
((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
|
||||
return pageCopy;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set checksum for a page in private memory.
|
||||
* If checksums are disabled, or if the page is not initialized, just
|
||||
* return. Otherwise compute and set the checksum.
|
||||
*
|
||||
* This must only be used when we know that no other process can be modifying
|
||||
* the page buffer.
|
||||
* In the past this needed to be done on a copy of the page, due to the
|
||||
* possibility of e.g., hint bits being set concurrently. However, this is not
|
||||
* necessary anymore as hint bits won't be set while IO is going on.
|
||||
*/
|
||||
void
|
||||
PageSetChecksumInplace(Page page, BlockNumber blkno)
|
||||
PageSetChecksum(Page page, BlockNumber blkno)
|
||||
{
|
||||
/* If we don't need a checksum, just return */
|
||||
if (PageIsNew(page) || !DataChecksumsEnabled())
|
||||
|
|
|
|||
|
|
@ -279,7 +279,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
|
|||
BlockNumber blkno = pending_writes[i].blkno;
|
||||
Page page = pending_writes[i].buf->data;
|
||||
|
||||
PageSetChecksumInplace(page, blkno);
|
||||
PageSetChecksum(page, blkno);
|
||||
|
||||
if (blkno >= bulkstate->relsize)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -537,7 +537,6 @@ extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
|
|||
extern void PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum);
|
||||
extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
|
||||
const void *newtup, Size newsize);
|
||||
extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
|
||||
extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
|
||||
extern void PageSetChecksum(Page page, BlockNumber blkno);
|
||||
|
||||
#endif /* BUFPAGE_H */
|
||||
|
|
|
|||
|
|
@ -288,7 +288,7 @@ modify_rel_block(PG_FUNCTION_ARGS)
|
|||
}
|
||||
else
|
||||
{
|
||||
PageSetChecksumInplace(page, blkno);
|
||||
PageSetChecksum(page, blkno);
|
||||
}
|
||||
|
||||
smgrwrite(RelationGetSmgr(rel),
|
||||
|
|
|
|||
Loading…
Reference in a new issue