mirror of
https://github.com/postgres/postgres.git
synced 2026-03-22 02:20:53 -04:00
This commit includes various optimizations to improve the performance of tuple deformation. We now precalculate CompactAttribute's attcacheoff, which allows us to remove the code from the deform routines which was setting the attcacheoff. Setting the attcacheoff is now handled by TupleDescFinalize(), which must be called before the TupleDesc is used for anything. Having TupleDescFinalize() means we can store the first attribute in the TupleDesc which does not have an offset cached. That allows us to add a dedicated deforming loop to deform all attributes up to the final one with an attcacheoff set, or up to the first NULL attribute, whichever comes first. Here we also improve tuple deformation performance of tuples with NULLs. Previously, if the HEAP_HASNULL bit was set in the tuple's t_infomask, deforming would, one-by-one, check each and every bit in the NULL bitmap to see if it was zero. Now, we process the NULL bitmap 1 byte at a time rather than 1 bit at a time to find the attnum with the first NULL. We can now deform the tuple without checking for NULLs up to just before that attribute. We also record the maximum attribute number which is guaranteed to exist in the tuple, that is, has a NOT NULL constraint and isn't an atthasmissing attribute. When deforming only attributes prior to the guaranteed attnum, we've no need to access the tuple's natt count. As an additional optimization, we only count fixed-width columns when calculating the maximum guaranteed column, as this eliminates the need to emit code to fetch byref types in the deformation loop for guaranteed attributes. Some locations in the code deform tuples that have yet to go through NOT NULL constraint validation. We're unable to perform the guaranteed attribute optimization when that's the case. This optimization is opt-in via the TupleTableSlot using the TTS_FLAG_OBEYS_NOT_NULL_CONSTRAINTS flag. This commit also adds a more efficient way of populating the isnull array by using a bit-wise SWAR trick which performs multiplication on the inverse of the tuple's bitmap byte and masking out all but the lower bit of each of the boolean's byte. This results in much more optimal code when compared to determining the NULLness via att_isnull(). 8 isnull elements are processed at once using this method, which means we need to round the tts_isnull array size up to the next 8 bytes. The palloc code does this anyway, but the round-up needed to be formalized so as not to overwrite the sentinel byte in MEMORY_CONTEXT_CHECKING builds. Doing this also allows the NULL-checking deforming loop to more efficiently check the isnull array, rather than doing the bit-wise processing for each attribute that att_isnull() does. The level of performance improvement from these changes seems to vary depending on the CPU architecture. Apple's M chips seem particularly fond of the changes, with some of the tested deform-heavy queries going over twice as fast as before. With x86-64, the speedups aren't quite as large. With tables containing only a small number of columns, the speedups will be less. Author: David Rowley <dgrowleyml@gmail.com> Reviewed-by: Chao Li <li.evan.chao@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: John Naylor <johncnaylorls@gmail.com> Reviewed-by: Amit Langote <amitlangote09@gmail.com> Reviewed-by: Zsolt Parragi <zsolt.parragi@percona.com> Reviewed-by: Álvaro Herrera <alvherre@kurilemu.de> Reviewed-by: Junwang Zhao <zhjwpku@gmail.com> Discussion: https://postgr.es/m/CAApHDvpoFjaj3%2Bw_jD5uPnGazaw41A71tVJokLDJg2zfcigpMQ%40mail.gmail.com
498 lines
14 KiB
C
498 lines
14 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* nodeTidrangescan.c
|
|
* Routines to support TID range scans of relations
|
|
*
|
|
* Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/executor/nodeTidrangescan.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/relscan.h"
|
|
#include "access/sysattr.h"
|
|
#include "access/tableam.h"
|
|
#include "catalog/pg_operator.h"
|
|
#include "executor/executor.h"
|
|
#include "executor/nodeTidrangescan.h"
|
|
#include "nodes/nodeFuncs.h"
|
|
#include "utils/rel.h"
|
|
|
|
|
|
/*
|
|
* It's sufficient to check varattno to identify the CTID variable, as any
|
|
* Var in the relation scan qual must be for our table. (Even if it's a
|
|
* parameterized scan referencing some other table's CTID, the other table's
|
|
* Var would have become a Param by the time it gets here.)
|
|
*/
|
|
#define IsCTIDVar(node) \
|
|
((node) != NULL && \
|
|
IsA((node), Var) && \
|
|
((Var *) (node))->varattno == SelfItemPointerAttributeNumber)
|
|
|
|
typedef enum
|
|
{
|
|
TIDEXPR_UPPER_BOUND,
|
|
TIDEXPR_LOWER_BOUND,
|
|
} TidExprType;
|
|
|
|
/* Upper or lower range bound for scan */
|
|
typedef struct TidOpExpr
|
|
{
|
|
TidExprType exprtype; /* type of op; lower or upper */
|
|
ExprState *exprstate; /* ExprState for a TID-yielding subexpr */
|
|
bool inclusive; /* whether op is inclusive */
|
|
} TidOpExpr;
|
|
|
|
/*
|
|
* For the given 'expr', build and return an appropriate TidOpExpr taking into
|
|
* account the expr's operator and operand order.
|
|
*/
|
|
static TidOpExpr *
|
|
MakeTidOpExpr(OpExpr *expr, TidRangeScanState *tidstate)
|
|
{
|
|
Node *arg1 = get_leftop((Expr *) expr);
|
|
Node *arg2 = get_rightop((Expr *) expr);
|
|
ExprState *exprstate = NULL;
|
|
bool invert = false;
|
|
TidOpExpr *tidopexpr;
|
|
|
|
if (IsCTIDVar(arg1))
|
|
exprstate = ExecInitExpr((Expr *) arg2, &tidstate->ss.ps);
|
|
else if (IsCTIDVar(arg2))
|
|
{
|
|
exprstate = ExecInitExpr((Expr *) arg1, &tidstate->ss.ps);
|
|
invert = true;
|
|
}
|
|
else
|
|
elog(ERROR, "could not identify CTID variable");
|
|
|
|
tidopexpr = palloc_object(TidOpExpr);
|
|
tidopexpr->inclusive = false; /* for now */
|
|
|
|
switch (expr->opno)
|
|
{
|
|
case TIDLessEqOperator:
|
|
tidopexpr->inclusive = true;
|
|
pg_fallthrough;
|
|
case TIDLessOperator:
|
|
tidopexpr->exprtype = invert ? TIDEXPR_LOWER_BOUND : TIDEXPR_UPPER_BOUND;
|
|
break;
|
|
case TIDGreaterEqOperator:
|
|
tidopexpr->inclusive = true;
|
|
pg_fallthrough;
|
|
case TIDGreaterOperator:
|
|
tidopexpr->exprtype = invert ? TIDEXPR_UPPER_BOUND : TIDEXPR_LOWER_BOUND;
|
|
break;
|
|
default:
|
|
elog(ERROR, "could not identify CTID operator");
|
|
}
|
|
|
|
tidopexpr->exprstate = exprstate;
|
|
|
|
return tidopexpr;
|
|
}
|
|
|
|
/*
|
|
* Extract the qual subexpressions that yield TIDs to search for,
|
|
* and compile them into ExprStates if they're ordinary expressions.
|
|
*/
|
|
static void
|
|
TidExprListCreate(TidRangeScanState *tidrangestate)
|
|
{
|
|
TidRangeScan *node = (TidRangeScan *) tidrangestate->ss.ps.plan;
|
|
List *tidexprs = NIL;
|
|
ListCell *l;
|
|
|
|
foreach(l, node->tidrangequals)
|
|
{
|
|
OpExpr *opexpr = lfirst(l);
|
|
TidOpExpr *tidopexpr;
|
|
|
|
if (!IsA(opexpr, OpExpr))
|
|
elog(ERROR, "could not identify CTID expression");
|
|
|
|
tidopexpr = MakeTidOpExpr(opexpr, tidrangestate);
|
|
tidexprs = lappend(tidexprs, tidopexpr);
|
|
}
|
|
|
|
tidrangestate->trss_tidexprs = tidexprs;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* TidRangeEval
|
|
*
|
|
* Compute and set node's block and offset range to scan by evaluating
|
|
* node->trss_tidexprs. Returns false if we detect the range cannot
|
|
* contain any tuples. Returns true if it's possible for the range to
|
|
* contain tuples. We don't bother validating that trss_mintid is less
|
|
* than or equal to trss_maxtid, as the scan_set_tidrange() table AM
|
|
* function will handle that.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
static bool
|
|
TidRangeEval(TidRangeScanState *node)
|
|
{
|
|
ExprContext *econtext = node->ss.ps.ps_ExprContext;
|
|
ItemPointerData lowerBound;
|
|
ItemPointerData upperBound;
|
|
ListCell *l;
|
|
|
|
/*
|
|
* Set the upper and lower bounds to the absolute limits of the range of
|
|
* the ItemPointer type. Below we'll try to narrow this range on either
|
|
* side by looking at the TidOpExprs.
|
|
*/
|
|
ItemPointerSet(&lowerBound, 0, 0);
|
|
ItemPointerSet(&upperBound, InvalidBlockNumber, PG_UINT16_MAX);
|
|
|
|
foreach(l, node->trss_tidexprs)
|
|
{
|
|
TidOpExpr *tidopexpr = (TidOpExpr *) lfirst(l);
|
|
ItemPointer itemptr;
|
|
bool isNull;
|
|
|
|
/* Evaluate this bound. */
|
|
itemptr = (ItemPointer)
|
|
DatumGetPointer(ExecEvalExprSwitchContext(tidopexpr->exprstate,
|
|
econtext,
|
|
&isNull));
|
|
|
|
/* If the bound is NULL, *nothing* matches the qual. */
|
|
if (isNull)
|
|
return false;
|
|
|
|
if (tidopexpr->exprtype == TIDEXPR_LOWER_BOUND)
|
|
{
|
|
ItemPointerData lb;
|
|
|
|
ItemPointerCopy(itemptr, &lb);
|
|
|
|
/*
|
|
* Normalize non-inclusive ranges to become inclusive. The
|
|
* resulting ItemPointer here may not be a valid item pointer.
|
|
*/
|
|
if (!tidopexpr->inclusive)
|
|
ItemPointerInc(&lb);
|
|
|
|
/* Check if we can narrow the range using this qual */
|
|
if (ItemPointerCompare(&lb, &lowerBound) > 0)
|
|
ItemPointerCopy(&lb, &lowerBound);
|
|
}
|
|
|
|
else if (tidopexpr->exprtype == TIDEXPR_UPPER_BOUND)
|
|
{
|
|
ItemPointerData ub;
|
|
|
|
ItemPointerCopy(itemptr, &ub);
|
|
|
|
/*
|
|
* Normalize non-inclusive ranges to become inclusive. The
|
|
* resulting ItemPointer here may not be a valid item pointer.
|
|
*/
|
|
if (!tidopexpr->inclusive)
|
|
ItemPointerDec(&ub);
|
|
|
|
/* Check if we can narrow the range using this qual */
|
|
if (ItemPointerCompare(&ub, &upperBound) < 0)
|
|
ItemPointerCopy(&ub, &upperBound);
|
|
}
|
|
}
|
|
|
|
ItemPointerCopy(&lowerBound, &node->trss_mintid);
|
|
ItemPointerCopy(&upperBound, &node->trss_maxtid);
|
|
|
|
return true;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* TidRangeNext
|
|
*
|
|
* Retrieve a tuple from the TidRangeScan node's currentRelation
|
|
* using the TIDs in the TidRangeScanState information.
|
|
*
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
static TupleTableSlot *
|
|
TidRangeNext(TidRangeScanState *node)
|
|
{
|
|
TableScanDesc scandesc;
|
|
EState *estate;
|
|
ScanDirection direction;
|
|
TupleTableSlot *slot;
|
|
|
|
/*
|
|
* extract necessary information from TID scan node
|
|
*/
|
|
scandesc = node->ss.ss_currentScanDesc;
|
|
estate = node->ss.ps.state;
|
|
slot = node->ss.ss_ScanTupleSlot;
|
|
direction = estate->es_direction;
|
|
|
|
if (!node->trss_inScan)
|
|
{
|
|
/* First time through, compute TID range to scan */
|
|
if (!TidRangeEval(node))
|
|
return NULL;
|
|
|
|
if (scandesc == NULL)
|
|
{
|
|
scandesc = table_beginscan_tidrange(node->ss.ss_currentRelation,
|
|
estate->es_snapshot,
|
|
&node->trss_mintid,
|
|
&node->trss_maxtid);
|
|
node->ss.ss_currentScanDesc = scandesc;
|
|
}
|
|
else
|
|
{
|
|
/* rescan with the updated TID range */
|
|
table_rescan_tidrange(scandesc, &node->trss_mintid,
|
|
&node->trss_maxtid);
|
|
}
|
|
|
|
node->trss_inScan = true;
|
|
}
|
|
|
|
/* Fetch the next tuple. */
|
|
if (!table_scan_getnextslot_tidrange(scandesc, direction, slot))
|
|
{
|
|
node->trss_inScan = false;
|
|
ExecClearTuple(slot);
|
|
}
|
|
|
|
return slot;
|
|
}
|
|
|
|
/*
|
|
* TidRangeRecheck -- access method routine to recheck a tuple in EvalPlanQual
|
|
*/
|
|
static bool
|
|
TidRangeRecheck(TidRangeScanState *node, TupleTableSlot *slot)
|
|
{
|
|
if (!TidRangeEval(node))
|
|
return false;
|
|
|
|
Assert(ItemPointerIsValid(&slot->tts_tid));
|
|
|
|
/* Recheck the ctid is still within range */
|
|
if (ItemPointerCompare(&slot->tts_tid, &node->trss_mintid) < 0 ||
|
|
ItemPointerCompare(&slot->tts_tid, &node->trss_maxtid) > 0)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecTidRangeScan(node)
|
|
*
|
|
* Scans the relation using tids and returns the next qualifying tuple.
|
|
* We call the ExecScan() routine and pass it the appropriate
|
|
* access method functions.
|
|
*
|
|
* Conditions:
|
|
* -- the "cursor" maintained by the AMI is positioned at the tuple
|
|
* returned previously.
|
|
*
|
|
* Initial States:
|
|
* -- the relation indicated is opened for TID range scanning.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
static TupleTableSlot *
|
|
ExecTidRangeScan(PlanState *pstate)
|
|
{
|
|
TidRangeScanState *node = castNode(TidRangeScanState, pstate);
|
|
|
|
return ExecScan(&node->ss,
|
|
(ExecScanAccessMtd) TidRangeNext,
|
|
(ExecScanRecheckMtd) TidRangeRecheck);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecReScanTidRangeScan(node)
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void
|
|
ExecReScanTidRangeScan(TidRangeScanState *node)
|
|
{
|
|
/* mark scan as not in progress, and tid range list as not computed yet */
|
|
node->trss_inScan = false;
|
|
|
|
/*
|
|
* We must wait until TidRangeNext before calling table_rescan_tidrange.
|
|
*/
|
|
ExecScanReScan(&node->ss);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecEndTidRangeScan
|
|
*
|
|
* Releases any storage allocated through C routines.
|
|
* Returns nothing.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void
|
|
ExecEndTidRangeScan(TidRangeScanState *node)
|
|
{
|
|
TableScanDesc scan = node->ss.ss_currentScanDesc;
|
|
|
|
if (scan != NULL)
|
|
table_endscan(scan);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecInitTidRangeScan
|
|
*
|
|
* Initializes the tid range scan's state information, creates
|
|
* scan keys, and opens the scan relation.
|
|
*
|
|
* Parameters:
|
|
* node: TidRangeScan node produced by the planner.
|
|
* estate: the execution state initialized in InitPlan.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
TidRangeScanState *
|
|
ExecInitTidRangeScan(TidRangeScan *node, EState *estate, int eflags)
|
|
{
|
|
TidRangeScanState *tidrangestate;
|
|
Relation currentRelation;
|
|
|
|
/*
|
|
* create state structure
|
|
*/
|
|
tidrangestate = makeNode(TidRangeScanState);
|
|
tidrangestate->ss.ps.plan = (Plan *) node;
|
|
tidrangestate->ss.ps.state = estate;
|
|
tidrangestate->ss.ps.ExecProcNode = ExecTidRangeScan;
|
|
|
|
/*
|
|
* Miscellaneous initialization
|
|
*
|
|
* create expression context for node
|
|
*/
|
|
ExecAssignExprContext(estate, &tidrangestate->ss.ps);
|
|
|
|
/*
|
|
* mark scan as not in progress, and TID range as not computed yet
|
|
*/
|
|
tidrangestate->trss_inScan = false;
|
|
|
|
/*
|
|
* open the scan relation
|
|
*/
|
|
currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
|
|
|
|
tidrangestate->ss.ss_currentRelation = currentRelation;
|
|
tidrangestate->ss.ss_currentScanDesc = NULL; /* no table scan here */
|
|
|
|
/*
|
|
* get the scan type from the relation descriptor.
|
|
*/
|
|
ExecInitScanTupleSlot(estate, &tidrangestate->ss,
|
|
RelationGetDescr(currentRelation),
|
|
table_slot_callbacks(currentRelation),
|
|
TTS_FLAG_OBEYS_NOT_NULL_CONSTRAINTS);
|
|
|
|
/*
|
|
* Initialize result type and projection.
|
|
*/
|
|
ExecInitResultTypeTL(&tidrangestate->ss.ps);
|
|
ExecAssignScanProjectionInfo(&tidrangestate->ss);
|
|
|
|
/*
|
|
* initialize child expressions
|
|
*/
|
|
tidrangestate->ss.ps.qual =
|
|
ExecInitQual(node->scan.plan.qual, (PlanState *) tidrangestate);
|
|
|
|
TidExprListCreate(tidrangestate);
|
|
|
|
/*
|
|
* all done.
|
|
*/
|
|
return tidrangestate;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* Parallel Scan Support
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecTidRangeScanEstimate
|
|
*
|
|
* Compute the amount of space we'll need in the parallel
|
|
* query DSM, and inform pcxt->estimator about our needs.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void
|
|
ExecTidRangeScanEstimate(TidRangeScanState *node, ParallelContext *pcxt)
|
|
{
|
|
EState *estate = node->ss.ps.state;
|
|
|
|
node->trss_pscanlen =
|
|
table_parallelscan_estimate(node->ss.ss_currentRelation,
|
|
estate->es_snapshot);
|
|
shm_toc_estimate_chunk(&pcxt->estimator, node->trss_pscanlen);
|
|
shm_toc_estimate_keys(&pcxt->estimator, 1);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecTidRangeScanInitializeDSM
|
|
*
|
|
* Set up a parallel TID range scan descriptor.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void
|
|
ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt)
|
|
{
|
|
EState *estate = node->ss.ps.state;
|
|
ParallelTableScanDesc pscan;
|
|
|
|
pscan = shm_toc_allocate(pcxt->toc, node->trss_pscanlen);
|
|
table_parallelscan_initialize(node->ss.ss_currentRelation,
|
|
pscan,
|
|
estate->es_snapshot);
|
|
shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan);
|
|
node->ss.ss_currentScanDesc =
|
|
table_beginscan_parallel_tidrange(node->ss.ss_currentRelation,
|
|
pscan);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecTidRangeScanReInitializeDSM
|
|
*
|
|
* Reset shared state before beginning a fresh scan.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void
|
|
ExecTidRangeScanReInitializeDSM(TidRangeScanState *node,
|
|
ParallelContext *pcxt)
|
|
{
|
|
ParallelTableScanDesc pscan;
|
|
|
|
pscan = node->ss.ss_currentScanDesc->rs_parallel;
|
|
table_parallelscan_reinitialize(node->ss.ss_currentRelation, pscan);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecTidRangeScanInitializeWorker
|
|
*
|
|
* Copy relevant information from TOC into planstate.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void
|
|
ExecTidRangeScanInitializeWorker(TidRangeScanState *node,
|
|
ParallelWorkerContext *pwcxt)
|
|
{
|
|
ParallelTableScanDesc pscan;
|
|
|
|
pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
|
|
node->ss.ss_currentScanDesc =
|
|
table_beginscan_parallel_tidrange(node->ss.ss_currentRelation,
|
|
pscan);
|
|
}
|