postgresql/src/backend/executor/nodeBitmapIndexscan.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

438 lines
13 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* nodeBitmapIndexscan.c
* Routines to support bitmapped index scans of relations
*
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
2010-09-20 16:08:53 -04:00
* src/backend/executor/nodeBitmapIndexscan.c
*
*-------------------------------------------------------------------------
*/
/*
* INTERFACE ROUTINES
* MultiExecBitmapIndexScan scans a relation using index.
* ExecInitBitmapIndexScan creates and initializes state info.
* ExecReScanBitmapIndexScan prepares to rescan the plan.
* ExecEndBitmapIndexScan releases all storage.
*/
#include "postgres.h"
#include "access/genam.h"
#include "executor/executor.h"
#include "executor/nodeBitmapIndexscan.h"
#include "executor/nodeIndexscan.h"
#include "miscadmin.h"
/* ----------------------------------------------------------------
* ExecBitmapIndexScan
*
* stub for pro forma compliance
* ----------------------------------------------------------------
*/
static TupleTableSlot *
ExecBitmapIndexScan(PlanState *pstate)
{
elog(ERROR, "BitmapIndexScan node does not support ExecProcNode call convention");
return NULL;
}
/* ----------------------------------------------------------------
* MultiExecBitmapIndexScan(node)
* ----------------------------------------------------------------
*/
Node *
MultiExecBitmapIndexScan(BitmapIndexScanState *node)
{
TIDBitmap *tbm;
IndexScanDesc scandesc;
double nTuples = 0;
bool doscan;
/* must provide our own instrumentation support */
if (node->ss.ps.instrument)
InstrStartNode(node->ss.ps.instrument);
/*
* extract necessary information from index scan node
*/
scandesc = node->biss_ScanDesc;
/*
* If we have runtime keys and they've not already been set up, do it now.
* Array keys are also treated as runtime keys; note that if ExecReScan
* returns with biss_RuntimeKeysReady still false, then there is an empty
* array key so we should do nothing.
*/
if (!node->biss_RuntimeKeysReady &&
(node->biss_NumRuntimeKeys != 0 || node->biss_NumArrayKeys != 0))
{
ExecReScan((PlanState *) node);
doscan = node->biss_RuntimeKeysReady;
}
else
doscan = true;
/*
* Prepare the result bitmap. Normally we just create a new one to pass
* back; however, our parent node is allowed to store a pre-made one into
* node->biss_result, in which case we just OR our tuple IDs into the
* existing bitmap. (This saves needing explicit UNION steps.)
*/
if (node->biss_result)
{
tbm = node->biss_result;
node->biss_result = NULL; /* reset for next time */
}
else
{
/* XXX should we use less than work_mem for this? */
tbm = tbm_create(work_mem * (Size) 1024,
((BitmapIndexScan *) node->ss.ps.plan)->isshared ?
node->ss.ps.state->es_query_dsa : NULL);
}
/*
* Get TIDs from index and insert into bitmap
*/
while (doscan)
{
nTuples += (double) index_getbitmap(scandesc, tbm);
CHECK_FOR_INTERRUPTS();
doscan = ExecIndexAdvanceArrayKeys(node->biss_ArrayKeys,
node->biss_NumArrayKeys);
if (doscan) /* reset index scan */
index_rescan(node->biss_ScanDesc,
node->biss_ScanKeys, node->biss_NumScanKeys,
NULL, 0);
}
/* must provide our own instrumentation support */
if (node->ss.ps.instrument)
InstrStopNode(node->ss.ps.instrument, nTuples);
return (Node *) tbm;
}
/* ----------------------------------------------------------------
* ExecReScanBitmapIndexScan(node)
*
* Recalculates the values of any scan keys whose value depends on
* information known at runtime, then rescans the indexed relation.
* ----------------------------------------------------------------
*/
void
ExecReScanBitmapIndexScan(BitmapIndexScanState *node)
{
ExprContext *econtext = node->biss_RuntimeContext;
/*
* Reset the runtime-key context so we don't leak memory as each outer
* tuple is scanned. Note this assumes that we will recalculate *all*
* runtime keys on each call.
*/
if (econtext)
ResetExprContext(econtext);
/*
* If we are doing runtime key calculations (ie, any of the index key
* values weren't simple Consts), compute the new key values.
*
* Array keys are also treated as runtime keys; note that if we return
* with biss_RuntimeKeysReady still false, then there is an empty array
* key so no index scan is needed.
*/
if (node->biss_NumRuntimeKeys != 0)
ExecIndexEvalRuntimeKeys(econtext,
node->biss_RuntimeKeys,
node->biss_NumRuntimeKeys);
if (node->biss_NumArrayKeys != 0)
node->biss_RuntimeKeysReady =
ExecIndexEvalArrayKeys(econtext,
node->biss_ArrayKeys,
node->biss_NumArrayKeys);
else
node->biss_RuntimeKeysReady = true;
/* reset index scan */
if (node->biss_RuntimeKeysReady)
index_rescan(node->biss_ScanDesc,
node->biss_ScanKeys, node->biss_NumScanKeys,
NULL, 0);
}
/* ----------------------------------------------------------------
* ExecEndBitmapIndexScan
* ----------------------------------------------------------------
*/
void
ExecEndBitmapIndexScan(BitmapIndexScanState *node)
{
Relation indexRelationDesc;
IndexScanDesc indexScanDesc;
/*
* extract information from the node
*/
indexRelationDesc = node->biss_RelationDesc;
indexScanDesc = node->biss_ScanDesc;
Show index search count in EXPLAIN ANALYZE, take 2. Expose the count of index searches/index descents in EXPLAIN ANALYZE's output for index scan/index-only scan/bitmap index scan nodes. This information is particularly useful with scans that use ScalarArrayOp quals, where the number of index searches can be unpredictable due to implementation details that interact with physical index characteristics (at least with nbtree SAOP scans, since Postgres 17 commit 5bf748b8). The information shown also provides useful context when EXPLAIN ANALYZE runs a plan with an index scan node that successfully applied the skip scan optimization (set to be added to nbtree by an upcoming patch). The instrumentation works by teaching all index AMs to increment a new nsearches counter whenever a new index search begins. The counter is incremented at exactly the same point that index AMs already increment the pg_stat_*_indexes.idx_scan counter (we're counting the same event, but at the scan level rather than the relation level). Parallel queries have workers copy their local counter struct into shared memory when an index scan node ends -- even when it isn't a parallel aware scan node. An earlier version of this patch that only worked with parallel aware scans became commit 5ead85fb (though that was quickly reverted by commit d00107cd following "debug_parallel_query=regress" buildfarm failures). Our approach doesn't match the approach used when tracking other index scan related costs (e.g., "Rows Removed by Filter:"). It is comparable to the approach used in similar cases involving costs that are only readily accessible inside an access method, not from the executor proper (e.g., "Heap Blocks:" output for a Bitmap Heap Scan, which was recently enhanced to show per-worker costs by commit 5a1e6df3, using essentially the same scheme as the one used here). It is necessary for index AMs to have direct responsibility for maintaining the new counter, since the counter might need to be incremented multiple times per amgettuple call (or per amgetbitmap call). But it is also necessary for the executor proper to manage the shared memory now used to transfer each worker's counter struct to the leader. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Reviewed-By: Tomas Vondra <tomas@vondra.me> Reviewed-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com> Reviewed-By: Matthias van de Meent <boekewurm+postgres@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkRqvaqR2CTNqTZP0z6FuL4-3ED6eQB0yx38XBNj1v-4Q@mail.gmail.com Discussion: https://postgr.es/m/CAH2-Wz=PKR6rB7qbx+Vnd7eqeB5VTcrW=iJvAsTsKbdG+kW_UA@mail.gmail.com
2025-03-11 09:20:50 -04:00
/*
* When ending a parallel worker, copy the statistics gathered by the
* worker back into shared memory so that it can be picked up by the main
* process to report in EXPLAIN ANALYZE
*/
if (node->biss_SharedInfo != NULL && IsParallelWorker())
{
IndexScanInstrumentation *winstrument;
Assert(ParallelWorkerNumber <= node->biss_SharedInfo->num_workers);
winstrument = &node->biss_SharedInfo->winstrument[ParallelWorkerNumber];
/*
* We have to accumulate the stats rather than performing a memcpy.
* When a Gather/GatherMerge node finishes it will perform planner
* shutdown on the workers. On rescan it will spin up new workers
* which will have a new BitmapIndexScanState and zeroed stats.
*/
winstrument->nsearches += node->biss_Instrument.nsearches;
}
/*
* close the index relation (no-op if we didn't open it)
*/
if (indexScanDesc)
index_endscan(indexScanDesc);
if (indexRelationDesc)
index_close(indexRelationDesc, NoLock);
}
/* ----------------------------------------------------------------
* ExecInitBitmapIndexScan
*
* Initializes the index scan's state information.
* ----------------------------------------------------------------
*/
BitmapIndexScanState *
ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags)
{
BitmapIndexScanState *indexstate;
Make queries' locking of indexes more consistent. The assertions added by commit b04aeb0a0 exposed that there are some code paths wherein the executor will try to open an index without holding any lock on it. We do have some lock on the index's table, so it seems likely that there's no fatal problem with this (for instance, the index couldn't get dropped from under us). Still, it's bad practice and we should fix it. To do so, remove the optimizations in ExecInitIndexScan and friends that tried to avoid taking a lock on an index belonging to a target relation, and just take the lock always. In non-bug cases, this will result in no additional shared-memory access, since we'll find in the local lock table that we already have a lock of the desired type; hence, no significant performance degradation should occur. Also, adjust the planner and executor so that the type of lock taken on an index is always identical to the type of lock taken for its table, by relying on the recently added RangeTblEntry.rellockmode field. This avoids some corner cases where that might not have been true before (possibly resulting in extra locking overhead), and prevents future maintenance issues from having multiple bits of logic that all needed to be in sync. In addition, this change removes all core calls to ExecRelationIsTargetRelation, which avoids a possible O(N^2) startup penalty for queries with large numbers of target relations. (We'd probably remove that function altogether, were it not that we advertise it as something that FDWs might want to use.) Also adjust some places in selfuncs.c to not take any lock on indexes they are transiently opening, since we can assume that plancat.c did that already. In passing, change gin_clean_pending_list() to take RowExclusiveLock not AccessShareLock on its target index. Although it's not clear that that's actually a bug, it seemed very strange for a function that's explicitly going to modify the index to use only AccessShareLock. David Rowley, reviewed by Julien Rouhaud and Amit Langote, a bit of further tweaking by me Discussion: https://postgr.es/m/19465.1541636036@sss.pgh.pa.us
2019-04-04 15:12:51 -04:00
LOCKMODE lockmode;
/* check for unsupported flags */
Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
/*
* create state structure
*/
indexstate = makeNode(BitmapIndexScanState);
indexstate->ss.ps.plan = (Plan *) node;
indexstate->ss.ps.state = estate;
indexstate->ss.ps.ExecProcNode = ExecBitmapIndexScan;
/* normally we don't make the result bitmap till runtime */
indexstate->biss_result = NULL;
/*
* We do not open or lock the base relation here. We assume that an
* ancestor BitmapHeapScan node is holding AccessShareLock (or better) on
* the heap relation throughout the execution of the plan tree.
*/
indexstate->ss.ss_currentRelation = NULL;
indexstate->ss.ss_currentScanDesc = NULL;
/*
* Miscellaneous initialization
*
* We do not need a standard exprcontext for this node, though we may
* decide below to create a runtime-key exprcontext
*/
/*
* initialize child expressions
*
* We don't need to initialize targetlist or qual since neither are used.
*
* Note: we don't initialize all of the indexqual expression, only the
* sub-parts corresponding to runtime keys (see below).
*/
/*
* If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
* here. This allows an index-advisor plugin to EXPLAIN a plan containing
* references to nonexistent indexes.
*/
if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
return indexstate;
Make queries' locking of indexes more consistent. The assertions added by commit b04aeb0a0 exposed that there are some code paths wherein the executor will try to open an index without holding any lock on it. We do have some lock on the index's table, so it seems likely that there's no fatal problem with this (for instance, the index couldn't get dropped from under us). Still, it's bad practice and we should fix it. To do so, remove the optimizations in ExecInitIndexScan and friends that tried to avoid taking a lock on an index belonging to a target relation, and just take the lock always. In non-bug cases, this will result in no additional shared-memory access, since we'll find in the local lock table that we already have a lock of the desired type; hence, no significant performance degradation should occur. Also, adjust the planner and executor so that the type of lock taken on an index is always identical to the type of lock taken for its table, by relying on the recently added RangeTblEntry.rellockmode field. This avoids some corner cases where that might not have been true before (possibly resulting in extra locking overhead), and prevents future maintenance issues from having multiple bits of logic that all needed to be in sync. In addition, this change removes all core calls to ExecRelationIsTargetRelation, which avoids a possible O(N^2) startup penalty for queries with large numbers of target relations. (We'd probably remove that function altogether, were it not that we advertise it as something that FDWs might want to use.) Also adjust some places in selfuncs.c to not take any lock on indexes they are transiently opening, since we can assume that plancat.c did that already. In passing, change gin_clean_pending_list() to take RowExclusiveLock not AccessShareLock on its target index. Although it's not clear that that's actually a bug, it seemed very strange for a function that's explicitly going to modify the index to use only AccessShareLock. David Rowley, reviewed by Julien Rouhaud and Amit Langote, a bit of further tweaking by me Discussion: https://postgr.es/m/19465.1541636036@sss.pgh.pa.us
2019-04-04 15:12:51 -04:00
/* Open the index relation. */
lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
indexstate->biss_RelationDesc = index_open(node->indexid, lockmode);
/*
* Initialize index-specific scan state
*/
indexstate->biss_RuntimeKeysReady = false;
indexstate->biss_RuntimeKeys = NULL;
indexstate->biss_NumRuntimeKeys = 0;
/*
* build the index scan keys from the index qualification
*/
ExecIndexBuildScanKeys((PlanState *) indexstate,
indexstate->biss_RelationDesc,
node->indexqual,
false,
&indexstate->biss_ScanKeys,
&indexstate->biss_NumScanKeys,
&indexstate->biss_RuntimeKeys,
&indexstate->biss_NumRuntimeKeys,
&indexstate->biss_ArrayKeys,
&indexstate->biss_NumArrayKeys);
/*
* If we have runtime keys or array keys, we need an ExprContext to
* evaluate them. We could just create a "standard" plan node exprcontext,
* but to keep the code looking similar to nodeIndexscan.c, it seems
* better to stick with the approach of using a separate ExprContext.
*/
if (indexstate->biss_NumRuntimeKeys != 0 ||
indexstate->biss_NumArrayKeys != 0)
{
ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
ExecAssignExprContext(estate, &indexstate->ss.ps);
indexstate->biss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
indexstate->ss.ps.ps_ExprContext = stdecontext;
}
else
{
indexstate->biss_RuntimeContext = NULL;
}
/*
* Initialize scan descriptor.
*/
indexstate->biss_ScanDesc =
index_beginscan_bitmap(indexstate->biss_RelationDesc,
estate->es_snapshot,
Show index search count in EXPLAIN ANALYZE, take 2. Expose the count of index searches/index descents in EXPLAIN ANALYZE's output for index scan/index-only scan/bitmap index scan nodes. This information is particularly useful with scans that use ScalarArrayOp quals, where the number of index searches can be unpredictable due to implementation details that interact with physical index characteristics (at least with nbtree SAOP scans, since Postgres 17 commit 5bf748b8). The information shown also provides useful context when EXPLAIN ANALYZE runs a plan with an index scan node that successfully applied the skip scan optimization (set to be added to nbtree by an upcoming patch). The instrumentation works by teaching all index AMs to increment a new nsearches counter whenever a new index search begins. The counter is incremented at exactly the same point that index AMs already increment the pg_stat_*_indexes.idx_scan counter (we're counting the same event, but at the scan level rather than the relation level). Parallel queries have workers copy their local counter struct into shared memory when an index scan node ends -- even when it isn't a parallel aware scan node. An earlier version of this patch that only worked with parallel aware scans became commit 5ead85fb (though that was quickly reverted by commit d00107cd following "debug_parallel_query=regress" buildfarm failures). Our approach doesn't match the approach used when tracking other index scan related costs (e.g., "Rows Removed by Filter:"). It is comparable to the approach used in similar cases involving costs that are only readily accessible inside an access method, not from the executor proper (e.g., "Heap Blocks:" output for a Bitmap Heap Scan, which was recently enhanced to show per-worker costs by commit 5a1e6df3, using essentially the same scheme as the one used here). It is necessary for index AMs to have direct responsibility for maintaining the new counter, since the counter might need to be incremented multiple times per amgettuple call (or per amgetbitmap call). But it is also necessary for the executor proper to manage the shared memory now used to transfer each worker's counter struct to the leader. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Reviewed-By: Tomas Vondra <tomas@vondra.me> Reviewed-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com> Reviewed-By: Matthias van de Meent <boekewurm+postgres@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkRqvaqR2CTNqTZP0z6FuL4-3ED6eQB0yx38XBNj1v-4Q@mail.gmail.com Discussion: https://postgr.es/m/CAH2-Wz=PKR6rB7qbx+Vnd7eqeB5VTcrW=iJvAsTsKbdG+kW_UA@mail.gmail.com
2025-03-11 09:20:50 -04:00
&indexstate->biss_Instrument,
indexstate->biss_NumScanKeys);
/*
* If no run-time keys to calculate, go ahead and pass the scankeys to the
* index AM.
*/
if (indexstate->biss_NumRuntimeKeys == 0 &&
indexstate->biss_NumArrayKeys == 0)
index_rescan(indexstate->biss_ScanDesc,
indexstate->biss_ScanKeys, indexstate->biss_NumScanKeys,
NULL, 0);
/*
* all done.
*/
return indexstate;
}
Show index search count in EXPLAIN ANALYZE, take 2. Expose the count of index searches/index descents in EXPLAIN ANALYZE's output for index scan/index-only scan/bitmap index scan nodes. This information is particularly useful with scans that use ScalarArrayOp quals, where the number of index searches can be unpredictable due to implementation details that interact with physical index characteristics (at least with nbtree SAOP scans, since Postgres 17 commit 5bf748b8). The information shown also provides useful context when EXPLAIN ANALYZE runs a plan with an index scan node that successfully applied the skip scan optimization (set to be added to nbtree by an upcoming patch). The instrumentation works by teaching all index AMs to increment a new nsearches counter whenever a new index search begins. The counter is incremented at exactly the same point that index AMs already increment the pg_stat_*_indexes.idx_scan counter (we're counting the same event, but at the scan level rather than the relation level). Parallel queries have workers copy their local counter struct into shared memory when an index scan node ends -- even when it isn't a parallel aware scan node. An earlier version of this patch that only worked with parallel aware scans became commit 5ead85fb (though that was quickly reverted by commit d00107cd following "debug_parallel_query=regress" buildfarm failures). Our approach doesn't match the approach used when tracking other index scan related costs (e.g., "Rows Removed by Filter:"). It is comparable to the approach used in similar cases involving costs that are only readily accessible inside an access method, not from the executor proper (e.g., "Heap Blocks:" output for a Bitmap Heap Scan, which was recently enhanced to show per-worker costs by commit 5a1e6df3, using essentially the same scheme as the one used here). It is necessary for index AMs to have direct responsibility for maintaining the new counter, since the counter might need to be incremented multiple times per amgettuple call (or per amgetbitmap call). But it is also necessary for the executor proper to manage the shared memory now used to transfer each worker's counter struct to the leader. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Reviewed-By: Tomas Vondra <tomas@vondra.me> Reviewed-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com> Reviewed-By: Matthias van de Meent <boekewurm+postgres@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkRqvaqR2CTNqTZP0z6FuL4-3ED6eQB0yx38XBNj1v-4Q@mail.gmail.com Discussion: https://postgr.es/m/CAH2-Wz=PKR6rB7qbx+Vnd7eqeB5VTcrW=iJvAsTsKbdG+kW_UA@mail.gmail.com
2025-03-11 09:20:50 -04:00
/* ----------------------------------------------------------------
* ExecBitmapIndexScanEstimate
*
* Compute the amount of space we'll need in the parallel
* query DSM, and inform pcxt->estimator about our needs.
* ----------------------------------------------------------------
*/
void
ExecBitmapIndexScanEstimate(BitmapIndexScanState *node, ParallelContext *pcxt)
{
Size size;
/*
* Parallel bitmap index scans are not supported, but we still need to
* store the scan's instrumentation in DSM during parallel query
*/
if (!node->ss.ps.instrument || pcxt->nworkers == 0)
return;
size = offsetof(SharedIndexScanInstrumentation, winstrument) +
pcxt->nworkers * sizeof(IndexScanInstrumentation);
shm_toc_estimate_chunk(&pcxt->estimator, size);
shm_toc_estimate_keys(&pcxt->estimator, 1);
}
/* ----------------------------------------------------------------
* ExecBitmapIndexScanInitializeDSM
*
* Set up bitmap index scan shared instrumentation.
* ----------------------------------------------------------------
*/
void
ExecBitmapIndexScanInitializeDSM(BitmapIndexScanState *node,
ParallelContext *pcxt)
{
Size size;
/* don't need this if not instrumenting or no workers */
if (!node->ss.ps.instrument || pcxt->nworkers == 0)
return;
size = offsetof(SharedIndexScanInstrumentation, winstrument) +
pcxt->nworkers * sizeof(IndexScanInstrumentation);
node->biss_SharedInfo =
(SharedIndexScanInstrumentation *) shm_toc_allocate(pcxt->toc,
size);
shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
node->biss_SharedInfo);
/* Each per-worker area must start out as zeroes */
memset(node->biss_SharedInfo, 0, size);
node->biss_SharedInfo->num_workers = pcxt->nworkers;
}
/* ----------------------------------------------------------------
* ExecBitmapIndexScanInitializeWorker
*
* Copy relevant information from TOC into planstate.
* ----------------------------------------------------------------
*/
void
ExecBitmapIndexScanInitializeWorker(BitmapIndexScanState *node,
ParallelWorkerContext *pwcxt)
{
/* don't need this if not instrumenting */
if (!node->ss.ps.instrument)
return;
node->biss_SharedInfo = (SharedIndexScanInstrumentation *)
shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
}
/* ----------------------------------------------------------------
* ExecBitmapIndexScanRetrieveInstrumentation
*
* Transfer bitmap index scan statistics from DSM to private memory.
* ----------------------------------------------------------------
*/
void
ExecBitmapIndexScanRetrieveInstrumentation(BitmapIndexScanState *node)
{
SharedIndexScanInstrumentation *SharedInfo = node->biss_SharedInfo;
size_t size;
if (SharedInfo == NULL)
return;
/* Create a copy of SharedInfo in backend-local memory */
size = offsetof(SharedIndexScanInstrumentation, winstrument) +
SharedInfo->num_workers * sizeof(IndexScanInstrumentation);
node->biss_SharedInfo = palloc(size);
memcpy(node->biss_SharedInfo, SharedInfo, size);
}