postgresql/src/backend/executor/nodeLockRows.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

410 lines
11 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* nodeLockRows.c
* Routines to handle FOR UPDATE/FOR SHARE row locking
*
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
2010-09-20 16:08:53 -04:00
* src/backend/executor/nodeLockRows.c
*
*-------------------------------------------------------------------------
*/
/*
* INTERFACE ROUTINES
* ExecLockRows - fetch locked rows
* ExecInitLockRows - initialize node and subnodes..
* ExecEndLockRows - shutdown node and subnodes
*/
#include "postgres.h"
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-23 22:55:57 -04:00
#include "access/tableam.h"
#include "access/xact.h"
#include "executor/executor.h"
#include "executor/nodeLockRows.h"
#include "foreign/fdwapi.h"
#include "miscadmin.h"
#include "utils/rel.h"
/* ----------------------------------------------------------------
* ExecLockRows
* ----------------------------------------------------------------
*/
static TupleTableSlot * /* return: a tuple or NULL */
ExecLockRows(PlanState *pstate)
{
LockRowsState *node = castNode(LockRowsState, pstate);
TupleTableSlot *slot;
EState *estate;
PlanState *outerPlan;
bool epq_needed;
ListCell *lc;
CHECK_FOR_INTERRUPTS();
/*
* get information from the node
*/
estate = node->ps.state;
outerPlan = outerPlanState(node);
/*
* Get next tuple from subplan, if any.
*/
lnext:
slot = ExecProcNode(outerPlan);
if (TupIsNull(slot))
{
/* Release any resources held by EPQ mechanism before exiting */
EvalPlanQualEnd(&node->lr_epqstate);
return NULL;
}
/* We don't need EvalPlanQual unless we get updated tuple version(s) */
epq_needed = false;
/*
* Attempt to lock the source tuple(s). (Note we only have locking
* rowmarks in lr_arowMarks.)
*/
foreach(lc, node->lr_arowMarks)
{
ExecAuxRowMark *aerm = (ExecAuxRowMark *) lfirst(lc);
ExecRowMark *erm = aerm->rowmark;
Datum datum;
bool isNull;
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-23 22:55:57 -04:00
ItemPointerData tid;
TM_FailureData tmfd;
LockTupleMode lockmode;
int lockflags = 0;
TM_Result test;
TupleTableSlot *markSlot;
/* clear any leftover test tuple for this rel */
markSlot = EvalPlanQualSlot(&node->lr_epqstate, erm->relation, erm->rti);
ExecClearTuple(markSlot);
/* if child rel, must check whether it produced this row */
if (erm->rti != erm->prti)
{
Oid tableoid;
datum = ExecGetJunkAttribute(slot,
aerm->toidAttNo,
&isNull);
/* shouldn't ever get a null result... */
if (isNull)
elog(ERROR, "tableoid is NULL");
tableoid = DatumGetObjectId(datum);
Allow foreign tables to participate in inheritance. Foreign tables can now be inheritance children, or parents. Much of the system was already ready for this, but we had to fix a few things of course, mostly in the area of planner and executor handling of row locks. As side effects of this, allow foreign tables to have NOT VALID CHECK constraints (and hence to accept ALTER ... VALIDATE CONSTRAINT), and to accept ALTER SET STORAGE and ALTER SET WITH/WITHOUT OIDS. Continuing to disallow these things would've required bizarre and inconsistent special cases in inheritance behavior. Since foreign tables don't enforce CHECK constraints anyway, a NOT VALID one is a complete no-op, but that doesn't mean we shouldn't allow it. And it's possible that some FDWs might have use for SET STORAGE or SET WITH OIDS, though doubtless they will be no-ops for most. An additional change in support of this is that when a ModifyTable node has multiple target tables, they will all now be explicitly identified in EXPLAIN output, for example: Update on pt1 (cost=0.00..321.05 rows=3541 width=46) Update on pt1 Foreign Update on ft1 Foreign Update on ft2 Update on child3 -> Seq Scan on pt1 (cost=0.00..0.00 rows=1 width=46) -> Foreign Scan on ft1 (cost=100.00..148.03 rows=1170 width=46) -> Foreign Scan on ft2 (cost=100.00..148.03 rows=1170 width=46) -> Seq Scan on child3 (cost=0.00..25.00 rows=1200 width=46) This was done mainly to provide an unambiguous place to attach "Remote SQL" fields, but it is useful for inherited updates even when no foreign tables are involved. Shigeru Hanada and Etsuro Fujita, reviewed by Ashutosh Bapat and Kyotaro Horiguchi, some additional hacking by me
2015-03-22 13:53:11 -04:00
Assert(OidIsValid(erm->relid));
if (tableoid != erm->relid)
{
/* this child is inactive right now */
erm->ermActive = false;
ItemPointerSetInvalid(&(erm->curCtid));
continue;
}
}
erm->ermActive = true;
/* fetch the tuple's ctid */
datum = ExecGetJunkAttribute(slot,
aerm->ctidAttNo,
&isNull);
/* shouldn't ever get a null result... */
if (isNull)
elog(ERROR, "ctid is NULL");
/* requests for foreign tables must be passed to their FDW */
if (erm->relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
{
FdwRoutine *fdwroutine;
bool updated = false;
fdwroutine = GetFdwRoutineForRelation(erm->relation, false);
/* this should have been checked already, but let's be safe */
if (fdwroutine->RefetchForeignRow == NULL)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot lock rows in foreign table \"%s\"",
RelationGetRelationName(erm->relation))));
fdwroutine->RefetchForeignRow(estate,
erm,
datum,
markSlot,
&updated);
if (TupIsNull(markSlot))
{
/* couldn't get the lock, so skip this row */
goto lnext;
}
/*
* if FDW says tuple was updated before getting locked, we need to
* perform EPQ testing to see if quals are still satisfied
*/
if (updated)
epq_needed = true;
continue;
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-23 22:55:57 -04:00
/* okay, try to lock (and fetch) the tuple */
tid = *((ItemPointer) DatumGetPointer(datum));
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 10:04:59 -05:00
switch (erm->markType)
{
case ROW_MARK_EXCLUSIVE:
lockmode = LockTupleExclusive;
break;
case ROW_MARK_NOKEYEXCLUSIVE:
lockmode = LockTupleNoKeyExclusive;
break;
case ROW_MARK_SHARE:
lockmode = LockTupleShare;
break;
case ROW_MARK_KEYSHARE:
lockmode = LockTupleKeyShare;
break;
default:
elog(ERROR, "unsupported rowmark type");
lockmode = LockTupleNoKeyExclusive; /* keep compiler quiet */
break;
}
lockflags = TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS;
if (!IsolationUsesXactSnapshot())
lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION;
test = table_tuple_lock(erm->relation, &tid, estate->es_snapshot,
markSlot, estate->es_output_cid,
lockmode, erm->waitPolicy,
lockflags,
&tmfd);
switch (test)
{
case TM_WouldBlock:
/* couldn't lock tuple in SKIP LOCKED mode */
goto lnext;
case TM_SelfModified:
/*
* The target tuple was already updated or deleted by the
* current command, or by a later command in the current
* transaction. We *must* ignore the tuple in the former
* case, so as to avoid the "Halloween problem" of repeated
* update attempts. In the latter case it might be sensible
* to fetch the updated tuple instead, but doing so would
* require changing heap_update and heap_delete to not
* complain about updating "invisible" tuples, which seems
* pretty scary (table_tuple_lock will not complain, but few
* callers expect TM_Invisible, and we're not one of them). So
* for now, treat the tuple as deleted and do not process.
*/
goto lnext;
case TM_Ok:
/*
* Got the lock successfully, the locked tuple saved in
* markSlot for, if needed, EvalPlanQual testing below.
*/
if (tmfd.traversed)
epq_needed = true;
break;
case TM_Updated:
if (IsolationUsesXactSnapshot())
ereport(ERROR,
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
errmsg("could not serialize access due to concurrent update")));
elog(ERROR, "unexpected table_tuple_lock status: %u",
test);
break;
case TM_Deleted:
if (IsolationUsesXactSnapshot())
ereport(ERROR,
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
errmsg("could not serialize access due to concurrent update")));
/* tuple was deleted so don't return it */
goto lnext;
case TM_Invisible:
elog(ERROR, "attempted to lock invisible tuple");
break;
default:
elog(ERROR, "unrecognized table_tuple_lock status: %u",
test);
}
/* Remember locked tuple's TID for EPQ testing and WHERE CURRENT OF */
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-23 22:55:57 -04:00
erm->curCtid = tid;
}
/*
* If we need to do EvalPlanQual testing, do so.
*/
if (epq_needed)
{
Reorder EPQ work, to fix rowmark related bugs and improve efficiency. In ad0bda5d24ea I changed the EvalPlanQual machinery to store substitution tuples in slot, instead of using plain HeapTuples. The main motivation for that was that using HeapTuples will be inefficient for future tableams. But it turns out that that conversion was buggy for non-locking rowmarks - the wrong tuple descriptor was used to create the slot. As a secondary issue 5db6df0c0 changed ExecLockRows() to begin EPQ earlier, to allow to fetch the locked rows directly into the EPQ slots, instead of having to copy tuples around. Unfortunately, as Tom complained, that forces some expensive initialization to happen earlier. As a third issue, the test coverage for EPQ was clearly insufficient. Fixing the first issue is unfortunately not trivial: Non-locked row marks were fetched at the start of EPQ, and we don't have the type information for the rowmarks available at that point. While we could change that, it's not easy. It might be worthwhile to change that at some point, but to fix this bug, it seems better to delay fetching non-locking rowmarks when they're actually needed, rather than eagerly. They're referenced at most once, and in cases where EPQ fails, might never be referenced. Fetching them when needed also increases locality a bit. To be able to fetch rowmarks during execution, rather than initialization, we need to be able to access the active EPQState, as that contains necessary data. To do so move EPQ related data from EState to EPQState, and, only for EStates creates as part of EPQ, reference the associated EPQState from EState. To fix the second issue, change EPQ initialization to allow use of EvalPlanQualSlot() to be used before EvalPlanQualBegin() (but obviously still requiring EvalPlanQualInit() to have been done). As these changes made struct EState harder to understand, e.g. by adding multiple EStates, significantly reorder the members, and add a lot more comments. Also add a few more EPQ tests, including one that fails for the first issue above. More is needed. Reported-By: yi huang Author: Andres Freund Reviewed-By: Tom Lane Discussion: https://postgr.es/m/CAHU7rYZo_C4ULsAx_LAj8az9zqgrD8WDd4hTegDTMM1LMqrBsg@mail.gmail.com https://postgr.es/m/24530.1562686693@sss.pgh.pa.us Backpatch: 12-, where the EPQ changes were introduced
2019-09-05 16:00:20 -04:00
/* Initialize EPQ machinery */
EvalPlanQualBegin(&node->lr_epqstate);
/*
Reorder EPQ work, to fix rowmark related bugs and improve efficiency. In ad0bda5d24ea I changed the EvalPlanQual machinery to store substitution tuples in slot, instead of using plain HeapTuples. The main motivation for that was that using HeapTuples will be inefficient for future tableams. But it turns out that that conversion was buggy for non-locking rowmarks - the wrong tuple descriptor was used to create the slot. As a secondary issue 5db6df0c0 changed ExecLockRows() to begin EPQ earlier, to allow to fetch the locked rows directly into the EPQ slots, instead of having to copy tuples around. Unfortunately, as Tom complained, that forces some expensive initialization to happen earlier. As a third issue, the test coverage for EPQ was clearly insufficient. Fixing the first issue is unfortunately not trivial: Non-locked row marks were fetched at the start of EPQ, and we don't have the type information for the rowmarks available at that point. While we could change that, it's not easy. It might be worthwhile to change that at some point, but to fix this bug, it seems better to delay fetching non-locking rowmarks when they're actually needed, rather than eagerly. They're referenced at most once, and in cases where EPQ fails, might never be referenced. Fetching them when needed also increases locality a bit. To be able to fetch rowmarks during execution, rather than initialization, we need to be able to access the active EPQState, as that contains necessary data. To do so move EPQ related data from EState to EPQState, and, only for EStates creates as part of EPQ, reference the associated EPQState from EState. To fix the second issue, change EPQ initialization to allow use of EvalPlanQualSlot() to be used before EvalPlanQualBegin() (but obviously still requiring EvalPlanQualInit() to have been done). As these changes made struct EState harder to understand, e.g. by adding multiple EStates, significantly reorder the members, and add a lot more comments. Also add a few more EPQ tests, including one that fails for the first issue above. More is needed. Reported-By: yi huang Author: Andres Freund Reviewed-By: Tom Lane Discussion: https://postgr.es/m/CAHU7rYZo_C4ULsAx_LAj8az9zqgrD8WDd4hTegDTMM1LMqrBsg@mail.gmail.com https://postgr.es/m/24530.1562686693@sss.pgh.pa.us Backpatch: 12-, where the EPQ changes were introduced
2019-09-05 16:00:20 -04:00
* To fetch non-locked source rows the EPQ logic needs to access junk
* columns from the tuple being tested.
*/
EvalPlanQualSetSlot(&node->lr_epqstate, slot);
2010-02-25 21:01:40 -05:00
/*
* And finally we can re-evaluate the tuple.
*/
slot = EvalPlanQualNext(&node->lr_epqstate);
if (TupIsNull(slot))
{
/* Updated tuple fails qual, so ignore it and go on */
goto lnext;
}
}
/* Got all locks, so return the current tuple */
return slot;
}
/* ----------------------------------------------------------------
* ExecInitLockRows
*
* This initializes the LockRows node state structures and
* the node's subplan.
* ----------------------------------------------------------------
*/
LockRowsState *
ExecInitLockRows(LockRows *node, EState *estate, int eflags)
{
LockRowsState *lrstate;
Plan *outerPlan = outerPlan(node);
List *epq_arowmarks;
ListCell *lc;
/* check for unsupported flags */
Assert(!(eflags & EXEC_FLAG_MARK));
/*
* create state structure
*/
lrstate = makeNode(LockRowsState);
lrstate->ps.plan = (Plan *) node;
lrstate->ps.state = estate;
lrstate->ps.ExecProcNode = ExecLockRows;
/*
* Miscellaneous initialization
*
* LockRows nodes never call ExecQual or ExecProject, therefore no
* ExprContext is needed.
*/
/*
Don't require return slots for nodes without projection. In a lot of nodes the return slot is not required. That can either be because the node doesn't do any projection (say an Append node), or because the node does perform projections but the projection is optimized away because the projection would yield an identical row. Slots aren't that small, especially for wide rows, so it's worthwhile to avoid creating them. It's not possible to just skip creating the slot - it's currently used to determine the tuple descriptor returned by ExecGetResultType(). So separate the determination of the result type from the slot creation. The work previously done internally ExecInitResultTupleSlotTL() can now also be done separately with ExecInitResultTypeTL() and ExecInitResultSlot(). That way nodes that aren't guaranteed to need a result slot, can use ExecInitResultTypeTL() to determine the result type of the node, and ExecAssignScanProjectionInfo() (via ExecConditionalAssignProjectionInfo()) determines that a result slot is needed, it is created with ExecInitResultSlot(). Besides the advantage of avoiding to create slots that then are unused, this is necessary preparation for later patches around tuple table slot abstraction. In particular separating the return descriptor and slot is a prerequisite to allow JITing of tuple deforming with knowledge of the underlying tuple format, and to avoid unnecessarily creating JITed tuple deforming for virtual slots. This commit removes a redundant argument from ExecInitResultTupleSlotTL(). While this commit touches a lot of the relevant lines anyway, it'd normally still not worthwhile to cause breakage, except that aforementioned later commits will touch *all* ExecInitResultTupleSlotTL() callers anyway (but fits worse thematically). Author: Andres Freund Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-09 20:19:39 -05:00
* Initialize result type.
*/
Don't require return slots for nodes without projection. In a lot of nodes the return slot is not required. That can either be because the node doesn't do any projection (say an Append node), or because the node does perform projections but the projection is optimized away because the projection would yield an identical row. Slots aren't that small, especially for wide rows, so it's worthwhile to avoid creating them. It's not possible to just skip creating the slot - it's currently used to determine the tuple descriptor returned by ExecGetResultType(). So separate the determination of the result type from the slot creation. The work previously done internally ExecInitResultTupleSlotTL() can now also be done separately with ExecInitResultTypeTL() and ExecInitResultSlot(). That way nodes that aren't guaranteed to need a result slot, can use ExecInitResultTypeTL() to determine the result type of the node, and ExecAssignScanProjectionInfo() (via ExecConditionalAssignProjectionInfo()) determines that a result slot is needed, it is created with ExecInitResultSlot(). Besides the advantage of avoiding to create slots that then are unused, this is necessary preparation for later patches around tuple table slot abstraction. In particular separating the return descriptor and slot is a prerequisite to allow JITing of tuple deforming with knowledge of the underlying tuple format, and to avoid unnecessarily creating JITed tuple deforming for virtual slots. This commit removes a redundant argument from ExecInitResultTupleSlotTL(). While this commit touches a lot of the relevant lines anyway, it'd normally still not worthwhile to cause breakage, except that aforementioned later commits will touch *all* ExecInitResultTupleSlotTL() callers anyway (but fits worse thematically). Author: Andres Freund Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-09 20:19:39 -05:00
ExecInitResultTypeTL(&lrstate->ps);
/*
* then initialize outer plan
*/
outerPlanState(lrstate) = ExecInitNode(outerPlan, estate, eflags);
Introduce notion of different types of slots (without implementing them). Upcoming work intends to allow pluggable ways to introduce new ways of storing table data. Accessing those table access methods from the executor requires TupleTableSlots to be carry tuples in the native format of such storage methods; otherwise there'll be a significant conversion overhead. Different access methods will require different data to store tuples efficiently (just like virtual, minimal, heap already require fields in TupleTableSlot). To allow that without requiring additional pointer indirections, we want to have different structs (embedding TupleTableSlot) for different types of slots. Thus different types of slots are needed, which requires adapting creators of slots. The slot that most efficiently can represent a type of tuple in an executor node will often depend on the type of slot a child node uses. Therefore we need to track the type of slot is returned by nodes, so parent slots can create slots based on that. Relatedly, JIT compilation of tuple deforming needs to know which type of slot a certain expression refers to, so it can create an appropriate deforming function for the type of tuple in the slot. But not all nodes will only return one type of slot, e.g. an append node will potentially return different types of slots for each of its subplans. Therefore add function that allows to query the type of a node's result slot, and whether it'll always be the same type (whether it's fixed). This can be queried using ExecGetResultSlotOps(). The scan, result, inner, outer type of slots are automatically inferred from ExecInitScanTupleSlot(), ExecInitResultSlot(), left/right subtrees respectively. If that's not correct for a node, that can be overwritten using new fields in PlanState. This commit does not introduce the actually abstracted implementation of different kind of TupleTableSlots, that will be left for a followup commit. The different types of slots introduced will, for now, still use the same backing implementation. While this already partially invalidates the big comment in tuptable.h, it seems to make more sense to update it later, when the different TupleTableSlot implementations actually exist. Author: Ashutosh Bapat and Andres Freund, with changes by Amit Khandekar Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-16 01:00:30 -05:00
/* node returns unmodified slots from the outer plan */
lrstate->ps.resultopsset = true;
lrstate->ps.resultops = ExecGetResultSlotOps(outerPlanState(lrstate),
&lrstate->ps.resultopsfixed);
/*
* LockRows nodes do no projections, so initialize projection info for
* this node appropriately
*/
lrstate->ps.ps_ProjInfo = NULL;
/*
* Locate the ExecRowMark(s) that this node is responsible for, and
* construct ExecAuxRowMarks for them. (InitPlan should already have
* built the global list of ExecRowMarks.)
*/
lrstate->lr_arowMarks = NIL;
epq_arowmarks = NIL;
foreach(lc, node->rowMarks)
{
PlanRowMark *rc = lfirst_node(PlanRowMark, lc);
ExecRowMark *erm;
ExecAuxRowMark *aerm;
Track unpruned relids to avoid processing pruned relations This commit introduces changes to track unpruned relations explicitly, making it possible for top-level plan nodes, such as ModifyTable and LockRows, to avoid processing partitions pruned during initial pruning. Scan-level nodes, such as Append and MergeAppend, already avoid the unnecessary processing by accessing partition pruning results directly via part_prune_index. In contrast, top-level nodes cannot access pruning results directly and need to determine which partitions remain unpruned. To address this, this commit introduces a new bitmapset field, es_unpruned_relids, which the executor uses to track the set of unpruned relations. This field is referenced during plan initialization to skip initializing certain nodes for pruned partitions. It is initialized with PlannedStmt.unprunableRelids, a new field that the planner populates with RT indexes of relations that cannot be pruned during runtime pruning. These include relations not subject to partition pruning and those required for execution regardless of pruning. PlannedStmt.unprunableRelids is computed during set_plan_refs() by removing the RT indexes of runtime-prunable relations, identified from PartitionPruneInfos, from the full set of relation RT indexes. ExecDoInitialPruning() then updates es_unpruned_relids by adding partitions that survive initial pruning. To support this, PartitionedRelPruneInfo and PartitionedRelPruningData now include a leafpart_rti_map[] array that maps partition indexes to their corresponding RT indexes. The former is used in set_plan_refs() when constructing unprunableRelids, while the latter is used in ExecDoInitialPruning() to convert partition indexes returned by get_matching_partitions() into RT indexes, which are then added to es_unpruned_relids. These changes make it possible for ModifyTable and LockRows nodes to process only relations that remain unpruned after initial pruning. ExecInitModifyTable() trims lists, such as resultRelations, withCheckOptionLists, returningLists, and updateColnosLists, to consider only unpruned partitions. It also creates ResultRelInfo structs only for these partitions. Similarly, child RowMarks for pruned relations are skipped. By avoiding unnecessary initialization of structures for pruned partitions, these changes improve the performance of updates and deletes on partitioned tables during initial runtime pruning. Due to ExecInitModifyTable() changes as described above, EXPLAIN on a plan for UPDATE and DELETE that uses runtime initial pruning no longer lists partitions pruned during initial pruning. Reviewed-by: Robert Haas <robertmhaas@gmail.com> (earlier versions) Reviewed-by: Tomas Vondra <tomas@vondra.me> Discussion: https://postgr.es/m/CA+HiwqFGkMSge6TgC9KQzde0ohpAycLQuV7ooitEEpbKB0O_mg@mail.gmail.com
2025-02-07 03:15:09 -05:00
/*
* Ignore "parent" rowmarks, because they are irrelevant at runtime.
* Also ignore the rowmarks belonging to child tables that have been
* pruned in ExecDoInitialPruning().
*/
if (rc->isParent ||
!bms_is_member(rc->rti, estate->es_unpruned_relids))
continue;
/* find ExecRowMark and build ExecAuxRowMark */
erm = ExecFindRowMark(estate, rc->rti, false);
aerm = ExecBuildAuxRowMark(erm, outerPlan->targetlist);
/*
* Only locking rowmarks go into our own list. Non-locking marks are
* passed off to the EvalPlanQual machinery. This is because we don't
* want to bother fetching non-locked rows unless we actually have to
* do an EPQ recheck.
*/
if (RowMarkRequiresRowShareLock(erm->markType))
lrstate->lr_arowMarks = lappend(lrstate->lr_arowMarks, aerm);
else
epq_arowmarks = lappend(epq_arowmarks, aerm);
}
/* Now we have the info needed to set up EPQ state */
EvalPlanQualInit(&lrstate->lr_epqstate, estate,
Fix misbehavior of EvalPlanQual checks with multiple result relations. The idea of EvalPlanQual is that we replace the query's scan of the result relation with a single injected tuple, and see if we get a tuple out, thereby implying that the injected tuple still passes the query quals. (In join cases, other relations in the query are still scanned normally.) This logic was not updated when commit 86dc90056 made it possible for a single DML query plan to have multiple result relations, when the query target relation has inheritance or partition children. We replaced the output for the current result relation successfully, but other result relations were still scanned normally; thus, if any other result relation contained a tuple satisfying the quals, we'd think the EPQ check passed, even if it did not pass for the injected tuple itself. This would lead to update or delete actions getting performed when they should have been skipped due to a conflicting concurrent update in READ COMMITTED isolation mode. Fix by blocking all sibling result relations from emitting tuples during an EvalPlanQual recheck. In the back branches, the fix is complicated a bit by the need to not change the size of struct EPQState (else we'd have ABI-breaking changes in offsets in struct ModifyTableState). Like the back-patches of 3f7836ff6 and 4b3e37993, add a separately palloc'd struct to avoid that. The logic is the same as in HEAD otherwise. This is only a live bug back to v14 where 86dc90056 came in. However, I chose to back-patch the test cases further, on the grounds that this whole area is none too well tested. I skipped doing so in v11 though because none of the test applied cleanly, and it didn't quite seem worth extra work for a branch with only six months to live. Per report from Ante Krešić (via Aleksander Alekseev) Discussion: https://postgr.es/m/CAJ7c6TMBTN3rcz4=AjYhLPD_w3FFT0Wq_C15jxCDn8U4tZnH1g@mail.gmail.com
2023-05-19 14:26:34 -04:00
outerPlan, epq_arowmarks, node->epqParam, NIL);
return lrstate;
}
/* ----------------------------------------------------------------
* ExecEndLockRows
*
* This shuts down the subplan and frees resources allocated
* to this node.
* ----------------------------------------------------------------
*/
void
ExecEndLockRows(LockRowsState *node)
{
/* We may have shut down EPQ already, but no harm in another call */
EvalPlanQualEnd(&node->lr_epqstate);
ExecEndNode(outerPlanState(node));
}
void
ExecReScanLockRows(LockRowsState *node)
{
PlanState *outerPlan = outerPlanState(node);
/*
* if chgParam of subnode is not null then plan will be re-scanned by
* first ExecProcNode.
*/
if (outerPlan->chgParam == NULL)
ExecReScan(outerPlan);
}