postgresql/src/backend/executor/instrument.c
Andres Freund 294520c444 instrumentation: Use Time-Stamp Counter on x86-64 to lower overhead
This allows the direct use of the Time-Stamp Counter (TSC) value retrieved
from the CPU using RDTSC/RDTSCP instructions, instead of APIs like
clock_gettime() on POSIX systems.

This reduces the overhead of EXPLAIN with ANALYZE and TIMING ON. Tests showed
that the overhead on top of actual runtime when instrumenting queries moving
lots of rows through the plan can be reduced from 2x as slow to 1.2x as slow
compared to the actual runtime. More complex workloads such as TPCH queries
have also shown ~20% gains when instrumented compared to before.

To control use of the TSC, the new "timing_clock_source" GUC is introduced,
whose default ("auto") automatically uses the TSC when reliable, for example
when running on modern Intel CPUs, or when running on Linux and the system
clocksource is reported as "tsc". The use of the operating system clock source
can be enforced by setting "system", or on x86-64 architectures the use of TSC
can be enforced by explicitly setting "tsc".

In order to use the TSC the frequency is first determined by use of CPUID, and
if not available, by running a short calibration loop at program start,
falling back to the system clock source if TSC values are not stable.

Note, that we split TSC usage into the RDTSC CPU instruction which does not
wait for out-of-order execution (faster, less precise) and the RDTSCP
instruction, which waits for outstanding instructions to retire. RDTSCP is
deemed to have little benefit in the typical InstrStartNode() /
InstrStopNode() use case of EXPLAIN, and can be up to twice as slow. To
separate these use cases, the new macro INSTR_TIME_SET_CURRENT_FAST() is
introduced, which uses RDTSC.

The original macro INSTR_TIME_SET_CURRENT() uses RDTSCP and is supposed to be
used when precision is more important than performance. When the system timing
clock source is used both of these macros instead utilize the system
APIs (clock_gettime / QueryPerformanceCounter) like before.

Additional users of interval timing, such as track_io_timing and
track_wal_io_timing could also benefit from being converted to use
INSTR_TIME_SET_CURRENT_FAST() but are left for future changes.

Author: Lukas Fittl <lukas@fittl.com>
Author: Andres Freund <andres@anarazel.de>
Author: David Geier <geidav.pg@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: David Geier <geidav.pg@gmail.com>
Reviewed-by: Lukas Fittl <lukas@fittl.com>
Reviewed-by: Zsolt Parragi <zsolt.parragi@percona.com>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com> (in an earlier version)
Reviewed-by: Maciek Sakrejda <m.sakrejda@gmail.com> (in an earlier version)
Reviewed-by: Robert Haas <robertmhaas@gmail.com> (in an earlier version)
Reviewed-by: Jakub Wartak <jakub.wartak@enterprisedb.com> (in an earlier version)
Discussion: https://postgr.es/m/20200612232810.f46nbqkdhbutzqdg@alap3.anarazel.de
2026-04-07 13:00:24 -04:00

421 lines
12 KiB
C

/*-------------------------------------------------------------------------
*
* instrument.c
* functions for instrumentation of plan execution
*
*
* Copyright (c) 2001-2026, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/executor/instrument.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <unistd.h>
#include "executor/instrument.h"
#include "portability/instr_time.h"
#include "utils/guc_hooks.h"
BufferUsage pgBufferUsage;
static BufferUsage save_pgBufferUsage;
WalUsage pgWalUsage;
static WalUsage save_pgWalUsage;
static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
static void WalUsageAdd(WalUsage *dst, WalUsage *add);
/* General purpose instrumentation handling */
Instrumentation *
InstrAlloc(int instrument_options)
{
Instrumentation *instr = palloc0_object(Instrumentation);
InstrInitOptions(instr, instrument_options);
return instr;
}
void
InstrInitOptions(Instrumentation *instr, int instrument_options)
{
instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
}
void
InstrStart(Instrumentation *instr)
{
if (instr->need_timer)
{
if (!INSTR_TIME_IS_ZERO(instr->starttime))
elog(ERROR, "InstrStart called twice in a row");
else
INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
}
/* save buffer usage totals at start, if needed */
if (instr->need_bufusage)
instr->bufusage_start = pgBufferUsage;
if (instr->need_walusage)
instr->walusage_start = pgWalUsage;
}
/*
* Helper for InstrStop() and InstrStopNode(), to avoid code duplication
* despite slightly different needs about how time is accumulated.
*/
static inline void
InstrStopCommon(Instrumentation *instr, instr_time *accum_time)
{
instr_time endtime;
/* update the time only if the timer was requested */
if (instr->need_timer)
{
if (INSTR_TIME_IS_ZERO(instr->starttime))
elog(ERROR, "InstrStop called without start");
INSTR_TIME_SET_CURRENT_FAST(endtime);
INSTR_TIME_ACCUM_DIFF(*accum_time, endtime, instr->starttime);
INSTR_TIME_SET_ZERO(instr->starttime);
}
/* Add delta of buffer usage since InstrStart to the totals */
if (instr->need_bufusage)
BufferUsageAccumDiff(&instr->bufusage,
&pgBufferUsage, &instr->bufusage_start);
if (instr->need_walusage)
WalUsageAccumDiff(&instr->walusage,
&pgWalUsage, &instr->walusage_start);
}
void
InstrStop(Instrumentation *instr)
{
InstrStopCommon(instr, &instr->total);
}
/* Node instrumentation handling */
/* Allocate new node instrumentation structure */
NodeInstrumentation *
InstrAllocNode(int instrument_options, bool async_mode)
{
NodeInstrumentation *instr = palloc_object(NodeInstrumentation);
InstrInitNode(instr, instrument_options, async_mode);
return instr;
}
/* Initialize a pre-allocated instrumentation structure. */
void
InstrInitNode(NodeInstrumentation *instr, int instrument_options, bool async_mode)
{
memset(instr, 0, sizeof(NodeInstrumentation));
InstrInitOptions(&instr->instr, instrument_options);
instr->async_mode = async_mode;
}
/* Entry to a plan node */
void
InstrStartNode(NodeInstrumentation *instr)
{
InstrStart(&instr->instr);
}
/* Exit from a plan node */
void
InstrStopNode(NodeInstrumentation *instr, double nTuples)
{
double save_tuplecount = instr->tuplecount;
/* count the returned tuples */
instr->tuplecount += nTuples;
/*
* Note that in contrast to InstrStop() the time is accumulated into
* NodeInstrumentation->counter, with total only getting updated in
* InstrEndLoop. We need the separate counter variable because we need to
* calculate start-up time for the first tuple in each cycle, and then
* accumulate it together.
*/
InstrStopCommon(&instr->instr, &instr->counter);
/* Is this the first tuple of this cycle? */
if (!instr->running)
{
instr->running = true;
instr->firsttuple = instr->counter;
}
else
{
/*
* In async mode, if the plan node hadn't emitted any tuples before,
* this might be the first tuple
*/
if (instr->async_mode && save_tuplecount < 1.0)
instr->firsttuple = instr->counter;
}
}
/* Update tuple count */
void
InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples)
{
/* count the returned tuples */
instr->tuplecount += nTuples;
}
/* Finish a run cycle for a plan node */
void
InstrEndLoop(NodeInstrumentation *instr)
{
/* Skip if nothing has happened, or already shut down */
if (!instr->running)
return;
if (!INSTR_TIME_IS_ZERO(instr->instr.starttime))
elog(ERROR, "InstrEndLoop called on running node");
/* Accumulate per-cycle statistics into totals */
INSTR_TIME_ADD(instr->startup, instr->firsttuple);
INSTR_TIME_ADD(instr->instr.total, instr->counter);
instr->ntuples += instr->tuplecount;
instr->nloops += 1;
/* Reset for next cycle (if any) */
instr->running = false;
INSTR_TIME_SET_ZERO(instr->instr.starttime);
INSTR_TIME_SET_ZERO(instr->counter);
INSTR_TIME_SET_ZERO(instr->firsttuple);
instr->tuplecount = 0;
}
/*
* Aggregate instrumentation from parallel workers. Must be called after
* InstrEndLoop.
*/
void
InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add)
{
Assert(!add->running);
INSTR_TIME_ADD(dst->startup, add->startup);
INSTR_TIME_ADD(dst->instr.total, add->instr.total);
dst->ntuples += add->ntuples;
dst->ntuples2 += add->ntuples2;
dst->nloops += add->nloops;
dst->nfiltered1 += add->nfiltered1;
dst->nfiltered2 += add->nfiltered2;
if (dst->instr.need_bufusage)
BufferUsageAdd(&dst->instr.bufusage, &add->instr.bufusage);
if (dst->instr.need_walusage)
WalUsageAdd(&dst->instr.walusage, &add->instr.walusage);
}
/* Trigger instrumentation handling */
TriggerInstrumentation *
InstrAllocTrigger(int n, int instrument_options)
{
TriggerInstrumentation *tginstr = palloc0_array(TriggerInstrumentation, n);
int i;
for (i = 0; i < n; i++)
InstrInitOptions(&tginstr[i].instr, instrument_options);
return tginstr;
}
void
InstrStartTrigger(TriggerInstrumentation *tginstr)
{
InstrStart(&tginstr->instr);
}
void
InstrStopTrigger(TriggerInstrumentation *tginstr, int64 firings)
{
InstrStop(&tginstr->instr);
tginstr->firings += firings;
}
/* note current values during parallel executor startup */
void
InstrStartParallelQuery(void)
{
save_pgBufferUsage = pgBufferUsage;
save_pgWalUsage = pgWalUsage;
}
/* report usage after parallel executor shutdown */
void
InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
{
memset(bufusage, 0, sizeof(BufferUsage));
BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage);
memset(walusage, 0, sizeof(WalUsage));
WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage);
}
/* accumulate work done by workers in leader's stats */
void
InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
{
BufferUsageAdd(&pgBufferUsage, bufusage);
WalUsageAdd(&pgWalUsage, walusage);
}
/* dst += add */
static void
BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
{
dst->shared_blks_hit += add->shared_blks_hit;
dst->shared_blks_read += add->shared_blks_read;
dst->shared_blks_dirtied += add->shared_blks_dirtied;
dst->shared_blks_written += add->shared_blks_written;
dst->local_blks_hit += add->local_blks_hit;
dst->local_blks_read += add->local_blks_read;
dst->local_blks_dirtied += add->local_blks_dirtied;
dst->local_blks_written += add->local_blks_written;
dst->temp_blks_read += add->temp_blks_read;
dst->temp_blks_written += add->temp_blks_written;
INSTR_TIME_ADD(dst->shared_blk_read_time, add->shared_blk_read_time);
INSTR_TIME_ADD(dst->shared_blk_write_time, add->shared_blk_write_time);
INSTR_TIME_ADD(dst->local_blk_read_time, add->local_blk_read_time);
INSTR_TIME_ADD(dst->local_blk_write_time, add->local_blk_write_time);
INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time);
INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time);
}
/* dst += add - sub */
void
BufferUsageAccumDiff(BufferUsage *dst,
const BufferUsage *add,
const BufferUsage *sub)
{
dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit;
dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read;
dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied;
dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written;
dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit;
dst->local_blks_read += add->local_blks_read - sub->local_blks_read;
dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied;
dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time,
add->shared_blk_read_time, sub->shared_blk_read_time);
INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time,
add->shared_blk_write_time, sub->shared_blk_write_time);
INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time,
add->local_blk_read_time, sub->local_blk_read_time);
INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time,
add->local_blk_write_time, sub->local_blk_write_time);
INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time,
add->temp_blk_read_time, sub->temp_blk_read_time);
INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time,
add->temp_blk_write_time, sub->temp_blk_write_time);
}
/* helper functions for WAL usage accumulation */
static void
WalUsageAdd(WalUsage *dst, WalUsage *add)
{
dst->wal_bytes += add->wal_bytes;
dst->wal_records += add->wal_records;
dst->wal_fpi += add->wal_fpi;
dst->wal_fpi_bytes += add->wal_fpi_bytes;
dst->wal_buffers_full += add->wal_buffers_full;
}
void
WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub)
{
dst->wal_bytes += add->wal_bytes - sub->wal_bytes;
dst->wal_records += add->wal_records - sub->wal_records;
dst->wal_fpi += add->wal_fpi - sub->wal_fpi;
dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes;
dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full;
}
/* GUC hooks for timing_clock_source */
bool
check_timing_clock_source(int *newval, void **extra, GucSource source)
{
/*
* Do nothing if timing is not initialized. This is only expected on child
* processes in EXEC_BACKEND builds, as GUC hooks can be called during
* InitializeGUCOptions() before InitProcessGlobals() has had a chance to
* run pg_initialize_timing(). Instead, TSC will be initialized via
* restore_backend_variables.
*/
#ifdef EXEC_BACKEND
if (!timing_initialized)
return true;
#else
Assert(timing_initialized);
#endif
#if PG_INSTR_TSC_CLOCK
pg_initialize_timing_tsc();
if (*newval == TIMING_CLOCK_SOURCE_TSC && timing_tsc_frequency_khz <= 0)
{
GUC_check_errdetail("TSC is not supported as timing clock source");
return false;
}
#endif
return true;
}
void
assign_timing_clock_source(int newval, void *extra)
{
#ifdef EXEC_BACKEND
if (!timing_initialized)
return;
#else
Assert(timing_initialized);
#endif
/*
* Ignore the return code since the check hook already verified TSC is
* usable if it's explicitly requested.
*/
pg_set_timing_clock_source(newval);
}
const char *
show_timing_clock_source(void)
{
switch (timing_clock_source)
{
case TIMING_CLOCK_SOURCE_AUTO:
#if PG_INSTR_TSC_CLOCK
if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC)
return "auto (tsc)";
#endif
return "auto (system)";
case TIMING_CLOCK_SOURCE_SYSTEM:
return "system";
#if PG_INSTR_TSC_CLOCK
case TIMING_CLOCK_SOURCE_TSC:
return "tsc";
#endif
}
/* unreachable */
return "?";
}