mirror of
https://github.com/postgres/postgres.git
synced 2026-05-16 19:39:34 -04:00
This allows the direct use of the Time-Stamp Counter (TSC) value retrieved
from the CPU using RDTSC/RDTSCP instructions, instead of APIs like
clock_gettime() on POSIX systems.
This reduces the overhead of EXPLAIN with ANALYZE and TIMING ON. Tests showed
that the overhead on top of actual runtime when instrumenting queries moving
lots of rows through the plan can be reduced from 2x as slow to 1.2x as slow
compared to the actual runtime. More complex workloads such as TPCH queries
have also shown ~20% gains when instrumented compared to before.
To control use of the TSC, the new "timing_clock_source" GUC is introduced,
whose default ("auto") automatically uses the TSC when reliable, for example
when running on modern Intel CPUs, or when running on Linux and the system
clocksource is reported as "tsc". The use of the operating system clock source
can be enforced by setting "system", or on x86-64 architectures the use of TSC
can be enforced by explicitly setting "tsc".
In order to use the TSC the frequency is first determined by use of CPUID, and
if not available, by running a short calibration loop at program start,
falling back to the system clock source if TSC values are not stable.
Note, that we split TSC usage into the RDTSC CPU instruction which does not
wait for out-of-order execution (faster, less precise) and the RDTSCP
instruction, which waits for outstanding instructions to retire. RDTSCP is
deemed to have little benefit in the typical InstrStartNode() /
InstrStopNode() use case of EXPLAIN, and can be up to twice as slow. To
separate these use cases, the new macro INSTR_TIME_SET_CURRENT_FAST() is
introduced, which uses RDTSC.
The original macro INSTR_TIME_SET_CURRENT() uses RDTSCP and is supposed to be
used when precision is more important than performance. When the system timing
clock source is used both of these macros instead utilize the system
APIs (clock_gettime / QueryPerformanceCounter) like before.
Additional users of interval timing, such as track_io_timing and
track_wal_io_timing could also benefit from being converted to use
INSTR_TIME_SET_CURRENT_FAST() but are left for future changes.
Author: Lukas Fittl <lukas@fittl.com>
Author: Andres Freund <andres@anarazel.de>
Author: David Geier <geidav.pg@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: David Geier <geidav.pg@gmail.com>
Reviewed-by: Lukas Fittl <lukas@fittl.com>
Reviewed-by: Zsolt Parragi <zsolt.parragi@percona.com>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com> (in an earlier version)
Reviewed-by: Maciek Sakrejda <m.sakrejda@gmail.com> (in an earlier version)
Reviewed-by: Robert Haas <robertmhaas@gmail.com> (in an earlier version)
Reviewed-by: Jakub Wartak <jakub.wartak@enterprisedb.com> (in an earlier version)
Discussion: https://postgr.es/m/20200612232810.f46nbqkdhbutzqdg@alap3.anarazel.de
421 lines
12 KiB
C
421 lines
12 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* instrument.c
|
|
* functions for instrumentation of plan execution
|
|
*
|
|
*
|
|
* Copyright (c) 2001-2026, PostgreSQL Global Development Group
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/executor/instrument.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include <unistd.h>
|
|
|
|
#include "executor/instrument.h"
|
|
#include "portability/instr_time.h"
|
|
#include "utils/guc_hooks.h"
|
|
|
|
BufferUsage pgBufferUsage;
|
|
static BufferUsage save_pgBufferUsage;
|
|
WalUsage pgWalUsage;
|
|
static WalUsage save_pgWalUsage;
|
|
|
|
static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
|
|
static void WalUsageAdd(WalUsage *dst, WalUsage *add);
|
|
|
|
|
|
/* General purpose instrumentation handling */
|
|
Instrumentation *
|
|
InstrAlloc(int instrument_options)
|
|
{
|
|
Instrumentation *instr = palloc0_object(Instrumentation);
|
|
|
|
InstrInitOptions(instr, instrument_options);
|
|
return instr;
|
|
}
|
|
|
|
void
|
|
InstrInitOptions(Instrumentation *instr, int instrument_options)
|
|
{
|
|
instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
|
|
instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
|
|
instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
|
|
}
|
|
|
|
void
|
|
InstrStart(Instrumentation *instr)
|
|
{
|
|
if (instr->need_timer)
|
|
{
|
|
if (!INSTR_TIME_IS_ZERO(instr->starttime))
|
|
elog(ERROR, "InstrStart called twice in a row");
|
|
else
|
|
INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
|
|
}
|
|
|
|
/* save buffer usage totals at start, if needed */
|
|
if (instr->need_bufusage)
|
|
instr->bufusage_start = pgBufferUsage;
|
|
|
|
if (instr->need_walusage)
|
|
instr->walusage_start = pgWalUsage;
|
|
}
|
|
|
|
/*
|
|
* Helper for InstrStop() and InstrStopNode(), to avoid code duplication
|
|
* despite slightly different needs about how time is accumulated.
|
|
*/
|
|
static inline void
|
|
InstrStopCommon(Instrumentation *instr, instr_time *accum_time)
|
|
{
|
|
instr_time endtime;
|
|
|
|
/* update the time only if the timer was requested */
|
|
if (instr->need_timer)
|
|
{
|
|
if (INSTR_TIME_IS_ZERO(instr->starttime))
|
|
elog(ERROR, "InstrStop called without start");
|
|
|
|
INSTR_TIME_SET_CURRENT_FAST(endtime);
|
|
INSTR_TIME_ACCUM_DIFF(*accum_time, endtime, instr->starttime);
|
|
|
|
INSTR_TIME_SET_ZERO(instr->starttime);
|
|
}
|
|
|
|
/* Add delta of buffer usage since InstrStart to the totals */
|
|
if (instr->need_bufusage)
|
|
BufferUsageAccumDiff(&instr->bufusage,
|
|
&pgBufferUsage, &instr->bufusage_start);
|
|
|
|
if (instr->need_walusage)
|
|
WalUsageAccumDiff(&instr->walusage,
|
|
&pgWalUsage, &instr->walusage_start);
|
|
}
|
|
|
|
void
|
|
InstrStop(Instrumentation *instr)
|
|
{
|
|
InstrStopCommon(instr, &instr->total);
|
|
}
|
|
|
|
/* Node instrumentation handling */
|
|
|
|
/* Allocate new node instrumentation structure */
|
|
NodeInstrumentation *
|
|
InstrAllocNode(int instrument_options, bool async_mode)
|
|
{
|
|
NodeInstrumentation *instr = palloc_object(NodeInstrumentation);
|
|
|
|
InstrInitNode(instr, instrument_options, async_mode);
|
|
|
|
return instr;
|
|
}
|
|
|
|
/* Initialize a pre-allocated instrumentation structure. */
|
|
void
|
|
InstrInitNode(NodeInstrumentation *instr, int instrument_options, bool async_mode)
|
|
{
|
|
memset(instr, 0, sizeof(NodeInstrumentation));
|
|
InstrInitOptions(&instr->instr, instrument_options);
|
|
instr->async_mode = async_mode;
|
|
}
|
|
|
|
/* Entry to a plan node */
|
|
void
|
|
InstrStartNode(NodeInstrumentation *instr)
|
|
{
|
|
InstrStart(&instr->instr);
|
|
}
|
|
|
|
/* Exit from a plan node */
|
|
void
|
|
InstrStopNode(NodeInstrumentation *instr, double nTuples)
|
|
{
|
|
double save_tuplecount = instr->tuplecount;
|
|
|
|
/* count the returned tuples */
|
|
instr->tuplecount += nTuples;
|
|
|
|
/*
|
|
* Note that in contrast to InstrStop() the time is accumulated into
|
|
* NodeInstrumentation->counter, with total only getting updated in
|
|
* InstrEndLoop. We need the separate counter variable because we need to
|
|
* calculate start-up time for the first tuple in each cycle, and then
|
|
* accumulate it together.
|
|
*/
|
|
InstrStopCommon(&instr->instr, &instr->counter);
|
|
|
|
/* Is this the first tuple of this cycle? */
|
|
if (!instr->running)
|
|
{
|
|
instr->running = true;
|
|
instr->firsttuple = instr->counter;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* In async mode, if the plan node hadn't emitted any tuples before,
|
|
* this might be the first tuple
|
|
*/
|
|
if (instr->async_mode && save_tuplecount < 1.0)
|
|
instr->firsttuple = instr->counter;
|
|
}
|
|
}
|
|
|
|
/* Update tuple count */
|
|
void
|
|
InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples)
|
|
{
|
|
/* count the returned tuples */
|
|
instr->tuplecount += nTuples;
|
|
}
|
|
|
|
/* Finish a run cycle for a plan node */
|
|
void
|
|
InstrEndLoop(NodeInstrumentation *instr)
|
|
{
|
|
/* Skip if nothing has happened, or already shut down */
|
|
if (!instr->running)
|
|
return;
|
|
|
|
if (!INSTR_TIME_IS_ZERO(instr->instr.starttime))
|
|
elog(ERROR, "InstrEndLoop called on running node");
|
|
|
|
/* Accumulate per-cycle statistics into totals */
|
|
INSTR_TIME_ADD(instr->startup, instr->firsttuple);
|
|
INSTR_TIME_ADD(instr->instr.total, instr->counter);
|
|
instr->ntuples += instr->tuplecount;
|
|
instr->nloops += 1;
|
|
|
|
/* Reset for next cycle (if any) */
|
|
instr->running = false;
|
|
INSTR_TIME_SET_ZERO(instr->instr.starttime);
|
|
INSTR_TIME_SET_ZERO(instr->counter);
|
|
INSTR_TIME_SET_ZERO(instr->firsttuple);
|
|
instr->tuplecount = 0;
|
|
}
|
|
|
|
/*
|
|
* Aggregate instrumentation from parallel workers. Must be called after
|
|
* InstrEndLoop.
|
|
*/
|
|
void
|
|
InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add)
|
|
{
|
|
Assert(!add->running);
|
|
|
|
INSTR_TIME_ADD(dst->startup, add->startup);
|
|
INSTR_TIME_ADD(dst->instr.total, add->instr.total);
|
|
dst->ntuples += add->ntuples;
|
|
dst->ntuples2 += add->ntuples2;
|
|
dst->nloops += add->nloops;
|
|
dst->nfiltered1 += add->nfiltered1;
|
|
dst->nfiltered2 += add->nfiltered2;
|
|
|
|
if (dst->instr.need_bufusage)
|
|
BufferUsageAdd(&dst->instr.bufusage, &add->instr.bufusage);
|
|
|
|
if (dst->instr.need_walusage)
|
|
WalUsageAdd(&dst->instr.walusage, &add->instr.walusage);
|
|
}
|
|
|
|
/* Trigger instrumentation handling */
|
|
TriggerInstrumentation *
|
|
InstrAllocTrigger(int n, int instrument_options)
|
|
{
|
|
TriggerInstrumentation *tginstr = palloc0_array(TriggerInstrumentation, n);
|
|
int i;
|
|
|
|
for (i = 0; i < n; i++)
|
|
InstrInitOptions(&tginstr[i].instr, instrument_options);
|
|
|
|
return tginstr;
|
|
}
|
|
|
|
void
|
|
InstrStartTrigger(TriggerInstrumentation *tginstr)
|
|
{
|
|
InstrStart(&tginstr->instr);
|
|
}
|
|
|
|
void
|
|
InstrStopTrigger(TriggerInstrumentation *tginstr, int64 firings)
|
|
{
|
|
InstrStop(&tginstr->instr);
|
|
tginstr->firings += firings;
|
|
}
|
|
|
|
/* note current values during parallel executor startup */
|
|
void
|
|
InstrStartParallelQuery(void)
|
|
{
|
|
save_pgBufferUsage = pgBufferUsage;
|
|
save_pgWalUsage = pgWalUsage;
|
|
}
|
|
|
|
/* report usage after parallel executor shutdown */
|
|
void
|
|
InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
|
|
{
|
|
memset(bufusage, 0, sizeof(BufferUsage));
|
|
BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage);
|
|
memset(walusage, 0, sizeof(WalUsage));
|
|
WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage);
|
|
}
|
|
|
|
/* accumulate work done by workers in leader's stats */
|
|
void
|
|
InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
|
|
{
|
|
BufferUsageAdd(&pgBufferUsage, bufusage);
|
|
WalUsageAdd(&pgWalUsage, walusage);
|
|
}
|
|
|
|
/* dst += add */
|
|
static void
|
|
BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
|
|
{
|
|
dst->shared_blks_hit += add->shared_blks_hit;
|
|
dst->shared_blks_read += add->shared_blks_read;
|
|
dst->shared_blks_dirtied += add->shared_blks_dirtied;
|
|
dst->shared_blks_written += add->shared_blks_written;
|
|
dst->local_blks_hit += add->local_blks_hit;
|
|
dst->local_blks_read += add->local_blks_read;
|
|
dst->local_blks_dirtied += add->local_blks_dirtied;
|
|
dst->local_blks_written += add->local_blks_written;
|
|
dst->temp_blks_read += add->temp_blks_read;
|
|
dst->temp_blks_written += add->temp_blks_written;
|
|
INSTR_TIME_ADD(dst->shared_blk_read_time, add->shared_blk_read_time);
|
|
INSTR_TIME_ADD(dst->shared_blk_write_time, add->shared_blk_write_time);
|
|
INSTR_TIME_ADD(dst->local_blk_read_time, add->local_blk_read_time);
|
|
INSTR_TIME_ADD(dst->local_blk_write_time, add->local_blk_write_time);
|
|
INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time);
|
|
INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time);
|
|
}
|
|
|
|
/* dst += add - sub */
|
|
void
|
|
BufferUsageAccumDiff(BufferUsage *dst,
|
|
const BufferUsage *add,
|
|
const BufferUsage *sub)
|
|
{
|
|
dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit;
|
|
dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read;
|
|
dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied;
|
|
dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written;
|
|
dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit;
|
|
dst->local_blks_read += add->local_blks_read - sub->local_blks_read;
|
|
dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied;
|
|
dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
|
|
dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
|
|
dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
|
|
INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time,
|
|
add->shared_blk_read_time, sub->shared_blk_read_time);
|
|
INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time,
|
|
add->shared_blk_write_time, sub->shared_blk_write_time);
|
|
INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time,
|
|
add->local_blk_read_time, sub->local_blk_read_time);
|
|
INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time,
|
|
add->local_blk_write_time, sub->local_blk_write_time);
|
|
INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time,
|
|
add->temp_blk_read_time, sub->temp_blk_read_time);
|
|
INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time,
|
|
add->temp_blk_write_time, sub->temp_blk_write_time);
|
|
}
|
|
|
|
/* helper functions for WAL usage accumulation */
|
|
static void
|
|
WalUsageAdd(WalUsage *dst, WalUsage *add)
|
|
{
|
|
dst->wal_bytes += add->wal_bytes;
|
|
dst->wal_records += add->wal_records;
|
|
dst->wal_fpi += add->wal_fpi;
|
|
dst->wal_fpi_bytes += add->wal_fpi_bytes;
|
|
dst->wal_buffers_full += add->wal_buffers_full;
|
|
}
|
|
|
|
void
|
|
WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub)
|
|
{
|
|
dst->wal_bytes += add->wal_bytes - sub->wal_bytes;
|
|
dst->wal_records += add->wal_records - sub->wal_records;
|
|
dst->wal_fpi += add->wal_fpi - sub->wal_fpi;
|
|
dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes;
|
|
dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full;
|
|
}
|
|
|
|
/* GUC hooks for timing_clock_source */
|
|
|
|
bool
|
|
check_timing_clock_source(int *newval, void **extra, GucSource source)
|
|
{
|
|
/*
|
|
* Do nothing if timing is not initialized. This is only expected on child
|
|
* processes in EXEC_BACKEND builds, as GUC hooks can be called during
|
|
* InitializeGUCOptions() before InitProcessGlobals() has had a chance to
|
|
* run pg_initialize_timing(). Instead, TSC will be initialized via
|
|
* restore_backend_variables.
|
|
*/
|
|
#ifdef EXEC_BACKEND
|
|
if (!timing_initialized)
|
|
return true;
|
|
#else
|
|
Assert(timing_initialized);
|
|
#endif
|
|
|
|
#if PG_INSTR_TSC_CLOCK
|
|
pg_initialize_timing_tsc();
|
|
|
|
if (*newval == TIMING_CLOCK_SOURCE_TSC && timing_tsc_frequency_khz <= 0)
|
|
{
|
|
GUC_check_errdetail("TSC is not supported as timing clock source");
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
assign_timing_clock_source(int newval, void *extra)
|
|
{
|
|
#ifdef EXEC_BACKEND
|
|
if (!timing_initialized)
|
|
return;
|
|
#else
|
|
Assert(timing_initialized);
|
|
#endif
|
|
|
|
/*
|
|
* Ignore the return code since the check hook already verified TSC is
|
|
* usable if it's explicitly requested.
|
|
*/
|
|
pg_set_timing_clock_source(newval);
|
|
}
|
|
|
|
const char *
|
|
show_timing_clock_source(void)
|
|
{
|
|
switch (timing_clock_source)
|
|
{
|
|
case TIMING_CLOCK_SOURCE_AUTO:
|
|
#if PG_INSTR_TSC_CLOCK
|
|
if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC)
|
|
return "auto (tsc)";
|
|
#endif
|
|
return "auto (system)";
|
|
case TIMING_CLOCK_SOURCE_SYSTEM:
|
|
return "system";
|
|
#if PG_INSTR_TSC_CLOCK
|
|
case TIMING_CLOCK_SOURCE_TSC:
|
|
return "tsc";
|
|
#endif
|
|
}
|
|
|
|
/* unreachable */
|
|
return "?";
|
|
}
|