mirror of
https://github.com/postgres/postgres.git
synced 2026-02-10 06:13:34 -05:00
Performing AIO using io_uring can be considerably faster than io_method=worker, particularly when lots of small IOs are issued, as a) the context-switch overhead for worker based AIO becomes more significant b) the number of IO workers can become limiting io_uring, however, is linux specific and requires an additional compile-time dependency (liburing). This implementation is fairly simple and there are substantial optimization opportunities. The description of the existing AIO_IO_COMPLETION wait event is updated to make the difference between it and the new AIO_IO_URING_EXECUTION clearer. Reviewed-by: Noah Misch <noah@leadboat.com> Reviewed-by: Jakub Wartak <jakub.wartak@enterprisedb.com> Discussion: https://postgr.es/m/uvrtrknj4kdytuboidbhwclo4gxhswwcpgadptsjvjqcluzmah%40brqs62irg4dt Discussion: https://postgr.es/m/20210223100344.llw5an2aklengrmn@alap3.anarazel.de Discussion: https://postgr.es/m/stj36ea6yyhoxtqkhpieia2z4krnam7qyetc57rfezgk4zgapf@gcnactj4z56m
358 lines
10 KiB
C
358 lines
10 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* aio.h
|
|
* Main AIO interface
|
|
*
|
|
* This is the header to include when actually issuing AIO. When just
|
|
* declaring functions involving an AIO related type, it might suffice to
|
|
* include aio_types.h. Initialization related functions are in the dedicated
|
|
* aio_init.h.
|
|
*
|
|
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/include/storage/aio.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#ifndef AIO_H
|
|
#define AIO_H
|
|
|
|
#include "storage/aio_types.h"
|
|
#include "storage/procnumber.h"
|
|
|
|
|
|
/* io_uring is incompatible with EXEC_BACKEND */
|
|
#if defined(USE_LIBURING) && !defined(EXEC_BACKEND)
|
|
#define IOMETHOD_IO_URING_ENABLED
|
|
#endif
|
|
|
|
|
|
/* Enum for io_method GUC. */
|
|
typedef enum IoMethod
|
|
{
|
|
IOMETHOD_SYNC = 0,
|
|
IOMETHOD_WORKER,
|
|
#ifdef IOMETHOD_IO_URING_ENABLED
|
|
IOMETHOD_IO_URING,
|
|
#endif
|
|
} IoMethod;
|
|
|
|
/* We'll default to worker based execution. */
|
|
#define DEFAULT_IO_METHOD IOMETHOD_WORKER
|
|
|
|
|
|
/*
|
|
* Flags for an IO that can be set with pgaio_io_set_flag().
|
|
*/
|
|
typedef enum PgAioHandleFlags
|
|
{
|
|
/*
|
|
* The IO references backend local memory.
|
|
*
|
|
* This needs to be set on an IO whenever the IO references process-local
|
|
* memory. Some IO methods do not support executing IO that references
|
|
* process local memory and thus need to fall back to executing IO
|
|
* synchronously for IOs with this flag set.
|
|
*
|
|
* Required for correctness.
|
|
*/
|
|
PGAIO_HF_REFERENCES_LOCAL = 1 << 1,
|
|
|
|
/*
|
|
* Hint that IO will be executed synchronously.
|
|
*
|
|
* This can make it a bit cheaper to execute synchronous IO via the AIO
|
|
* interface, to avoid needing an AIO and non-AIO version of code.
|
|
*
|
|
* Advantageous to set, if applicable, but not required for correctness.
|
|
*/
|
|
PGAIO_HF_SYNCHRONOUS = 1 << 0,
|
|
|
|
/*
|
|
* IO is using buffered IO, used to control heuristic in some IO methods.
|
|
*
|
|
* Advantageous to set, if applicable, but not required for correctness.
|
|
*/
|
|
PGAIO_HF_BUFFERED = 1 << 2,
|
|
} PgAioHandleFlags;
|
|
|
|
/*
|
|
* The IO operations supported by the AIO subsystem.
|
|
*
|
|
* This could be in aio_internal.h, as it is not pubicly referenced, but
|
|
* PgAioOpData currently *does* need to be public, therefore keeping this
|
|
* public seems to make sense.
|
|
*/
|
|
typedef enum PgAioOp
|
|
{
|
|
/* intentionally the zero value, to help catch zeroed memory etc */
|
|
PGAIO_OP_INVALID = 0,
|
|
|
|
PGAIO_OP_READV,
|
|
PGAIO_OP_WRITEV,
|
|
|
|
/**
|
|
* In the near term we'll need at least:
|
|
* - fsync / fdatasync
|
|
* - flush_range
|
|
*
|
|
* Eventually we'll additionally want at least:
|
|
* - send
|
|
* - recv
|
|
* - accept
|
|
**/
|
|
} PgAioOp;
|
|
|
|
#define PGAIO_OP_COUNT (PGAIO_OP_WRITEV + 1)
|
|
|
|
|
|
/*
|
|
* On what is IO being performed?
|
|
*
|
|
* PgAioTargetID specific behaviour should be implemented in
|
|
* aio_target.c.
|
|
*/
|
|
typedef enum PgAioTargetID
|
|
{
|
|
/* intentionally the zero value, to help catch zeroed memory etc */
|
|
PGAIO_TID_INVALID = 0,
|
|
} PgAioTargetID;
|
|
|
|
#define PGAIO_TID_COUNT (PGAIO_TID_INVALID + 1)
|
|
|
|
|
|
/*
|
|
* Data necessary for support IO operations (see PgAioOp).
|
|
*
|
|
* NB: Note that the FDs in here may *not* be relied upon for re-issuing
|
|
* requests (e.g. for partial reads/writes or in an IO worker) - the FD might
|
|
* be from another process, or closed since. That's not a problem for staged
|
|
* IOs, as all staged IOs are submitted when closing an FD.
|
|
*/
|
|
typedef union
|
|
{
|
|
struct
|
|
{
|
|
int fd;
|
|
uint16 iov_length;
|
|
uint64 offset;
|
|
} read;
|
|
|
|
struct
|
|
{
|
|
int fd;
|
|
uint16 iov_length;
|
|
uint64 offset;
|
|
} write;
|
|
} PgAioOpData;
|
|
|
|
|
|
/*
|
|
* Information the object that IO is executed on. Mostly callbacks that
|
|
* operate on PgAioTargetData.
|
|
*
|
|
* typedef is in aio_types.h
|
|
*/
|
|
struct PgAioTargetInfo
|
|
{
|
|
/*
|
|
* To support executing using worker processes, the file descriptor for an
|
|
* IO may need to be be reopened in a different process.
|
|
*/
|
|
void (*reopen) (PgAioHandle *ioh);
|
|
|
|
/* describe the target of the IO, used for log messages and views */
|
|
char *(*describe_identity) (const PgAioTargetData *sd);
|
|
|
|
/* name of the target, used in log messages / views */
|
|
const char *name;
|
|
};
|
|
|
|
|
|
/*
|
|
* IDs for callbacks that can be registered on an IO.
|
|
*
|
|
* Callbacks are identified by an ID rather than a function pointer. There are
|
|
* two main reasons:
|
|
*
|
|
* 1) Memory within PgAioHandle is precious, due to the number of PgAioHandle
|
|
* structs in pre-allocated shared memory.
|
|
*
|
|
* 2) Due to EXEC_BACKEND function pointers are not necessarily stable between
|
|
* different backends, therefore function pointers cannot directly be in
|
|
* shared memory.
|
|
*
|
|
* Without 2), we could fairly easily allow to add new callbacks, by filling a
|
|
* ID->pointer mapping table on demand. In the presence of 2 that's still
|
|
* doable, but harder, because every process has to re-register the pointers
|
|
* so that a local ID->"backend local pointer" mapping can be maintained.
|
|
*/
|
|
typedef enum PgAioHandleCallbackID
|
|
{
|
|
PGAIO_HCB_INVALID,
|
|
} PgAioHandleCallbackID;
|
|
|
|
|
|
typedef void (*PgAioHandleCallbackStage) (PgAioHandle *ioh, uint8 cb_flags);
|
|
typedef PgAioResult (*PgAioHandleCallbackComplete) (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_flags);
|
|
typedef void (*PgAioHandleCallbackReport) (PgAioResult result, const PgAioTargetData *target_data, int elevel);
|
|
|
|
/* typedef is in aio_types.h */
|
|
struct PgAioHandleCallbacks
|
|
{
|
|
/*
|
|
* Prepare resources affected by the IO for execution. This could e.g.
|
|
* include moving ownership of buffer pins to the AIO subsystem.
|
|
*/
|
|
PgAioHandleCallbackStage stage;
|
|
|
|
/*
|
|
* Update the state of resources affected by the IO to reflect completion
|
|
* of the IO. This could e.g. include updating shared buffer state to
|
|
* signal the IO has finished.
|
|
*
|
|
* The _shared suffix indicates that this is executed by the backend that
|
|
* completed the IO, which may or may not be the backend that issued the
|
|
* IO. Obviously the callback thus can only modify resources in shared
|
|
* memory.
|
|
*
|
|
* The latest registered callback is called first. This allows
|
|
* higher-level code to register callbacks that can rely on callbacks
|
|
* registered by lower-level code to already have been executed.
|
|
*
|
|
* NB: This is called in a critical section. Errors can be signalled by
|
|
* the callback's return value, it's the responsibility of the IO's issuer
|
|
* to react appropriately.
|
|
*/
|
|
PgAioHandleCallbackComplete complete_shared;
|
|
|
|
/*
|
|
* Like complete_shared, except called in the issuing backend.
|
|
*
|
|
* This variant of the completion callback is useful when backend-local
|
|
* state has to be updated to reflect the IO's completion. E.g. a
|
|
* temporary buffer's BufferDesc isn't accessible in complete_shared.
|
|
*
|
|
* Local callbacks are only called after complete_shared for all
|
|
* registered callbacks has been called.
|
|
*/
|
|
PgAioHandleCallbackComplete complete_local;
|
|
|
|
/*
|
|
* Report the result of an IO operation. This is e.g. used to raise an
|
|
* error after an IO failed at the appropriate time (i.e. not when the IO
|
|
* failed, but under control of the code that issued the IO).
|
|
*/
|
|
PgAioHandleCallbackReport report;
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
* How many callbacks can be registered for one IO handle. Currently we only
|
|
* need two, but it's not hard to imagine needing a few more.
|
|
*/
|
|
#define PGAIO_HANDLE_MAX_CALLBACKS 4
|
|
|
|
|
|
|
|
/* --------------------------------------------------------------------------------
|
|
* IO Handles
|
|
* --------------------------------------------------------------------------------
|
|
*/
|
|
|
|
/* functions in aio.c */
|
|
struct ResourceOwnerData;
|
|
extern PgAioHandle *pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret);
|
|
extern PgAioHandle *pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret);
|
|
|
|
extern void pgaio_io_release(PgAioHandle *ioh);
|
|
struct dlist_node;
|
|
extern void pgaio_io_release_resowner(struct dlist_node *ioh_node, bool on_error);
|
|
|
|
extern void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag);
|
|
|
|
extern int pgaio_io_get_id(PgAioHandle *ioh);
|
|
extern ProcNumber pgaio_io_get_owner(PgAioHandle *ioh);
|
|
|
|
extern void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow);
|
|
|
|
/* functions in aio_io.c */
|
|
struct iovec;
|
|
extern int pgaio_io_get_iovec(PgAioHandle *ioh, struct iovec **iov);
|
|
|
|
extern PgAioOp pgaio_io_get_op(PgAioHandle *ioh);
|
|
extern PgAioOpData *pgaio_io_get_op_data(PgAioHandle *ioh);
|
|
|
|
extern void pgaio_io_start_readv(PgAioHandle *ioh,
|
|
int fd, int iovcnt, uint64 offset);
|
|
extern void pgaio_io_start_writev(PgAioHandle *ioh,
|
|
int fd, int iovcnt, uint64 offset);
|
|
|
|
/* functions in aio_target.c */
|
|
extern void pgaio_io_set_target(PgAioHandle *ioh, PgAioTargetID targetid);
|
|
extern bool pgaio_io_has_target(PgAioHandle *ioh);
|
|
extern PgAioTargetData *pgaio_io_get_target_data(PgAioHandle *ioh);
|
|
extern char *pgaio_io_get_target_description(PgAioHandle *ioh);
|
|
|
|
/* functions in aio_callback.c */
|
|
extern void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id,
|
|
uint8 cb_data);
|
|
extern void pgaio_io_set_handle_data_64(PgAioHandle *ioh, uint64 *data, uint8 len);
|
|
extern void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len);
|
|
extern uint64 *pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len);
|
|
|
|
|
|
|
|
/* --------------------------------------------------------------------------------
|
|
* IO Wait References
|
|
* --------------------------------------------------------------------------------
|
|
*/
|
|
|
|
extern void pgaio_wref_clear(PgAioWaitRef *iow);
|
|
extern bool pgaio_wref_valid(PgAioWaitRef *iow);
|
|
extern int pgaio_wref_get_id(PgAioWaitRef *iow);
|
|
|
|
extern void pgaio_wref_wait(PgAioWaitRef *iow);
|
|
extern bool pgaio_wref_check_done(PgAioWaitRef *iow);
|
|
|
|
|
|
|
|
/* --------------------------------------------------------------------------------
|
|
* IO Result
|
|
* --------------------------------------------------------------------------------
|
|
*/
|
|
|
|
extern void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data,
|
|
int elevel);
|
|
|
|
|
|
|
|
/* --------------------------------------------------------------------------------
|
|
* Actions on multiple IOs.
|
|
* --------------------------------------------------------------------------------
|
|
*/
|
|
|
|
extern void pgaio_enter_batchmode(void);
|
|
extern void pgaio_exit_batchmode(void);
|
|
extern void pgaio_submit_staged(void);
|
|
extern bool pgaio_have_staged(void);
|
|
|
|
|
|
|
|
/* --------------------------------------------------------------------------------
|
|
* Other
|
|
* --------------------------------------------------------------------------------
|
|
*/
|
|
|
|
extern void pgaio_closing_fd(int fd);
|
|
|
|
|
|
|
|
/* GUCs */
|
|
extern PGDLLIMPORT int io_method;
|
|
extern PGDLLIMPORT int io_max_concurrency;
|
|
|
|
|
|
#endif /* AIO_H */
|