postgresql/src/backend/bootstrap/bootstrap.c

1146 lines
29 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* bootstrap.c
* routines to support running postgres in 'bootstrap' mode
* bootstrap mode is used to create the initial template database
*
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
2010-09-20 16:08:53 -04:00
* src/backend/bootstrap/bootstrap.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
1999-07-15 23:14:30 -04:00
#include <unistd.h>
#include <signal.h>
#include "access/htup_details.h"
#include "access/xact.h"
#include "access/xlog_internal.h"
#include "bootstrap/bootstrap.h"
#include "catalog/index.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
#include "libpq/pqsignal.h"
1999-07-16 01:00:38 -04:00
#include "miscadmin.h"
#include "nodes/makefuncs.h"
#include "pg_getopt.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "postmaster/startup.h"
#include "postmaster/walwriter.h"
#include "replication/walreceiver.h"
#include "storage/bufmgr.h"
#include "storage/bufpage.h"
#include "storage/condition_variable.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "tcop/tcopprot.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/relmapper.h"
#include "utils/tqual.h"
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 15:18:54 -04:00
uint32 bootstrap_data_checksum_version = 0; /* No checksum */
1996-10-21 04:31:23 -04:00
#define ALLOC(t, c) \
((t *) MemoryContextAllocZero(TopMemoryContext, (unsigned)(c) * sizeof(t)))
static void CheckerModeMain(void);
static void BootstrapModeMain(void);
static void bootstrap_signals(void);
static void ShutdownAuxiliaryProcess(int code, Datum arg);
1998-08-31 23:29:17 -04:00
static Form_pg_attribute AllocateAttribute(void);
static Oid gettype(char *type);
static void cleanup(void);
/* ----------------
* global variables
* ----------------
*/
AuxProcType MyAuxProcType = NotAnAuxProcess; /* declared in miscadmin.h */
Fix management of pendingOpsTable in auxiliary processes. mdinit() was misusing IsBootstrapProcessingMode() to decide whether to create an fsync pending-operations table in the current process. This led to creating a table not only in the startup and checkpointer processes as intended, but also in the bgwriter process, not to mention other auxiliary processes such as walwriter and walreceiver. Creation of the table in the bgwriter is fatal, because it absorbs fsync requests that should have gone to the checkpointer; instead they just sit in bgwriter local memory and are never acted on. So writes performed by the bgwriter were not being fsync'd which could result in data loss after an OS crash. I think there is no live bug with respect to walwriter and walreceiver because those never perform any writes of shared buffers; but the potential is there for future breakage in those processes too. To fix, make AuxiliaryProcessMain() export the current process's AuxProcType as a global variable, and then make mdinit() test directly for the types of aux process that should have a pendingOpsTable. Having done that, we might as well also get rid of the random bool flags such as am_walreceiver that some of the aux processes had grown. (Note that we could not have fixed the bug by examining those variables in mdinit(), because it's called from BaseInit() which is run by AuxiliaryProcessMain() before entering any of the process-type-specific code.) Back-patch to 9.2, where the problem was introduced by the split-up of bgwriter and checkpointer processes. The bogus pendingOpsTable exists in walwriter and walreceiver processes in earlier branches, but absent any evidence that it causes actual problems there, I'll leave the older branches alone.
2012-07-18 15:28:10 -04:00
Relation boot_reldesc; /* current relation descriptor */
Form_pg_attribute attrtypes[MAXATTR]; /* points to attribute info */
int numattr; /* number of attributes for cur. rel */
/*
* Basic information associated with each type. This is used before
* pg_type is filled, so it has to cover the datatypes used as column types
* in the core "bootstrapped" catalogs.
*
* XXX several of these input/output functions do catalog scans
* (e.g., F_REGPROCIN scans pg_proc). this obviously creates some
* order dependencies in the catalog creation process.
*/
struct typinfo
{
char name[NAMEDATALEN];
Oid oid;
Oid elem;
int16 len;
bool byval;
char align;
char storage;
Oid collation;
Oid inproc;
Oid outproc;
};
static const struct typinfo TypInfo[] = {
{"bool", BOOLOID, 0, 1, true, 'c', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_BOOLIN, F_BOOLOUT},
{"bytea", BYTEAOID, 0, -1, false, 'i', 'x', InvalidOid,
2004-08-29 01:07:03 -04:00
F_BYTEAIN, F_BYTEAOUT},
{"char", CHAROID, 0, 1, true, 'c', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_CHARIN, F_CHAROUT},
{"int2", INT2OID, 0, 2, true, 's', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_INT2IN, F_INT2OUT},
{"int4", INT4OID, 0, 4, true, 'i', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_INT4IN, F_INT4OUT},
{"float4", FLOAT4OID, 0, 4, FLOAT4PASSBYVAL, 'i', 'p', InvalidOid,
F_FLOAT4IN, F_FLOAT4OUT},
{"name", NAMEOID, CHAROID, NAMEDATALEN, false, 'c', 'p', InvalidOid,
F_NAMEIN, F_NAMEOUT},
{"regclass", REGCLASSOID, 0, 4, true, 'i', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_REGCLASSIN, F_REGCLASSOUT},
{"regproc", REGPROCOID, 0, 4, true, 'i', 'p', InvalidOid,
F_REGPROCIN, F_REGPROCOUT},
{"regtype", REGTYPEOID, 0, 4, true, 'i', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_REGTYPEIN, F_REGTYPEOUT},
{"regrole", REGROLEOID, 0, 4, true, 'i', 'p', InvalidOid,
F_REGROLEIN, F_REGROLEOUT},
{"regnamespace", REGNAMESPACEOID, 0, 4, true, 'i', 'p', InvalidOid,
F_REGNAMESPACEIN, F_REGNAMESPACEOUT},
{"text", TEXTOID, 0, -1, false, 'i', 'x', DEFAULT_COLLATION_OID,
2004-08-29 01:07:03 -04:00
F_TEXTIN, F_TEXTOUT},
{"oid", OIDOID, 0, 4, true, 'i', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_OIDIN, F_OIDOUT},
{"tid", TIDOID, 0, 6, false, 's', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_TIDIN, F_TIDOUT},
{"xid", XIDOID, 0, 4, true, 'i', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_XIDIN, F_XIDOUT},
{"cid", CIDOID, 0, 4, true, 'i', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_CIDIN, F_CIDOUT},
{"pg_node_tree", PGNODETREEOID, 0, -1, false, 'i', 'x', DEFAULT_COLLATION_OID,
F_PG_NODE_TREE_IN, F_PG_NODE_TREE_OUT},
{"int2vector", INT2VECTOROID, INT2OID, -1, false, 'i', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_INT2VECTORIN, F_INT2VECTOROUT},
{"oidvector", OIDVECTOROID, OIDOID, -1, false, 'i', 'p', InvalidOid,
2004-08-29 01:07:03 -04:00
F_OIDVECTORIN, F_OIDVECTOROUT},
{"_int4", INT4ARRAYOID, INT4OID, -1, false, 'i', 'x', InvalidOid,
2004-08-29 01:07:03 -04:00
F_ARRAY_IN, F_ARRAY_OUT},
{"_text", 1009, TEXTOID, -1, false, 'i', 'x', DEFAULT_COLLATION_OID,
2004-08-29 01:07:03 -04:00
F_ARRAY_IN, F_ARRAY_OUT},
{"_oid", 1028, OIDOID, -1, false, 'i', 'x', InvalidOid,
F_ARRAY_IN, F_ARRAY_OUT},
{"_char", 1002, CHAROID, -1, false, 'i', 'x', InvalidOid,
F_ARRAY_IN, F_ARRAY_OUT},
{"_aclitem", 1034, ACLITEMOID, -1, false, 'i', 'x', InvalidOid,
2004-08-29 01:07:03 -04:00
F_ARRAY_IN, F_ARRAY_OUT}
};
2004-08-29 01:07:03 -04:00
static const int n_types = sizeof(TypInfo) / sizeof(struct typinfo);
struct typmap
{ /* a hack */
Oid am_oid;
1998-08-31 23:29:17 -04:00
FormData_pg_type am_typ;
};
static struct typmap **Typ = NULL;
static struct typmap *Ap = NULL;
static Datum values[MAXATTR]; /* current row's attribute values */
static bool Nulls[MAXATTR];
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 15:18:54 -04:00
static MemoryContext nogc = NULL; /* special no-gc mem context */
/*
* At bootstrap time, we first declare all the indices to be built, and
* then build them. The IndexList structure stores enough information
* to allow us to build the indices after they've been declared.
*/
typedef struct _IndexList
{
Oid il_heap;
Oid il_ind;
IndexInfo *il_info;
struct _IndexList *il_next;
} IndexList;
static IndexList *ILHead = NULL;
/*
* AuxiliaryProcessMain
*
* The main entry point for auxiliary processes, such as the bgwriter,
* walwriter, walreceiver, bootstrapper and the shared memory checker code.
*
* This code is here just because of historical reasons.
*/
void
AuxiliaryProcessMain(int argc, char *argv[])
{
char *progname = argv[0];
int flag;
char *userDoption = NULL;
/*
* Initialize process environment (already done if under postmaster, but
* not if standalone).
*/
if (!IsUnderPostmaster)
InitStandaloneProcess(argv[0]);
/*
* process command arguments
*/
/* Set defaults, to be overridden by explicit options below */
if (!IsUnderPostmaster)
InitializeGUCOptions();
/* Ignore the initial --boot argument, if present */
if (argc > 1 && strcmp(argv[1], "--boot") == 0)
{
argv++;
argc--;
}
Fix management of pendingOpsTable in auxiliary processes. mdinit() was misusing IsBootstrapProcessingMode() to decide whether to create an fsync pending-operations table in the current process. This led to creating a table not only in the startup and checkpointer processes as intended, but also in the bgwriter process, not to mention other auxiliary processes such as walwriter and walreceiver. Creation of the table in the bgwriter is fatal, because it absorbs fsync requests that should have gone to the checkpointer; instead they just sit in bgwriter local memory and are never acted on. So writes performed by the bgwriter were not being fsync'd which could result in data loss after an OS crash. I think there is no live bug with respect to walwriter and walreceiver because those never perform any writes of shared buffers; but the potential is there for future breakage in those processes too. To fix, make AuxiliaryProcessMain() export the current process's AuxProcType as a global variable, and then make mdinit() test directly for the types of aux process that should have a pendingOpsTable. Having done that, we might as well also get rid of the random bool flags such as am_walreceiver that some of the aux processes had grown. (Note that we could not have fixed the bug by examining those variables in mdinit(), because it's called from BaseInit() which is run by AuxiliaryProcessMain() before entering any of the process-type-specific code.) Back-patch to 9.2, where the problem was introduced by the split-up of bgwriter and checkpointer processes. The bogus pendingOpsTable exists in walwriter and walreceiver processes in earlier branches, but absent any evidence that it causes actual problems there, I'll leave the older branches alone.
2012-07-18 15:28:10 -04:00
/* If no -x argument, we are a CheckerProcess */
MyAuxProcType = CheckerProcess;
while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:x:X:-:")) != -1)
{
switch (flag)
{
case 'B':
SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV);
break;
case 'D':
userDoption = pstrdup(optarg);
break;
case 'd':
2002-09-04 16:31:48 -04:00
{
/* Turn on debugging for the bootstrap process. */
char *debugstr;
2002-09-04 16:31:48 -04:00
debugstr = psprintf("debug%s", optarg);
SetConfigOption("log_min_messages", debugstr,
2002-09-04 16:31:48 -04:00
PGC_POSTMASTER, PGC_S_ARGV);
SetConfigOption("client_min_messages", debugstr,
PGC_POSTMASTER, PGC_S_ARGV);
pfree(debugstr);
}
break;
case 'F':
SetConfigOption("fsync", "false", PGC_POSTMASTER, PGC_S_ARGV);
break;
case 'k':
bootstrap_data_checksum_version = PG_DATA_CHECKSUM_VERSION;
break;
case 'r':
2007-02-10 09:58:55 -05:00
strlcpy(OutputFileName, optarg, MAXPGPATH);
break;
case 'x':
Fix management of pendingOpsTable in auxiliary processes. mdinit() was misusing IsBootstrapProcessingMode() to decide whether to create an fsync pending-operations table in the current process. This led to creating a table not only in the startup and checkpointer processes as intended, but also in the bgwriter process, not to mention other auxiliary processes such as walwriter and walreceiver. Creation of the table in the bgwriter is fatal, because it absorbs fsync requests that should have gone to the checkpointer; instead they just sit in bgwriter local memory and are never acted on. So writes performed by the bgwriter were not being fsync'd which could result in data loss after an OS crash. I think there is no live bug with respect to walwriter and walreceiver because those never perform any writes of shared buffers; but the potential is there for future breakage in those processes too. To fix, make AuxiliaryProcessMain() export the current process's AuxProcType as a global variable, and then make mdinit() test directly for the types of aux process that should have a pendingOpsTable. Having done that, we might as well also get rid of the random bool flags such as am_walreceiver that some of the aux processes had grown. (Note that we could not have fixed the bug by examining those variables in mdinit(), because it's called from BaseInit() which is run by AuxiliaryProcessMain() before entering any of the process-type-specific code.) Back-patch to 9.2, where the problem was introduced by the split-up of bgwriter and checkpointer processes. The bogus pendingOpsTable exists in walwriter and walreceiver processes in earlier branches, but absent any evidence that it causes actual problems there, I'll leave the older branches alone.
2012-07-18 15:28:10 -04:00
MyAuxProcType = atoi(optarg);
break;
case 'X':
{
int WalSegSz = strtoul(optarg, NULL, 0);
if (!IsValidWalSegSize(WalSegSz))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("-X requires a power of 2 value between 1MB and 1GB")));
SetConfigOption("wal_segment_size", optarg, PGC_INTERNAL,
PGC_S_OVERRIDE);
}
break;
case 'c':
case '-':
{
char *name,
*value;
ParseLongOption(optarg, &name, &value);
if (!value)
{
if (flag == '-')
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("--%s requires a value",
optarg)));
else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("-c %s requires a value",
optarg)));
}
SetConfigOption(name, value, PGC_POSTMASTER, PGC_S_ARGV);
free(name);
if (value)
free(value);
break;
}
default:
write_stderr("Try \"%s --help\" for more information.\n",
progname);
proc_exit(1);
break;
}
}
if (argc != optind)
{
write_stderr("%s: invalid command-line arguments\n", progname);
proc_exit(1);
}
/*
* Identify myself via ps
*/
if (IsUnderPostmaster)
{
const char *statmsg;
Fix management of pendingOpsTable in auxiliary processes. mdinit() was misusing IsBootstrapProcessingMode() to decide whether to create an fsync pending-operations table in the current process. This led to creating a table not only in the startup and checkpointer processes as intended, but also in the bgwriter process, not to mention other auxiliary processes such as walwriter and walreceiver. Creation of the table in the bgwriter is fatal, because it absorbs fsync requests that should have gone to the checkpointer; instead they just sit in bgwriter local memory and are never acted on. So writes performed by the bgwriter were not being fsync'd which could result in data loss after an OS crash. I think there is no live bug with respect to walwriter and walreceiver because those never perform any writes of shared buffers; but the potential is there for future breakage in those processes too. To fix, make AuxiliaryProcessMain() export the current process's AuxProcType as a global variable, and then make mdinit() test directly for the types of aux process that should have a pendingOpsTable. Having done that, we might as well also get rid of the random bool flags such as am_walreceiver that some of the aux processes had grown. (Note that we could not have fixed the bug by examining those variables in mdinit(), because it's called from BaseInit() which is run by AuxiliaryProcessMain() before entering any of the process-type-specific code.) Back-patch to 9.2, where the problem was introduced by the split-up of bgwriter and checkpointer processes. The bogus pendingOpsTable exists in walwriter and walreceiver processes in earlier branches, but absent any evidence that it causes actual problems there, I'll leave the older branches alone.
2012-07-18 15:28:10 -04:00
switch (MyAuxProcType)
{
case StartupProcess:
statmsg = pgstat_get_backend_desc(B_STARTUP);
break;
case BgWriterProcess:
statmsg = pgstat_get_backend_desc(B_BG_WRITER);
break;
case CheckpointerProcess:
statmsg = pgstat_get_backend_desc(B_CHECKPOINTER);
break;
case WalWriterProcess:
statmsg = pgstat_get_backend_desc(B_WAL_WRITER);
break;
case WalReceiverProcess:
statmsg = pgstat_get_backend_desc(B_WAL_RECEIVER);
break;
default:
statmsg = "??? process";
break;
}
init_ps_display(statmsg, "", "", "");
}
/* Acquire configuration parameters, unless inherited from postmaster */
if (!IsUnderPostmaster)
{
if (!SelectConfigFiles(userDoption, progname))
proc_exit(1);
}
/* Validate we have been given a reasonable-looking DataDir */
Assert(DataDir);
ValidatePgVersion(DataDir);
/* Change into DataDir (if under postmaster, should be done already) */
if (!IsUnderPostmaster)
ChangeToDataDir();
/* If standalone, create lockfile for data directory */
if (!IsUnderPostmaster)
CreateDataDirLockFile(false);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-12 20:17:06 -05:00
SetProcessingMode(BootstrapProcessing);
IgnoreSystemIndexes = true;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-12 20:17:06 -05:00
/* Initialize MaxBackends (if under postmaster, was done already) */
if (!IsUnderPostmaster)
InitializeMaxBackends();
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-12 20:17:06 -05:00
BaseInit();
/*
* When we are an auxiliary process, we aren't going to do the full
2006-10-03 20:30:14 -04:00
* InitPostgres pushups, but there are a couple of things that need to get
* lit up even in an auxiliary process.
*/
if (IsUnderPostmaster)
{
/*
* Create a PGPROC so we can use LWLocks. In the EXEC_BACKEND case,
* this was already done by SubPostmasterMain().
*/
#ifndef EXEC_BACKEND
InitAuxiliaryProcess();
#endif
/*
* Assign the ProcSignalSlot for an auxiliary process. Since it
* doesn't have a BackendId, the slot is statically allocated based on
* the auxiliary process type (MyAuxProcType). Backends use slots
Fix management of pendingOpsTable in auxiliary processes. mdinit() was misusing IsBootstrapProcessingMode() to decide whether to create an fsync pending-operations table in the current process. This led to creating a table not only in the startup and checkpointer processes as intended, but also in the bgwriter process, not to mention other auxiliary processes such as walwriter and walreceiver. Creation of the table in the bgwriter is fatal, because it absorbs fsync requests that should have gone to the checkpointer; instead they just sit in bgwriter local memory and are never acted on. So writes performed by the bgwriter were not being fsync'd which could result in data loss after an OS crash. I think there is no live bug with respect to walwriter and walreceiver because those never perform any writes of shared buffers; but the potential is there for future breakage in those processes too. To fix, make AuxiliaryProcessMain() export the current process's AuxProcType as a global variable, and then make mdinit() test directly for the types of aux process that should have a pendingOpsTable. Having done that, we might as well also get rid of the random bool flags such as am_walreceiver that some of the aux processes had grown. (Note that we could not have fixed the bug by examining those variables in mdinit(), because it's called from BaseInit() which is run by AuxiliaryProcessMain() before entering any of the process-type-specific code.) Back-patch to 9.2, where the problem was introduced by the split-up of bgwriter and checkpointer processes. The bogus pendingOpsTable exists in walwriter and walreceiver processes in earlier branches, but absent any evidence that it causes actual problems there, I'll leave the older branches alone.
2012-07-18 15:28:10 -04:00
* indexed in the range from 1 to MaxBackends (inclusive), so we use
* MaxBackends + AuxProcType + 1 as the index of the slot for an
* auxiliary process.
*
* This will need rethinking if we ever want more than one of a
* particular auxiliary process type.
*/
Fix management of pendingOpsTable in auxiliary processes. mdinit() was misusing IsBootstrapProcessingMode() to decide whether to create an fsync pending-operations table in the current process. This led to creating a table not only in the startup and checkpointer processes as intended, but also in the bgwriter process, not to mention other auxiliary processes such as walwriter and walreceiver. Creation of the table in the bgwriter is fatal, because it absorbs fsync requests that should have gone to the checkpointer; instead they just sit in bgwriter local memory and are never acted on. So writes performed by the bgwriter were not being fsync'd which could result in data loss after an OS crash. I think there is no live bug with respect to walwriter and walreceiver because those never perform any writes of shared buffers; but the potential is there for future breakage in those processes too. To fix, make AuxiliaryProcessMain() export the current process's AuxProcType as a global variable, and then make mdinit() test directly for the types of aux process that should have a pendingOpsTable. Having done that, we might as well also get rid of the random bool flags such as am_walreceiver that some of the aux processes had grown. (Note that we could not have fixed the bug by examining those variables in mdinit(), because it's called from BaseInit() which is run by AuxiliaryProcessMain() before entering any of the process-type-specific code.) Back-patch to 9.2, where the problem was introduced by the split-up of bgwriter and checkpointer processes. The bogus pendingOpsTable exists in walwriter and walreceiver processes in earlier branches, but absent any evidence that it causes actual problems there, I'll leave the older branches alone.
2012-07-18 15:28:10 -04:00
ProcSignalInit(MaxBackends + MyAuxProcType + 1);
/* finish setting up bufmgr.c */
InitBufferPoolBackend();
/* Initialize backend status information */
pgstat_initialize();
pgstat_bestart();
/* register a before-shutdown callback for LWLock cleanup */
before_shmem_exit(ShutdownAuxiliaryProcess, 0);
}
/*
2000-11-09 06:26:00 -05:00
* XLOG operations
*/
2000-11-21 04:39:57 -05:00
SetProcessingMode(NormalProcessing);
Fix management of pendingOpsTable in auxiliary processes. mdinit() was misusing IsBootstrapProcessingMode() to decide whether to create an fsync pending-operations table in the current process. This led to creating a table not only in the startup and checkpointer processes as intended, but also in the bgwriter process, not to mention other auxiliary processes such as walwriter and walreceiver. Creation of the table in the bgwriter is fatal, because it absorbs fsync requests that should have gone to the checkpointer; instead they just sit in bgwriter local memory and are never acted on. So writes performed by the bgwriter were not being fsync'd which could result in data loss after an OS crash. I think there is no live bug with respect to walwriter and walreceiver because those never perform any writes of shared buffers; but the potential is there for future breakage in those processes too. To fix, make AuxiliaryProcessMain() export the current process's AuxProcType as a global variable, and then make mdinit() test directly for the types of aux process that should have a pendingOpsTable. Having done that, we might as well also get rid of the random bool flags such as am_walreceiver that some of the aux processes had grown. (Note that we could not have fixed the bug by examining those variables in mdinit(), because it's called from BaseInit() which is run by AuxiliaryProcessMain() before entering any of the process-type-specific code.) Back-patch to 9.2, where the problem was introduced by the split-up of bgwriter and checkpointer processes. The bogus pendingOpsTable exists in walwriter and walreceiver processes in earlier branches, but absent any evidence that it causes actual problems there, I'll leave the older branches alone.
2012-07-18 15:28:10 -04:00
switch (MyAuxProcType)
2000-11-21 04:39:57 -05:00
{
case CheckerProcess:
/* don't set signals, they're useless here */
CheckerModeMain();
proc_exit(1); /* should never return */
case BootstrapProcess:
2015-05-23 21:35:49 -04:00
Protect against multixact members wraparound Multixact member files are subject to early wraparound overflow and removal: if the average multixact size is above a certain threshold (see note below) the protections against offset overflow are not enough: during multixact truncation at checkpoint time, some pg_multixact/members files would be removed because the server considers them to be old and not needed anymore. This leads to loss of files that are critical to interpret existing tuples's Xmax values. To protect against this, since we don't have enough info in pg_control and we can't modify it in old branches, we maintain shared memory state about the oldest value that we need to keep; we use this during new multixact creation to abort if an old still-needed file would get overwritten. This value is kept up to date by checkpoints, which makes it not completely accurate but should be good enough. We start emitting warnings sometime earlier, so that the eventual multixact-shutdown doesn't take DBAs completely by surprise (more precisely: once 20 members SLRU segments are remaining before shutdown.) On troublesome average multixact size: The threshold size depends on the multixact freeze parameters. The oldest age is related to the greater of multixact_freeze_table_age and multixact_freeze_min_age: anything older than that should be removed promptly by autovacuum. If autovacuum is keeping up with multixact freezing, the troublesome multixact average size is (2^32-1) / Max(freeze table age, freeze min age) or around 28 members per multixact. Having an average multixact size larger than that will eventually cause new multixact data to overwrite the data area for older multixacts. (If autovacuum is not able to keep up, or there are errors in vacuuming, the actual maximum is multixact_freeeze_max_age instead, at which point multixact generation is stopped completely. The default value for this limit is 400 million, which means that the multixact size that would cause trouble is about 10 members). Initial bug report by Timothy Garnett, bug #12990 Backpatch to 9.3, where the problem was introduced. Authors: Álvaro Herrera, Thomas Munro Reviews: Thomas Munro, Amit Kapila, Robert Haas, Kevin Grittner
2015-04-28 10:32:53 -04:00
/*
* There was a brief instant during which mode was Normal; this is
* okay. We need to be in bootstrap mode during BootStrapXLOG for
* the sake of multixact initialization.
*/
SetProcessingMode(BootstrapProcessing);
bootstrap_signals();
BootStrapXLOG();
BootstrapModeMain();
proc_exit(1); /* should never return */
case StartupProcess:
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 10:58:41 -05:00
/* don't set signals, startup process has its own agenda */
StartupProcessMain();
proc_exit(1); /* should never return */
case BgWriterProcess:
/* don't set signals, bgwriter has its own agenda */
BackgroundWriterMain();
proc_exit(1); /* should never return */
case CheckpointerProcess:
/* don't set signals, checkpointer has its own agenda */
CheckpointerMain();
proc_exit(1); /* should never return */
case WalWriterProcess:
/* don't set signals, walwriter has its own agenda */
InitXLOGAccess();
WalWriterMain();
proc_exit(1); /* should never return */
2007-11-15 16:14:46 -05:00
case WalReceiverProcess:
/* don't set signals, walreceiver has its own agenda */
WalReceiverMain();
proc_exit(1); /* should never return */
default:
Fix management of pendingOpsTable in auxiliary processes. mdinit() was misusing IsBootstrapProcessingMode() to decide whether to create an fsync pending-operations table in the current process. This led to creating a table not only in the startup and checkpointer processes as intended, but also in the bgwriter process, not to mention other auxiliary processes such as walwriter and walreceiver. Creation of the table in the bgwriter is fatal, because it absorbs fsync requests that should have gone to the checkpointer; instead they just sit in bgwriter local memory and are never acted on. So writes performed by the bgwriter were not being fsync'd which could result in data loss after an OS crash. I think there is no live bug with respect to walwriter and walreceiver because those never perform any writes of shared buffers; but the potential is there for future breakage in those processes too. To fix, make AuxiliaryProcessMain() export the current process's AuxProcType as a global variable, and then make mdinit() test directly for the types of aux process that should have a pendingOpsTable. Having done that, we might as well also get rid of the random bool flags such as am_walreceiver that some of the aux processes had grown. (Note that we could not have fixed the bug by examining those variables in mdinit(), because it's called from BaseInit() which is run by AuxiliaryProcessMain() before entering any of the process-type-specific code.) Back-patch to 9.2, where the problem was introduced by the split-up of bgwriter and checkpointer processes. The bogus pendingOpsTable exists in walwriter and walreceiver processes in earlier branches, but absent any evidence that it causes actual problems there, I'll leave the older branches alone.
2012-07-18 15:28:10 -04:00
elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType);
proc_exit(1);
}
}
/*
* In shared memory checker mode, all we really want to do is create shared
* memory and semaphores (just to prove we can do it with the current GUC
* settings). Since, in fact, that was already done by BaseInit(),
* we have nothing more to do here.
*/
static void
CheckerModeMain(void)
{
proc_exit(0);
}
/*
* The main entry point for running the backend in bootstrap mode
*
* The bootstrap mode is used to initialize the template database.
* The bootstrap backend doesn't speak SQL, but instead expects
* commands in a special bootstrap language.
*/
static void
BootstrapModeMain(void)
{
int i;
Assert(!IsUnderPostmaster);
Protect against multixact members wraparound Multixact member files are subject to early wraparound overflow and removal: if the average multixact size is above a certain threshold (see note below) the protections against offset overflow are not enough: during multixact truncation at checkpoint time, some pg_multixact/members files would be removed because the server considers them to be old and not needed anymore. This leads to loss of files that are critical to interpret existing tuples's Xmax values. To protect against this, since we don't have enough info in pg_control and we can't modify it in old branches, we maintain shared memory state about the oldest value that we need to keep; we use this during new multixact creation to abort if an old still-needed file would get overwritten. This value is kept up to date by checkpoints, which makes it not completely accurate but should be good enough. We start emitting warnings sometime earlier, so that the eventual multixact-shutdown doesn't take DBAs completely by surprise (more precisely: once 20 members SLRU segments are remaining before shutdown.) On troublesome average multixact size: The threshold size depends on the multixact freeze parameters. The oldest age is related to the greater of multixact_freeze_table_age and multixact_freeze_min_age: anything older than that should be removed promptly by autovacuum. If autovacuum is keeping up with multixact freezing, the troublesome multixact average size is (2^32-1) / Max(freeze table age, freeze min age) or around 28 members per multixact. Having an average multixact size larger than that will eventually cause new multixact data to overwrite the data area for older multixacts. (If autovacuum is not able to keep up, or there are errors in vacuuming, the actual maximum is multixact_freeeze_max_age instead, at which point multixact generation is stopped completely. The default value for this limit is 400 million, which means that the multixact size that would cause trouble is about 10 members). Initial bug report by Timothy Garnett, bug #12990 Backpatch to 9.3, where the problem was introduced. Authors: Álvaro Herrera, Thomas Munro Reviews: Thomas Munro, Amit Kapila, Robert Haas, Kevin Grittner
2015-04-28 10:32:53 -04:00
Assert(IsBootstrapProcessingMode());
/*
* Do backend-like initialization for bootstrap mode
*/
InitProcess();
InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL);
/* Initialize stuff for bootstrap-file processing */
for (i = 0; i < MAXATTR; i++)
{
attrtypes[i] = NULL;
Nulls[i] = false;
}
/*
* Process bootstrap input.
*/
StartTransactionCommand();
boot_yyparse();
CommitTransactionCommand();
/*
2010-02-25 21:01:40 -05:00
* We should now know about all mapped relations, so it's okay to write
* out the initial relation mapping files.
*/
RelationMapFinishBootstrap();
/* Clean up and exit */
cleanup();
proc_exit(0);
}
/* ----------------------------------------------------------------
* misc functions
* ----------------------------------------------------------------
*/
/*
* Set up signal handling for a bootstrap process
*/
static void
bootstrap_signals(void)
{
Assert(!IsUnderPostmaster);
/* Set up appropriately for interactive use */
pqsignal(SIGHUP, die);
pqsignal(SIGINT, die);
pqsignal(SIGTERM, die);
pqsignal(SIGQUIT, die);
}
/*
* Begin shutdown of an auxiliary process. This is approximately the equivalent
* of ShutdownPostgres() in postinit.c. We can't run transactions in an
* auxiliary process, so most of the work of AbortTransaction() is not needed,
* but we do need to make sure we've released any LWLocks we are holding.
* (This is only critical during an error exit.)
*/
static void
ShutdownAuxiliaryProcess(int code, Datum arg)
{
LWLockReleaseAll();
ConditionVariableCancelSleep();
pgstat_report_wait_end();
}
/* ----------------------------------------------------------------
* MANUAL BACKEND INTERACTIVE INTERFACE COMMANDS
* ----------------------------------------------------------------
*/
/* ----------------
* boot_openrel
* ----------------
*/
void
boot_openrel(char *relname)
{
int i;
struct typmap **app;
Relation rel;
HeapScanDesc scan;
HeapTuple tup;
if (strlen(relname) >= NAMEDATALEN)
relname[NAMEDATALEN - 1] = '\0';
if (Typ == NULL)
{
/* We can now load the pg_type data */
rel = heap_open(TypeRelationId, NoLock);
scan = heap_beginscan_catalog(rel, 0, NULL);
i = 0;
while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
++i;
heap_endscan(scan);
app = Typ = ALLOC(struct typmap *, i + 1);
while (i-- > 0)
*app++ = ALLOC(struct typmap, 1);
*app = NULL;
scan = heap_beginscan_catalog(rel, 0, NULL);
app = Typ;
while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
(*app)->am_oid = HeapTupleGetOid(tup);
memcpy((char *) &(*app)->am_typ,
(char *) GETSTRUCT(tup),
sizeof((*app)->am_typ));
app++;
}
heap_endscan(scan);
heap_close(rel, NoLock);
}
if (boot_reldesc != NULL)
closerel(NULL);
elog(DEBUG4, "open relation %s, attrsize %d",
relname, (int) ATTRIBUTE_FIXED_PART_SIZE);
boot_reldesc = heap_openrv(makeRangeVar(NULL, relname, -1), NoLock);
numattr = boot_reldesc->rd_rel->relnatts;
for (i = 0; i < numattr; i++)
{
if (attrtypes[i] == NULL)
attrtypes[i] = AllocateAttribute();
memmove((char *) attrtypes[i],
(char *) TupleDescAttr(boot_reldesc->rd_att, i),
ATTRIBUTE_FIXED_PART_SIZE);
{
1998-08-31 23:29:17 -04:00
Form_pg_attribute at = attrtypes[i];
elog(DEBUG4, "create attribute %d name %s len %d num %d type %u",
i, NameStr(at->attname), at->attlen, at->attnum,
at->atttypid);
}
}
}
/* ----------------
* closerel
* ----------------
*/
void
closerel(char *name)
{
if (name)
{
if (boot_reldesc)
{
if (strcmp(RelationGetRelationName(boot_reldesc), name) != 0)
elog(ERROR, "close of %s when %s was expected",
name, RelationGetRelationName(boot_reldesc));
}
else
elog(ERROR, "close of %s before any relation was opened",
name);
}
if (boot_reldesc == NULL)
elog(ERROR, "no open relation to close");
else
{
elog(DEBUG4, "close relation %s",
RelationGetRelationName(boot_reldesc));
heap_close(boot_reldesc, NoLock);
boot_reldesc = NULL;
}
}
/* ----------------
* DEFINEATTR()
*
* define a <field,type> pair
* if there are n fields in a relation to be created, this routine
* will be called n times
* ----------------
*/
void
DefineAttr(char *name, char *type, int attnum, int nullness)
{
Oid typeoid;
if (boot_reldesc != NULL)
{
elog(WARNING, "no open relations allowed with CREATE command");
closerel(NULL);
}
if (attrtypes[attnum] == NULL)
attrtypes[attnum] = AllocateAttribute();
MemSet(attrtypes[attnum], 0, ATTRIBUTE_FIXED_PART_SIZE);
namestrcpy(&attrtypes[attnum]->attname, name);
elog(DEBUG4, "column %s %s", NameStr(attrtypes[attnum]->attname), type);
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 15:18:54 -04:00
attrtypes[attnum]->attnum = attnum + 1; /* fillatt */
typeoid = gettype(type);
if (Typ != NULL)
{
attrtypes[attnum]->atttypid = Ap->am_oid;
attrtypes[attnum]->attlen = Ap->am_typ.typlen;
attrtypes[attnum]->attbyval = Ap->am_typ.typbyval;
attrtypes[attnum]->attstorage = Ap->am_typ.typstorage;
attrtypes[attnum]->attalign = Ap->am_typ.typalign;
attrtypes[attnum]->attcollation = Ap->am_typ.typcollation;
/* if an array type, assume 1-dimensional attribute */
if (Ap->am_typ.typelem != InvalidOid && Ap->am_typ.typlen < 0)
attrtypes[attnum]->attndims = 1;
else
attrtypes[attnum]->attndims = 0;
}
else
{
attrtypes[attnum]->atttypid = TypInfo[typeoid].oid;
attrtypes[attnum]->attlen = TypInfo[typeoid].len;
attrtypes[attnum]->attbyval = TypInfo[typeoid].byval;
attrtypes[attnum]->attstorage = TypInfo[typeoid].storage;
attrtypes[attnum]->attalign = TypInfo[typeoid].align;
attrtypes[attnum]->attcollation = TypInfo[typeoid].collation;
/* if an array type, assume 1-dimensional attribute */
if (TypInfo[typeoid].elem != InvalidOid &&
attrtypes[attnum]->attlen < 0)
attrtypes[attnum]->attndims = 1;
else
attrtypes[attnum]->attndims = 0;
}
attrtypes[attnum]->attstattarget = -1;
attrtypes[attnum]->attcacheoff = -1;
1998-02-07 01:11:56 -05:00
attrtypes[attnum]->atttypmod = -1;
attrtypes[attnum]->attislocal = true;
2002-09-04 16:31:48 -04:00
if (nullness == BOOTCOL_NULL_FORCE_NOT_NULL)
{
attrtypes[attnum]->attnotnull = true;
}
else if (nullness == BOOTCOL_NULL_FORCE_NULL)
{
attrtypes[attnum]->attnotnull = false;
}
else
{
Assert(nullness == BOOTCOL_NULL_AUTO);
/*
* Mark as "not null" if type is fixed-width and prior columns are
* too. This corresponds to case where column can be accessed
* directly via C struct declaration.
*
* oidvector and int2vector are also treated as not-nullable, even
* though they are no longer fixed-width.
*/
#define MARKNOTNULL(att) \
((att)->attlen > 0 || \
(att)->atttypid == OIDVECTOROID || \
(att)->atttypid == INT2VECTOROID)
if (MARKNOTNULL(attrtypes[attnum]))
{
int i;
/* check earlier attributes */
for (i = 0; i < attnum; i++)
{
if (!attrtypes[i]->attnotnull)
break;
}
if (i == attnum)
attrtypes[attnum]->attnotnull = true;
}
}
}
/* ----------------
* InsertOneTuple
*
* If objectid is not zero, it is a specific OID to assign to the tuple.
* Otherwise, an OID will be assigned (if necessary) by heap_insert.
* ----------------
*/
void
InsertOneTuple(Oid objectid)
{
HeapTuple tuple;
TupleDesc tupDesc;
int i;
elog(DEBUG4, "inserting row oid %u, %d columns", objectid, numattr);
tupDesc = CreateTupleDesc(numattr,
RelationGetForm(boot_reldesc)->relhasoids,
attrtypes);
tuple = heap_form_tuple(tupDesc, values, Nulls);
if (objectid != (Oid) 0)
HeapTupleSetOid(tuple, objectid);
pfree(tupDesc); /* just free's tupDesc, not the attrtypes */
simple_heap_insert(boot_reldesc, tuple);
heap_freetuple(tuple);
elog(DEBUG4, "row inserted");
/*
* Reset null markers for next tuple
*/
for (i = 0; i < numattr; i++)
Nulls[i] = false;
}
/* ----------------
* InsertOneValue
* ----------------
*/
void
InsertOneValue(char *value, int i)
{
Oid typoid;
int16 typlen;
bool typbyval;
char typalign;
char typdelim;
Oid typioparam;
Oid typinput;
Oid typoutput;
AssertArg(i >= 0 && i < MAXATTR);
elog(DEBUG4, "inserting column %d value \"%s\"", i, value);
typoid = TupleDescAttr(boot_reldesc->rd_att, i)->atttypid;
boot_get_type_io_data(typoid,
&typlen, &typbyval, &typalign,
&typdelim, &typioparam,
&typinput, &typoutput);
values[i] = OidInputFunctionCall(typinput, value, typioparam, -1);
Prevent memory leaks from accumulating across printtup() calls. Historically, printtup() has assumed that it could prevent memory leakage by pfree'ing the string result of each output function and manually managing detoasting of toasted values. This amounts to assuming that datatype output functions never leak any memory internally; an assumption we've already decided to be bogus elsewhere, for example in COPY OUT. range_out in particular is known to leak multiple kilobytes per call, as noted in bug #8573 from Godfried Vanluffelen. While we could go in and fix that leak, it wouldn't be very notationally convenient, and in any case there have been and undoubtedly will again be other leaks in other output functions. So what seems like the best solution is to run the output functions in a temporary memory context that can be reset after each row, as we're doing in COPY OUT. Some quick experimentation suggests this is actually a tad faster than the retail pfree's anyway. This patch fixes all the variants of printtup, except for debugtup() which is used in standalone mode. It doesn't seem worth worrying about query-lifespan leaks in standalone mode, and fixing that case would be a bit tedious since debugtup() doesn't currently have any startup or shutdown functions. While at it, remove manual detoast management from several other output-function call sites that had copied it from printtup(). This doesn't make a lot of difference right now, but in view of recent discussions about supporting "non-flattened" Datums, we're going to want that code gone eventually anyway. Back-patch to 9.2 where range_out was introduced. We might eventually decide to back-patch this further, but in the absence of known major leaks in older output functions, I'll refrain for now.
2013-11-03 11:33:05 -05:00
/*
* We use ereport not elog here so that parameters aren't evaluated unless
* the message is going to be printed, which generally it isn't
*/
ereport(DEBUG4,
(errmsg_internal("inserted -> %s",
OidOutputFunctionCall(typoutput, values[i]))));
}
/* ----------------
* InsertOneNull
* ----------------
*/
void
InsertOneNull(int i)
{
elog(DEBUG4, "inserting column %d NULL", i);
Assert(i >= 0 && i < MAXATTR);
if (TupleDescAttr(boot_reldesc->rd_att, i)->attnotnull)
elog(ERROR,
"NULL value specified for not-null column \"%s\" of relation \"%s\"",
NameStr(TupleDescAttr(boot_reldesc->rd_att, i)->attname),
RelationGetRelationName(boot_reldesc));
values[i] = PointerGetDatum(NULL);
Nulls[i] = true;
}
/* ----------------
* cleanup
* ----------------
*/
static void
cleanup(void)
{
if (boot_reldesc != NULL)
closerel(NULL);
}
/* ----------------
* gettype
*
* NB: this is really ugly; it will return an integer index into TypInfo[],
* and not an OID at all, until the first reference to a type not known in
* TypInfo[]. At that point it will read and cache pg_type in the Typ array,
* and subsequently return a real OID (and set the global pointer Ap to
* point at the found row in Typ). So caller must check whether Typ is
* still NULL to determine what the return value is!
* ----------------
*/
static Oid
gettype(char *type)
{
int i;
Relation rel;
HeapScanDesc scan;
HeapTuple tup;
struct typmap **app;
if (Typ != NULL)
{
for (app = Typ; *app != NULL; app++)
{
if (strncmp(NameStr((*app)->am_typ.typname), type, NAMEDATALEN) == 0)
{
Ap = *app;
1998-08-31 23:29:17 -04:00
return (*app)->am_oid;
}
}
}
else
{
for (i = 0; i < n_types; i++)
{
if (strncmp(type, TypInfo[i].name, NAMEDATALEN) == 0)
1998-08-31 23:29:17 -04:00
return i;
}
elog(DEBUG4, "external type: %s", type);
rel = heap_open(TypeRelationId, NoLock);
scan = heap_beginscan_catalog(rel, 0, NULL);
i = 0;
while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
++i;
heap_endscan(scan);
app = Typ = ALLOC(struct typmap *, i + 1);
while (i-- > 0)
*app++ = ALLOC(struct typmap, 1);
*app = NULL;
scan = heap_beginscan_catalog(rel, 0, NULL);
app = Typ;
while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
(*app)->am_oid = HeapTupleGetOid(tup);
memmove((char *) &(*app++)->am_typ,
(char *) GETSTRUCT(tup),
sizeof((*app)->am_typ));
}
heap_endscan(scan);
heap_close(rel, NoLock);
1998-08-31 23:29:17 -04:00
return gettype(type);
}
elog(ERROR, "unrecognized type \"%s\"", type);
/* not reached, here to make compiler happy */
return 0;
}
/* ----------------
* boot_get_type_io_data
*
* Obtain type I/O information at bootstrap time. This intentionally has
* almost the same API as lsyscache.c's get_type_io_data, except that
* we only support obtaining the typinput and typoutput routines, not
* the binary I/O routines. It is exported so that array_in and array_out
* can be made to work during early bootstrap.
* ----------------
*/
void
boot_get_type_io_data(Oid typid,
int16 *typlen,
bool *typbyval,
char *typalign,
char *typdelim,
Oid *typioparam,
Oid *typinput,
Oid *typoutput)
{
if (Typ != NULL)
{
/* We have the boot-time contents of pg_type, so use it */
struct typmap **app;
struct typmap *ap;
app = Typ;
while (*app && (*app)->am_oid != typid)
++app;
ap = *app;
if (ap == NULL)
elog(ERROR, "type OID %u not found in Typ list", typid);
*typlen = ap->am_typ.typlen;
*typbyval = ap->am_typ.typbyval;
*typalign = ap->am_typ.typalign;
*typdelim = ap->am_typ.typdelim;
/* XXX this logic must match getTypeIOParam() */
if (OidIsValid(ap->am_typ.typelem))
*typioparam = ap->am_typ.typelem;
else
*typioparam = typid;
*typinput = ap->am_typ.typinput;
*typoutput = ap->am_typ.typoutput;
}
else
{
/* We don't have pg_type yet, so use the hard-wired TypInfo array */
int typeindex;
for (typeindex = 0; typeindex < n_types; typeindex++)
{
if (TypInfo[typeindex].oid == typid)
break;
}
if (typeindex >= n_types)
elog(ERROR, "type OID %u not found in TypInfo", typid);
*typlen = TypInfo[typeindex].len;
*typbyval = TypInfo[typeindex].byval;
*typalign = TypInfo[typeindex].align;
/* We assume typdelim is ',' for all boot-time types */
*typdelim = ',';
/* XXX this logic must match getTypeIOParam() */
if (OidIsValid(TypInfo[typeindex].elem))
*typioparam = TypInfo[typeindex].elem;
else
*typioparam = typid;
*typinput = TypInfo[typeindex].inproc;
*typoutput = TypInfo[typeindex].outproc;
}
}
/* ----------------
* AllocateAttribute
*
* Note: bootstrap never sets any per-column ACLs, so we only need
* ATTRIBUTE_FIXED_PART_SIZE space per attribute.
* ----------------
*/
static Form_pg_attribute
AllocateAttribute(void)
{
return (Form_pg_attribute)
MemoryContextAllocZero(TopMemoryContext, ATTRIBUTE_FIXED_PART_SIZE);
}
/*
* MapArrayTypeName
*
* Given a type name, produce the corresponding array type name by prepending
* '_' and truncating as needed to fit in NAMEDATALEN-1 bytes. This is only
* used in bootstrap mode, so we can get away with assuming that the input is
* ASCII and we don't need multibyte-aware truncation.
*
* The given string normally ends with '[]' or '[digits]'; we discard that.
*
* The result is a palloc'd string.
*/
char *
MapArrayTypeName(const char *s)
{
int i,
j;
char newStr[NAMEDATALEN];
newStr[0] = '_';
j = 1;
for (i = 0; i < NAMEDATALEN - 2 && s[i] != '['; i++, j++)
newStr[j] = s[i];
newStr[j] = '\0';
return pstrdup(newStr);
}
/*
* index_register() -- record an index that has been set up for building
* later.
*
* At bootstrap time, we define a bunch of indexes on system catalogs.
* We postpone actually building the indexes until just before we're
* finished with initialization, however. This is because the indexes
* themselves have catalog entries, and those have to be included in the
* indexes on those catalogs. Doing it in two phases is the simplest
* way of making sure the indexes have the right contents at the end.
*/
void
index_register(Oid heap,
Oid ind,
IndexInfo *indexInfo)
{
IndexList *newind;
MemoryContext oldcxt;
/*
* XXX mao 10/31/92 -- don't gc index reldescs, associated info at
* bootstrap time. we'll declare the indexes now, but want to create them
2005-10-14 22:49:52 -04:00
* later.
*/
if (nogc == NULL)
nogc = AllocSetContextCreate(NULL,
"BootstrapNoGC",
Add macros to make AllocSetContextCreate() calls simpler and safer. I found that half a dozen (nearly 5%) of our AllocSetContextCreate calls had typos in the context-sizing parameters. While none of these led to especially significant problems, they did create minor inefficiencies, and it's now clear that expecting people to copy-and-paste those calls accurately is not a great idea. Let's reduce the risk of future errors by introducing single macros that encapsulate the common use-cases. Three such macros are enough to cover all but two special-purpose contexts; those two calls can be left as-is, I think. While this patch doesn't in itself improve matters for third-party extensions, it doesn't break anything for them either, and they can gradually adopt the simplified notation over time. In passing, change TopMemoryContext to use the default allocation parameters. Formerly it could only be extended 8K at a time. That was probably reasonable when this code was written; but nowadays we create many more contexts than we did then, so that it's not unusual to have a couple hundred K in TopMemoryContext, even without considering various dubious code that sticks other things there. There seems no good reason not to let it use growing blocks like most other contexts. Back-patch to 9.6, mostly because that's still close enough to HEAD that it's easy to do so, and keeping the branches in sync can be expected to avoid some future back-patching pain. The bugs fixed by these changes don't seem to be significant enough to justify fixing them further back. Discussion: <21072.1472321324@sss.pgh.pa.us>
2016-08-27 17:50:38 -04:00
ALLOCSET_DEFAULT_SIZES);
oldcxt = MemoryContextSwitchTo(nogc);
newind = (IndexList *) palloc(sizeof(IndexList));
newind->il_heap = heap;
newind->il_ind = ind;
newind->il_info = (IndexInfo *) palloc(sizeof(IndexInfo));
memcpy(newind->il_info, indexInfo, sizeof(IndexInfo));
/* expressions will likely be null, but may as well copy it */
newind->il_info->ii_Expressions =
copyObject(indexInfo->ii_Expressions);
newind->il_info->ii_ExpressionsState = NIL;
/* predicate will likely be null, but may as well copy it */
newind->il_info->ii_Predicate =
copyObject(indexInfo->ii_Predicate);
Faster expression evaluation and targetlist projection. This replaces the old, recursive tree-walk based evaluation, with non-recursive, opcode dispatch based, expression evaluation. Projection is now implemented as part of expression evaluation. This both leads to significant performance improvements, and makes future just-in-time compilation of expressions easier. The speed gains primarily come from: - non-recursive implementation reduces stack usage / overhead - simple sub-expressions are implemented with a single jump, without function calls - sharing some state between different sub-expressions - reduced amount of indirect/hard to predict memory accesses by laying out operation metadata sequentially; including the avoidance of nearly all of the previously used linked lists - more code has been moved to expression initialization, avoiding constant re-checks at evaluation time Future just-in-time compilation (JIT) has become easier, as demonstrated by released patches intended to be merged in a later release, for primarily two reasons: Firstly, due to a stricter split between expression initialization and evaluation, less code has to be handled by the JIT. Secondly, due to the non-recursive nature of the generated "instructions", less performance-critical code-paths can easily be shared between interpreted and compiled evaluation. The new framework allows for significant future optimizations. E.g.: - basic infrastructure for to later reduce the per executor-startup overhead of expression evaluation, by caching state in prepared statements. That'd be helpful in OLTPish scenarios where initialization overhead is measurable. - optimizing the generated "code". A number of proposals for potential work has already been made. - optimizing the interpreter. Similarly a number of proposals have been made here too. The move of logic into the expression initialization step leads to some backward-incompatible changes: - Function permission checks are now done during expression initialization, whereas previously they were done during execution. In edge cases this can lead to errors being raised that previously wouldn't have been, e.g. a NULL array being coerced to a different array type previously didn't perform checks. - The set of domain constraints to be checked, is now evaluated once during expression initialization, previously it was re-built every time a domain check was evaluated. For normal queries this doesn't change much, but e.g. for plpgsql functions, which caches ExprStates, the old set could stick around longer. The behavior around might still change. Author: Andres Freund, with significant changes by Tom Lane, changes by Heikki Linnakangas Reviewed-By: Tom Lane, Heikki Linnakangas Discussion: https://postgr.es/m/20161206034955.bh33paeralxbtluv@alap3.anarazel.de
2017-03-14 18:45:36 -04:00
newind->il_info->ii_PredicateState = NULL;
/* no exclusion constraints at bootstrap time, so no need to copy */
Assert(indexInfo->ii_ExclusionOps == NULL);
Assert(indexInfo->ii_ExclusionProcs == NULL);
Assert(indexInfo->ii_ExclusionStrats == NULL);
newind->il_next = ILHead;
ILHead = newind;
MemoryContextSwitchTo(oldcxt);
}
/*
* build_indices -- fill in all the indexes registered earlier
*/
void
build_indices(void)
{
for (; ILHead != NULL; ILHead = ILHead->il_next)
{
Relation heap;
Relation ind;
/* need not bother with locks during bootstrap */
heap = heap_open(ILHead->il_heap, NoLock);
ind = index_open(ILHead->il_ind, NoLock);
index_build(heap, ind, ILHead->il_info, false, false, false);
index_close(ind, NoLock);
heap_close(heap, NoLock);
}
}