Allocate all parts of shmem hash table from a single contiguous area

Previously, the shared header (HASHHDR) and the directory were
allocated by the caller, and passed to hash_create(), while the actual
elements were allocated separately with ShmemAlloc(). After this
commit, all the memory needed by the header, the directory, and all
the elements is allocated using a single ShmemInitStruct() call, and
the different parts are carved out of that allocation. This way the
ShmemIndex entries (and thus pg_shmem_allocations) reflect the size of
the whole hash table, rather than just the directories.

Commit f5930f9a98 attempted this earlier, but it had to be reverted.
The new strategy is to let dynahash.c perform all the allocations with
the alloc function, but have the alloc function carve out the parts
from the one larger allocation. The shared header and the directory
are now also allocated with alloc calls, instead of passing the area
for those directly from the caller.

Reviewed-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://www.postgresql.org/message-id/01ab1d41-3eda-4705-8bbd-af898f5007f1@iki.fi
This commit is contained in:
Heikki Linnakangas 2026-04-04 02:40:25 +03:00
parent 999e9ebb51
commit 9fe9ecd516
4 changed files with 111 additions and 75 deletions

View file

@ -90,11 +90,14 @@ typedef struct ShmemAllocatorData
slock_t shmem_lock;
HASHHDR *index; /* location of ShmemIndex */
size_t index_size; /* size of shmem region holding ShmemIndex */
LWLock index_lock; /* protects ShmemIndex */
} ShmemAllocatorData;
#define ShmemIndexLock (&ShmemAllocator->index_lock)
static HTAB *shmem_hash_create(void *location, size_t size, bool found,
const char *name, int64 nelems, HASHCTL *infoP, int hash_flags);
static void *ShmemHashAlloc(Size size, void *alloc_arg);
static void *ShmemAllocRaw(Size size, Size *allocated_size);
@ -112,6 +115,16 @@ static bool firstNumaTouch = true;
Datum pg_numa_available(PG_FUNCTION_ARGS);
/*
* A very simple allocator used to carve out different parts of a hash table
* from a previously allocated contiguous shared memory area.
*/
typedef struct shmem_hash_allocator
{
char *next; /* start of free space in the area */
char *end; /* end of the shmem area */
} shmem_hash_allocator;
/*
* InitShmemAllocator() --- set up basic pointers to shared memory.
*
@ -126,7 +139,6 @@ InitShmemAllocator(PGShmemHeader *seghdr)
Size offset;
HASHCTL info;
int hash_flags;
size_t size = 0;
#ifndef EXEC_BACKEND
Assert(!IsUnderPostmaster);
@ -179,19 +191,18 @@ InitShmemAllocator(PGShmemHeader *seghdr)
*/
info.keysize = SHMEM_INDEX_KEYSIZE;
info.entrysize = sizeof(ShmemIndexEnt);
info.dsize = info.max_dsize = hash_select_dirsize(SHMEM_INDEX_SIZE);
info.alloc = ShmemHashAlloc;
info.alloc_arg = NULL;
hash_flags = HASH_ELEM | HASH_STRINGS | HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE | HASH_FIXED_SIZE;
hash_flags = HASH_ELEM | HASH_STRINGS | HASH_FIXED_SIZE;
if (!IsUnderPostmaster)
{
size = hash_get_shared_size(&info, hash_flags);
ShmemAllocator->index = (HASHHDR *) ShmemAlloc(size);
ShmemAllocator->index_size = hash_estimate_size(SHMEM_INDEX_SIZE, info.entrysize);
ShmemAllocator->index = (HASHHDR *) ShmemAlloc(ShmemAllocator->index_size);
}
else
hash_flags |= HASH_ATTACH;
info.hctl = ShmemAllocator->index;
ShmemIndex = hash_create("ShmemIndex", SHMEM_INDEX_SIZE, &info, hash_flags);
ShmemIndex = shmem_hash_create(ShmemAllocator->index,
ShmemAllocator->index_size,
IsUnderPostmaster,
"ShmemIndex", SHMEM_INDEX_SIZE,
&info, hash_flags);
Assert(ShmemIndex != NULL);
/*
@ -205,8 +216,8 @@ InitShmemAllocator(PGShmemHeader *seghdr)
hash_search(ShmemIndex, "ShmemIndex", HASH_ENTER, &found);
Assert(!found);
result->size = size;
result->allocated_size = size;
result->size = ShmemAllocator->index_size;
result->allocated_size = ShmemAllocator->index_size;
result->location = ShmemAllocator->index;
}
}
@ -246,13 +257,27 @@ ShmemAllocNoError(Size size)
return ShmemAllocRaw(size, &allocated_size);
}
/* Alloc callback for shared memory hash tables */
/*
* ShmemHashAlloc -- alloc callback for shared memory hash tables
*
* Carve out the allocation from a pre-allocated region. All shared memory
* hash tables are initialized with HASH_FIXED_SIZE, so all the allocations
* happen upfront during initialization and no locking is required.
*/
static void *
ShmemHashAlloc(Size size, void *alloc_arg)
{
Size allocated_size;
shmem_hash_allocator *allocator = (shmem_hash_allocator *) alloc_arg;
void *result;
return ShmemAllocRaw(size, &allocated_size);
size = MAXALIGN(size);
if (allocator->end - allocator->next < size)
return NULL;
result = allocator->next;
allocator->next += size;
return result;
}
/*
@ -343,13 +368,34 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
int hash_flags) /* info about infoP */
{
bool found;
size_t size;
void *location;
size = hash_estimate_size(nelems, infoP->entrysize);
/* look it up in the shmem index or allocate */
location = ShmemInitStruct(name, size, &found);
return shmem_hash_create(location, size, found,
name, nelems, infoP, hash_flags);
}
/*
* Initialize or attach to a shared hash table in the given shmem region.
*
* This is extracted from ShmemInitHash() to allow InitShmemAllocator() to
* share the logic for bootstrapping the ShmemIndex hash table.
*/
static HTAB *
shmem_hash_create(void *location, size_t size, bool found,
const char *name, int64 nelems, HASHCTL *infoP, int hash_flags)
{
shmem_hash_allocator allocator;
/*
* Hash tables allocated in shared memory have a fixed directory; it can't
* grow or other backends wouldn't be able to find it. So, make sure we
* make it big enough to start with. We also allocate all the buckets
* upfront.
* Hash tables allocated in shared memory have a fixed directory and have
* all elements allocated upfront. We don't support growing because we'd
* need to grow the underlying shmem region with it.
*
* The shared memory allocator must be specified too.
*/
@ -358,20 +404,22 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
infoP->alloc_arg = NULL;
hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE | HASH_FIXED_SIZE;
/* look it up in the shmem index */
location = ShmemInitStruct(name,
hash_get_shared_size(infoP, hash_flags),
&found);
/*
* if it already exists, attach to it rather than allocate and initialize
* new space
*/
if (found)
if (!found)
{
allocator.next = (char *) location;
allocator.end = (char *) location + size;
infoP->alloc_arg = &allocator;
}
else
{
/* Pass location of hashtable header to hash_create */
infoP->hctl = (HASHHDR *) location;
hash_flags |= HASH_ATTACH;
/* Pass location of hashtable header to hash_create */
infoP->hctl = (HASHHDR *) location;
}
return hash_create(name, nelems, infoP, hash_flags);
}

View file

@ -195,6 +195,9 @@ struct HASHHDR
int nelem_alloc; /* number of entries to allocate at once */
bool isfixed; /* if true, don't enlarge */
/* Current directory. In shared tables, this doesn't change */
HASHSEGMENT *dir;
#ifdef HASH_STATISTICS
/*
@ -374,6 +377,8 @@ hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
* hash_destroy very simple. The memory context is made a child of either
* a context specified by the caller, or TopMemoryContext if nothing is
* specified.
*
* Note that HASH_DIRSIZE and HASH_ALLOC had better be set as well.
*/
if (flags & HASH_SHARED_MEM)
{
@ -485,22 +490,19 @@ hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
if (flags & HASH_SHARED_MEM)
{
/*
* ctl structure and directory are preallocated for shared memory
* tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
* well.
*/
hashp->hctl = info->hctl;
hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
hashp->hcxt = NULL;
hashp->isshared = true;
/* hash table already exists, we're just attaching to it */
if (flags & HASH_ATTACH)
{
/* Caller must pass the pointer to the shared header */
Assert(info->hctl);
hashp->hctl = info->hctl;
/* make local copies of some heavily-used values */
hctl = hashp->hctl;
hashp->keysize = hctl->keysize;
hashp->dir = info->hctl->dir;
hashp->keysize = info->hctl->keysize;
return hashp;
}
@ -514,14 +516,20 @@ hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
hashp->isshared = false;
}
/*
* Allocate the header structure.
*
* XXX: In case of a shared memory hash table, other processes need the
* pointer to the header to re-find the hash table. There is currently no
* explicit way to pass it back from here, the caller relies on the fact
* that this is the first allocation made with the alloc function. That's
* a little ugly, but works for now.
*/
hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR), hashp->alloc_arg);
if (!hashp->hctl)
{
hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR), hashp->alloc_arg);
if (!hashp->hctl)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
hashp->frozen = false;
@ -724,25 +732,17 @@ init_htab(HTAB *hashp, int64 nelem)
nsegs = next_pow2_int(nsegs);
/*
* Make sure directory is big enough. If pre-allocated directory is too
* small, choke (caller screwed up).
* Make sure directory is big enough.
*/
if (nsegs > hctl->dsize)
{
if (!(hashp->dir))
hctl->dsize = nsegs;
else
return false;
}
hctl->dsize = nsegs;
/* Allocate a directory */
if (!(hashp->dir))
{
hashp->dir = (HASHSEGMENT *)
hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT), hashp->alloc_arg);
if (!hashp->dir)
return false;
}
hctl->dir = (HASHSEGMENT *)
hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT), hashp->alloc_arg);
if (!hctl->dir)
return false;
hashp->dir = hctl->dir;
/* Allocate initial segments */
for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
@ -831,19 +831,6 @@ hash_select_dirsize(int64 num_entries)
return nDirEntries;
}
/*
* Compute the required initial memory allocation for a shared-memory
* hashtable with the given parameters. We need space for the HASHHDR
* and for the (non expansible) directory.
*/
Size
hash_get_shared_size(HASHCTL *info, int flags)
{
Assert(flags & HASH_DIRSIZE);
Assert(info->dsize == info->max_dsize);
return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
}
/********************** DESTROY ROUTINES ************************/
@ -1647,6 +1634,7 @@ dir_realloc(HTAB *hashp)
{
memcpy(p, old_p, old_dirsize);
MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
hashp->hctl->dir = p;
hashp->dir = p;
hashp->hctl->dsize = new_dsize;

View file

@ -82,7 +82,7 @@ typedef struct HASHCTL
void *alloc_arg; /* opaque argument passed to allocator */
/* Used if HASH_CONTEXT flag is set: */
MemoryContext hcxt; /* memory context to use for allocations */
/* Used if HASH_SHARED_MEM flag is set: */
/* Used if HASH_ATTACH flag is set: */
HASHHDR *hctl; /* location of header in shared mem */
} HASHCTL;
@ -149,7 +149,6 @@ extern void hash_seq_term(HASH_SEQ_STATUS *status);
extern void hash_freeze(HTAB *hashp);
extern Size hash_estimate_size(int64 num_entries, Size entrysize);
extern int64 hash_select_dirsize(int64 num_entries);
extern Size hash_get_shared_size(HASHCTL *info, int flags);
extern void AtEOXact_HashTables(bool isCommit);
extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth);

View file

@ -4225,6 +4225,7 @@ shm_mq_result
shm_toc
shm_toc_entry
shm_toc_estimator
shmem_hash_allocator
shmem_request_hook_type
shmem_startup_hook_type
sig_atomic_t