bind9/lib/dns/badcache.c
Ondřej Surý aa72c31422 Fix the rbt hashtable and grow it when setting max-cache-size
There were several problems with rbt hashtable implementation:

1. Our internal hashing function returns uint64_t value, but it was
   silently truncated to unsigned int in dns_name_hash() and
   dns_name_fullhash() functions.  As the SipHash 2-4 higher bits are
   more random, we need to use the upper half of the return value.

2. The hashtable implementation in rbt.c was using modulo to pick the
   slot number for the hash table.  This has several problems because
   modulo is: a) slow, b) oblivious to patterns in the input data.  This
   could lead to very uneven distribution of the hashed data in the
   hashtable.  Combined with the single-linked lists we use, it could
   really hog-down the lookup and removal of the nodes from the rbt
   tree[a].  The Fibonacci Hashing is much better fit for the hashtable
   function here.  For longer description, read "Fibonacci Hashing: The
   Optimization that the World Forgot"[b] or just look at the Linux
   kernel.  Also this will make Diego very happy :).

3. The hashtable would rehash every time the number of nodes in the rbt
   tree would exceed 3 * (hashtable size).  The overcommit will make the
   uneven distribution in the hashtable even worse, but the main problem
   lies in the rehashing - every time the database grows beyond the
   limit, each subsequent rehashing will be much slower.  The mitigation
   here is letting the rbt know how big the cache can grown and
   pre-allocate the hashtable to be big enough to actually never need to
   rehash.  This will consume more memory at the start, but since the
   size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it
   will only consume maximum of 32GB of memory for hashtable in the
   worst case (and max-cache-size would need to be set to more than
   4TB).  Calling the dns_db_adjusthashsize() will also cap the maximum
   size of the hashtable to the pre-computed number of bits, so it won't
   try to consume more gigabytes of memory than available for the
   database.

   FIXME: What is the average size of the rbt node that gets hashed?  I
   chose the pagesize (4k) as initial value to precompute the size of
   the hashtable, but the value is based on feeling and not any real
   data.

For future work, there are more places where we use result of the hash
value modulo some small number and that would benefit from Fibonacci
Hashing to get better distribution.

Notes:
a. A doubly linked list should be used here to speedup the removal of
   the entries from the hashtable.
b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/

(cherry picked from commit e24bc324b4)
2020-07-30 11:57:24 +02:00

522 lines
13 KiB
C

/*
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* See the COPYRIGHT file distributed with this work for additional
* information regarding copyright ownership.
*/
/*! \file */
#include <inttypes.h>
#include <stdbool.h>
#include <isc/buffer.h>
#include <isc/hash.h>
#include <isc/log.h>
#include <isc/mem.h>
#include <isc/mutex.h>
#include <isc/platform.h>
#include <isc/print.h>
#include <isc/rwlock.h>
#include <isc/string.h>
#include <isc/time.h>
#include <isc/util.h>
#include <dns/badcache.h>
#include <dns/name.h>
#include <dns/rdatatype.h>
#include <dns/types.h>
typedef struct dns_bcentry dns_bcentry_t;
struct dns_badcache {
unsigned int magic;
isc_rwlock_t lock;
isc_mem_t *mctx;
isc_mutex_t *tlocks;
dns_bcentry_t **table;
atomic_uint_fast32_t count;
atomic_uint_fast32_t sweep;
unsigned int minsize;
unsigned int size;
};
#define BADCACHE_MAGIC ISC_MAGIC('B', 'd', 'C', 'a')
#define VALID_BADCACHE(m) ISC_MAGIC_VALID(m, BADCACHE_MAGIC)
struct dns_bcentry {
dns_bcentry_t *next;
dns_rdatatype_t type;
isc_time_t expire;
uint32_t flags;
unsigned int hashval;
dns_name_t name;
};
static void
badcache_resize(dns_badcache_t *bc, isc_time_t *now);
isc_result_t
dns_badcache_init(isc_mem_t *mctx, unsigned int size, dns_badcache_t **bcp) {
dns_badcache_t *bc = NULL;
unsigned int i;
REQUIRE(bcp != NULL && *bcp == NULL);
REQUIRE(mctx != NULL);
bc = isc_mem_get(mctx, sizeof(dns_badcache_t));
memset(bc, 0, sizeof(dns_badcache_t));
isc_mem_attach(mctx, &bc->mctx);
isc_rwlock_init(&bc->lock, 0, 0);
bc->table = isc_mem_get(bc->mctx, sizeof(*bc->table) * size);
bc->tlocks = isc_mem_get(bc->mctx, sizeof(isc_mutex_t) * size);
for (i = 0; i < size; i++) {
isc_mutex_init(&bc->tlocks[i]);
}
bc->size = bc->minsize = size;
memset(bc->table, 0, bc->size * sizeof(dns_bcentry_t *));
atomic_init(&bc->count, 0);
atomic_init(&bc->sweep, 0);
bc->magic = BADCACHE_MAGIC;
*bcp = bc;
return (ISC_R_SUCCESS);
}
void
dns_badcache_destroy(dns_badcache_t **bcp) {
dns_badcache_t *bc;
unsigned int i;
REQUIRE(bcp != NULL && *bcp != NULL);
bc = *bcp;
*bcp = NULL;
dns_badcache_flush(bc);
bc->magic = 0;
isc_rwlock_destroy(&bc->lock);
for (i = 0; i < bc->size; i++) {
isc_mutex_destroy(&bc->tlocks[i]);
}
isc_mem_put(bc->mctx, bc->table, sizeof(dns_bcentry_t *) * bc->size);
isc_mem_put(bc->mctx, bc->tlocks, sizeof(isc_mutex_t) * bc->size);
isc_mem_putanddetach(&bc->mctx, bc, sizeof(dns_badcache_t));
}
static void
badcache_resize(dns_badcache_t *bc, isc_time_t *now) {
dns_bcentry_t **newtable, *bad, *next;
isc_mutex_t *newlocks;
unsigned int newsize, i;
bool grow;
RWLOCK(&bc->lock, isc_rwlocktype_write);
/*
* XXXWPK we will have a thundering herd problem here,
* as all threads will wait on the RWLOCK when there's
* a need to resize badcache.
* However, it happens so rarely it should not be a
* performance issue. This is because we double the
* size every time we grow it, and we don't shrink
* unless the number of entries really shrunk. In a
* high load situation, the number of badcache entries
* will eventually stabilize.
*/
if (atomic_load_relaxed(&bc->count) > bc->size * 8) {
grow = true;
} else if (atomic_load_relaxed(&bc->count) < bc->size * 2 &&
bc->size > bc->minsize)
{
grow = false;
} else {
/* Someone resized it already, bail. */
RWUNLOCK(&bc->lock, isc_rwlocktype_write);
return;
}
if (grow) {
newsize = bc->size * 2 + 1;
} else {
newsize = (bc->size - 1) / 2;
#ifdef __clang_analyzer__
/*
* XXXWPK there's a bug in clang static analyzer -
* `value % newsize` is considered undefined even though
* we check if newsize is larger than 0. This helps.
*/
newsize += 1;
#endif
}
RUNTIME_CHECK(newsize > 0);
newtable = isc_mem_get(bc->mctx, sizeof(dns_bcentry_t *) * newsize);
memset(newtable, 0, sizeof(dns_bcentry_t *) * newsize);
newlocks = isc_mem_get(bc->mctx, sizeof(isc_mutex_t) * newsize);
/* Copy existing mutexes */
for (i = 0; i < newsize && i < bc->size; i++) {
newlocks[i] = bc->tlocks[i];
}
/* Initialize additional mutexes if we're growing */
for (i = bc->size; i < newsize; i++) {
isc_mutex_init(&newlocks[i]);
}
/* Destroy extra mutexes if we're shrinking */
for (i = newsize; i < bc->size; i++) {
isc_mutex_destroy(&bc->tlocks[i]);
}
for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) {
for (bad = bc->table[i]; bad != NULL; bad = next) {
next = bad->next;
if (isc_time_compare(&bad->expire, now) < 0) {
isc_mem_put(bc->mctx, bad,
sizeof(*bad) + bad->name.length);
atomic_fetch_sub_relaxed(&bc->count, 1);
} else {
bad->next = newtable[bad->hashval % newsize];
newtable[bad->hashval % newsize] = bad;
}
}
bc->table[i] = NULL;
}
isc_mem_put(bc->mctx, bc->tlocks, sizeof(isc_mutex_t) * bc->size);
bc->tlocks = newlocks;
isc_mem_put(bc->mctx, bc->table, sizeof(*bc->table) * bc->size);
bc->size = newsize;
bc->table = newtable;
RWUNLOCK(&bc->lock, isc_rwlocktype_write);
}
void
dns_badcache_add(dns_badcache_t *bc, const dns_name_t *name,
dns_rdatatype_t type, bool update, uint32_t flags,
isc_time_t *expire) {
isc_result_t result;
unsigned int hashval, hash;
dns_bcentry_t *bad, *prev, *next;
isc_time_t now;
bool resize = false;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(name != NULL);
REQUIRE(expire != NULL);
RWLOCK(&bc->lock, isc_rwlocktype_read);
result = isc_time_now(&now);
if (result != ISC_R_SUCCESS) {
isc_time_settoepoch(&now);
}
hashval = dns_name_hash(name, false);
hash = hashval % bc->size;
LOCK(&bc->tlocks[hash]);
prev = NULL;
for (bad = bc->table[hash]; bad != NULL; bad = next) {
next = bad->next;
if (bad->type == type && dns_name_equal(name, &bad->name)) {
if (update) {
bad->expire = *expire;
bad->flags = flags;
}
break;
}
if (isc_time_compare(&bad->expire, &now) < 0) {
if (prev == NULL) {
bc->table[hash] = bad->next;
} else {
prev->next = bad->next;
}
isc_mem_put(bc->mctx, bad,
sizeof(*bad) + bad->name.length);
atomic_fetch_sub_relaxed(&bc->count, 1);
} else {
prev = bad;
}
}
if (bad == NULL) {
isc_buffer_t buffer;
bad = isc_mem_get(bc->mctx, sizeof(*bad) + name->length);
bad->type = type;
bad->hashval = hashval;
bad->expire = *expire;
bad->flags = flags;
isc_buffer_init(&buffer, bad + 1, name->length);
dns_name_init(&bad->name, NULL);
dns_name_copy(name, &bad->name, &buffer);
bad->next = bc->table[hash];
bc->table[hash] = bad;
unsigned count = atomic_fetch_add_relaxed(&bc->count, 1);
if ((count > bc->size * 8) ||
(count < bc->size * 2 && bc->size > bc->minsize)) {
resize = true;
}
} else {
bad->expire = *expire;
}
UNLOCK(&bc->tlocks[hash]);
RWUNLOCK(&bc->lock, isc_rwlocktype_read);
if (resize) {
badcache_resize(bc, &now);
}
}
bool
dns_badcache_find(dns_badcache_t *bc, const dns_name_t *name,
dns_rdatatype_t type, uint32_t *flagp, isc_time_t *now) {
dns_bcentry_t *bad, *prev, *next;
bool answer = false;
unsigned int i;
unsigned int hash;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(name != NULL);
REQUIRE(now != NULL);
RWLOCK(&bc->lock, isc_rwlocktype_read);
/*
* XXXMUKS: dns_name_equal() is expensive as it does a
* octet-by-octet comparison, and it can be made better in two
* ways here. First, lowercase the names (use
* dns_name_downcase() instead of dns_name_copy() in
* dns_badcache_add()) so that dns_name_caseequal() can be used
* which the compiler will emit as SIMD instructions. Second,
* don't put multiple copies of the same name in the chain (or
* multiple names will have to be matched for equality), but use
* name->link to store the type specific part.
*/
if (atomic_load_relaxed(&bc->count) == 0) {
goto skip;
}
hash = dns_name_hash(name, false) % bc->size;
prev = NULL;
LOCK(&bc->tlocks[hash]);
for (bad = bc->table[hash]; bad != NULL; bad = next) {
next = bad->next;
/*
* Search the hash list. Clean out expired records as we go.
*/
if (isc_time_compare(&bad->expire, now) < 0) {
if (prev != NULL) {
prev->next = bad->next;
} else {
bc->table[hash] = bad->next;
}
isc_mem_put(bc->mctx, bad,
sizeof(*bad) + bad->name.length);
atomic_fetch_sub(&bc->count, 1);
continue;
}
if (bad->type == type && dns_name_equal(name, &bad->name)) {
if (flagp != NULL) {
*flagp = bad->flags;
}
answer = true;
break;
}
prev = bad;
}
UNLOCK(&bc->tlocks[hash]);
skip:
/*
* Slow sweep to clean out stale records.
*/
i = atomic_fetch_add(&bc->sweep, 1) % bc->size;
if (isc_mutex_trylock(&bc->tlocks[i]) == ISC_R_SUCCESS) {
bad = bc->table[i];
if (bad != NULL && isc_time_compare(&bad->expire, now) < 0) {
bc->table[i] = bad->next;
isc_mem_put(bc->mctx, bad,
sizeof(*bad) + bad->name.length);
atomic_fetch_sub_relaxed(&bc->count, 1);
}
UNLOCK(&bc->tlocks[i]);
}
RWUNLOCK(&bc->lock, isc_rwlocktype_read);
return (answer);
}
void
dns_badcache_flush(dns_badcache_t *bc) {
dns_bcentry_t *entry, *next;
unsigned int i;
RWLOCK(&bc->lock, isc_rwlocktype_write);
REQUIRE(VALID_BADCACHE(bc));
for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) {
for (entry = bc->table[i]; entry != NULL; entry = next) {
next = entry->next;
isc_mem_put(bc->mctx, entry,
sizeof(*entry) + entry->name.length);
atomic_fetch_sub_relaxed(&bc->count, 1);
}
bc->table[i] = NULL;
}
RWUNLOCK(&bc->lock, isc_rwlocktype_write);
}
void
dns_badcache_flushname(dns_badcache_t *bc, const dns_name_t *name) {
dns_bcentry_t *bad, *prev, *next;
isc_result_t result;
isc_time_t now;
unsigned int hash;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(name != NULL);
RWLOCK(&bc->lock, isc_rwlocktype_read);
result = isc_time_now(&now);
if (result != ISC_R_SUCCESS) {
isc_time_settoepoch(&now);
}
hash = dns_name_hash(name, false) % bc->size;
LOCK(&bc->tlocks[hash]);
prev = NULL;
for (bad = bc->table[hash]; bad != NULL; bad = next) {
int n;
next = bad->next;
n = isc_time_compare(&bad->expire, &now);
if (n < 0 || dns_name_equal(name, &bad->name)) {
if (prev == NULL) {
bc->table[hash] = bad->next;
} else {
prev->next = bad->next;
}
isc_mem_put(bc->mctx, bad,
sizeof(*bad) + bad->name.length);
atomic_fetch_sub_relaxed(&bc->count, 1);
} else {
prev = bad;
}
}
UNLOCK(&bc->tlocks[hash]);
RWUNLOCK(&bc->lock, isc_rwlocktype_read);
}
void
dns_badcache_flushtree(dns_badcache_t *bc, const dns_name_t *name) {
dns_bcentry_t *bad, *prev, *next;
unsigned int i;
int n;
isc_time_t now;
isc_result_t result;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(name != NULL);
/*
* We write lock the tree to avoid relocking every node
* individually.
*/
RWLOCK(&bc->lock, isc_rwlocktype_write);
result = isc_time_now(&now);
if (result != ISC_R_SUCCESS) {
isc_time_settoepoch(&now);
}
for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) {
prev = NULL;
for (bad = bc->table[i]; bad != NULL; bad = next) {
next = bad->next;
n = isc_time_compare(&bad->expire, &now);
if (n < 0 || dns_name_issubdomain(&bad->name, name)) {
if (prev == NULL) {
bc->table[i] = bad->next;
} else {
prev->next = bad->next;
}
isc_mem_put(bc->mctx, bad,
sizeof(*bad) + bad->name.length);
atomic_fetch_sub_relaxed(&bc->count, 1);
} else {
prev = bad;
}
}
}
RWUNLOCK(&bc->lock, isc_rwlocktype_write);
}
void
dns_badcache_print(dns_badcache_t *bc, const char *cachename, FILE *fp) {
char namebuf[DNS_NAME_FORMATSIZE];
char typebuf[DNS_RDATATYPE_FORMATSIZE];
dns_bcentry_t *bad, *next, *prev;
isc_time_t now;
unsigned int i;
uint64_t t;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(cachename != NULL);
REQUIRE(fp != NULL);
/*
* We write lock the tree to avoid relocking every node
* individually.
*/
RWLOCK(&bc->lock, isc_rwlocktype_write);
fprintf(fp, ";\n; %s\n;\n", cachename);
TIME_NOW(&now);
for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) {
prev = NULL;
for (bad = bc->table[i]; bad != NULL; bad = next) {
next = bad->next;
if (isc_time_compare(&bad->expire, &now) < 0) {
if (prev != NULL) {
prev->next = bad->next;
} else {
bc->table[i] = bad->next;
}
isc_mem_put(bc->mctx, bad,
sizeof(*bad) + bad->name.length);
atomic_fetch_sub_relaxed(&bc->count, 1);
continue;
}
prev = bad;
dns_name_format(&bad->name, namebuf, sizeof(namebuf));
dns_rdatatype_format(bad->type, typebuf,
sizeof(typebuf));
t = isc_time_microdiff(&bad->expire, &now);
t /= 1000;
fprintf(fp,
"; %s/%s [ttl "
"%" PRIu64 "]\n",
namebuf, typebuf, t);
}
}
RWUNLOCK(&bc->lock, isc_rwlocktype_write);
}