diff --git a/include/haproxy/backend-t.h b/include/haproxy/backend-t.h index d7e57a772..f0a9bf354 100644 --- a/include/haproxy/backend-t.h +++ b/include/haproxy/backend-t.h @@ -162,12 +162,15 @@ struct lbprm { int wmult; /* ratio between user weight and effective weight */ int wdiv; /* ratio between effective weight and user weight */ int hash_balance_factor; /* load balancing factor * 100, 0 if disabled */ + unsigned int lb_free_list_nb; /* Number of elements in the free list */ struct sample_expr *expr; /* sample expression for "balance (log-)hash" */ char *arg_str; /* name of the URL parameter/header/cookie used for hashing */ int arg_len; /* strlen(arg_str), computed only once */ int arg_opt1; /* extra option 1 for the LB algo (algo-specific) */ int arg_opt2; /* extra option 2 for the LB algo (algo-specific) */ int arg_opt3; /* extra option 3 for the LB algo (algo-specific) */ + uint64_t lb_seq; /* sequence number for algos who need it */ + struct mt_list lb_free_list; /* LB tree elements available */ __decl_thread(HA_RWLOCK_T lock); struct server *fbck; /* first backup server when !PR_O_USE_ALL_BK, or NULL */ diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h index dbb39f4c3..eec78132b 100644 --- a/include/haproxy/defaults.h +++ b/include/haproxy/defaults.h @@ -602,4 +602,33 @@ #define MAX_SELF_USE_QUEUE 9 #endif +/* + * FWLC defines + */ + +/* + * How many mt_lists we use per tree elements. + * The more lists we have, the less likely it + * will be that we'll have contention when + * inserting/removing an element, but the more + * costly it will be to look up servers. + */ +#ifndef FWLC_LISTS_NB +#define FWLC_LISTS_NB 4 +#endif /* FWLC_LISTS_NB */ + +/* + * How many entries we want to keep in the + * free list, before trying to use some. + * We want to keep some nodes in the tree, + * to avoid having to re-allocate one and + * modify the tree, which requires the + * write lock and is costly, but we + * don't want to have too much, to save + * memory. + */ +#ifndef FWLC_MIN_FREE_ENTRIES +#define FWLC_MIN_FREE_ENTRIES 500 +#endif /* FWLC_MIN_FREE_ENTRIES */ + #endif /* _HAPROXY_DEFAULTS_H */ diff --git a/include/haproxy/server-t.h b/include/haproxy/server-t.h index fe13318f8..716ac79fb 100644 --- a/include/haproxy/server-t.h +++ b/include/haproxy/server-t.h @@ -383,6 +383,12 @@ struct server { union { struct eb32_node lb_node; /* node used for tree-based load balancing */ struct list lb_list; /* elem used for list-based load balancing */ + struct { + struct fwlc_tree_elt *tree_elt; /* pointer to the element stored in tree, protected by lb_lock */ + struct fwlc_tree_elt *free_elt; /* A free element, so that we don't have to allocate one, protected by lb_lock */ + struct mt_list lb_mt_list; /* elem used for mt list-based load balancing, protected by lb_lock */ + int lb_lock; /* make sure we are the only one updating the server */ + }; }; struct server *next_full; /* next server in the temporary full list */ diff --git a/src/lb_fwlc.c b/src/lb_fwlc.c index bb7e8979c..c647336c5 100644 --- a/src/lb_fwlc.c +++ b/src/lb_fwlc.c @@ -16,7 +16,47 @@ #include #include #include +#include +struct fwlc_tree_elt { + struct mt_list srv_list[FWLC_LISTS_NB]; + struct mt_list free_list; + struct eb32_node lb_node; + unsigned int elements; +}; + +DECLARE_STATIC_POOL(pool_head_fwlc_elt, "fwlc_tree_elt", sizeof(struct fwlc_tree_elt)); + +#define FWLC_LBPRM_SEQ(lbprm) ((lbprm) & 0xffffffff) +#define FWLC_LBPRM_SMALLEST(lbprm) ((lbprm) >> 32) + +/* + * Atomically try to update the sequence number, and the smallest key for which there is at least one server. + * Returns 1 on success, and 0 on failure. + */ +static int fwlc_set_seq_and_smallest(struct lbprm *lbprm, uint64_t current, unsigned int seq, unsigned int smallest) +{ +#if !defined(HA_CAS_IS_8B) && !defined(HA_HAVE_CAS_DW) + __decl_thread(static HA_SPINLOCK_T seq_lock); +#endif + uint64_t dst_nb = seq | ((uint64_t)smallest << 32); + int ret; +#if defined(HA_CAS_IS_8B) + ret = _HA_ATOMIC_CAS(&lbprm->lb_seq, ¤t, dst_nb); +#elif defined(HA_HAVE_CAS_DW) + ret = _HA_ATOMIC_DWCAS(&lbprm->lb_seq, ¤t, &dst_nb); +#else + HA_SPIN_LOCK(OTHER_LOCK, &seq_lock); + if (lbprm->lb_seq == current) { + lbprm->lb_seq = dst_nb; + ret = 1; + } else + ret = 0; + HA_SPIN_UNLOCK(OTHER_LOCK, &seq_lock); +#endif + return ret; + +} /* Remove a server from a tree. It must have previously been dequeued. This * function is meant to be called when a server is going down or has its @@ -29,13 +69,137 @@ static inline void fwlc_remove_from_tree(struct server *s) s->lb_tree = NULL; } +/* + * Remove anything allocated by the proxy + */ +static void fwlc_proxy_deinit(struct proxy *p) +{ + struct fwlc_tree_elt *tree_elt; + + while ((tree_elt = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list)) != NULL) { + pool_free(pool_head_fwlc_elt, tree_elt); + } +} + +/* + * Remove anything allocated by the server + */ +static void fwlc_server_deinit(struct server *s) +{ + if (s->free_elt) { + pool_free(pool_head_fwlc_elt, s->free_elt); + s->free_elt = NULL; + } +} + /* simply removes a server from a tree. * * The lbprm's lock must be held. */ static inline void fwlc_dequeue_srv(struct server *s) { - eb32_delete(&s->lb_node); + struct fwlc_tree_elt *tree_elt = s->tree_elt; + unsigned int elts; + + MT_LIST_DELETE(&s->lb_mt_list); + if (tree_elt) { + elts = _HA_ATOMIC_FETCH_SUB(&tree_elt->elements, 1); + /* We are the last element, we can nuke the node */ + if (elts == 1) { + if (FWLC_LBPRM_SMALLEST(s->proxy->lbprm.lb_seq) == tree_elt->lb_node.key) { + /* + * We were the smallest one, and now we're + * gone, reset it + */ + /* + * We're holding the lbprm lock so this should never fail, + * as nobody should be around to modify it + */ + do { + } while (fwlc_set_seq_and_smallest(&s->proxy->lbprm, s->proxy->lbprm.lb_seq, FWLC_LBPRM_SEQ(s->proxy->lbprm.lb_seq) + 1, 0) == 0 && __ha_cpu_relax()); + + } + eb32_delete(&tree_elt->lb_node); + } + } + s->tree_elt = NULL; + if (s->free_elt) { + pool_free(pool_head_fwlc_elt, s->free_elt); + s->free_elt = NULL; + } +} + +/* + * Allocate a tree element, either from the free list, from an element provided, or + * from allocation. + * Must be called with the wrlock + */ +static struct fwlc_tree_elt *fwlc_alloc_tree_elt(struct proxy *p, struct fwlc_tree_elt *allocated_elt) +{ + struct fwlc_tree_elt *tree_elt = NULL; + int i = 0; + + if (p->lbprm.lb_free_list_nb >= FWLC_MIN_FREE_ENTRIES) { + while ((tree_elt = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list)) != NULL) { + MT_LIST_APPEND(&p->lbprm.lb_free_list, &tree_elt->free_list); + if (tree_elt->elements == 0) { + eb32_delete(&tree_elt->lb_node); + if (i == 0) { + struct fwlc_tree_elt *tmptree; + + tmptree = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list); + /* + * Check if the next element still contains servers, and if not, + * just free it, to do some cleanup. + */ + if (tmptree && tmptree->elements == 0) { + eb32_delete(&tmptree->lb_node); + pool_free(pool_head_fwlc_elt, tmptree); + p->lbprm.lb_free_list_nb--; + } else if (tmptree) + MT_LIST_APPEND(&p->lbprm.lb_free_list, &tmptree->free_list); + } + return tree_elt; + } + i++; + if (i > 3) + break; + } + } + if (!allocated_elt) + tree_elt = pool_alloc(pool_head_fwlc_elt); + else + tree_elt = allocated_elt; + + for (i = 0; i < FWLC_LISTS_NB; i++) { + MT_LIST_INIT(&tree_elt->srv_list[i]); + } + MT_LIST_INIT(&tree_elt->free_list); + MT_LIST_APPEND(&p->lbprm.lb_free_list, &tree_elt->free_list); + p->lbprm.lb_free_list_nb++; + tree_elt->elements = 0; + return tree_elt; +} + +/* + * Return the tree element for the provided key, allocate it first if needed. + * Must be called with the lbprm lock held. + */ +static struct fwlc_tree_elt *fwlc_get_tree_elt(struct server *s, u32 key) +{ + struct eb32_node *node; + struct fwlc_tree_elt *tree_elt = NULL; + + node = eb32_lookup(s->lb_tree, key); + if (node) + tree_elt = container_of(node, struct fwlc_tree_elt, lb_node); + if (!tree_elt) { + /* No element available, we have to allocate one */ + tree_elt = fwlc_alloc_tree_elt(s->proxy, NULL); + tree_elt->lb_node.key = key; + eb32_insert(s->lb_tree, &tree_elt->lb_node); + } + return tree_elt; } /* Queue a server in its associated tree, assuming the is >0. @@ -58,10 +222,77 @@ static inline void fwlc_dequeue_srv(struct server *s) */ static inline void fwlc_queue_srv(struct server *s, unsigned int eweight) { + struct fwlc_tree_elt *tree_elt; unsigned int inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength); + unsigned int list_nb; + u32 key; - s->lb_node.key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / eweight : 0; - eb32_insert(s->lb_tree, &s->lb_node); + key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / eweight : 0; + tree_elt = fwlc_get_tree_elt(s, key); + list_nb = statistical_prng_range(FWLC_LISTS_NB); + MT_LIST_APPEND(&tree_elt->srv_list[list_nb], &s->lb_mt_list); + s->tree_elt = tree_elt; + _HA_ATOMIC_INC(&tree_elt->elements); + if (FWLC_LBPRM_SMALLEST(s->proxy->lbprm.lb_seq) > key) { + /* + * We're holding the lbprm lock so this should never fail, + * as nobody should be around to modify it + */ + do { + } while (fwlc_set_seq_and_smallest(&s->proxy->lbprm, s->proxy->lbprm.lb_seq, FWLC_LBPRM_SEQ(s->proxy->lbprm.lb_seq) + 1, key) == 0); + } +} + +/* + * Loop across the different lists until we find an unlocked one, and lock it. + */ +static __inline struct mt_list fwlc_lock_target_list(struct fwlc_tree_elt *tree_elt) +{ + struct mt_list list = {NULL, NULL}; + int i; + int dst_list; + + + dst_list = statistical_prng_range(FWLC_LISTS_NB); + + while (list.next == NULL) { + for (i = 0; i < FWLC_LISTS_NB; i++) { + list = mt_list_try_lock_prev(&tree_elt->srv_list[(dst_list + i) % FWLC_LISTS_NB]); + if (list.next != NULL) + break; + } + } + return list; +} + +/* + * Calculate the key to be used for a given server + */ +static inline unsigned int fwlc_get_key(struct server *s) +{ + unsigned int inflight; + unsigned int eweight; + unsigned int new_key; + + inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength); + eweight = _HA_ATOMIC_LOAD(&s->cur_eweight); + new_key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1) : 0; + + return new_key; +} + +/* + * Only one thread will try to update a server position at a given time, + * thanks to the lb_lock. However that means that by the time we are done + * with the update, a new one might be needed, so check for that and + * schedule the tasklet if needed, once we dropped the lock. + */ +static inline void fwlc_check_srv_key(struct server *s, unsigned int expected) +{ + unsigned int key = fwlc_get_key(s); + + if (key != expected && s->requeue_tasklet) + tasklet_wakeup(s->requeue_tasklet); } /* Re-position the server in the FWLC tree after it has been assigned one @@ -71,45 +302,204 @@ static inline void fwlc_queue_srv(struct server *s, unsigned int eweight) */ static void fwlc_srv_reposition(struct server *s) { - unsigned int inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength); + struct mt_list to_unlock; + struct fwlc_tree_elt *tree_elt = NULL, *allocated_elt = NULL; + struct eb32_node *node; + struct mt_list list; + uint64_t cur_seq = 0; unsigned int eweight = _HA_ATOMIC_LOAD(&s->cur_eweight); - unsigned int new_key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1) : 0; + unsigned int new_key; + unsigned int smallest; + int srv_lock; + HA_RWLOCK_RDLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); + new_key = fwlc_get_key(s); /* some calls will be made for no change (e.g connect_server() after * assign_server(). Let's check that first. */ - if (s->lb_node.node.leaf_p && eweight && s->lb_node.key == new_key) - return; - - if (HA_RWLOCK_TRYWRLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock) != 0) { - /* there's already some contention on the tree's lock, there's - * no point insisting. Better wake up the server's tasklet that - * will let this or another thread retry later. For the time - * being, the server's apparent load is slightly inaccurate but - * we don't care, if there is contention, it will self-regulate. - */ - if (s->requeue_tasklet) - tasklet_wakeup(s->requeue_tasklet); + if ((s->tree_elt && s->tree_elt->lb_node.node.leaf_p && eweight && + s->tree_elt->lb_node.key == new_key) || !s->lb_tree) { + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); return; } - /* below we've got the lock */ - if (s->lb_tree) { + srv_lock = HA_ATOMIC_XCHG(&s->lb_lock, 1); + /* Somebody else is updating that server, give up */ + if (srv_lock == 1) { + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); + return; + } + + node = eb32_lookup(s->lb_tree, new_key); + if (node) + tree_elt = container_of(node, struct fwlc_tree_elt, lb_node); + /* + * It is possible that s->tree_elt was changed since we checked + * As s->tree_elt is only changed while holding s->lb_lock, + * check again now that we acquired it, and if we're using + * the right element, do nothing. + */ + if (tree_elt == s->tree_elt) { + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); + _HA_ATOMIC_STORE(&s->lb_lock, 0); + fwlc_check_srv_key(s, new_key); + return; + } + /* + * We have to allocate a new tree element, and/or remove the + * previous element, we will modify the tree, so let's get the write + * lock. + */ + if (!tree_elt) { + unsigned int new_new_key; + + /* + * We don't want to allocate something while holding the lock, + * so make sure we have something allocated before. + */ + if (s->free_elt != NULL) { + allocated_elt = s->free_elt; + s->free_elt = NULL; + } else + allocated_elt = pool_alloc(pool_head_fwlc_elt); + if (HA_RWLOCK_TRYRDTOWR(LBPRM_LOCK, &s->proxy->lbprm.lock) != 0) { + /* there's already some contention on the tree's lock, there's + * no point insisting. Better wake up the server's tasklet that + * will let this or another thread retry later. For the time + * being, the server's apparent load is slightly inaccurate but + * we don't care, if there is contention, it will self-regulate. + */ + if (s->requeue_tasklet) + tasklet_wakeup(s->requeue_tasklet); + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); + s->free_elt = allocated_elt; + _HA_ATOMIC_STORE(&s->lb_lock, 0); + return; + } + /* we might have been waiting for a while on the lock above * so it's worth testing again because other threads are very * likely to have released a connection or taken one leading * to our target value (50% of the case in measurements). */ - inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength); - eweight = _HA_ATOMIC_LOAD(&s->cur_eweight); - new_key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1) : 0; - if (!s->lb_node.node.leaf_p || s->lb_node.key != new_key) { - eb32_delete(&s->lb_node); - s->lb_node.key = new_key; - eb32_insert(s->lb_tree, &s->lb_node); + + new_new_key = fwlc_get_key(s); + if (new_new_key != new_key) { + if (s->tree_elt && + s->tree_elt->lb_node.node.leaf_p && + eweight && s->tree_elt->lb_node.key == new_new_key) { + /* Okay after all we have nothing to do */ + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); + s->free_elt = allocated_elt; + _HA_ATOMIC_STORE(&s->lb_lock, 0); + fwlc_check_srv_key(s, new_new_key); + return; + } + node = eb32_lookup(s->lb_tree, new_new_key); + if (node) { + tree_elt = container_of(node, struct fwlc_tree_elt, lb_node); + HA_RWLOCK_WRTORD(LBPRM_LOCK, &s->proxy->lbprm.lock); + s->free_elt = allocated_elt; + allocated_elt = NULL; + } else + tree_elt = NULL; + new_key = new_new_key; } } - HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); + + /* + * Now we increment the number of elements in the new tree_elt, + * we change our sequence number and smallest, and we then + * decrement the number of elements in the old tree_elt. + * It is important to keep this sequencing, as fwlc_get_next_server() + * uses the number of elements to know if there is something to look for, + * and we want to make sure we do not miss a server. + */ + if (!tree_elt) { + /* + * There were no tree element matching our key, + * allocate one and insert it into the tree + */ + tree_elt = fwlc_alloc_tree_elt(s->proxy, allocated_elt); + if (tree_elt == allocated_elt) + allocated_elt = NULL; + tree_elt->lb_node.key = new_key; + tree_elt->elements = 1; + __ha_barrier_store(); + /* If we allocated, then we hold the write lock */ + eb32_insert(s->lb_tree, &tree_elt->lb_node); + HA_RWLOCK_WRTORD(LBPRM_LOCK, &s->proxy->lbprm.lock); + } else { + _HA_ATOMIC_INC(&tree_elt->elements); + } + + __ha_barrier_store(); + /* + * Update the sequence number, and the smallest if needed. + * We always have to do it, even if we're not actually + * updating the smallest one, otherwise we'll get na + * ABA problem and a server may be missed when looked up. + * The only time we don't have to do it if is another thread + * increased it, and the new smallest element is not + * higher than our new key. + */ + do { + unsigned int tmpsmallest; + uint64_t newcurseq = _HA_ATOMIC_LOAD(&s->proxy->lbprm.lb_seq); + + if (cur_seq != 0 && FWLC_LBPRM_SEQ(newcurseq) > + FWLC_LBPRM_SEQ(cur_seq) && new_key >= FWLC_LBPRM_SMALLEST(newcurseq)) + break; + + cur_seq = newcurseq; + tmpsmallest = FWLC_LBPRM_SMALLEST(cur_seq); + if (new_key > tmpsmallest) + smallest = tmpsmallest; + else + smallest = new_key; + + } while (fwlc_set_seq_and_smallest(&s->proxy->lbprm, cur_seq, FWLC_LBPRM_SEQ(cur_seq) + 1, smallest) == 0 && __ha_cpu_relax()); + + __ha_barrier_store(); + + if (s->tree_elt) + _HA_ATOMIC_DEC(&s->tree_elt->elements); + + /* + * Now lock the existing element, and its target list. + * To prevent a deadlock, we always lock the one + * with the lowest key first. + */ + if (new_key < s->tree_elt->lb_node.key) { + to_unlock = mt_list_lock_full(&s->lb_mt_list); + list = fwlc_lock_target_list(tree_elt); + } else { + list = fwlc_lock_target_list(tree_elt); + to_unlock = mt_list_lock_full(&s->lb_mt_list); + } + + /* + * Unlock the old list, the element is now + * no longer in it. + */ + mt_list_unlock_link(to_unlock); + + /* + * Add the element to the new list, and unlock it. + */ + mt_list_unlock_full(&s->lb_mt_list, list); + + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); + + s->tree_elt = tree_elt; + + if (allocated_elt) + s->free_elt = allocated_elt; + + __ha_barrier_store(); + _HA_ATOMIC_STORE(&s->lb_lock, 0); + + fwlc_check_srv_key(s, new_key); } /* This function updates the server trees according to server 's new @@ -306,6 +696,8 @@ void fwlc_init_server_tree(struct proxy *p) p->lbprm.server_take_conn = fwlc_srv_reposition; p->lbprm.server_drop_conn = fwlc_srv_reposition; p->lbprm.server_requeue = fwlc_srv_reposition; + p->lbprm.server_deinit = fwlc_server_deinit; + p->lbprm.proxy_deinit = fwlc_proxy_deinit; p->lbprm.wdiv = BE_WEIGHT_SCALE; for (srv = p->srv; srv; srv = srv->next) { @@ -313,6 +705,8 @@ void fwlc_init_server_tree(struct proxy *p) srv_lb_commit_status(srv); } + p->lbprm.lb_seq = 0; + recount_servers(p); update_backend_weight(p); @@ -337,46 +731,128 @@ struct server *fwlc_get_next_server(struct proxy *p, struct server *srvtoavoid) { struct server *srv, *avoided; struct eb32_node *node; + uint64_t curseq; + int found = 0; srv = avoided = NULL; HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock); + curseq = _HA_ATOMIC_LOAD(&p->lbprm.lb_seq); +redo: if (p->srv_act) - node = eb32_first(&p->lbprm.fwlc.act); + node = eb32_lookup_ge(&p->lbprm.fwlc.act, FWLC_LBPRM_SMALLEST(curseq)); else if (p->lbprm.fbck) { srv = p->lbprm.fbck; goto out; } else if (p->srv_bck) - node = eb32_first(&p->lbprm.fwlc.bck); + node = eb32_lookup_ge(&p->lbprm.fwlc.bck, FWLC_LBPRM_SMALLEST(curseq)); else { srv = NULL; goto out; } while (node) { - /* OK, we have a server. However, it may be saturated, in which - * case we don't want to reconsider it for now, so we'll simply - * skip it. Same if it's the server we try to avoid, in which - * case we simply remember it for later use if needed. - */ + struct fwlc_tree_elt *tree_elt; struct server *s; + int orig_nb; + int i = 0; - s = eb32_entry(node, struct server, lb_node); - if (!s->maxconn || s->served + s->queueslength < srv_dynamic_maxconn(s) + s->maxqueue) { - if (s != srvtoavoid) { - srv = s; - break; + tree_elt = eb32_entry(node, struct fwlc_tree_elt, lb_node); + orig_nb = statistical_prng_range(FWLC_LISTS_NB); + + while (_HA_ATOMIC_LOAD(&tree_elt->elements) > 0) { + struct mt_list mt_list; + mt_list.next = _HA_ATOMIC_LOAD(&tree_elt->srv_list[(i + orig_nb) % FWLC_LISTS_NB].next); + + if (mt_list.next != &tree_elt->srv_list[(i + orig_nb) % FWLC_LISTS_NB] && mt_list.next != MT_LIST_BUSY) { + unsigned int eweight; + unsigned int planned_inflight; + s = container_of(mt_list.next, struct server, lb_mt_list); + eweight = _HA_ATOMIC_LOAD(&s->cur_eweight); + + planned_inflight = tree_elt->lb_node.key * eweight / SRV_EWGHT_MAX; + if (!s->maxconn || s->served + s->queueslength < srv_dynamic_maxconn(s) + s->maxqueue) { + if (_HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength) > planned_inflight + 2) { + /* + * The server has more requests than expected, + * let's try to reposition it, to avoid too + * many threads using the same server at the + * same time. + */ + if (i >= FWLC_LISTS_NB) { + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + fwlc_srv_reposition(s); + HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock); + } + i++; + continue; + } + if (s != srvtoavoid) { + srv = s; + found = 1; + break; + } + avoided = s; + } + i++; + } else if (mt_list.next == &tree_elt->srv_list[(i + orig_nb) % FWLC_LISTS_NB]) { + i++; + continue; + } else { + i++; + continue; + } + } + if (found) + break; + + do { + node = eb32_next(node); + } while (node && node->key < FWLC_LBPRM_SMALLEST(curseq)); + + if (node) { + uint64_t newcurseq = HA_ATOMIC_LOAD(&p->lbprm.lb_seq); + + /* + * If we have a bigger element than the smallest recorded, and we're up to date, + * update the smallest one. + */ + if (likely(newcurseq == curseq && FWLC_LBPRM_SMALLEST(newcurseq) < node->key)) { + if (fwlc_set_seq_and_smallest(&p->lbprm, curseq, FWLC_LBPRM_SEQ(curseq), node->key) != 0) { + curseq = FWLC_LBPRM_SEQ(curseq) | ((uint64_t)node->key << 32); + __ha_barrier_store(); + continue; + } + + } + /* + * Somebody added a new server in node we already skipped, so retry from the beginning. + */ + if (unlikely(FWLC_LBPRM_SMALLEST(newcurseq) < node->key && FWLC_LBPRM_SEQ(newcurseq) != FWLC_LBPRM_SEQ(curseq))) { + curseq = newcurseq; + goto redo; + } + curseq = newcurseq; + } else { + uint64_t newcurseq = _HA_ATOMIC_LOAD(&p->lbprm.lb_seq); + + /* + * No more node, but somebody changed the tree, so it's + * worth trying again. + */ + if (FWLC_LBPRM_SEQ(newcurseq) != FWLC_LBPRM_SEQ(curseq)) { + curseq = newcurseq; + goto redo; } - avoided = s; } - node = eb32_next(node); } if (!srv) srv = avoided; out: HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + return srv; } diff --git a/src/proxy.c b/src/proxy.c index 9b786b33a..67807d7e7 100644 --- a/src/proxy.c +++ b/src/proxy.c @@ -1478,6 +1478,8 @@ void init_new_proxy(struct proxy *p) LIST_INIT(&p->filter_configs); LIST_INIT(&p->tcpcheck_rules.preset_vars); + MT_LIST_INIT(&p->lbprm.lb_free_list); + p->defsrv.id = "default-server"; p->conf.used_listener_id = EB_ROOT; p->conf.used_server_id = EB_ROOT;