blob: 32d7d1d1166f766da968e7c02fe03f99b3c42dc2 [file] [log] [blame]
Willy Tarreau6b2e11b2009-10-01 07:52:15 +02001/*
2 * Consistent Hash implementation
3 * Please consult this very well detailed article for more information :
4 * http://www.spiteful.com/2008/03/17/programmers-toolbox-part-3-consistent-hashing/
5 *
6 * Our implementation has to support both weighted hashing and weighted round
7 * robin because we'll use it to replace the previous map-based implementation
8 * which offered both algorithms.
9 *
Willy Tarreau4c14eaa2010-11-24 14:01:45 +010010 * Copyright 2000-2010 Willy Tarreau <w@1wt.eu>
Willy Tarreau6b2e11b2009-10-01 07:52:15 +020011 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 */
18
19#include <common/compat.h>
20#include <common/config.h>
21#include <common/debug.h>
Willy Tarreau4c14eaa2010-11-24 14:01:45 +010022#include <common/standard.h>
Willy Tarreau45cb4fb2009-10-26 21:10:04 +010023#include <eb32tree.h>
Willy Tarreau6b2e11b2009-10-01 07:52:15 +020024
25#include <types/global.h>
26#include <types/server.h>
27
28#include <proto/backend.h>
29#include <proto/queue.h>
30
Willy Tarreau6b2e11b2009-10-01 07:52:15 +020031/* Return next tree node after <node> which must still be in the tree, or be
32 * NULL. Lookup wraps around the end to the beginning. If the next node is the
33 * same node, return NULL. This is designed to find a valid next node before
34 * deleting one from the tree.
35 */
36static inline struct eb32_node *chash_skip_node(struct eb_root *root, struct eb32_node *node)
37{
38 struct eb32_node *stop = node;
39
40 if (!node)
41 return NULL;
42 node = eb32_next(node);
43 if (!node)
44 node = eb32_first(root);
45 if (node == stop)
46 return NULL;
47 return node;
48}
49
50/* Remove all of a server's entries from its tree. This may be used when
51 * setting a server down.
52 */
53static inline void chash_dequeue_srv(struct server *s)
54{
55 while (s->lb_nodes_now > 0) {
56 if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
57 s->lb_nodes_now = s->lb_nodes_tot;
58 s->lb_nodes_now--;
59 if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node)
60 s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last);
61 eb32_delete(&s->lb_nodes[s->lb_nodes_now].node);
62 }
63}
64
65/* Adjust the number of entries of a server in its tree. The server must appear
66 * as many times as its weight indicates it. If it's there too often, we remove
67 * the last occurrences. If it's not there enough, we add more occurrences. To
68 * remove a server from the tree, normally call this with eweight=0.
69 */
70static inline void chash_queue_dequeue_srv(struct server *s)
71{
Emeric Brun52a91d32017-08-31 14:41:55 +020072 while (s->lb_nodes_now > s->next_eweight) {
Willy Tarreau6b2e11b2009-10-01 07:52:15 +020073 if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
74 s->lb_nodes_now = s->lb_nodes_tot;
75 s->lb_nodes_now--;
76 if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node)
77 s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last);
78 eb32_delete(&s->lb_nodes[s->lb_nodes_now].node);
79 }
80
Olivier Houchardf8eb8d52017-10-17 15:52:59 +020081 /* Attempt to increase the total number of nodes, if the user
82 * increased the weight beyond the original weight
83 */
84 if (s->lb_nodes_tot < s->next_eweight) {
85 struct tree_occ *new_nodes = realloc(s->lb_nodes, s->next_eweight);
86
87 if (new_nodes) {
88 unsigned int j;
89
90 s->lb_nodes = new_nodes;
91 memset(&s->lb_nodes[s->lb_nodes_tot], 0,
92 (s->next_eweight - s->lb_nodes_tot) * sizeof(*s->lb_nodes));
93 for (j = s->lb_nodes_tot; j < s->next_eweight; j++) {
94 s->lb_nodes[j].server = s;
95 s->lb_nodes[j].node.key = full_hash(s->puid * SRV_EWGHT_RANGE + j);
96 }
97 s->lb_nodes_tot = s->next_eweight;
98 }
99 }
Emeric Brun52a91d32017-08-31 14:41:55 +0200100 while (s->lb_nodes_now < s->next_eweight) {
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200101 if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
102 break;
103 if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node)
104 s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last);
105 eb32_insert(s->lb_tree, &s->lb_nodes[s->lb_nodes_now].node);
106 s->lb_nodes_now++;
107 }
108}
109
110/* This function updates the server trees according to server <srv>'s new
111 * state. It should be called when server <srv>'s status changes to down.
112 * It is not important whether the server was already down or not. It is not
113 * important either that the new state is completely down (the caller may not
114 * know all the variables of a server's state).
115 */
116static void chash_set_server_status_down(struct server *srv)
117{
118 struct proxy *p = srv->proxy;
119
Willy Tarreauc5150da2014-05-13 19:27:31 +0200120 if (!srv_lb_status_changed(srv))
Christopher Faulet5b517552017-06-09 14:17:53 +0200121 return;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200122
Emeric Brun52a91d32017-08-31 14:41:55 +0200123 if (srv_willbe_usable(srv))
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200124 goto out_update_state;
125
Emeric Brun52a91d32017-08-31 14:41:55 +0200126 if (!srv_currently_usable(srv))
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200127 /* server was already down */
128 goto out_update_backend;
129
Willy Tarreauc93cd162014-05-13 15:54:22 +0200130 if (srv->flags & SRV_F_BACKUP) {
Emeric Brun52a91d32017-08-31 14:41:55 +0200131 p->lbprm.tot_wbck -= srv->cur_eweight;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200132 p->srv_bck--;
133
134 if (srv == p->lbprm.fbck) {
135 /* we lost the first backup server in a single-backup
136 * configuration, we must search another one.
137 */
138 struct server *srv2 = p->lbprm.fbck;
139 do {
140 srv2 = srv2->next;
141 } while (srv2 &&
Willy Tarreauc93cd162014-05-13 15:54:22 +0200142 !((srv2->flags & SRV_F_BACKUP) &&
Emeric Brun52a91d32017-08-31 14:41:55 +0200143 srv_willbe_usable(srv2)));
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200144 p->lbprm.fbck = srv2;
145 }
146 } else {
Emeric Brun52a91d32017-08-31 14:41:55 +0200147 p->lbprm.tot_wact -= srv->cur_eweight;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200148 p->srv_act--;
149 }
150
151 chash_dequeue_srv(srv);
152
153out_update_backend:
154 /* check/update tot_used, tot_weight */
155 update_backend_weight(p);
156 out_update_state:
Willy Tarreauc5150da2014-05-13 19:27:31 +0200157 srv_lb_commit_status(srv);
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200158}
159
160/* This function updates the server trees according to server <srv>'s new
161 * state. It should be called when server <srv>'s status changes to up.
162 * It is not important whether the server was already down or not. It is not
163 * important either that the new state is completely UP (the caller may not
164 * know all the variables of a server's state). This function will not change
165 * the weight of a server which was already up.
166 */
167static void chash_set_server_status_up(struct server *srv)
168{
169 struct proxy *p = srv->proxy;
170
Willy Tarreauc5150da2014-05-13 19:27:31 +0200171 if (!srv_lb_status_changed(srv))
Christopher Faulet5b517552017-06-09 14:17:53 +0200172 return;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200173
Emeric Brun52a91d32017-08-31 14:41:55 +0200174 if (!srv_willbe_usable(srv))
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200175 goto out_update_state;
176
Emeric Brun52a91d32017-08-31 14:41:55 +0200177 if (srv_currently_usable(srv))
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200178 /* server was already up */
179 goto out_update_backend;
180
Willy Tarreauc93cd162014-05-13 15:54:22 +0200181 if (srv->flags & SRV_F_BACKUP) {
Emeric Brun52a91d32017-08-31 14:41:55 +0200182 p->lbprm.tot_wbck += srv->next_eweight;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200183 p->srv_bck++;
184
185 if (!(p->options & PR_O_USE_ALL_BK)) {
186 if (!p->lbprm.fbck) {
187 /* there was no backup server anymore */
188 p->lbprm.fbck = srv;
189 } else {
190 /* we may have restored a backup server prior to fbck,
191 * in which case it should replace it.
192 */
193 struct server *srv2 = srv;
194 do {
195 srv2 = srv2->next;
196 } while (srv2 && (srv2 != p->lbprm.fbck));
197 if (srv2)
198 p->lbprm.fbck = srv;
199 }
200 }
201 } else {
Emeric Brun52a91d32017-08-31 14:41:55 +0200202 p->lbprm.tot_wact += srv->next_eweight;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200203 p->srv_act++;
204 }
205
206 /* note that eweight cannot be 0 here */
207 chash_queue_dequeue_srv(srv);
208
209 out_update_backend:
210 /* check/update tot_used, tot_weight */
211 update_backend_weight(p);
212 out_update_state:
Willy Tarreauc5150da2014-05-13 19:27:31 +0200213 srv_lb_commit_status(srv);
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200214}
215
216/* This function must be called after an update to server <srv>'s effective
217 * weight. It may be called after a state change too.
218 */
219static void chash_update_server_weight(struct server *srv)
220{
221 int old_state, new_state;
222 struct proxy *p = srv->proxy;
223
Willy Tarreauc5150da2014-05-13 19:27:31 +0200224 if (!srv_lb_status_changed(srv))
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200225 return;
226
227 /* If changing the server's weight changes its state, we simply apply
228 * the procedures we already have for status change. If the state
229 * remains down, the server is not in any tree, so it's as easy as
230 * updating its values. If the state remains up with different weights,
231 * there are some computations to perform to find a new place and
232 * possibly a new tree for this server.
233 */
234
Emeric Brun52a91d32017-08-31 14:41:55 +0200235 old_state = srv_currently_usable(srv);
236 new_state = srv_willbe_usable(srv);
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200237
238 if (!old_state && !new_state) {
Willy Tarreauc5150da2014-05-13 19:27:31 +0200239 srv_lb_commit_status(srv);
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200240 return;
241 }
242 else if (!old_state && new_state) {
243 chash_set_server_status_up(srv);
244 return;
245 }
246 else if (old_state && !new_state) {
247 chash_set_server_status_down(srv);
248 return;
249 }
250
251 /* only adjust the server's presence in the tree */
252 chash_queue_dequeue_srv(srv);
253
Willy Tarreauc93cd162014-05-13 15:54:22 +0200254 if (srv->flags & SRV_F_BACKUP)
Emeric Brun52a91d32017-08-31 14:41:55 +0200255 p->lbprm.tot_wbck += srv->next_eweight - srv->cur_eweight;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200256 else
Emeric Brun52a91d32017-08-31 14:41:55 +0200257 p->lbprm.tot_wact += srv->next_eweight - srv->cur_eweight;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200258
259 update_backend_weight(p);
Willy Tarreauc5150da2014-05-13 19:27:31 +0200260 srv_lb_commit_status(srv);
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200261}
262
263/*
Andrew Rodland4f88c632016-10-25 12:50:37 -0400264 * This function implements the "Consistent Hashing with Bounded Loads" algorithm
265 * of Mirrokni, Thorup, and Zadimoghaddam (arxiv:1608.01350), adapted for use with
266 * unequal server weights.
267 */
268int chash_server_is_eligible(struct server *s)
269{
270 /* The total number of slots to allocate is the total number of outstanding requests
271 * (including the one we're about to make) times the load-balance-factor, rounded up.
272 */
273 unsigned tot_slots = ((s->proxy->served + 1) * s->proxy->lbprm.chash.balance_factor + 99) / 100;
274 unsigned slots_per_weight = tot_slots / s->proxy->lbprm.tot_weight;
275 unsigned remainder = tot_slots % s->proxy->lbprm.tot_weight;
276
277 /* Allocate a whole number of slots per weight unit... */
Emeric Brun52a91d32017-08-31 14:41:55 +0200278 unsigned slots = s->cur_eweight * slots_per_weight;
Andrew Rodland4f88c632016-10-25 12:50:37 -0400279
280 /* And then distribute the rest among servers proportionally to their weight. */
Emeric Brun52a91d32017-08-31 14:41:55 +0200281 slots += ((s->cumulative_weight + s->cur_eweight) * remainder) / s->proxy->lbprm.tot_weight
Andrew Rodland4f88c632016-10-25 12:50:37 -0400282 - (s->cumulative_weight * remainder) / s->proxy->lbprm.tot_weight;
283
284 /* But never leave a server with 0. */
285 if (slots == 0)
286 slots = 1;
287
288 return s->served < slots;
289}
290
291/*
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200292 * This function returns the running server from the CHASH tree, which is at
293 * the closest distance from the value of <hash>. Doing so ensures that even
294 * with a well imbalanced hash, if some servers are close to each other, they
295 * will still both receive traffic. If any server is found, it will be returned.
296 * If no valid server is found, NULL is returned.
297 */
298struct server *chash_get_server_hash(struct proxy *p, unsigned int hash)
299{
300 struct eb32_node *next, *prev;
301 struct server *nsrv, *psrv;
302 struct eb_root *root;
303 unsigned int dn, dp;
Andrew Rodland4f88c632016-10-25 12:50:37 -0400304 int loop;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200305
306 if (p->srv_act)
307 root = &p->lbprm.chash.act;
308 else if (p->lbprm.fbck)
309 return p->lbprm.fbck;
310 else if (p->srv_bck)
311 root = &p->lbprm.chash.bck;
312 else
313 return NULL;
314
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200315 /* find the node after and the node before */
316 next = eb32_lookup_ge(root, hash);
317 if (!next)
318 next = eb32_first(root);
319 if (!next)
320 return NULL; /* tree is empty */
321
322 prev = eb32_prev(next);
323 if (!prev)
324 prev = eb32_last(root);
325
326 nsrv = eb32_entry(next, struct tree_occ, node)->server;
327 psrv = eb32_entry(prev, struct tree_occ, node)->server;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200328
Andrew Rodland18330ab2017-04-26 02:57:03 -0400329 /* OK we're located between two servers, let's
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200330 * compare distances between hash and the two servers
331 * and select the closest server.
332 */
333 dp = hash - prev->key;
334 dn = next->key - hash;
335
Andrew Rodland4f88c632016-10-25 12:50:37 -0400336 if (dp <= dn) {
337 next = prev;
338 nsrv = psrv;
339 }
340
341 loop = 0;
342 while (p->lbprm.chash.balance_factor && !chash_server_is_eligible(nsrv)) {
343 next = eb32_next(next);
344 if (!next) {
345 next = eb32_first(root);
346 if (++loop > 1) // protection against accidental loop
347 break;
348 }
349 nsrv = eb32_entry(next, struct tree_occ, node)->server;
350 }
351
352 return nsrv;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200353}
354
355/* Return next server from the CHASH tree in backend <p>. If the tree is empty,
356 * return NULL. Saturated servers are skipped.
357 */
358struct server *chash_get_next_server(struct proxy *p, struct server *srvtoavoid)
359{
360 struct server *srv, *avoided;
361 struct eb32_node *node, *stop, *avoided_node;
362 struct eb_root *root;
363
364 srv = avoided = NULL;
365 avoided_node = NULL;
366
Christopher Faulet5b517552017-06-09 14:17:53 +0200367 SPIN_LOCK(LBPRM_LOCK, &p->lbprm.lock);
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200368 if (p->srv_act)
369 root = &p->lbprm.chash.act;
Christopher Faulet5b517552017-06-09 14:17:53 +0200370 else if (p->lbprm.fbck) {
371 srv = p->lbprm.fbck;
372 goto out;
373 }
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200374 else if (p->srv_bck)
375 root = &p->lbprm.chash.bck;
Christopher Faulet5b517552017-06-09 14:17:53 +0200376 else {
377 srv = NULL;
378 goto out;
379 }
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200380
381 stop = node = p->lbprm.chash.last;
382 do {
383 struct server *s;
384
385 if (node)
386 node = eb32_next(node);
387 if (!node)
388 node = eb32_first(root);
389
390 p->lbprm.chash.last = node;
391 if (!node)
392 /* no node is available */
393 return NULL;
394
Willy Tarreaud16a1b22013-04-12 14:46:51 +0200395 /* Note: if we came here after a down/up cycle with no last
396 * pointer, and after a redispatch (srvtoavoid is set), we
397 * must set stop to non-null otherwise we can loop forever.
398 */
399 if (!stop)
400 stop = node;
401
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200402 /* OK, we have a server. However, it may be saturated, in which
403 * case we don't want to reconsider it for now, so we'll simply
404 * skip it. Same if it's the server we try to avoid, in which
405 * case we simply remember it for later use if needed.
406 */
407 s = eb32_entry(node, struct tree_occ, node)->server;
408 if (!s->maxconn || (!s->nbpend && s->served < srv_dynamic_maxconn(s))) {
409 if (s != srvtoavoid) {
410 srv = s;
411 break;
412 }
413 avoided = s;
414 avoided_node = node;
415 }
416 } while (node != stop);
417
418 if (!srv) {
419 srv = avoided;
420 p->lbprm.chash.last = avoided_node;
421 }
422
Christopher Faulet5b517552017-06-09 14:17:53 +0200423 out:
424 SPIN_UNLOCK(LBPRM_LOCK, &p->lbprm.lock);
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200425 return srv;
426}
427
428/* This function is responsible for building the active and backup trees for
429 * constistent hashing. The servers receive an array of initialized nodes
430 * with their assigned keys. It also sets p->lbprm.wdiv to the eweight to
431 * uweight ratio.
432 */
433void chash_init_server_tree(struct proxy *p)
434{
435 struct server *srv;
436 struct eb_root init_head = EB_ROOT;
437 int node;
438
439 p->lbprm.set_server_status_up = chash_set_server_status_up;
440 p->lbprm.set_server_status_down = chash_set_server_status_down;
441 p->lbprm.update_server_eweight = chash_update_server_weight;
442 p->lbprm.server_take_conn = NULL;
443 p->lbprm.server_drop_conn = NULL;
444
445 p->lbprm.wdiv = BE_WEIGHT_SCALE;
446 for (srv = p->srv; srv; srv = srv->next) {
Emeric Brun52a91d32017-08-31 14:41:55 +0200447 srv->next_eweight = (srv->uweight * p->lbprm.wdiv + p->lbprm.wmult - 1) / p->lbprm.wmult;
Willy Tarreauc5150da2014-05-13 19:27:31 +0200448 srv_lb_commit_status(srv);
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200449 }
450
451 recount_servers(p);
452 update_backend_weight(p);
453
454 p->lbprm.chash.act = init_head;
455 p->lbprm.chash.bck = init_head;
456 p->lbprm.chash.last = NULL;
457
458 /* queue active and backup servers in two distinct groups */
459 for (srv = p->srv; srv; srv = srv->next) {
Willy Tarreauc93cd162014-05-13 15:54:22 +0200460 srv->lb_tree = (srv->flags & SRV_F_BACKUP) ? &p->lbprm.chash.bck : &p->lbprm.chash.act;
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200461 srv->lb_nodes_tot = srv->uweight * BE_WEIGHT_SCALE;
462 srv->lb_nodes_now = 0;
Vincent Bernat3c2f2f22016-04-03 13:48:42 +0200463 srv->lb_nodes = calloc(srv->lb_nodes_tot, sizeof(struct tree_occ));
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200464
465 for (node = 0; node < srv->lb_nodes_tot; node++) {
466 srv->lb_nodes[node].server = srv;
Willy Tarreau4c14eaa2010-11-24 14:01:45 +0100467 srv->lb_nodes[node].node.key = full_hash(srv->puid * SRV_EWGHT_RANGE + node);
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200468 }
469
Emeric Brun52a91d32017-08-31 14:41:55 +0200470 if (srv_currently_usable(srv))
Willy Tarreau6b2e11b2009-10-01 07:52:15 +0200471 chash_queue_dequeue_srv(srv);
472 }
473}