blob: 58c029a4ad3392bc05c30f6923f91dc6a87f4172 [file] [log] [blame]
Willy Tarreau6b2e11b2009-10-01 07:52:15 +02001/*
2 * Consistent Hash implementation
3 * Please consult this very well detailed article for more information :
4 * http://www.spiteful.com/2008/03/17/programmers-toolbox-part-3-consistent-hashing/
5 *
6 * Our implementation has to support both weighted hashing and weighted round
7 * robin because we'll use it to replace the previous map-based implementation
8 * which offered both algorithms.
9 *
10 * Copyright 2000-2009 Willy Tarreau <w@1wt.eu>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 */
18
19#include <common/compat.h>
20#include <common/config.h>
21#include <common/debug.h>
Willy Tarreau45cb4fb2009-10-26 21:10:04 +010022#include <eb32tree.h>
Willy Tarreau6b2e11b2009-10-01 07:52:15 +020023
24#include <types/global.h>
25#include <types/server.h>
26
27#include <proto/backend.h>
28#include <proto/queue.h>
29
30static inline unsigned int chash_hash(unsigned int a)
31{
32 /* This function is one of Bob Jenkins' full avalanche hashing
33 * functions, which when provides quite a good distribution for little
34 * input variations. The result is quite suited to fit over a 32-bit
35 * space with enough variations so that a randomly picked number falls
36 * equally before any server position.
37 * Check http://burtleburtle.net/bob/hash/integer.html for more info.
38 */
39 a = (a+0x7ed55d16) + (a<<12);
40 a = (a^0xc761c23c) ^ (a>>19);
41 a = (a+0x165667b1) + (a<<5);
42 a = (a+0xd3a2646c) ^ (a<<9);
43 a = (a+0xfd7046c5) + (a<<3);
44 a = (a^0xb55a4f09) ^ (a>>16);
45
46 /* ensure values are better spread all around the tree by multiplying
47 * by a large prime close to 3/4 of the tree.
48 */
49 return a * 3221225473U;
50}
51
52/* Return next tree node after <node> which must still be in the tree, or be
53 * NULL. Lookup wraps around the end to the beginning. If the next node is the
54 * same node, return NULL. This is designed to find a valid next node before
55 * deleting one from the tree.
56 */
57static inline struct eb32_node *chash_skip_node(struct eb_root *root, struct eb32_node *node)
58{
59 struct eb32_node *stop = node;
60
61 if (!node)
62 return NULL;
63 node = eb32_next(node);
64 if (!node)
65 node = eb32_first(root);
66 if (node == stop)
67 return NULL;
68 return node;
69}
70
71/* Remove all of a server's entries from its tree. This may be used when
72 * setting a server down.
73 */
74static inline void chash_dequeue_srv(struct server *s)
75{
76 while (s->lb_nodes_now > 0) {
77 if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
78 s->lb_nodes_now = s->lb_nodes_tot;
79 s->lb_nodes_now--;
80 if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node)
81 s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last);
82 eb32_delete(&s->lb_nodes[s->lb_nodes_now].node);
83 }
84}
85
86/* Adjust the number of entries of a server in its tree. The server must appear
87 * as many times as its weight indicates it. If it's there too often, we remove
88 * the last occurrences. If it's not there enough, we add more occurrences. To
89 * remove a server from the tree, normally call this with eweight=0.
90 */
91static inline void chash_queue_dequeue_srv(struct server *s)
92{
93 while (s->lb_nodes_now > s->eweight) {
94 if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
95 s->lb_nodes_now = s->lb_nodes_tot;
96 s->lb_nodes_now--;
97 if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node)
98 s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last);
99 eb32_delete(&s->lb_nodes[s->lb_nodes_now].node);
100 }
101
102 while (s->lb_nodes_now < s->eweight) {
103 if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
104 break;
105 if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node)
106 s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last);
107 eb32_insert(s->lb_tree, &s->lb_nodes[s->lb_nodes_now].node);
108 s->lb_nodes_now++;
109 }
110}
111
112/* This function updates the server trees according to server <srv>'s new
113 * state. It should be called when server <srv>'s status changes to down.
114 * It is not important whether the server was already down or not. It is not
115 * important either that the new state is completely down (the caller may not
116 * know all the variables of a server's state).
117 */
118static void chash_set_server_status_down(struct server *srv)
119{
120 struct proxy *p = srv->proxy;
121
122 if (srv->state == srv->prev_state &&
123 srv->eweight == srv->prev_eweight)
124 return;
125
126 if (srv_is_usable(srv->state, srv->eweight))
127 goto out_update_state;
128
129 if (!srv_is_usable(srv->prev_state, srv->prev_eweight))
130 /* server was already down */
131 goto out_update_backend;
132
133 if (srv->state & SRV_BACKUP) {
134 p->lbprm.tot_wbck -= srv->prev_eweight;
135 p->srv_bck--;
136
137 if (srv == p->lbprm.fbck) {
138 /* we lost the first backup server in a single-backup
139 * configuration, we must search another one.
140 */
141 struct server *srv2 = p->lbprm.fbck;
142 do {
143 srv2 = srv2->next;
144 } while (srv2 &&
145 !((srv2->state & SRV_BACKUP) &&
146 srv_is_usable(srv2->state, srv2->eweight)));
147 p->lbprm.fbck = srv2;
148 }
149 } else {
150 p->lbprm.tot_wact -= srv->prev_eweight;
151 p->srv_act--;
152 }
153
154 chash_dequeue_srv(srv);
155
156out_update_backend:
157 /* check/update tot_used, tot_weight */
158 update_backend_weight(p);
159 out_update_state:
160 srv->prev_state = srv->state;
161 srv->prev_eweight = srv->eweight;
162}
163
164/* This function updates the server trees according to server <srv>'s new
165 * state. It should be called when server <srv>'s status changes to up.
166 * It is not important whether the server was already down or not. It is not
167 * important either that the new state is completely UP (the caller may not
168 * know all the variables of a server's state). This function will not change
169 * the weight of a server which was already up.
170 */
171static void chash_set_server_status_up(struct server *srv)
172{
173 struct proxy *p = srv->proxy;
174
175 if (srv->state == srv->prev_state &&
176 srv->eweight == srv->prev_eweight)
177 return;
178
179 if (!srv_is_usable(srv->state, srv->eweight))
180 goto out_update_state;
181
182 if (srv_is_usable(srv->prev_state, srv->prev_eweight))
183 /* server was already up */
184 goto out_update_backend;
185
186 if (srv->state & SRV_BACKUP) {
187 p->lbprm.tot_wbck += srv->eweight;
188 p->srv_bck++;
189
190 if (!(p->options & PR_O_USE_ALL_BK)) {
191 if (!p->lbprm.fbck) {
192 /* there was no backup server anymore */
193 p->lbprm.fbck = srv;
194 } else {
195 /* we may have restored a backup server prior to fbck,
196 * in which case it should replace it.
197 */
198 struct server *srv2 = srv;
199 do {
200 srv2 = srv2->next;
201 } while (srv2 && (srv2 != p->lbprm.fbck));
202 if (srv2)
203 p->lbprm.fbck = srv;
204 }
205 }
206 } else {
207 p->lbprm.tot_wact += srv->eweight;
208 p->srv_act++;
209 }
210
211 /* note that eweight cannot be 0 here */
212 chash_queue_dequeue_srv(srv);
213
214 out_update_backend:
215 /* check/update tot_used, tot_weight */
216 update_backend_weight(p);
217 out_update_state:
218 srv->prev_state = srv->state;
219 srv->prev_eweight = srv->eweight;
220}
221
222/* This function must be called after an update to server <srv>'s effective
223 * weight. It may be called after a state change too.
224 */
225static void chash_update_server_weight(struct server *srv)
226{
227 int old_state, new_state;
228 struct proxy *p = srv->proxy;
229
230 if (srv->state == srv->prev_state &&
231 srv->eweight == srv->prev_eweight)
232 return;
233
234 /* If changing the server's weight changes its state, we simply apply
235 * the procedures we already have for status change. If the state
236 * remains down, the server is not in any tree, so it's as easy as
237 * updating its values. If the state remains up with different weights,
238 * there are some computations to perform to find a new place and
239 * possibly a new tree for this server.
240 */
241
242 old_state = srv_is_usable(srv->prev_state, srv->prev_eweight);
243 new_state = srv_is_usable(srv->state, srv->eweight);
244
245 if (!old_state && !new_state) {
246 srv->prev_state = srv->state;
247 srv->prev_eweight = srv->eweight;
248 return;
249 }
250 else if (!old_state && new_state) {
251 chash_set_server_status_up(srv);
252 return;
253 }
254 else if (old_state && !new_state) {
255 chash_set_server_status_down(srv);
256 return;
257 }
258
259 /* only adjust the server's presence in the tree */
260 chash_queue_dequeue_srv(srv);
261
262 if (srv->state & SRV_BACKUP)
263 p->lbprm.tot_wbck += srv->eweight - srv->prev_eweight;
264 else
265 p->lbprm.tot_wact += srv->eweight - srv->prev_eweight;
266
267 update_backend_weight(p);
268 srv->prev_state = srv->state;
269 srv->prev_eweight = srv->eweight;
270}
271
272/*
273 * This function returns the running server from the CHASH tree, which is at
274 * the closest distance from the value of <hash>. Doing so ensures that even
275 * with a well imbalanced hash, if some servers are close to each other, they
276 * will still both receive traffic. If any server is found, it will be returned.
277 * If no valid server is found, NULL is returned.
278 */
279struct server *chash_get_server_hash(struct proxy *p, unsigned int hash)
280{
281 struct eb32_node *next, *prev;
282 struct server *nsrv, *psrv;
283 struct eb_root *root;
284 unsigned int dn, dp;
285
286 if (p->srv_act)
287 root = &p->lbprm.chash.act;
288 else if (p->lbprm.fbck)
289 return p->lbprm.fbck;
290 else if (p->srv_bck)
291 root = &p->lbprm.chash.bck;
292 else
293 return NULL;
294
295 hash = chash_hash(hash);
296
297 /* find the node after and the node before */
298 next = eb32_lookup_ge(root, hash);
299 if (!next)
300 next = eb32_first(root);
301 if (!next)
302 return NULL; /* tree is empty */
303
304 prev = eb32_prev(next);
305 if (!prev)
306 prev = eb32_last(root);
307
308 nsrv = eb32_entry(next, struct tree_occ, node)->server;
309 psrv = eb32_entry(prev, struct tree_occ, node)->server;
310 if (nsrv == psrv)
311 return nsrv;
312
313 /* OK we're located between two distinct servers, let's
314 * compare distances between hash and the two servers
315 * and select the closest server.
316 */
317 dp = hash - prev->key;
318 dn = next->key - hash;
319
320 return (dp <= dn) ? psrv : nsrv;
321}
322
323/* Return next server from the CHASH tree in backend <p>. If the tree is empty,
324 * return NULL. Saturated servers are skipped.
325 */
326struct server *chash_get_next_server(struct proxy *p, struct server *srvtoavoid)
327{
328 struct server *srv, *avoided;
329 struct eb32_node *node, *stop, *avoided_node;
330 struct eb_root *root;
331
332 srv = avoided = NULL;
333 avoided_node = NULL;
334
335 if (p->srv_act)
336 root = &p->lbprm.chash.act;
337 else if (p->lbprm.fbck)
338 return p->lbprm.fbck;
339 else if (p->srv_bck)
340 root = &p->lbprm.chash.bck;
341 else
342 return NULL;
343
344 stop = node = p->lbprm.chash.last;
345 do {
346 struct server *s;
347
348 if (node)
349 node = eb32_next(node);
350 if (!node)
351 node = eb32_first(root);
352
353 p->lbprm.chash.last = node;
354 if (!node)
355 /* no node is available */
356 return NULL;
357
358 /* OK, we have a server. However, it may be saturated, in which
359 * case we don't want to reconsider it for now, so we'll simply
360 * skip it. Same if it's the server we try to avoid, in which
361 * case we simply remember it for later use if needed.
362 */
363 s = eb32_entry(node, struct tree_occ, node)->server;
364 if (!s->maxconn || (!s->nbpend && s->served < srv_dynamic_maxconn(s))) {
365 if (s != srvtoavoid) {
366 srv = s;
367 break;
368 }
369 avoided = s;
370 avoided_node = node;
371 }
372 } while (node != stop);
373
374 if (!srv) {
375 srv = avoided;
376 p->lbprm.chash.last = avoided_node;
377 }
378
379 return srv;
380}
381
382/* This function is responsible for building the active and backup trees for
383 * constistent hashing. The servers receive an array of initialized nodes
384 * with their assigned keys. It also sets p->lbprm.wdiv to the eweight to
385 * uweight ratio.
386 */
387void chash_init_server_tree(struct proxy *p)
388{
389 struct server *srv;
390 struct eb_root init_head = EB_ROOT;
391 int node;
392
393 p->lbprm.set_server_status_up = chash_set_server_status_up;
394 p->lbprm.set_server_status_down = chash_set_server_status_down;
395 p->lbprm.update_server_eweight = chash_update_server_weight;
396 p->lbprm.server_take_conn = NULL;
397 p->lbprm.server_drop_conn = NULL;
398
399 p->lbprm.wdiv = BE_WEIGHT_SCALE;
400 for (srv = p->srv; srv; srv = srv->next) {
401 srv->prev_eweight = srv->eweight = srv->uweight * BE_WEIGHT_SCALE;
402 srv->prev_state = srv->state;
403 }
404
405 recount_servers(p);
406 update_backend_weight(p);
407
408 p->lbprm.chash.act = init_head;
409 p->lbprm.chash.bck = init_head;
410 p->lbprm.chash.last = NULL;
411
412 /* queue active and backup servers in two distinct groups */
413 for (srv = p->srv; srv; srv = srv->next) {
414 srv->lb_tree = (srv->state & SRV_BACKUP) ? &p->lbprm.chash.bck : &p->lbprm.chash.act;
415 srv->lb_nodes_tot = srv->uweight * BE_WEIGHT_SCALE;
416 srv->lb_nodes_now = 0;
417 srv->lb_nodes = (struct tree_occ *)calloc(srv->lb_nodes_tot, sizeof(struct tree_occ));
418
419 for (node = 0; node < srv->lb_nodes_tot; node++) {
420 srv->lb_nodes[node].server = srv;
421 srv->lb_nodes[node].node.key = chash_hash(srv->puid * SRV_EWGHT_RANGE + node);
422 }
423
424 if (srv_is_usable(srv->state, srv->eweight))
425 chash_queue_dequeue_srv(srv);
426 }
427}