Willy Tarreau | 18baa9d | 2007-11-26 01:24:50 +0100 | [diff] [blame] | 1 | /* |
| 2 | * experimental weighted round robin scheduler - (c) 2007 willy tarreau. |
| 3 | * |
| 4 | * This filling algorithm is excellent at spreading the servers, as it also |
| 5 | * takes care of keeping the most uniform distance between occurences of each |
| 6 | * server, by maximizing this distance. It reduces the number of variables |
| 7 | * and expensive operations. |
| 8 | */ |
| 9 | |
| 10 | #include <stdio.h> |
| 11 | #include <stdlib.h> |
| 12 | #include "eb32tree.h" |
| 13 | |
| 14 | struct srv { |
| 15 | struct eb32_node node; |
| 16 | struct eb_root *tree; // we want to know where the server is |
| 17 | int num; |
| 18 | int w; /* weight */ |
| 19 | int next, last; |
| 20 | int rem; |
| 21 | } *srv; |
| 22 | |
| 23 | /* those trees represent a sliding window of 3 time frames */ |
| 24 | struct eb_root tree_0 = EB_ROOT; |
| 25 | struct eb_root tree_1 = EB_ROOT; |
| 26 | struct eb_root tree_2 = EB_ROOT; |
| 27 | |
| 28 | struct eb_root *init_tree; /* receives positions 0..sw-1 */ |
| 29 | struct eb_root *next_tree; /* receives positions >= 2sw */ |
| 30 | |
| 31 | int nsrv; /* # of servers */ |
| 32 | int nsw, sw; /* sum of weights */ |
| 33 | int p; /* current position, between sw..2sw-1 */ |
| 34 | |
| 35 | /* queue a server in the weights tree */ |
| 36 | void queue_by_weight(struct eb_root *root, struct srv *s) { |
| 37 | s->node.key = 255 - s->w; |
Willy Tarreau | b698f0f | 2007-12-02 11:01:23 +0100 | [diff] [blame] | 38 | eb32_insert(root, &s->node); |
Willy Tarreau | 18baa9d | 2007-11-26 01:24:50 +0100 | [diff] [blame] | 39 | s->tree = root; |
| 40 | } |
| 41 | |
| 42 | /* queue a server in the weight tree <root>, except if its weight is 0 */ |
| 43 | void queue_by_weight_0(struct eb_root *root, struct srv *s) { |
| 44 | if (s->w) { |
| 45 | s->node.key = 255 - s->w; |
Willy Tarreau | b698f0f | 2007-12-02 11:01:23 +0100 | [diff] [blame] | 46 | eb32_insert(root, &s->node); |
Willy Tarreau | 18baa9d | 2007-11-26 01:24:50 +0100 | [diff] [blame] | 47 | s->tree = root; |
| 48 | } else { |
| 49 | s->tree = NULL; |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | static inline void dequeue_srv(struct srv *s) { |
| 54 | eb32_delete(&s->node); |
| 55 | } |
| 56 | |
| 57 | /* queues a server into the correct tree depending on ->next */ |
| 58 | void put_srv(struct srv *s) { |
| 59 | if (s->w <= 0 || |
| 60 | s->next >= 2*sw || /* delay everything which does not fit into the window */ |
| 61 | s->next >= sw+nsw) { /* and everything which does not fit into the theorical new window */ |
| 62 | /* put into next tree */ |
| 63 | s->next -= sw; // readjust next in case we could finally take this back to current. |
| 64 | queue_by_weight_0(next_tree, s); |
Willy Tarreau | b698f0f | 2007-12-02 11:01:23 +0100 | [diff] [blame] | 65 | } else { |
| 66 | // The overflow problem is caused by the scale we want to apply to user weight |
| 67 | // to turn it into effective weight. Since this is only used to provide a smooth |
| 68 | // slowstart on very low weights (1), it is a pure waste. Thus, we just have to |
| 69 | // apply a small scaling factor and warn the user that slowstart is not very smooth |
| 70 | // on low weights. |
| 71 | // The max key is about ((scale*maxw)*(scale*maxw)*nbsrv)/ratio (where the ratio is |
| 72 | // the arbitrary divide we perform in the examples above). Assuming that ratio==scale, |
| 73 | // this translates to maxkey=scale*maxw^2*nbsrv, so |
| 74 | // max_nbsrv=2^32/255^2/scale ~= 66051/scale |
| 75 | // Using a scale of 16 is enough to support 4000 servers without overflow, providing |
| 76 | // 6% steps during slowstart. |
| 77 | |
| 78 | s->node.key = 256 * s->next + (16*255 + s->rem - s->w) / 16; |
| 79 | |
| 80 | /* check for overflows */ |
| 81 | if ((int)s->node.key < 0) |
| 82 | printf(" OV: srv=%p w=%d rem=%d next=%d key=%d", s, s->w, s->rem, s->next, s->node.key); |
| 83 | eb32_insert(&tree_0, &s->node); |
Willy Tarreau | 18baa9d | 2007-11-26 01:24:50 +0100 | [diff] [blame] | 84 | s->tree = &tree_0; |
| 85 | } |
| 86 | } |
| 87 | |
| 88 | /* prepares a server when extracting it from the init tree */ |
| 89 | static inline void get_srv_init(struct srv *s) { |
| 90 | s->next = s->rem = 0; |
| 91 | } |
| 92 | |
| 93 | /* prepares a server when extracting it from the next tree */ |
| 94 | static inline void get_srv_next(struct srv *s) { |
| 95 | s->next += sw; |
| 96 | } |
| 97 | |
| 98 | /* prepares a server when extracting it from the next tree */ |
| 99 | static inline void get_srv_down(struct srv *s) { |
| 100 | s->next = p; |
| 101 | } |
| 102 | |
| 103 | /* prepares a server when extracting it from its tree */ |
| 104 | void get_srv(struct srv *s) { |
| 105 | if (s->tree == init_tree) { |
| 106 | get_srv_init(s); |
| 107 | } |
| 108 | else if (s->tree == next_tree) { |
| 109 | get_srv_next(s); |
| 110 | } |
| 111 | else if (s->tree == NULL) { |
| 112 | get_srv_down(s); |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | |
| 117 | /* return next server from the current tree, or a server from the init tree |
| 118 | * if appropriate. If both trees are empty, return NULL. |
| 119 | */ |
| 120 | struct srv *get_next_server() { |
| 121 | struct eb32_node *node; |
| 122 | struct srv *s; |
| 123 | |
| 124 | node = eb32_first(&tree_0); |
| 125 | s = eb32_entry(node, struct srv, node); |
| 126 | |
| 127 | if (!node || s->next > p) { |
| 128 | /* either we have no server left, or we have a hole */ |
| 129 | struct eb32_node *node2; |
| 130 | node2 = eb32_first(init_tree); |
| 131 | if (node2) { |
| 132 | node = node2; |
| 133 | s = eb32_entry(node, struct srv, node); |
| 134 | get_srv_init(s); |
| 135 | if (s->w == 0) |
| 136 | node = NULL; |
Willy Tarreau | b698f0f | 2007-12-02 11:01:23 +0100 | [diff] [blame] | 137 | s->node.key = 0; // do not display random values |
Willy Tarreau | 18baa9d | 2007-11-26 01:24:50 +0100 | [diff] [blame] | 138 | } |
| 139 | } |
| 140 | if (node) |
| 141 | return s; |
| 142 | else |
| 143 | return NULL; |
| 144 | } |
| 145 | |
| 146 | void update_position(struct srv *s) { |
| 147 | //if (s->tree == init_tree) { |
| 148 | if (!s->next) { |
| 149 | // first time ever for this server |
| 150 | s->last = p; |
| 151 | s->next = p + nsw / s->w; |
| 152 | s->rem += nsw % s->w; |
| 153 | |
| 154 | if (s->rem >= s->w) { |
| 155 | s->rem -= s->w; |
| 156 | s->next++; |
| 157 | } |
| 158 | } else { |
| 159 | s->last = s->next; // or p ? |
| 160 | //s->next += sw / s->w; |
| 161 | //s->rem += sw % s->w; |
| 162 | s->next += nsw / s->w; |
| 163 | s->rem += nsw % s->w; |
| 164 | |
| 165 | if (s->rem >= s->w) { |
| 166 | s->rem -= s->w; |
| 167 | s->next++; |
| 168 | } |
| 169 | } |
| 170 | } |
| 171 | |
| 172 | |
| 173 | /* switches trees init_tree and next_tree. init_tree should be empty when |
| 174 | * this happens, and next_tree filled with servers sorted by weights. |
| 175 | */ |
| 176 | void switch_trees() { |
| 177 | struct eb_root *swap; |
| 178 | swap = init_tree; |
| 179 | init_tree = next_tree; |
| 180 | next_tree = swap; |
| 181 | sw = nsw; |
| 182 | p = sw; |
| 183 | } |
| 184 | |
| 185 | main(int argc, char **argv) { |
| 186 | int conns; |
| 187 | int i; |
| 188 | |
| 189 | struct srv *s; |
| 190 | |
| 191 | argc--; argv++; |
| 192 | nsrv = argc; |
| 193 | |
| 194 | if (!nsrv) |
| 195 | exit(1); |
| 196 | |
| 197 | srv = (struct srv *)calloc(nsrv, sizeof(struct srv)); |
| 198 | |
| 199 | sw = 0; |
| 200 | for (i = 0; i < nsrv; i++) { |
| 201 | s = &srv[i]; |
| 202 | s->num = i; |
| 203 | s->w = atol(argv[i]); |
| 204 | sw += s->w; |
| 205 | } |
| 206 | |
| 207 | nsw = sw; |
| 208 | |
| 209 | init_tree = &tree_1; |
| 210 | next_tree = &tree_2; |
| 211 | |
| 212 | /* and insert all the servers in the PREV tree */ |
| 213 | /* note that it is required to insert them according to |
| 214 | * the reverse order of their weights. |
| 215 | */ |
| 216 | printf("---------------:"); |
| 217 | for (i = 0; i < nsrv; i++) { |
| 218 | s = &srv[i]; |
| 219 | queue_by_weight_0(init_tree, s); |
| 220 | printf("%2d", s->w); |
| 221 | } |
| 222 | printf("\n"); |
| 223 | |
| 224 | p = sw; // time base of current tree |
| 225 | conns = 0; |
| 226 | while (1) { |
| 227 | struct eb32_node *node; |
| 228 | |
| 229 | printf("%08d|%06d: ", conns, p); |
| 230 | |
| 231 | /* if we have en empty tree, let's first try to collect weights |
| 232 | * which might have changed. |
| 233 | */ |
| 234 | if (!sw) { |
| 235 | if (nsw) { |
| 236 | sw = nsw; |
| 237 | p = sw; |
| 238 | /* do not switch trees, otherwise new servers (from init) |
| 239 | * would end up in next. |
| 240 | */ |
| 241 | //switch_trees(); |
| 242 | //printf("bla\n"); |
| 243 | } |
| 244 | else |
| 245 | goto next_iteration; |
| 246 | } |
| 247 | |
| 248 | s = get_next_server(); |
| 249 | if (!s) { |
| 250 | printf("----------- switch (empty) -- sw=%d -> %d ---------\n", sw, nsw); |
| 251 | switch_trees(); |
| 252 | s = get_next_server(); |
| 253 | printf("%08d|%06d: ", conns, p); |
| 254 | |
| 255 | if (!s) |
| 256 | goto next_iteration; |
| 257 | } |
| 258 | else if (s->next >= 2*sw) { |
| 259 | printf("ARGGGGG! s[%d].next=%d, max=%d\n", s->num, s->next, 2*sw-1); |
| 260 | } |
| 261 | |
| 262 | /* now we have THE server we want to put at this position */ |
| 263 | for (i = 0; i < s->num; i++) { |
| 264 | if (srv[i].w > 0) |
| 265 | printf(". "); |
| 266 | else |
| 267 | printf("_ "); |
| 268 | } |
| 269 | printf("# "); |
| 270 | for (i = s->num + 1; i < nsrv; i++) { |
| 271 | if (srv[i].w > 0) |
| 272 | printf(". "); |
| 273 | else |
| 274 | printf("_ "); |
| 275 | } |
| 276 | printf(" : "); |
| 277 | |
| 278 | printf("s=%02d v=%04d w=%03d n=%03d r=%03d ", |
| 279 | s->num, s->node.key, s->w, s->next, s->rem); |
| 280 | |
| 281 | update_position(s); |
| 282 | printf(" | next=%03d, rem=%03d ", s->next, s->rem); |
| 283 | |
| 284 | if (s->next >= sw * 2) { |
| 285 | dequeue_srv(s); |
| 286 | //queue_by_weight(next_tree, s); |
| 287 | put_srv(s); |
| 288 | printf(" => next (w=%d, n=%d) ", s->w, s->next); |
| 289 | } |
| 290 | else { |
| 291 | printf(" => curr "); |
| 292 | |
| 293 | //s->node.key = s->next; |
| 294 | /* we want to ensure that in case of conflicts, servers with |
| 295 | * the highest weights will get served first. Also, we still |
| 296 | * have the remainder to see where the entry expected to be |
| 297 | * inserted. |
| 298 | */ |
| 299 | //s->node.key = 256 * s->next + 255 - s->w; |
| 300 | //s->node.key = sw * s->next + sw / s->w; |
| 301 | //s->node.key = sw * s->next + s->rem; /// seems best (check with filltab15) ! |
| 302 | |
| 303 | //s->node.key = (2 * sw * s->next) + s->rem + sw / s->w; |
| 304 | |
| 305 | /* FIXME: must be optimized */ |
| 306 | dequeue_srv(s); |
| 307 | put_srv(s); |
| 308 | //eb32i_insert(&tree_0, &s->node); |
| 309 | //s->tree = &tree_0; |
| 310 | } |
| 311 | |
| 312 | next_iteration: |
| 313 | p++; |
| 314 | conns++; |
Willy Tarreau | b698f0f | 2007-12-02 11:01:23 +0100 | [diff] [blame] | 315 | if (/*conns == 30*/ /**/random()%100 == 0/**/) { |
| 316 | int w = /*20*//**/random()%4096/**/; |
Willy Tarreau | 18baa9d | 2007-11-26 01:24:50 +0100 | [diff] [blame] | 317 | int num = /*1*//**/random()%nsrv/**/; |
| 318 | struct srv *s = &srv[num]; |
| 319 | |
| 320 | nsw = nsw - s->w + w; |
| 321 | //sw=nsw; |
| 322 | |
| 323 | if (s->tree == init_tree) { |
| 324 | printf(" -- chgwght1(%d): %d->%d, n=%d --", s->num, s->w, w, s->next); |
| 325 | printf("(init)"); |
| 326 | s->w = w; |
| 327 | dequeue_srv(s); |
| 328 | queue_by_weight_0(s->tree, s); |
| 329 | } |
| 330 | else if (s->tree == NULL) { |
| 331 | printf(" -- chgwght2(%d): %d->%d, n=%d --", s->num, s->w, w, s->next); |
| 332 | printf("(down)"); |
| 333 | s->w = w; |
| 334 | dequeue_srv(s); |
| 335 | //queue_by_weight_0(init_tree, s); |
| 336 | get_srv(s); |
| 337 | s->next = p + (nsw + sw - p) / s->w; |
| 338 | put_srv(s); |
| 339 | } |
| 340 | else { |
| 341 | int oldnext; |
| 342 | |
| 343 | /* the server is either active or in the next queue */ |
| 344 | get_srv(s); |
| 345 | printf(" -- chgwght3(%d): %d->%d, n=%d, sw=%d, nsw=%d --", s->num, s->w, w, s->next, sw, nsw); |
| 346 | |
| 347 | oldnext = s->next; |
| 348 | s->w = w; |
| 349 | |
| 350 | /* we must measure how far we are from the end of the current window |
| 351 | * and try to fit their as many entries as should theorically be. |
| 352 | */ |
| 353 | |
| 354 | //s->w = s->w * (2*sw - p) / sw; |
| 355 | if (s->w > 0) { |
| 356 | int step = (nsw /*+ sw - p*/) / s->w; |
| 357 | s->next = s->last + step; |
| 358 | s->rem = 0; |
| 359 | if (s->next > oldnext) { |
| 360 | s->next = oldnext; |
| 361 | printf(" aaaaaaa "); |
| 362 | } |
| 363 | |
| 364 | if (s->next < p + 2) { |
| 365 | s->next = p + step; |
| 366 | printf(" bbbbbb "); |
| 367 | } |
| 368 | } else { |
| 369 | printf(" push -- "); |
| 370 | /* push it into the next tree */ |
| 371 | s->w = 0; |
| 372 | s->next = p + sw; |
| 373 | } |
| 374 | |
| 375 | |
| 376 | dequeue_srv(s); |
| 377 | printf(" n=%d", s->next); |
| 378 | put_srv(s); |
| 379 | } |
| 380 | } |
| 381 | |
| 382 | printf("\n"); |
| 383 | |
| 384 | if (0 && conns % 50000 == 0) { |
| 385 | printf("-------- %-5d : changing all weights ----\n", conns); |
| 386 | |
| 387 | for (i = 0; i < nsrv; i++) { |
| 388 | int w = i + 1; |
| 389 | s = &srv[i]; |
| 390 | nsw = nsw - s->w + w; |
| 391 | s->w = w; |
| 392 | dequeue_srv(s); |
| 393 | queue_by_weight_0(next_tree, s); // or init_tree ? |
| 394 | } |
| 395 | } |
| 396 | |
| 397 | } |
| 398 | } |
| 399 | |