blob: 9b0f0aa0019a9744c5e3e83b6fa58026316a04bb [file] [log] [blame]
Willy Tarreaubaaee002006-06-26 02:48:02 +02001/*
2 * Backend variables and functions.
3 *
Willy Tarreaud825eef2007-05-12 22:35:00 +02004 * Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
Willy Tarreaubaaee002006-06-26 02:48:02 +02005 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <errno.h>
14#include <fcntl.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <syslog.h>
Willy Tarreauf19cf372006-11-14 15:40:51 +010018#include <string.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020019
Willy Tarreau2dd0d472006-06-29 17:53:05 +020020#include <common/compat.h>
Willy Tarreaue3ba5f02006-06-29 18:54:54 +020021#include <common/config.h>
Willy Tarreaub625a082007-11-26 01:15:43 +010022#include <common/eb32tree.h>
Willy Tarreau2dd0d472006-06-29 17:53:05 +020023#include <common/time.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020024
25#include <types/buffers.h>
26#include <types/global.h>
27#include <types/polling.h>
28#include <types/proxy.h>
29#include <types/server.h>
30#include <types/session.h>
31
32#include <proto/backend.h>
Willy Tarreau14c8aac2007-05-08 19:46:30 +020033#include <proto/client.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020034#include <proto/fd.h>
Willy Tarreau80587432006-12-24 17:47:20 +010035#include <proto/httperr.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020036#include <proto/log.h>
37#include <proto/proto_http.h>
38#include <proto/queue.h>
39#include <proto/stream_sock.h>
40#include <proto/task.h>
41
Willy Tarreau77074d52006-11-12 23:57:19 +010042#ifdef CONFIG_HAP_CTTPROXY
43#include <import/ip_tproxy.h>
44#endif
Willy Tarreaubaaee002006-06-26 02:48:02 +020045
Willy Tarreau6d1a9882007-01-07 02:03:04 +010046#ifdef CONFIG_HAP_TCPSPLICE
47#include <libtcpsplice.h>
48#endif
49
Willy Tarreaub625a082007-11-26 01:15:43 +010050static inline void fwrr_remove_from_tree(struct server *s);
51static inline void fwrr_queue_by_weight(struct eb_root *root, struct server *s);
52static inline void fwrr_dequeue_srv(struct server *s);
53static void fwrr_get_srv(struct server *s);
54static void fwrr_queue_srv(struct server *s);
55
56/* This function returns non-zero if a server with the given weight and state
57 * is usable for LB, otherwise zero.
58 */
59static inline int srv_is_usable(int state, int weight)
60{
61 if (!weight)
62 return 0;
63 if (!(state & SRV_RUNNING))
64 return 0;
65 return 1;
66}
67
Willy Tarreaubaaee002006-06-26 02:48:02 +020068/*
69 * This function recounts the number of usable active and backup servers for
70 * proxy <p>. These numbers are returned into the p->srv_act and p->srv_bck.
Willy Tarreaub625a082007-11-26 01:15:43 +010071 * This function also recomputes the total active and backup weights. However,
72 * it does nout update tot_weight nor tot_used. Use update_backend_weight() for
73 * this.
Willy Tarreaubaaee002006-06-26 02:48:02 +020074 */
Willy Tarreaub625a082007-11-26 01:15:43 +010075static void recount_servers(struct proxy *px)
Willy Tarreaubaaee002006-06-26 02:48:02 +020076{
77 struct server *srv;
78
Willy Tarreau20697042007-11-15 23:26:18 +010079 px->srv_act = px->srv_bck = 0;
80 px->lbprm.tot_wact = px->lbprm.tot_wbck = 0;
Willy Tarreaub625a082007-11-26 01:15:43 +010081 px->lbprm.fbck = NULL;
Willy Tarreaubaaee002006-06-26 02:48:02 +020082 for (srv = px->srv; srv != NULL; srv = srv->next) {
Willy Tarreaub625a082007-11-26 01:15:43 +010083 if (!srv_is_usable(srv->state, srv->eweight))
84 continue;
85
86 if (srv->state & SRV_BACKUP) {
87 if (!px->srv_bck &&
88 !(px->options & PR_O_USE_ALL_BK))
89 px->lbprm.fbck = srv;
90 px->srv_bck++;
91 px->lbprm.tot_wbck += srv->eweight;
92 } else {
93 px->srv_act++;
94 px->lbprm.tot_wact += srv->eweight;
Willy Tarreaubaaee002006-06-26 02:48:02 +020095 }
96 }
Willy Tarreaub625a082007-11-26 01:15:43 +010097}
Willy Tarreau20697042007-11-15 23:26:18 +010098
Willy Tarreaub625a082007-11-26 01:15:43 +010099/* This function simply updates the backend's tot_weight and tot_used values
100 * after servers weights have been updated. It is designed to be used after
101 * recount_servers() or equivalent.
102 */
103static void update_backend_weight(struct proxy *px)
104{
Willy Tarreau20697042007-11-15 23:26:18 +0100105 if (px->srv_act) {
106 px->lbprm.tot_weight = px->lbprm.tot_wact;
107 px->lbprm.tot_used = px->srv_act;
108 }
Willy Tarreaub625a082007-11-26 01:15:43 +0100109 else if (px->lbprm.fbck) {
110 /* use only the first backup server */
111 px->lbprm.tot_weight = px->lbprm.fbck->eweight;
112 px->lbprm.tot_used = 1;
Willy Tarreau20697042007-11-15 23:26:18 +0100113 }
114 else {
Willy Tarreaub625a082007-11-26 01:15:43 +0100115 px->lbprm.tot_weight = px->lbprm.tot_wbck;
116 px->lbprm.tot_used = px->srv_bck;
Willy Tarreau20697042007-11-15 23:26:18 +0100117 }
Willy Tarreaub625a082007-11-26 01:15:43 +0100118}
119
120/* this function updates the map according to server <srv>'s new state */
121static void map_set_server_status_down(struct server *srv)
122{
123 struct proxy *p = srv->proxy;
124
125 if (srv->state == srv->prev_state &&
126 srv->eweight == srv->prev_eweight)
127 return;
128
129 /* FIXME: could be optimized since we know what changed */
130 recount_servers(p);
131 update_backend_weight(p);
132 srv->prev_state = srv->state;
133 srv->prev_eweight = srv->eweight;
134 p->lbprm.map.state |= PR_MAP_RECALC;
Willy Tarreau20697042007-11-15 23:26:18 +0100135
Willy Tarreaubaaee002006-06-26 02:48:02 +0200136}
137
Willy Tarreaub625a082007-11-26 01:15:43 +0100138/* this function updates the map according to server <srv>'s new state */
139static void map_set_server_status_up(struct server *srv)
140{
141 struct proxy *p = srv->proxy;
142
143 if (srv->state == srv->prev_state &&
144 srv->eweight == srv->prev_eweight)
145 return;
146
147 /* FIXME: could be optimized since we know what changed */
148 recount_servers(p);
149 update_backend_weight(p);
150 srv->prev_state = srv->state;
151 srv->prev_eweight = srv->eweight;
152 p->lbprm.map.state |= PR_MAP_RECALC;
153}
154
Willy Tarreau20697042007-11-15 23:26:18 +0100155/* This function recomputes the server map for proxy px. It relies on
156 * px->lbprm.tot_wact, tot_wbck, tot_used, tot_weight, so it must be
157 * called after recount_servers(). It also expects px->lbprm.map.srv
158 * to be allocated with the largest size needed. It updates tot_weight.
Willy Tarreaubaaee002006-06-26 02:48:02 +0200159 */
160void recalc_server_map(struct proxy *px)
161{
162 int o, tot, flag;
163 struct server *cur, *best;
164
Willy Tarreau20697042007-11-15 23:26:18 +0100165 switch (px->lbprm.tot_used) {
166 case 0: /* no server */
167 px->lbprm.map.state &= ~PR_MAP_RECALC;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200168 return;
Willy Tarreau20697042007-11-15 23:26:18 +0100169 case 1: /* only one server, just fill first entry */
170 tot = 1;
171 break;
172 default:
173 tot = px->lbprm.tot_weight;
174 break;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200175 }
176
Willy Tarreau20697042007-11-15 23:26:18 +0100177 /* here we *know* that we have some servers */
178 if (px->srv_act)
179 flag = SRV_RUNNING;
180 else
181 flag = SRV_RUNNING | SRV_BACKUP;
182
Willy Tarreaubaaee002006-06-26 02:48:02 +0200183 /* this algorithm gives priority to the first server, which means that
184 * it will respect the declaration order for equivalent weights, and
185 * that whatever the weights, the first server called will always be
Willy Tarreau20697042007-11-15 23:26:18 +0100186 * the first declared. This is an important asumption for the backup
Willy Tarreaubaaee002006-06-26 02:48:02 +0200187 * case, where we want the first server only.
188 */
189 for (cur = px->srv; cur; cur = cur->next)
190 cur->wscore = 0;
191
192 for (o = 0; o < tot; o++) {
193 int max = 0;
194 best = NULL;
195 for (cur = px->srv; cur; cur = cur->next) {
196 if ((cur->state & (SRV_RUNNING | SRV_BACKUP)) == flag) {
197 int v;
198
199 /* If we are forced to return only one server, we don't want to
200 * go further, because we would return the wrong one due to
201 * divide overflow.
202 */
203 if (tot == 1) {
204 best = cur;
Willy Tarreau20697042007-11-15 23:26:18 +0100205 /* note that best->wscore will be wrong but we don't care */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200206 break;
207 }
208
Willy Tarreau417fae02007-03-25 21:16:40 +0200209 cur->wscore += cur->eweight;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200210 v = (cur->wscore + tot) / tot; /* result between 0 and 3 */
211 if (best == NULL || v > max) {
212 max = v;
213 best = cur;
214 }
215 }
216 }
Willy Tarreau20697042007-11-15 23:26:18 +0100217 px->lbprm.map.srv[o] = best;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200218 best->wscore -= tot;
219 }
Willy Tarreau20697042007-11-15 23:26:18 +0100220 px->lbprm.map.state &= ~PR_MAP_RECALC;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200221}
222
Willy Tarreau5dc2fa62007-11-19 19:10:18 +0100223/* This function is responsible of building the server MAP for map-based LB
224 * algorithms, allocating the map, and setting p->lbprm.wmult to the GCD of the
225 * weights if applicable. It should be called only once per proxy, at config
226 * time.
227 */
228void init_server_map(struct proxy *p)
229{
230 struct server *srv;
231 int pgcd;
232 int act, bck;
233
Willy Tarreaub625a082007-11-26 01:15:43 +0100234 p->lbprm.set_server_status_up = map_set_server_status_up;
235 p->lbprm.set_server_status_down = map_set_server_status_down;
236 p->lbprm.update_server_eweight = NULL;
237
Willy Tarreau5dc2fa62007-11-19 19:10:18 +0100238 if (!p->srv)
239 return;
240
241 /* We will factor the weights to reduce the table,
242 * using Euclide's largest common divisor algorithm
243 */
244 pgcd = p->srv->uweight;
245 for (srv = p->srv->next; srv && pgcd > 1; srv = srv->next) {
246 int w = srv->uweight;
247 while (w) {
248 int t = pgcd % w;
249 pgcd = w;
250 w = t;
251 }
252 }
253
254 /* It is sometimes useful to know what factor to apply
255 * to the backend's effective weight to know its real
256 * weight.
257 */
258 p->lbprm.wmult = pgcd;
259
260 act = bck = 0;
261 for (srv = p->srv; srv; srv = srv->next) {
262 srv->eweight = srv->uweight / pgcd;
Willy Tarreaub625a082007-11-26 01:15:43 +0100263 srv->prev_eweight = srv->eweight;
264 srv->prev_state = srv->state;
Willy Tarreau5dc2fa62007-11-19 19:10:18 +0100265 if (srv->state & SRV_BACKUP)
266 bck += srv->eweight;
267 else
268 act += srv->eweight;
269 }
270
271 /* this is the largest map we will ever need for this servers list */
272 if (act < bck)
273 act = bck;
274
275 p->lbprm.map.srv = (struct server **)calloc(act, sizeof(struct server *));
276 /* recounts servers and their weights */
277 p->lbprm.map.state = PR_MAP_RECALC;
278 recount_servers(p);
Willy Tarreaub625a082007-11-26 01:15:43 +0100279 update_backend_weight(p);
Willy Tarreau5dc2fa62007-11-19 19:10:18 +0100280 recalc_server_map(p);
281}
282
Willy Tarreaub625a082007-11-26 01:15:43 +0100283/* This function updates the server trees according to server <srv>'s new
284 * state. It should be called when server <srv>'s status changes to down.
285 * It is not important whether the server was already down or not. However,
286 * it is mandatory that the new state be down.
287 */
288static void fwrr_set_server_status_down(struct server *srv)
289{
290 struct proxy *p = srv->proxy;
291 struct fwrr_group *grp;
292
293 if (srv->state == srv->prev_state &&
294 srv->eweight == srv->prev_eweight)
295 return;
296
297 if (!srv_is_usable(srv->prev_state, srv->prev_eweight))
298 /* server was already down */
299 goto out_update_backend;
300
301 grp = (srv->state & SRV_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act;
302 grp->next_weight -= srv->prev_eweight;
303
304 if (srv->state & SRV_BACKUP) {
305 p->lbprm.tot_wbck = p->lbprm.fwrr.bck.next_weight;
306 p->srv_bck--;
307
308 if (srv == p->lbprm.fbck) {
309 /* we lost the first backup server in a single-backup
310 * configuration, we must search another one.
311 */
312 struct server *srv2 = p->lbprm.fbck;
313 do {
314 srv2 = srv2->next;
315 } while (srv2 &&
316 !((srv2->state & SRV_BACKUP) &&
317 srv_is_usable(srv2->state, srv2->eweight)));
318 p->lbprm.fbck = srv2;
319 }
320 } else {
321 p->lbprm.tot_wact = p->lbprm.fwrr.act.next_weight;
322 p->srv_act--;
323 }
324
325 fwrr_dequeue_srv(srv);
326 fwrr_remove_from_tree(srv);
327
328out_update_backend:
329 /* check/update tot_used, tot_weight */
330 update_backend_weight(p);
331 srv->prev_state = srv->state;
332 srv->prev_eweight = srv->eweight;
333
334}
335
336/* This function updates the server trees according to server <srv>'s new
337 * state. It should be called when server <srv>'s status changes to up.
338 * It is not important whether the server was already down or not. However,
339 * it is mandatory that the new state be up. This function will not change
340 * the weight of a server which was already up.
341 */
342static void fwrr_set_server_status_up(struct server *srv)
343{
344 struct proxy *p = srv->proxy;
345 struct fwrr_group *grp;
346
347 if (srv->state == srv->prev_state &&
348 srv->eweight == srv->prev_eweight)
349 return;
350
351 if (srv_is_usable(srv->prev_state, srv->prev_eweight))
352 /* server was already up */
353 goto out_update_backend;
354
355 grp = (srv->state & SRV_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act;
356 grp->next_weight += srv->eweight;
357
358 if (srv->state & SRV_BACKUP) {
359 p->lbprm.tot_wbck = p->lbprm.fwrr.bck.next_weight;
360 p->srv_bck++;
361
362 if (p->lbprm.fbck) {
363 /* we may have restored a backup server prior to fbck,
364 * in which case it should replace it.
365 */
366 struct server *srv2 = srv;
367 do {
368 srv2 = srv2->next;
369 } while (srv2 && (srv2 != p->lbprm.fbck));
370 if (srv2)
371 p->lbprm.fbck = srv;
372 }
373 } else {
374 p->lbprm.tot_wact = p->lbprm.fwrr.act.next_weight;
375 p->srv_act++;
376 }
377
378 /* note that eweight cannot be 0 here */
379 fwrr_get_srv(srv);
380 srv->npos = grp->curr_pos + (grp->next_weight + grp->curr_weight - grp->curr_pos) / srv->eweight;
381 fwrr_queue_srv(srv);
382
383out_update_backend:
384 /* check/update tot_used, tot_weight */
385 update_backend_weight(p);
386 srv->prev_state = srv->state;
387 srv->prev_eweight = srv->eweight;
388}
389
390/* This function must be called after an update to server <srv>'s effective
391 * weight. It may be called after a state change too.
392 */
393static void fwrr_update_server_weight(struct server *srv)
394{
395 int old_state, new_state;
396 struct proxy *p = srv->proxy;
397 struct fwrr_group *grp;
398
399 if (srv->state == srv->prev_state &&
400 srv->eweight == srv->prev_eweight)
401 return;
402
403 /* If changing the server's weight changes its state, we simply apply
404 * the procedures we already have for status change. If the state
405 * remains down, the server is not in any tree, so it's as easy as
406 * updating its values. If the state remains up with different weights,
407 * there are some computations to perform to find a new place and
408 * possibly a new tree for this server.
409 */
410
411 old_state = srv_is_usable(srv->prev_state, srv->prev_eweight);
412 new_state = srv_is_usable(srv->state, srv->eweight);
413
414 if (!old_state && !new_state) {
415 srv->prev_state = srv->state;
416 srv->prev_eweight = srv->eweight;
417 return;
418 }
419 else if (!old_state && new_state) {
420 fwrr_set_server_status_up(srv);
421 return;
422 }
423 else if (old_state && !new_state) {
424 fwrr_set_server_status_down(srv);
425 return;
426 }
427
428 grp = (srv->state & SRV_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act;
429 grp->next_weight = grp->next_weight - srv->prev_eweight + srv->eweight;
430
431 p->lbprm.tot_wact = p->lbprm.fwrr.act.next_weight;
432 p->lbprm.tot_wbck = p->lbprm.fwrr.bck.next_weight;
433
434 if (srv->lb_tree == grp->init) {
435 fwrr_dequeue_srv(srv);
436 fwrr_queue_by_weight(grp->init, srv);
437 }
438 else if (!srv->lb_tree) {
439 /* FIXME: server was down. This is not possible right now but
440 * may be needed soon for slowstart or graceful shutdown.
441 */
442 fwrr_dequeue_srv(srv);
443 fwrr_get_srv(srv);
444 srv->npos = grp->curr_pos + (grp->next_weight + grp->curr_weight - grp->curr_pos) / srv->eweight;
445 fwrr_queue_srv(srv);
446 } else {
447 /* The server is either active or in the next queue. If it's
448 * still in the active queue and it has not consumed all of its
449 * places, let's adjust its next position.
450 */
451 fwrr_get_srv(srv);
452
453 if (srv->eweight > 0) {
454 int prev_next = srv->npos;
455 int step = grp->next_weight / srv->eweight;
456
457 srv->npos = srv->lpos + step;
458 srv->rweight = 0;
459
460 if (srv->npos > prev_next)
461 srv->npos = prev_next;
462 if (srv->npos < grp->curr_pos + 2)
463 srv->npos = grp->curr_pos + step;
464 } else {
465 /* push it into the next tree */
466 srv->npos = grp->curr_pos + grp->curr_weight;
467 }
468
469 fwrr_dequeue_srv(srv);
470 fwrr_queue_srv(srv);
471 }
472
473 update_backend_weight(p);
474 srv->prev_state = srv->state;
475 srv->prev_eweight = srv->eweight;
476}
477
478/* Remove a server from a tree. It must have previously been dequeued. This
479 * function is meant to be called when a server is going down or has its
480 * weight disabled.
481 */
482static inline void fwrr_remove_from_tree(struct server *s)
483{
484 s->lb_tree = NULL;
485}
486
487/* Queue a server in the weight tree <root>, assuming the weight is >0.
488 * We want to sort them by inverted weights, because we need to place
489 * heavy servers first in order to get a smooth distribution.
490 */
491static inline void fwrr_queue_by_weight(struct eb_root *root, struct server *s)
492{
493 /* eweight can be as high as 256*255 */
494 s->lb_node.key = BE_WEIGHT_SCALE*255 - s->eweight;
495 eb32_insert(root, &s->lb_node);
496 s->lb_tree = root;
497}
498
499/* This function is responsible for building the weight trees in case of fast
500 * weighted round-robin. It also sets p->lbprm.wdiv to the eweight to uweight
501 * ratio. Both active and backup groups are initialized.
502 */
503void fwrr_init_server_groups(struct proxy *p)
504{
505 struct server *srv;
506 struct eb_root init_head = EB_ROOT;
507
508 p->lbprm.set_server_status_up = fwrr_set_server_status_up;
509 p->lbprm.set_server_status_down = fwrr_set_server_status_down;
510 p->lbprm.update_server_eweight = fwrr_update_server_weight;
511
512 p->lbprm.wdiv = BE_WEIGHT_SCALE;
513 for (srv = p->srv; srv; srv = srv->next) {
514 srv->prev_eweight = srv->eweight = srv->uweight * BE_WEIGHT_SCALE;
515 srv->prev_state = srv->state;
516 }
517
518 recount_servers(p);
519 update_backend_weight(p);
520
521 /* prepare the active servers group */
522 p->lbprm.fwrr.act.curr_pos = p->lbprm.fwrr.act.curr_weight =
523 p->lbprm.fwrr.act.next_weight = p->lbprm.tot_wact;
524 p->lbprm.fwrr.act.curr = p->lbprm.fwrr.act.t0 =
525 p->lbprm.fwrr.act.t1 = init_head;
526 p->lbprm.fwrr.act.init = &p->lbprm.fwrr.act.t0;
527 p->lbprm.fwrr.act.next = &p->lbprm.fwrr.act.t1;
528
529 /* prepare the backup servers group */
530 p->lbprm.fwrr.bck.curr_pos = p->lbprm.fwrr.bck.curr_weight =
531 p->lbprm.fwrr.bck.next_weight = p->lbprm.tot_wbck;
532 p->lbprm.fwrr.bck.curr = p->lbprm.fwrr.bck.t0 =
533 p->lbprm.fwrr.bck.t1 = init_head;
534 p->lbprm.fwrr.bck.init = &p->lbprm.fwrr.bck.t0;
535 p->lbprm.fwrr.bck.next = &p->lbprm.fwrr.bck.t1;
536
537 /* queue active and backup servers in two distinct groups */
538 for (srv = p->srv; srv; srv = srv->next) {
539 if (!srv_is_usable(srv->state, srv->eweight))
540 continue;
541 fwrr_queue_by_weight((srv->state & SRV_BACKUP) ?
542 p->lbprm.fwrr.bck.init :
543 p->lbprm.fwrr.act.init,
544 srv);
545 }
546}
547
548/* simply removes a server from a weight tree */
549static inline void fwrr_dequeue_srv(struct server *s)
550{
551 eb32_delete(&s->lb_node);
552}
553
554/* queues a server into the appropriate group and tree depending on its
555 * backup status, and ->npos. If the server is disabled, simply assign
556 * it to the NULL tree.
557 */
558static void fwrr_queue_srv(struct server *s)
559{
560 struct proxy *p = s->proxy;
561 struct fwrr_group *grp;
562
563 grp = (s->state & SRV_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act;
564
565 /* Delay everything which does not fit into the window and everything
566 * which does not fit into the theorical new window.
567 */
568 if (!srv_is_usable(s->state, s->eweight)) {
569 fwrr_remove_from_tree(s);
570 }
571 else if (s->eweight <= 0 ||
572 s->npos >= 2 * grp->curr_weight ||
573 s->npos >= grp->curr_weight + grp->next_weight) {
574 /* put into next tree, and readjust npos in case we could
575 * finally take this back to current. */
576 s->npos -= grp->curr_weight;
577 fwrr_queue_by_weight(grp->next, s);
578 }
579 else {
580 /* FIXME: we want to multiply by a constant to avoid overrides
581 * after weight changes, but this can easily overflow on 32-bit
582 * values. We need to change this for a 64-bit tree, and keep
583 * the 65536 factor for optimal smoothness (both rweight and
584 * eweight are 16 bit entities). s->npos is bound by the number
585 * of servers times the maximum eweight (~= nsrv << 16).
586 */
587 //s->lb_node.key = grp->curr_weight * s->npos + s->rweight - s->eweight;
588 //s->lb_node.key = 65536 * s->npos + s->rweight - s->eweight;
589 s->lb_node.key = 16 * s->npos + (s->rweight - s->eweight) / 4096;
590 eb32i_insert(&grp->curr, &s->lb_node);
591 s->lb_tree = &grp->curr;
592 }
593}
594
595/* prepares a server when extracting it from the "init" tree */
596static inline void fwrr_get_srv_init(struct server *s)
597{
598 s->npos = s->rweight = 0;
599}
600
601/* prepares a server when extracting it from the "next" tree */
602static inline void fwrr_get_srv_next(struct server *s)
603{
604 struct fwrr_group *grp = (s->state & SRV_BACKUP) ?
605 &s->proxy->lbprm.fwrr.bck :
606 &s->proxy->lbprm.fwrr.act;
607
608 s->npos += grp->curr_weight;
609}
610
611/* prepares a server when it was marked down */
612static inline void fwrr_get_srv_down(struct server *s)
613{
614 struct fwrr_group *grp = (s->state & SRV_BACKUP) ?
615 &s->proxy->lbprm.fwrr.bck :
616 &s->proxy->lbprm.fwrr.act;
617
618 s->npos = grp->curr_pos;
619}
620
621/* prepares a server when extracting it from its tree */
622static void fwrr_get_srv(struct server *s)
623{
624 struct proxy *p = s->proxy;
625 struct fwrr_group *grp = (s->state & SRV_BACKUP) ?
626 &p->lbprm.fwrr.bck :
627 &p->lbprm.fwrr.act;
628
629 if (s->lb_tree == grp->init) {
630 fwrr_get_srv_init(s);
631 }
632 else if (s->lb_tree == grp->next) {
633 fwrr_get_srv_next(s);
634 }
635 else if (s->lb_tree == NULL) {
636 fwrr_get_srv_down(s);
637 }
638}
639
640/* switches trees "init" and "next" for FWRR group <grp>. "init" should be empty
641 * when this happens, and "next" filled with servers sorted by weights.
642 */
643static inline void fwrr_switch_trees(struct fwrr_group *grp)
644{
645 struct eb_root *swap;
646 swap = grp->init;
647 grp->init = grp->next;
648 grp->next = swap;
649 grp->curr_weight = grp->next_weight;
650 grp->curr_pos = grp->curr_weight;
651}
652
653/* return next server from the current tree in FWRR group <grp>, or a server
654 * from the "init" tree if appropriate. If both trees are empty, return NULL.
655 */
656static struct server *fwrr_get_server_from_group(struct fwrr_group *grp)
657{
658 struct eb32_node *node;
659 struct server *s;
660
661 node = eb32_first(&grp->curr);
662 s = eb32_entry(node, struct server, lb_node);
663
664 if (!node || s->npos > grp->curr_pos) {
665 /* either we have no server left, or we have a hole */
666 struct eb32_node *node2;
667 node2 = eb32_first(grp->init);
668 if (node2) {
669 node = node2;
670 s = eb32_entry(node, struct server, lb_node);
671 fwrr_get_srv_init(s);
672 if (s->eweight == 0) /* FIXME: is it possible at all ? */
673 node = NULL;
674 }
675 }
676 if (node)
677 return s;
678 else
679 return NULL;
680}
681
682/* Computes next position of server <s> in the group. It is mandatory for <s>
683 * to have a non-zero, positive eweight.
684*/
685static inline void fwrr_update_position(struct fwrr_group *grp, struct server *s)
686{
687 if (!s->npos) {
688 /* first time ever for this server */
689 s->lpos = grp->curr_pos;
690 s->npos = grp->curr_pos + grp->next_weight / s->eweight;
691 s->rweight += grp->next_weight % s->eweight;
692
693 if (s->rweight >= s->eweight) {
694 s->rweight -= s->eweight;
695 s->npos++;
696 }
697 } else {
698 s->lpos = s->npos;
699 s->npos += grp->next_weight / s->eweight;
700 s->rweight += grp->next_weight % s->eweight;
701
702 if (s->rweight >= s->eweight) {
703 s->rweight -= s->eweight;
704 s->npos++;
705 }
706 }
707}
708
709/* Return next server from the current tree in backend <p>, or a server from
710 * the init tree if appropriate. If both trees are empty, return NULL.
711 * Saturated servers are skipped and requeued.
712 */
713static struct server *fwrr_get_next_server(struct proxy *p)
714{
715 struct server *srv;
716 struct fwrr_group *grp;
717 struct server *full;
718 int switched;
719
720 if (p->srv_act)
721 grp = &p->lbprm.fwrr.act;
722 else if (p->lbprm.fbck)
723 return p->lbprm.fbck;
724 else if (p->srv_bck)
725 grp = &p->lbprm.fwrr.bck;
726 else
727 return NULL;
728
729 switched = 0;
730 full = NULL; /* NULL-terminated list of saturated servers */
731 while (1) {
732 /* if we see an empty group, let's first try to collect weights
733 * which might have recently changed.
734 */
735 if (!grp->curr_weight)
736 grp->curr_pos = grp->curr_weight = grp->next_weight;
737
738 /* get first server from the "current" tree. When the end of
739 * the tree is reached, we may have to switch, but only once.
740 */
741 while (1) {
742 srv = fwrr_get_server_from_group(grp);
743 if (srv)
744 break;
745 if (switched)
746 goto requeue_servers;
747 switched = 1;
748 fwrr_switch_trees(grp);
749
750 }
751
752 /* OK, we have a server. However, it may be saturated, in which
753 * case we don't want to reconsider it for now. We'll update
754 * its position and dequeue it anyway, so that we can move it
755 * to a better place afterwards.
756 */
757 fwrr_update_position(grp, srv);
758 fwrr_dequeue_srv(srv);
759 grp->curr_pos++;
760 if (!srv->maxconn || srv->cur_sess < srv_dynamic_maxconn(srv))
761 break;
762
763 /* the server is saturated, let's chain it for later reinsertion */
764 srv->next_full = full;
765 full = srv;
766 }
767
768 /* OK, we got the best server, let's update it */
769 fwrr_queue_srv(srv);
770
771 requeue_servers:
772 if (unlikely(full)) {
773 if (switched) {
774 /* the tree has switched, requeue all extracted servers
775 * into "init", because their place was lost, and only
776 * their weight matters.
777 */
778 do {
779 fwrr_queue_by_weight(grp->init, full);
780 full = full->next_full;
781 } while (full);
782 } else {
783 /* requeue all extracted servers just as if they were consumed
784 * so that they regain their expected place.
785 */
786 do {
787 fwrr_queue_srv(full);
788 full = full->next_full;
789 } while (full);
790 }
791 }
792 return srv;
793}
794
Willy Tarreau01732802007-11-01 22:48:15 +0100795/*
796 * This function tries to find a running server for the proxy <px> following
797 * the URL parameter hash method. It looks for a specific parameter in the
798 * URL and hashes it to compute the server ID. This is useful to optimize
799 * performance by avoiding bounces between servers in contexts where sessions
800 * are shared but cookies are not usable. If the parameter is not found, NULL
801 * is returned. If any server is found, it will be returned. If no valid server
802 * is found, NULL is returned.
803 *
804 */
805struct server *get_server_ph(struct proxy *px, const char *uri, int uri_len)
806{
807 unsigned long hash = 0;
808 char *p;
809 int plen;
810
Willy Tarreau20697042007-11-15 23:26:18 +0100811 if (px->lbprm.tot_weight == 0)
Willy Tarreau01732802007-11-01 22:48:15 +0100812 return NULL;
813
Willy Tarreau20697042007-11-15 23:26:18 +0100814 if (px->lbprm.map.state & PR_MAP_RECALC)
815 recalc_server_map(px);
816
Willy Tarreau01732802007-11-01 22:48:15 +0100817 p = memchr(uri, '?', uri_len);
818 if (!p)
819 return NULL;
820 p++;
821
822 uri_len -= (p - uri);
823 plen = px->url_param_len;
824
825 if (uri_len <= plen)
826 return NULL;
827
828 while (uri_len > plen) {
829 /* Look for the parameter name followed by an equal symbol */
830 if (p[plen] == '=') {
831 /* skip the equal symbol */
832 uri = p;
833 p += plen + 1;
834 uri_len -= plen + 1;
835 if (memcmp(uri, px->url_param_name, plen) == 0) {
836 /* OK, we have the parameter here at <uri>, and
837 * the value after the equal sign, at <p>
838 */
839 while (uri_len && *p != '&') {
840 hash = *p + (hash << 6) + (hash << 16) - hash;
841 uri_len--;
842 p++;
843 }
Willy Tarreau20697042007-11-15 23:26:18 +0100844 return px->lbprm.map.srv[hash % px->lbprm.tot_weight];
Willy Tarreau01732802007-11-01 22:48:15 +0100845 }
846 }
847
848 /* skip to next parameter */
849 uri = p;
850 p = memchr(uri, '&', uri_len);
851 if (!p)
852 return NULL;
853 p++;
854 uri_len -= (p - uri);
855 }
856 return NULL;
857}
Willy Tarreaubaaee002006-06-26 02:48:02 +0200858
859/*
860 * This function marks the session as 'assigned' in direct or dispatch modes,
861 * or tries to assign one in balance mode, according to the algorithm. It does
862 * nothing if the session had already been assigned a server.
863 *
864 * It may return :
865 * SRV_STATUS_OK if everything is OK. s->srv will be valid.
866 * SRV_STATUS_NOSRV if no server is available. s->srv = NULL.
867 * SRV_STATUS_FULL if all servers are saturated. s->srv = NULL.
868 * SRV_STATUS_INTERNAL for other unrecoverable errors.
869 *
870 * Upon successful return, the session flag SN_ASSIGNED to indicate that it does
871 * not need to be called anymore. This usually means that s->srv can be trusted
872 * in balance and direct modes. This flag is not cleared, so it's to the caller
873 * to clear it if required (eg: redispatch).
874 *
875 */
876
877int assign_server(struct session *s)
878{
879#ifdef DEBUG_FULL
880 fprintf(stderr,"assign_server : s=%p\n",s);
881#endif
882
883 if (s->pend_pos)
884 return SRV_STATUS_INTERNAL;
885
886 if (!(s->flags & SN_ASSIGNED)) {
Willy Tarreaue2e27a52007-04-01 00:01:37 +0200887 if (s->be->options & PR_O_BALANCE) {
Willy Tarreau1a20a5d2007-11-01 21:08:19 +0100888 int len;
889
Willy Tarreau5d65bbb2007-01-21 12:47:26 +0100890 if (s->flags & SN_DIRECT) {
891 s->flags |= SN_ASSIGNED;
892 return SRV_STATUS_OK;
893 }
Willy Tarreau1a20a5d2007-11-01 21:08:19 +0100894
Willy Tarreaub625a082007-11-26 01:15:43 +0100895 if (!s->be->lbprm.tot_weight)
Willy Tarreaubaaee002006-06-26 02:48:02 +0200896 return SRV_STATUS_NOSRV;
897
Willy Tarreau1a20a5d2007-11-01 21:08:19 +0100898 switch (s->be->options & PR_O_BALANCE) {
899 case PR_O_BALANCE_RR:
Willy Tarreaub625a082007-11-26 01:15:43 +0100900 s->srv = fwrr_get_next_server(s->be);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200901 if (!s->srv)
902 return SRV_STATUS_FULL;
Willy Tarreau1a20a5d2007-11-01 21:08:19 +0100903 break;
904 case PR_O_BALANCE_SH:
Willy Tarreaubaaee002006-06-26 02:48:02 +0200905 if (s->cli_addr.ss_family == AF_INET)
906 len = 4;
907 else if (s->cli_addr.ss_family == AF_INET6)
908 len = 16;
909 else /* unknown IP family */
910 return SRV_STATUS_INTERNAL;
911
Willy Tarreaue2e27a52007-04-01 00:01:37 +0200912 s->srv = get_server_sh(s->be,
Willy Tarreaubaaee002006-06-26 02:48:02 +0200913 (void *)&((struct sockaddr_in *)&s->cli_addr)->sin_addr,
914 len);
Willy Tarreau1a20a5d2007-11-01 21:08:19 +0100915 break;
916 case PR_O_BALANCE_UH:
Willy Tarreau2fcb5002007-05-08 13:35:26 +0200917 /* URI hashing */
918 s->srv = get_server_uh(s->be,
919 s->txn.req.sol + s->txn.req.sl.rq.u,
920 s->txn.req.sl.rq.u_l);
Willy Tarreau01732802007-11-01 22:48:15 +0100921 break;
922 case PR_O_BALANCE_PH:
923 /* URL Parameter hashing */
924 s->srv = get_server_ph(s->be,
925 s->txn.req.sol + s->txn.req.sl.rq.u,
926 s->txn.req.sl.rq.u_l);
927 if (!s->srv) {
Willy Tarreaub625a082007-11-26 01:15:43 +0100928 /* parameter not found, fall back to round robin on the map */
Willy Tarreau01732802007-11-01 22:48:15 +0100929 s->srv = get_server_rr_with_conns(s->be);
930 if (!s->srv)
931 return SRV_STATUS_FULL;
932 }
Willy Tarreau1a20a5d2007-11-01 21:08:19 +0100933 break;
934 default:
935 /* unknown balancing algorithm */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200936 return SRV_STATUS_INTERNAL;
Willy Tarreau1a20a5d2007-11-01 21:08:19 +0100937 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200938 }
Willy Tarreaue2e27a52007-04-01 00:01:37 +0200939 else if (!*(int *)&s->be->dispatch_addr.sin_addr &&
Willy Tarreau5d65bbb2007-01-21 12:47:26 +0100940 !(s->fe->options & PR_O_TRANSP)) {
Willy Tarreau1a1158b2007-01-20 11:07:46 +0100941 return SRV_STATUS_NOSRV;
Willy Tarreau5d65bbb2007-01-21 12:47:26 +0100942 }
943 s->flags |= SN_ASSIGNED;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200944 }
945 return SRV_STATUS_OK;
946}
947
948
949/*
950 * This function assigns a server address to a session, and sets SN_ADDR_SET.
951 * The address is taken from the currently assigned server, or from the
952 * dispatch or transparent address.
953 *
954 * It may return :
955 * SRV_STATUS_OK if everything is OK.
956 * SRV_STATUS_INTERNAL for other unrecoverable errors.
957 *
958 * Upon successful return, the session flag SN_ADDR_SET is set. This flag is
959 * not cleared, so it's to the caller to clear it if required.
960 *
961 */
962int assign_server_address(struct session *s)
963{
964#ifdef DEBUG_FULL
965 fprintf(stderr,"assign_server_address : s=%p\n",s);
966#endif
967
Willy Tarreaue2e27a52007-04-01 00:01:37 +0200968 if ((s->flags & SN_DIRECT) || (s->be->options & PR_O_BALANCE)) {
Willy Tarreaubaaee002006-06-26 02:48:02 +0200969 /* A server is necessarily known for this session */
970 if (!(s->flags & SN_ASSIGNED))
971 return SRV_STATUS_INTERNAL;
972
973 s->srv_addr = s->srv->addr;
974
975 /* if this server remaps proxied ports, we'll use
976 * the port the client connected to with an offset. */
977 if (s->srv->state & SRV_MAPPORTS) {
Willy Tarreau14c8aac2007-05-08 19:46:30 +0200978 if (!(s->fe->options & PR_O_TRANSP) && !(s->flags & SN_FRT_ADDR_SET))
979 get_frt_addr(s);
980 if (s->frt_addr.ss_family == AF_INET) {
981 s->srv_addr.sin_port = htons(ntohs(s->srv_addr.sin_port) +
982 ntohs(((struct sockaddr_in *)&s->frt_addr)->sin_port));
983 } else {
984 s->srv_addr.sin_port = htons(ntohs(s->srv_addr.sin_port) +
985 ntohs(((struct sockaddr_in6 *)&s->frt_addr)->sin6_port));
986 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200987 }
988 }
Willy Tarreaue2e27a52007-04-01 00:01:37 +0200989 else if (*(int *)&s->be->dispatch_addr.sin_addr) {
Willy Tarreaubaaee002006-06-26 02:48:02 +0200990 /* connect to the defined dispatch addr */
Willy Tarreaue2e27a52007-04-01 00:01:37 +0200991 s->srv_addr = s->be->dispatch_addr;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200992 }
Willy Tarreau73de9892006-11-30 11:40:23 +0100993 else if (s->fe->options & PR_O_TRANSP) {
Willy Tarreaubaaee002006-06-26 02:48:02 +0200994 /* in transparent mode, use the original dest addr if no dispatch specified */
995 socklen_t salen = sizeof(s->srv_addr);
996
997 if (get_original_dst(s->cli_fd, &s->srv_addr, &salen) == -1) {
998 qfprintf(stderr, "Cannot get original server address.\n");
999 return SRV_STATUS_INTERNAL;
1000 }
1001 }
Willy Tarreau1a1158b2007-01-20 11:07:46 +01001002 else {
1003 /* no server and no LB algorithm ! */
1004 return SRV_STATUS_INTERNAL;
1005 }
Willy Tarreaubaaee002006-06-26 02:48:02 +02001006
1007 s->flags |= SN_ADDR_SET;
1008 return SRV_STATUS_OK;
1009}
1010
1011
1012/* This function assigns a server to session <s> if required, and can add the
1013 * connection to either the assigned server's queue or to the proxy's queue.
1014 *
1015 * Returns :
1016 *
1017 * SRV_STATUS_OK if everything is OK.
1018 * SRV_STATUS_NOSRV if no server is available. s->srv = NULL.
1019 * SRV_STATUS_QUEUED if the connection has been queued.
1020 * SRV_STATUS_FULL if the server(s) is/are saturated and the
1021 * connection could not be queued.
1022 * SRV_STATUS_INTERNAL for other unrecoverable errors.
1023 *
1024 */
1025int assign_server_and_queue(struct session *s)
1026{
1027 struct pendconn *p;
1028 int err;
1029
1030 if (s->pend_pos)
1031 return SRV_STATUS_INTERNAL;
1032
1033 if (s->flags & SN_ASSIGNED) {
Elijah Epifanovacafc5f2007-10-25 20:15:38 +02001034 if (s->srv && s->srv->maxqueue > 0 && s->srv->nbpend >= s->srv->maxqueue) {
1035 s->flags &= ~(SN_DIRECT | SN_ASSIGNED | SN_ADDR_SET);
1036 s->srv = NULL;
1037 http_flush_cookie_flags(&s->txn);
1038 } else {
1039 /* a server does not need to be assigned, perhaps because we're in
1040 * direct mode, or in dispatch or transparent modes where the server
1041 * is not needed.
1042 */
1043 if (s->srv &&
1044 s->srv->maxconn && s->srv->cur_sess >= srv_dynamic_maxconn(s->srv)) {
1045 p = pendconn_add(s);
1046 if (p)
1047 return SRV_STATUS_QUEUED;
1048 else
1049 return SRV_STATUS_FULL;
1050 }
1051 return SRV_STATUS_OK;
Willy Tarreaubaaee002006-06-26 02:48:02 +02001052 }
Willy Tarreaubaaee002006-06-26 02:48:02 +02001053 }
1054
1055 /* a server needs to be assigned */
1056 err = assign_server(s);
1057 switch (err) {
1058 case SRV_STATUS_OK:
1059 /* in balance mode, we might have servers with connection limits */
1060 if (s->srv &&
1061 s->srv->maxconn && s->srv->cur_sess >= srv_dynamic_maxconn(s->srv)) {
1062 p = pendconn_add(s);
1063 if (p)
1064 return SRV_STATUS_QUEUED;
1065 else
1066 return SRV_STATUS_FULL;
1067 }
1068 return SRV_STATUS_OK;
1069
1070 case SRV_STATUS_FULL:
1071 /* queue this session into the proxy's queue */
1072 p = pendconn_add(s);
1073 if (p)
1074 return SRV_STATUS_QUEUED;
1075 else
1076 return SRV_STATUS_FULL;
1077
1078 case SRV_STATUS_NOSRV:
1079 case SRV_STATUS_INTERNAL:
1080 return err;
1081 default:
1082 return SRV_STATUS_INTERNAL;
1083 }
1084}
1085
1086
1087/*
1088 * This function initiates a connection to the server assigned to this session
1089 * (s->srv, s->srv_addr). It will assign a server if none is assigned yet.
1090 * It can return one of :
1091 * - SN_ERR_NONE if everything's OK
1092 * - SN_ERR_SRVTO if there are no more servers
1093 * - SN_ERR_SRVCL if the connection was refused by the server
1094 * - SN_ERR_PRXCOND if the connection has been limited by the proxy (maxconn)
1095 * - SN_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...)
1096 * - SN_ERR_INTERNAL for any other purely internal errors
1097 * Additionnally, in the case of SN_ERR_RESOURCE, an emergency log will be emitted.
1098 */
1099int connect_server(struct session *s)
1100{
1101 int fd, err;
1102
1103 if (!(s->flags & SN_ADDR_SET)) {
1104 err = assign_server_address(s);
1105 if (err != SRV_STATUS_OK)
1106 return SN_ERR_INTERNAL;
1107 }
1108
1109 if ((fd = s->srv_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == -1) {
1110 qfprintf(stderr, "Cannot get a server socket.\n");
1111
1112 if (errno == ENFILE)
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001113 send_log(s->be, LOG_EMERG,
Willy Tarreaubaaee002006-06-26 02:48:02 +02001114 "Proxy %s reached system FD limit at %d. Please check system tunables.\n",
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001115 s->be->id, maxfd);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001116 else if (errno == EMFILE)
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001117 send_log(s->be, LOG_EMERG,
Willy Tarreaubaaee002006-06-26 02:48:02 +02001118 "Proxy %s reached process FD limit at %d. Please check 'ulimit-n' and restart.\n",
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001119 s->be->id, maxfd);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001120 else if (errno == ENOBUFS || errno == ENOMEM)
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001121 send_log(s->be, LOG_EMERG,
Willy Tarreaubaaee002006-06-26 02:48:02 +02001122 "Proxy %s reached system memory limit at %d sockets. Please check system tunables.\n",
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001123 s->be->id, maxfd);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001124 /* this is a resource error */
1125 return SN_ERR_RESOURCE;
1126 }
1127
1128 if (fd >= global.maxsock) {
1129 /* do not log anything there, it's a normal condition when this option
1130 * is used to serialize connections to a server !
1131 */
1132 Alert("socket(): not enough free sockets. Raise -n argument. Giving up.\n");
1133 close(fd);
1134 return SN_ERR_PRXCOND; /* it is a configuration limit */
1135 }
1136
Willy Tarreau6d1a9882007-01-07 02:03:04 +01001137#ifdef CONFIG_HAP_TCPSPLICE
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001138 if ((s->fe->options & s->be->options) & PR_O_TCPSPLICE) {
Willy Tarreau6d1a9882007-01-07 02:03:04 +01001139 /* TCP splicing supported by both FE and BE */
1140 tcp_splice_initfd(s->cli_fd, fd);
1141 }
1142#endif
1143
Willy Tarreaubaaee002006-06-26 02:48:02 +02001144 if ((fcntl(fd, F_SETFL, O_NONBLOCK)==-1) ||
1145 (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) == -1)) {
1146 qfprintf(stderr,"Cannot set client socket to non blocking mode.\n");
1147 close(fd);
1148 return SN_ERR_INTERNAL;
1149 }
1150
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001151 if (s->be->options & PR_O_TCP_SRV_KA)
Willy Tarreaubaaee002006-06-26 02:48:02 +02001152 setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, (char *) &one, sizeof(one));
1153
Alexandre Cassen87ea5482007-10-11 20:48:58 +02001154 if (s->be->options & PR_O_TCP_NOLING)
1155 setsockopt(fd, SOL_SOCKET, SO_LINGER, (struct linger *) &nolinger, sizeof(struct linger));
1156
Willy Tarreaubaaee002006-06-26 02:48:02 +02001157 /* allow specific binding :
1158 * - server-specific at first
1159 * - proxy-specific next
1160 */
1161 if (s->srv != NULL && s->srv->state & SRV_BIND_SRC) {
1162 setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &one, sizeof(one));
1163 if (bind(fd, (struct sockaddr *)&s->srv->source_addr, sizeof(s->srv->source_addr)) == -1) {
1164 Alert("Cannot bind to source address before connect() for server %s/%s. Aborting.\n",
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001165 s->be->id, s->srv->id);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001166 close(fd);
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001167 send_log(s->be, LOG_EMERG,
Willy Tarreaubaaee002006-06-26 02:48:02 +02001168 "Cannot bind to source address before connect() for server %s/%s.\n",
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001169 s->be->id, s->srv->id);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001170 return SN_ERR_RESOURCE;
1171 }
Willy Tarreau77074d52006-11-12 23:57:19 +01001172#ifdef CONFIG_HAP_CTTPROXY
1173 if (s->srv->state & SRV_TPROXY_MASK) {
1174 struct in_tproxy itp1, itp2;
1175 memset(&itp1, 0, sizeof(itp1));
1176
1177 itp1.op = TPROXY_ASSIGN;
1178 switch (s->srv->state & SRV_TPROXY_MASK) {
1179 case SRV_TPROXY_ADDR:
1180 itp1.v.addr.faddr = s->srv->tproxy_addr.sin_addr;
1181 itp1.v.addr.fport = s->srv->tproxy_addr.sin_port;
1182 break;
1183 case SRV_TPROXY_CLI:
1184 itp1.v.addr.fport = ((struct sockaddr_in *)&s->cli_addr)->sin_port;
1185 /* fall through */
1186 case SRV_TPROXY_CIP:
1187 /* FIXME: what can we do if the client connects in IPv6 ? */
1188 itp1.v.addr.faddr = ((struct sockaddr_in *)&s->cli_addr)->sin_addr;
1189 break;
1190 }
1191
1192 /* set connect flag on socket */
1193 itp2.op = TPROXY_FLAGS;
1194 itp2.v.flags = ITP_CONNECT | ITP_ONCE;
1195
1196 if (setsockopt(fd, SOL_IP, IP_TPROXY, &itp1, sizeof(itp1)) == -1 ||
1197 setsockopt(fd, SOL_IP, IP_TPROXY, &itp2, sizeof(itp2)) == -1) {
1198 Alert("Cannot bind to tproxy source address before connect() for server %s/%s. Aborting.\n",
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001199 s->be->id, s->srv->id);
Willy Tarreau77074d52006-11-12 23:57:19 +01001200 close(fd);
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001201 send_log(s->be, LOG_EMERG,
Willy Tarreau77074d52006-11-12 23:57:19 +01001202 "Cannot bind to tproxy source address before connect() for server %s/%s.\n",
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001203 s->be->id, s->srv->id);
Willy Tarreau77074d52006-11-12 23:57:19 +01001204 return SN_ERR_RESOURCE;
1205 }
1206 }
1207#endif
Willy Tarreaubaaee002006-06-26 02:48:02 +02001208 }
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001209 else if (s->be->options & PR_O_BIND_SRC) {
Willy Tarreaubaaee002006-06-26 02:48:02 +02001210 setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &one, sizeof(one));
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001211 if (bind(fd, (struct sockaddr *)&s->be->source_addr, sizeof(s->be->source_addr)) == -1) {
1212 Alert("Cannot bind to source address before connect() for proxy %s. Aborting.\n", s->be->id);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001213 close(fd);
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001214 send_log(s->be, LOG_EMERG,
Willy Tarreaubaaee002006-06-26 02:48:02 +02001215 "Cannot bind to source address before connect() for server %s/%s.\n",
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001216 s->be->id, s->srv->id);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001217 return SN_ERR_RESOURCE;
1218 }
Willy Tarreau77074d52006-11-12 23:57:19 +01001219#ifdef CONFIG_HAP_CTTPROXY
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001220 if (s->be->options & PR_O_TPXY_MASK) {
Willy Tarreau77074d52006-11-12 23:57:19 +01001221 struct in_tproxy itp1, itp2;
1222 memset(&itp1, 0, sizeof(itp1));
1223
1224 itp1.op = TPROXY_ASSIGN;
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001225 switch (s->be->options & PR_O_TPXY_MASK) {
Willy Tarreau77074d52006-11-12 23:57:19 +01001226 case PR_O_TPXY_ADDR:
1227 itp1.v.addr.faddr = s->srv->tproxy_addr.sin_addr;
1228 itp1.v.addr.fport = s->srv->tproxy_addr.sin_port;
1229 break;
1230 case PR_O_TPXY_CLI:
1231 itp1.v.addr.fport = ((struct sockaddr_in *)&s->cli_addr)->sin_port;
1232 /* fall through */
1233 case PR_O_TPXY_CIP:
1234 /* FIXME: what can we do if the client connects in IPv6 ? */
1235 itp1.v.addr.faddr = ((struct sockaddr_in *)&s->cli_addr)->sin_addr;
1236 break;
1237 }
1238
1239 /* set connect flag on socket */
1240 itp2.op = TPROXY_FLAGS;
1241 itp2.v.flags = ITP_CONNECT | ITP_ONCE;
1242
1243 if (setsockopt(fd, SOL_IP, IP_TPROXY, &itp1, sizeof(itp1)) == -1 ||
1244 setsockopt(fd, SOL_IP, IP_TPROXY, &itp2, sizeof(itp2)) == -1) {
1245 Alert("Cannot bind to tproxy source address before connect() for proxy %s. Aborting.\n",
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001246 s->be->id);
Willy Tarreau77074d52006-11-12 23:57:19 +01001247 close(fd);
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001248 send_log(s->be, LOG_EMERG,
Willy Tarreau77074d52006-11-12 23:57:19 +01001249 "Cannot bind to tproxy source address before connect() for server %s/%s.\n",
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001250 s->be->id, s->srv->id);
Willy Tarreau77074d52006-11-12 23:57:19 +01001251 return SN_ERR_RESOURCE;
1252 }
1253 }
1254#endif
Willy Tarreaubaaee002006-06-26 02:48:02 +02001255 }
1256
1257 if ((connect(fd, (struct sockaddr *)&s->srv_addr, sizeof(s->srv_addr)) == -1) &&
1258 (errno != EINPROGRESS) && (errno != EALREADY) && (errno != EISCONN)) {
1259
1260 if (errno == EAGAIN || errno == EADDRINUSE) {
1261 char *msg;
1262 if (errno == EAGAIN) /* no free ports left, try again later */
1263 msg = "no free ports";
1264 else
1265 msg = "local address already in use";
1266
1267 qfprintf(stderr,"Cannot connect: %s.\n",msg);
1268 close(fd);
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001269 send_log(s->be, LOG_EMERG,
Willy Tarreaubaaee002006-06-26 02:48:02 +02001270 "Connect() failed for server %s/%s: %s.\n",
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001271 s->be->id, s->srv->id, msg);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001272 return SN_ERR_RESOURCE;
1273 } else if (errno == ETIMEDOUT) {
1274 //qfprintf(stderr,"Connect(): ETIMEDOUT");
1275 close(fd);
1276 return SN_ERR_SRVTO;
1277 } else {
1278 // (errno == ECONNREFUSED || errno == ENETUNREACH || errno == EACCES || errno == EPERM)
1279 //qfprintf(stderr,"Connect(): %d", errno);
1280 close(fd);
1281 return SN_ERR_SRVCL;
1282 }
1283 }
1284
1285 fdtab[fd].owner = s->task;
Willy Tarreaubaaee002006-06-26 02:48:02 +02001286 fdtab[fd].state = FD_STCONN; /* connection in progress */
Willy Tarreaud7971282006-07-29 18:36:34 +02001287 fdtab[fd].cb[DIR_RD].f = &stream_sock_read;
Willy Tarreau54469402006-07-29 16:59:06 +02001288 fdtab[fd].cb[DIR_RD].b = s->rep;
Willy Tarreauf8306d52006-07-29 19:01:31 +02001289 fdtab[fd].cb[DIR_WR].f = &stream_sock_write;
Willy Tarreau54469402006-07-29 16:59:06 +02001290 fdtab[fd].cb[DIR_WR].b = s->req;
Willy Tarreaue94ebd02007-10-09 17:14:37 +02001291
1292 fdtab[fd].peeraddr = (struct sockaddr *)&s->srv_addr;
1293 fdtab[fd].peerlen = sizeof(s->srv_addr);
1294
Willy Tarreauf161a342007-04-08 16:59:42 +02001295 EV_FD_SET(fd, DIR_WR); /* for connect status */
Willy Tarreaubaaee002006-06-26 02:48:02 +02001296
1297 fd_insert(fd);
1298 if (s->srv) {
1299 s->srv->cur_sess++;
1300 if (s->srv->cur_sess > s->srv->cur_sess_max)
1301 s->srv->cur_sess_max = s->srv->cur_sess;
1302 }
1303
Willy Tarreaua8b55e32007-05-13 16:08:19 +02001304 if (!tv_add_ifset(&s->req->cex, &now, &s->be->contimeout))
Willy Tarreaud7971282006-07-29 18:36:34 +02001305 tv_eternity(&s->req->cex);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001306 return SN_ERR_NONE; /* connection is OK */
1307}
1308
1309
1310/*
1311 * This function checks the retry count during the connect() job.
1312 * It updates the session's srv_state and retries, so that the caller knows
1313 * what it has to do. It uses the last connection error to set the log when
1314 * it expires. It returns 1 when it has expired, and 0 otherwise.
1315 */
1316int srv_count_retry_down(struct session *t, int conn_err)
1317{
1318 /* we are in front of a retryable error */
1319 t->conn_retries--;
Krzysztof Oledzki1cf36ba2007-10-18 19:12:30 +02001320 if (t->srv)
1321 t->srv->retries++;
1322 t->be->retries++;
1323
Willy Tarreaubaaee002006-06-26 02:48:02 +02001324 if (t->conn_retries < 0) {
1325 /* if not retryable anymore, let's abort */
Willy Tarreaud7971282006-07-29 18:36:34 +02001326 tv_eternity(&t->req->cex);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001327 srv_close_with_err(t, conn_err, SN_FINST_C,
Willy Tarreau80587432006-12-24 17:47:20 +01001328 503, error_message(t, HTTP_ERR_503));
Willy Tarreaubaaee002006-06-26 02:48:02 +02001329 if (t->srv)
1330 t->srv->failed_conns++;
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001331 t->be->failed_conns++;
Willy Tarreaubaaee002006-06-26 02:48:02 +02001332
1333 /* We used to have a free connection slot. Since we'll never use it,
1334 * we have to inform the server that it may be used by another session.
1335 */
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001336 if (may_dequeue_tasks(t->srv, t->be))
Willy Tarreau96bcfd72007-04-29 10:41:56 +02001337 task_wakeup(t->srv->queue_mgt);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001338 return 1;
1339 }
1340 return 0;
1341}
1342
1343
1344/*
1345 * This function performs the retryable part of the connect() job.
1346 * It updates the session's srv_state and retries, so that the caller knows
1347 * what it has to do. It returns 1 when it breaks out of the loop, or 0 if
1348 * it needs to redispatch.
1349 */
1350int srv_retryable_connect(struct session *t)
1351{
1352 int conn_err;
1353
1354 /* This loop ensures that we stop before the last retry in case of a
1355 * redispatchable server.
1356 */
1357 do {
1358 /* initiate a connection to the server */
1359 conn_err = connect_server(t);
1360 switch (conn_err) {
1361
1362 case SN_ERR_NONE:
1363 //fprintf(stderr,"0: c=%d, s=%d\n", c, s);
1364 t->srv_state = SV_STCONN;
1365 return 1;
1366
1367 case SN_ERR_INTERNAL:
Willy Tarreaud7971282006-07-29 18:36:34 +02001368 tv_eternity(&t->req->cex);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001369 srv_close_with_err(t, SN_ERR_INTERNAL, SN_FINST_C,
Willy Tarreau80587432006-12-24 17:47:20 +01001370 500, error_message(t, HTTP_ERR_500));
Willy Tarreaubaaee002006-06-26 02:48:02 +02001371 if (t->srv)
1372 t->srv->failed_conns++;
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001373 t->be->failed_conns++;
Willy Tarreaubaaee002006-06-26 02:48:02 +02001374 /* release other sessions waiting for this server */
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001375 if (may_dequeue_tasks(t->srv, t->be))
Willy Tarreau96bcfd72007-04-29 10:41:56 +02001376 task_wakeup(t->srv->queue_mgt);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001377 return 1;
1378 }
1379 /* ensure that we have enough retries left */
1380 if (srv_count_retry_down(t, conn_err)) {
Willy Tarreaubaaee002006-06-26 02:48:02 +02001381 return 1;
1382 }
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001383 } while (t->srv == NULL || t->conn_retries > 0 || !(t->be->options & PR_O_REDISP));
Willy Tarreaubaaee002006-06-26 02:48:02 +02001384
1385 /* We're on our last chance, and the REDISP option was specified.
1386 * We will ignore cookie and force to balance or use the dispatcher.
1387 */
1388 /* let's try to offer this slot to anybody */
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001389 if (may_dequeue_tasks(t->srv, t->be))
Willy Tarreau96bcfd72007-04-29 10:41:56 +02001390 task_wakeup(t->srv->queue_mgt);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001391
1392 if (t->srv)
1393 t->srv->failed_conns++;
Krzysztof Oledzki1cf36ba2007-10-18 19:12:30 +02001394 t->be->redispatches++;
Willy Tarreaubaaee002006-06-26 02:48:02 +02001395
1396 t->flags &= ~(SN_DIRECT | SN_ASSIGNED | SN_ADDR_SET);
1397 t->srv = NULL; /* it's left to the dispatcher to choose a server */
Willy Tarreau3d300592007-03-18 18:34:41 +01001398 http_flush_cookie_flags(&t->txn);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001399 return 0;
1400}
1401
1402
1403/* This function performs the "redispatch" part of a connection attempt. It
1404 * will assign a server if required, queue the connection if required, and
1405 * handle errors that might arise at this level. It can change the server
1406 * state. It will return 1 if it encounters an error, switches the server
1407 * state, or has to queue a connection. Otherwise, it will return 0 indicating
1408 * that the connection is ready to use.
1409 */
1410
1411int srv_redispatch_connect(struct session *t)
1412{
1413 int conn_err;
1414
1415 /* We know that we don't have any connection pending, so we will
1416 * try to get a new one, and wait in this state if it's queued
1417 */
1418 conn_err = assign_server_and_queue(t);
1419 switch (conn_err) {
1420 case SRV_STATUS_OK:
1421 break;
1422
1423 case SRV_STATUS_NOSRV:
1424 /* note: it is guaranteed that t->srv == NULL here */
Willy Tarreaud7971282006-07-29 18:36:34 +02001425 tv_eternity(&t->req->cex);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001426 srv_close_with_err(t, SN_ERR_SRVTO, SN_FINST_C,
Willy Tarreau80587432006-12-24 17:47:20 +01001427 503, error_message(t, HTTP_ERR_503));
Willy Tarreaubaaee002006-06-26 02:48:02 +02001428 if (t->srv)
1429 t->srv->failed_conns++;
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001430 t->be->failed_conns++;
Willy Tarreaubaaee002006-06-26 02:48:02 +02001431
1432 return 1;
1433
1434 case SRV_STATUS_QUEUED:
1435 /* FIXME-20060503 : we should use the queue timeout instead */
Willy Tarreaua8b55e32007-05-13 16:08:19 +02001436 if (!tv_add_ifset(&t->req->cex, &now, &t->be->contimeout))
Willy Tarreaud7971282006-07-29 18:36:34 +02001437 tv_eternity(&t->req->cex);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001438 t->srv_state = SV_STIDLE;
1439 /* do nothing else and do not wake any other session up */
1440 return 1;
1441
1442 case SRV_STATUS_FULL:
1443 case SRV_STATUS_INTERNAL:
1444 default:
Willy Tarreaud7971282006-07-29 18:36:34 +02001445 tv_eternity(&t->req->cex);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001446 srv_close_with_err(t, SN_ERR_INTERNAL, SN_FINST_C,
Willy Tarreau80587432006-12-24 17:47:20 +01001447 500, error_message(t, HTTP_ERR_500));
Willy Tarreaubaaee002006-06-26 02:48:02 +02001448 if (t->srv)
1449 t->srv->failed_conns++;
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001450 t->be->failed_conns++;
Willy Tarreaubaaee002006-06-26 02:48:02 +02001451
1452 /* release other sessions waiting for this server */
Willy Tarreaue2e27a52007-04-01 00:01:37 +02001453 if (may_dequeue_tasks(t->srv, t->be))
Willy Tarreau96bcfd72007-04-29 10:41:56 +02001454 task_wakeup(t->srv->queue_mgt);
Willy Tarreaubaaee002006-06-26 02:48:02 +02001455 return 1;
1456 }
1457 /* if we get here, it's because we got SRV_STATUS_OK, which also
1458 * means that the connection has not been queued.
1459 */
1460 return 0;
1461}
1462
Krzysztof Oledzki85130942007-10-22 16:21:10 +02001463int be_downtime(struct proxy *px) {
Willy Tarreaub625a082007-11-26 01:15:43 +01001464 if (px->lbprm.tot_weight && px->last_change < now.tv_sec) // ignore negative time
Krzysztof Oledzki85130942007-10-22 16:21:10 +02001465 return px->down_time;
1466
1467 return now.tv_sec - px->last_change + px->down_time;
1468}
Willy Tarreaubaaee002006-06-26 02:48:02 +02001469
Willy Tarreaua0cbda62007-11-01 21:39:54 +01001470/* This function parses a "balance" statement in a backend section describing
1471 * <curproxy>. It returns -1 if there is any error, otherwise zero. If it
1472 * returns -1, it may write an error message into ther <err> buffer, for at
1473 * most <errlen> bytes, trailing zero included. The trailing '\n' will not be
1474 * written. The function must be called with <args> pointing to the first word
1475 * after "balance".
1476 */
1477int backend_parse_balance(const char **args, char *err, int errlen, struct proxy *curproxy)
1478{
1479 if (!*(args[0])) {
1480 /* if no option is set, use round-robin by default */
1481 curproxy->options &= ~PR_O_BALANCE;
1482 curproxy->options |= PR_O_BALANCE_RR;
1483 return 0;
1484 }
1485
1486 if (!strcmp(args[0], "roundrobin")) {
1487 curproxy->options &= ~PR_O_BALANCE;
1488 curproxy->options |= PR_O_BALANCE_RR;
1489 }
1490 else if (!strcmp(args[0], "source")) {
1491 curproxy->options &= ~PR_O_BALANCE;
1492 curproxy->options |= PR_O_BALANCE_SH;
1493 }
1494 else if (!strcmp(args[0], "uri")) {
1495 curproxy->options &= ~PR_O_BALANCE;
1496 curproxy->options |= PR_O_BALANCE_UH;
1497 }
Willy Tarreau01732802007-11-01 22:48:15 +01001498 else if (!strcmp(args[0], "url_param")) {
1499 if (!*args[1]) {
1500 snprintf(err, errlen, "'balance url_param' requires an URL parameter name.");
1501 return -1;
1502 }
1503 curproxy->options &= ~PR_O_BALANCE;
1504 curproxy->options |= PR_O_BALANCE_PH;
1505 if (curproxy->url_param_name)
1506 free(curproxy->url_param_name);
1507 curproxy->url_param_name = strdup(args[1]);
1508 curproxy->url_param_len = strlen(args[1]);
1509 }
Willy Tarreaua0cbda62007-11-01 21:39:54 +01001510 else {
Willy Tarreau01732802007-11-01 22:48:15 +01001511 snprintf(err, errlen, "'balance' only supports 'roundrobin', 'source', 'uri' and 'url_param' options.");
Willy Tarreaua0cbda62007-11-01 21:39:54 +01001512 return -1;
1513 }
1514 return 0;
1515}
1516
Willy Tarreaubaaee002006-06-26 02:48:02 +02001517/*
1518 * Local variables:
1519 * c-indent-level: 8
1520 * c-basic-offset: 8
1521 * End:
1522 */