[MAJOR] implementation of the "leastconn" load balancing algorithm

The new "leastconn" LB algorithm selects the server which has the
least established or pending connections. The weights are considered,
so that a server with a weight of 20 will get twice as many connections
as the server with a weight of 10.

The algorithm respects the minconn/maxconn settings, as well as the
slowstart since it is a dynamic algorithm. It also correctly supports
backup servers (one and all).

It is generally suited for protocols with long sessions (such as remote
terminals and databases), as it will ensure that upon restart, a server
with no connection will take all new ones until its load is balanced
with others.

A test configuration has been added in order to ease regression testing.
diff --git a/include/proto/backend.h b/include/proto/backend.h
index 1720105..03fa518 100644
--- a/include/proto/backend.h
+++ b/include/proto/backend.h
@@ -43,6 +43,7 @@
 int be_downtime(struct proxy *px);
 void init_server_map(struct proxy *p);
 void fwrr_init_server_groups(struct proxy *p);
+void fwlc_init_server_tree(struct proxy *p);
 
 /*
  * This function tries to find a running server with free connection slots for
diff --git a/include/types/backend.h b/include/types/backend.h
index 2b35860..2d62722 100644
--- a/include/types/backend.h
+++ b/include/types/backend.h
@@ -2,7 +2,7 @@
   include/types/backend.h
   This file rassembles definitions for backends
 
-  Copyright (C) 2000-2007 Willy Tarreau - w@1wt.eu
+  Copyright (C) 2000-2008 Willy Tarreau - w@1wt.eu
   
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -42,6 +42,7 @@
 #define BE_LB_ALGO_SH	(BE_LB_PROP_L4  | 0x02) /* balance on source IP hash */
 #define BE_LB_ALGO_UH	(BE_LB_PROP_L7  | 0x03) /* balance on URI hash */
 #define BE_LB_ALGO_PH	(BE_LB_PROP_L7  | 0x04) /* balance on URL parameter hash */
+#define BE_LB_ALGO_LC	(BE_LB_PROP_DYN | 0x05) /* fast weighted round-robin mode (dynamic) */
 
 /* various constants */
 
diff --git a/include/types/proxy.h b/include/types/proxy.h
index a2238cb..98baf53 100644
--- a/include/types/proxy.h
+++ b/include/types/proxy.h
@@ -150,9 +150,15 @@
 			struct fwrr_group act;	/* weighted round robin on the active servers */
 			struct fwrr_group bck;	/* weighted round robin on the backup servers */
 		} fwrr;
+		struct {
+			struct eb_root act;	/* weighted least conns on the active servers */
+			struct eb_root bck;	/* weighted least conns on the backup servers */
+		} fwlc;
 		void (*update_server_eweight)(struct server *);/* if non-NULL, to be called after eweight change */
 		void (*set_server_status_up)(struct server *);/* to be called after status changes to UP */
 		void (*set_server_status_down)(struct server *);/* to be called after status changes to DOWN */
+		void (*server_take_conn)(struct server *);/* to be called when connection is assigned */
+		void (*server_drop_conn)(struct server *);/* to be called when connection is dropped */
 	} lbprm;				/* LB parameters for all algorithms */
 
 	char *cookie_name;			/* name of the cookie to look for */
diff --git a/src/backend.c b/src/backend.c
index 666c218..436a4d9 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -829,6 +829,291 @@
 	return srv;
 }
 
+/* Remove a server from a tree. It must have previously been dequeued. This
+ * function is meant to be called when a server is going down or has its
+ * weight disabled.
+ */
+static inline void fwlc_remove_from_tree(struct server *s)
+{
+	s->lb_tree = NULL;
+}
+
+/* simply removes a server from a tree */
+static inline void fwlc_dequeue_srv(struct server *s)
+{
+	eb32_delete(&s->lb_node);
+}
+
+/* Queue a server in its associated tree, assuming the weight is >0.
+ * Servers are sorted by #conns/weight. To ensure maximum accuracy,
+ * we use #conns*SRV_EWGHT_MAX/eweight as the sorting key.
+ */
+static inline void fwlc_queue_srv(struct server *s)
+{
+	s->lb_node.key = s->cur_sess * SRV_EWGHT_MAX / s->eweight;
+	eb32_insert(s->lb_tree, &s->lb_node);
+}
+
+/* Re-position the server in the FWLC tree after it has been assigned one
+ * connection or after it has released one. Note that it is possible that
+ * the server has been moved out of the tree due to failed health-checks.
+ */
+static void fwlc_srv_reposition(struct server *s)
+{
+	if (!s->lb_tree)
+		return;
+	fwlc_dequeue_srv(s);
+	fwlc_queue_srv(s);
+}
+
+/* This function updates the server trees according to server <srv>'s new
+ * state. It should be called when server <srv>'s status changes to down.
+ * It is not important whether the server was already down or not. It is not
+ * important either that the new state is completely down (the caller may not
+ * know all the variables of a server's state).
+ */
+static void fwlc_set_server_status_down(struct server *srv)
+{
+	struct proxy *p = srv->proxy;
+
+	if (srv->state == srv->prev_state &&
+	    srv->eweight == srv->prev_eweight)
+		return;
+
+	if (srv_is_usable(srv->state, srv->eweight))
+		goto out_update_state;
+
+	if (!srv_is_usable(srv->prev_state, srv->prev_eweight))
+		/* server was already down */
+		goto out_update_backend;
+
+	if (srv->state & SRV_BACKUP) {
+		p->lbprm.tot_wbck -= srv->prev_eweight;
+		p->srv_bck--;
+
+		if (srv == p->lbprm.fbck) {
+			/* we lost the first backup server in a single-backup
+			 * configuration, we must search another one.
+			 */
+			struct server *srv2 = p->lbprm.fbck;
+			do {
+				srv2 = srv2->next;
+			} while (srv2 &&
+				 !((srv2->state & SRV_BACKUP) &&
+				   srv_is_usable(srv2->state, srv2->eweight)));
+			p->lbprm.fbck = srv2;
+		}
+	} else {
+		p->lbprm.tot_wact -= srv->prev_eweight;
+		p->srv_act--;
+	}
+
+	fwlc_dequeue_srv(srv);
+	fwlc_remove_from_tree(srv);
+
+out_update_backend:
+	/* check/update tot_used, tot_weight */
+	update_backend_weight(p);
+ out_update_state:
+	srv->prev_state = srv->state;
+	srv->prev_eweight = srv->eweight;
+}
+
+/* This function updates the server trees according to server <srv>'s new
+ * state. It should be called when server <srv>'s status changes to up.
+ * It is not important whether the server was already down or not. It is not
+ * important either that the new state is completely UP (the caller may not
+ * know all the variables of a server's state). This function will not change
+ * the weight of a server which was already up.
+ */
+static void fwlc_set_server_status_up(struct server *srv)
+{
+	struct proxy *p = srv->proxy;
+
+	if (srv->state == srv->prev_state &&
+	    srv->eweight == srv->prev_eweight)
+		return;
+
+	if (!srv_is_usable(srv->state, srv->eweight))
+		goto out_update_state;
+
+	if (srv_is_usable(srv->prev_state, srv->prev_eweight))
+		/* server was already up */
+		goto out_update_backend;
+
+	if (srv->state & SRV_BACKUP) {
+		srv->lb_tree = &p->lbprm.fwlc.bck;
+		p->lbprm.tot_wbck += srv->eweight;
+		p->srv_bck++;
+
+		if (!(p->options & PR_O_USE_ALL_BK)) {
+			if (!p->lbprm.fbck) {
+				/* there was no backup server anymore */
+				p->lbprm.fbck = srv;
+			} else {
+				/* we may have restored a backup server prior to fbck,
+				 * in which case it should replace it.
+				 */
+				struct server *srv2 = srv;
+				do {
+					srv2 = srv2->next;
+				} while (srv2 && (srv2 != p->lbprm.fbck));
+				if (srv2)
+					p->lbprm.fbck = srv;
+			}
+		}
+	} else {
+		srv->lb_tree = &p->lbprm.fwlc.act;
+		p->lbprm.tot_wact += srv->eweight;
+		p->srv_act++;
+	}
+
+	/* note that eweight cannot be 0 here */
+	fwlc_queue_srv(srv);
+
+ out_update_backend:
+	/* check/update tot_used, tot_weight */
+	update_backend_weight(p);
+ out_update_state:
+	srv->prev_state = srv->state;
+	srv->prev_eweight = srv->eweight;
+}
+
+/* This function must be called after an update to server <srv>'s effective
+ * weight. It may be called after a state change too.
+ */
+static void fwlc_update_server_weight(struct server *srv)
+{
+	int old_state, new_state;
+	struct proxy *p = srv->proxy;
+
+	if (srv->state == srv->prev_state &&
+	    srv->eweight == srv->prev_eweight)
+		return;
+
+	/* If changing the server's weight changes its state, we simply apply
+	 * the procedures we already have for status change. If the state
+	 * remains down, the server is not in any tree, so it's as easy as
+	 * updating its values. If the state remains up with different weights,
+	 * there are some computations to perform to find a new place and
+	 * possibly a new tree for this server.
+	 */
+	 
+	old_state = srv_is_usable(srv->prev_state, srv->prev_eweight);
+	new_state = srv_is_usable(srv->state, srv->eweight);
+
+	if (!old_state && !new_state) {
+		srv->prev_state = srv->state;
+		srv->prev_eweight = srv->eweight;
+		return;
+	}
+	else if (!old_state && new_state) {
+		fwlc_set_server_status_up(srv);
+		return;
+	}
+	else if (old_state && !new_state) {
+		fwlc_set_server_status_down(srv);
+		return;
+	}
+
+	if (srv->lb_tree)
+		fwlc_dequeue_srv(srv);
+
+	if (srv->state & SRV_BACKUP) {
+		p->lbprm.tot_wbck += srv->eweight - srv->prev_eweight;
+		srv->lb_tree = &p->lbprm.fwlc.bck;
+	} else {
+		p->lbprm.tot_wact += srv->eweight - srv->prev_eweight;
+		srv->lb_tree = &p->lbprm.fwlc.act;
+	}
+
+	fwlc_queue_srv(srv);
+
+	update_backend_weight(p);
+	srv->prev_state = srv->state;
+	srv->prev_eweight = srv->eweight;
+}
+
+/* This function is responsible for building the trees in case of fast
+ * weighted least-conns. It also sets p->lbprm.wdiv to the eweight to
+ * uweight ratio. Both active and backup groups are initialized.
+ */
+void fwlc_init_server_tree(struct proxy *p)
+{
+	struct server *srv;
+	struct eb_root init_head = EB_ROOT;
+
+	p->lbprm.set_server_status_up   = fwlc_set_server_status_up;
+	p->lbprm.set_server_status_down = fwlc_set_server_status_down;
+	p->lbprm.update_server_eweight  = fwlc_update_server_weight;
+	p->lbprm.server_take_conn = fwlc_srv_reposition;
+	p->lbprm.server_drop_conn = fwlc_srv_reposition;
+
+	p->lbprm.wdiv = BE_WEIGHT_SCALE;
+	for (srv = p->srv; srv; srv = srv->next) {
+		srv->prev_eweight = srv->eweight = srv->uweight * BE_WEIGHT_SCALE;
+		srv->prev_state = srv->state;
+	}
+
+	recount_servers(p);
+	update_backend_weight(p);
+
+	p->lbprm.fwlc.act = init_head;
+	p->lbprm.fwlc.bck = init_head;
+
+	/* queue active and backup servers in two distinct groups */
+	for (srv = p->srv; srv; srv = srv->next) {
+		if (!srv_is_usable(srv->state, srv->eweight))
+			continue;
+		srv->lb_tree = (srv->state & SRV_BACKUP) ? &p->lbprm.fwlc.bck : &p->lbprm.fwlc.act;
+		fwlc_queue_srv(srv);
+	}
+}
+
+/* Return next server from the FWLC tree in backend <p>. If the tree is empty,
+ * return NULL. Saturated servers are skipped.
+ */
+static struct server *fwlc_get_next_server(struct proxy *p, struct server *srvtoavoid)
+{
+	struct server *srv, *avoided;
+	struct eb32_node *node;
+
+	srv = avoided = NULL;
+
+	if (p->srv_act)
+		node = eb32_first(&p->lbprm.fwlc.act);
+	else if (p->lbprm.fbck)
+		return p->lbprm.fbck;
+	else if (p->srv_bck)
+		node = eb32_first(&p->lbprm.fwlc.bck);
+	else
+		return NULL;
+
+	while (node) {
+		/* OK, we have a server. However, it may be saturated, in which
+		 * case we don't want to reconsider it for now, so we'll simply
+		 * skip it. Same if it's the server we try to avoid, in which
+		 * case we simply remember it for later use if needed.
+		 */
+		struct server *s;
+
+		s = eb32_entry(node, struct server, lb_node);
+		if (!s->maxconn || s->cur_sess < srv_dynamic_maxconn(s)) {
+			if (s != srvtoavoid) {
+				srv = s;
+				break;
+			}
+			avoided = s;
+		}
+		node = eb32_next(node);
+	}
+
+	if (!srv)
+		srv = avoided;
+
+	return srv;
+}
+
 /* 
  * This function tries to find a running server for the proxy <px> following
  * the URL parameter hash method. It looks for a specific parameter in the
@@ -944,6 +1229,11 @@
 				if (!s->srv)
 					return SRV_STATUS_FULL;
 				break;
+			case BE_LB_ALGO_LC:
+				s->srv = fwlc_get_next_server(s->be, srvtoavoid);
+				if (!s->srv)
+					return SRV_STATUS_FULL;
+				break;
 			case BE_LB_ALGO_SH:
 				if (s->cli_addr.ss_family == AF_INET)
 					len = 4;
@@ -1376,6 +1666,8 @@
 		s->srv->cur_sess++;
 		if (s->srv->cur_sess > s->srv->cur_sess_max)
 			s->srv->cur_sess_max = s->srv->cur_sess;
+		if (s->be->lbprm.server_take_conn)
+			s->be->lbprm.server_take_conn(s->srv);
 	}
 
 	if (!tv_add_ifset(&s->req->cex, &now, &s->be->timeout.connect))
@@ -1564,6 +1856,10 @@
 		curproxy->lbprm.algo &= ~BE_LB_ALGO;
 		curproxy->lbprm.algo |= BE_LB_ALGO_RR;
 	}
+	else if (!strcmp(args[0], "leastconn")) {
+		curproxy->lbprm.algo &= ~BE_LB_ALGO;
+		curproxy->lbprm.algo |= BE_LB_ALGO_LC;
+	}
 	else if (!strcmp(args[0], "source")) {
 		curproxy->lbprm.algo &= ~BE_LB_ALGO;
 		curproxy->lbprm.algo |= BE_LB_ALGO_SH;
@@ -1585,7 +1881,7 @@
 		curproxy->url_param_len = strlen(args[1]);
 	}
 	else {
-		snprintf(err, errlen, "'balance' only supports 'roundrobin', 'source', 'uri' and 'url_param' options.");
+		snprintf(err, errlen, "'balance' only supports 'roundrobin', 'leastconn', 'source', 'uri' and 'url_param' options.");
 		return -1;
 	}
 	return 0;
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 0b1a0ba..7dbadd1 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -2964,6 +2964,8 @@
 		/* round robin relies on a weight tree */
 		if ((curproxy->lbprm.algo & BE_LB_ALGO) == BE_LB_ALGO_RR)
 			fwrr_init_server_groups(curproxy);
+		else if ((curproxy->lbprm.algo & BE_LB_ALGO) == BE_LB_ALGO_LC)
+			fwlc_init_server_tree(curproxy);
 		else
 			init_server_map(curproxy);
 
diff --git a/src/proto_http.c b/src/proto_http.c
index f685201..d83e601 100644
--- a/src/proto_http.c
+++ b/src/proto_http.c
@@ -2569,8 +2569,11 @@
 		      t->be->options & PR_O_ABRT_CLOSE))) { /* give up */
 			tv_eternity(&req->cex);
 			fd_delete(t->srv_fd);
-			if (t->srv)
+			if (t->srv) {
 				t->srv->cur_sess--;
+				if (t->srv->proxy->lbprm.server_drop_conn)
+					t->srv->proxy->lbprm.server_drop_conn(t->srv);
+			}
 
 			/* note that this must not return any error because it would be able to
 			 * overwrite the client_retnclose() output.
@@ -2594,8 +2597,11 @@
 			}
 			else {
 				fd_delete(t->srv_fd);
-				if (t->srv)
+				if (t->srv) {
 					t->srv->cur_sess--;
+					if (t->srv->proxy->lbprm.server_drop_conn)
+						t->srv->proxy->lbprm.server_drop_conn(t->srv);
+				}
 
 				if (!(req->flags & BF_WRITE_STATUS))
 					conn_err = SN_ERR_SRVTO; // it was a connect timeout.
@@ -2788,6 +2794,8 @@
 				if (t->srv) {
 					t->srv->cur_sess--;
 					t->srv->failed_resp++;
+					if (t->srv->proxy->lbprm.server_drop_conn)
+						t->srv->proxy->lbprm.server_drop_conn(t->srv);
 				}
 				t->be->failed_resp++;
 				t->srv_state = SV_STCLOSE;
@@ -2830,6 +2838,8 @@
 				if (t->srv) {
 					t->srv->cur_sess--;
 					t->srv->failed_resp++;
+					if (t->srv->proxy->lbprm.server_drop_conn)
+						t->srv->proxy->lbprm.server_drop_conn(t->srv);
 				}
 				t->be->failed_resp++;
 				t->srv_state = SV_STCLOSE;
@@ -2998,6 +3008,8 @@
 					if (t->srv) {
 						t->srv->cur_sess--;
 						t->srv->failed_resp++;
+						if (t->srv->proxy->lbprm.server_drop_conn)
+							t->srv->proxy->lbprm.server_drop_conn(t->srv);
 					}
 					cur_proxy->failed_resp++;
 				return_srv_prx_502:
@@ -3025,6 +3037,8 @@
 				if (t->srv) {
 					t->srv->cur_sess--;
 					t->srv->failed_secu++;
+					if (t->srv->proxy->lbprm.server_drop_conn)
+						t->srv->proxy->lbprm.server_drop_conn(t->srv);
 				}
 				cur_proxy->denied_resp++;
 				goto return_srv_prx_502;
@@ -3160,6 +3174,8 @@
 			if (t->srv) {
 				t->srv->cur_sess--;
 				t->srv->failed_secu++;
+				if (t->srv->proxy->lbprm.server_drop_conn)
+					t->srv->proxy->lbprm.server_drop_conn(t->srv);
 			}
 			t->be->denied_resp++;
 
@@ -3247,6 +3263,8 @@
 			if (t->srv) {
 				t->srv->cur_sess--;
 				t->srv->failed_resp++;
+				if (t->srv->proxy->lbprm.server_drop_conn)
+					t->srv->proxy->lbprm.server_drop_conn(t->srv);
 			}
 			t->be->failed_resp++;
 			t->srv_state = SV_STCLOSE;
@@ -3354,6 +3372,8 @@
 			if (t->srv) {
 				t->srv->cur_sess--;
 				t->srv->failed_resp++;
+				if (t->srv->proxy->lbprm.server_drop_conn)
+					t->srv->proxy->lbprm.server_drop_conn(t->srv);
 			}
 			t->be->failed_resp++;
 			//close(t->srv_fd);
@@ -3374,8 +3394,11 @@
 			//EV_FD_CLR(t->srv_fd, DIR_WR);
 			buffer_shutw(req);
 			fd_delete(t->srv_fd);
-			if (t->srv)
+			if (t->srv) {
 				t->srv->cur_sess--;
+				if (t->srv->proxy->lbprm.server_drop_conn)
+					t->srv->proxy->lbprm.server_drop_conn(t->srv);
+			}
 			//close(t->srv_fd);
 			t->srv_state = SV_STCLOSE;
 			/* We used to have a free connection slot. Since we'll never use it,
@@ -3390,8 +3413,11 @@
 			//EV_FD_CLR(t->srv_fd, DIR_WR);
 			buffer_shutw(req);
 			fd_delete(t->srv_fd);
-			if (t->srv)
+			if (t->srv) {
 				t->srv->cur_sess--;
+				if (t->srv->proxy->lbprm.server_drop_conn)
+					t->srv->proxy->lbprm.server_drop_conn(t->srv);
+			}
 			//close(t->srv_fd);
 			t->srv_state = SV_STCLOSE;
 			if (!(t->flags & SN_ERR_MASK))
@@ -3429,6 +3455,8 @@
 			if (t->srv) {
 				t->srv->cur_sess--;
 				t->srv->failed_resp++;
+				if (t->srv->proxy->lbprm.server_drop_conn)
+					t->srv->proxy->lbprm.server_drop_conn(t->srv);
 			}
 			t->be->failed_resp++;
 			//close(t->srv_fd);
@@ -3449,8 +3477,11 @@
 			//EV_FD_CLR(t->srv_fd, DIR_RD);
 			buffer_shutr(rep);
 			fd_delete(t->srv_fd);
-			if (t->srv)
+			if (t->srv) {
 				t->srv->cur_sess--;
+				if (t->srv->proxy->lbprm.server_drop_conn)
+					t->srv->proxy->lbprm.server_drop_conn(t->srv);
+			}
 			//close(t->srv_fd);
 			t->srv_state = SV_STCLOSE;
 			/* We used to have a free connection slot. Since we'll never use it,
@@ -3465,8 +3496,11 @@
 			//EV_FD_CLR(t->srv_fd, DIR_RD);
 			buffer_shutr(rep);
 			fd_delete(t->srv_fd);
-			if (t->srv)
+			if (t->srv) {
 				t->srv->cur_sess--;
+				if (t->srv->proxy->lbprm.server_drop_conn)
+					t->srv->proxy->lbprm.server_drop_conn(t->srv);
+			}
 			//close(t->srv_fd);
 			t->srv_state = SV_STCLOSE;
 			if (!(t->flags & SN_ERR_MASK))
diff --git a/tests/test-fwlc.cfg b/tests/test-fwlc.cfg
new file mode 100644
index 0000000..53a82df
--- /dev/null
+++ b/tests/test-fwlc.cfg
@@ -0,0 +1,61 @@
+# This is a test configuration.
+# It makes use of a farm built from 4 active servers and 4 backup servers,
+# all listenening to different IP addresses on port 80. Health-checks are
+# TCP only on port 81 so that iptables rules permit easy selection of which
+# servers are enabled or disabled.
+#
+# Create statistics counters this way :
+#
+#   iptables -N http
+#   iptables -A OUTPUT -p tcp --syn --dport 80 -j http
+#   for i in $(seq 1 8); do iptables -A http -d 127.0.0.$i; done
+#   iptables -A http -d 127.0.0.0/24
+#
+# Consult the statistics using iptables this way:
+#
+#   iptables --line-numbers -nxvL http
+#   iptables -Z http
+# 
+#
+# Block individual servers like this :
+#   iptables -I INPUT -p tcp --dport 81 -d 127.0.0.1 -j DROP
+#
+
+global
+	maxconn    1000
+        stats socket /tmp/sock1 mode 600
+        stats timeout 3000
+        stats maxconn 2000
+
+listen  sample1
+        mode       tcp
+        retries    1
+        redispatch
+        contimeout 1000
+        clitimeout 120000
+        srvtimeout 120000
+        maxconn    40000
+        bind       :8080
+        balance    leastconn
+	option     allbackups
+        server     act1 127.0.0.1:80 weight 10 maxconn 200 check inter 1000 fall 1
+        server     act2 127.0.0.2:80 weight 20 maxconn 200 check inter 1000 fall 1
+        server     act3 127.0.0.3:80 weight 30 maxconn 200 check inter 1000 fall 1
+        server     act4 127.0.0.4:80 weight 40 maxconn 200 check inter 1000 fall 1
+        server     bck1 127.0.0.5:80 weight 10 check inter 1000 fall 1 backup
+        server     bck2 127.0.0.6:80 weight 20 check inter 1000 fall 1 backup
+        server     bck3 127.0.0.7:80 weight 30 check inter 1000 fall 1 backup
+        server     bck4 127.0.0.8:80 weight 40 check inter 1000 fall 1 backup
+        option     httpclose
+
+listen  sample1
+        mode       http
+        contimeout 1000
+        clitimeout 50000
+        srvtimeout 50000
+        maxconn    40000
+        bind       :8081
+        balance    leastconn
+        option     httpclose
+	stats      uri /stats
+	stats      refresh 5