[MEDIUM] Spread health checks even more

When one server appears at the same position in multiple backends, it
receives all the checks from all the backends exactly at the same time
because the health-checks are only spread within a backend but not
globally.

Attached patch implements per-server start delay in a different way.
Checks are now spread globally - not locally to one backend. It also makes
them start faster - IMHO there is no need to add a 'server->inter' when
calculating first execution. Calculation were moved from cfgparse.c to
checks.c. There is a new function start_checks() and now it is not called
when haproxy is started in MODE_CHECK.

With this patch it is also possible to set a global 'spread-checks'
parameter. It takes a percentage value (1..50, probably something near
5..10 is a good idea) so haproxy adds or removes that many percent to the
original interval after each check. My test shows that with 18 backends,
54 servers total and 10000ms/5% it takes about 45m to mix them completely.

I decided to use rand/srand pseudo-random number generator. I am aware it
is not recommend for a good randomness but a) we do not need a good random
generator here b) it is probably the most portable one.
diff --git a/include/proto/checks.h b/include/proto/checks.h
index 839af55..8499175 100644
--- a/include/proto/checks.h
+++ b/include/proto/checks.h
@@ -26,6 +26,7 @@
 #include <common/config.h>
 
 void process_chk(struct task *t, struct timeval *next);
+int start_checks();
 
 #endif /* _PROTO_CHECKS_H */
 
diff --git a/include/types/global.h b/include/types/global.h
index f2de0d9..340b583 100644
--- a/include/types/global.h
+++ b/include/types/global.h
@@ -55,6 +55,7 @@
 	int rlimit_memmax;	/* default ulimit-d in megs value : 0=unset */
 	int mode;
 	int last_checks;
+	int spread_checks;
 	char *chroot;
 	char *pidfile;
 	int logfac1, logfac2;
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 19b2ee7..43ed8aa 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -451,7 +451,21 @@
 			Alert("parsing [%s:%d] : too many syslog servers\n", file, linenum);
 			return -1;
 		}
-	
+	}
+	else if (!strcmp(args[0], "spread-checks")) {  /* random time between checks (0-50) */
+		if (global.spread_checks != 0) {
+			Alert("parsing [%s:%d]: spread-checks already specified. Continuing.\n", file, linenum);
+			return 0;
+		}
+		if (*(args[1]) == 0) {
+			Alert("parsing [%s:%d]: '%s' expects an integer argument (0..50).\n", file, linenum, args[0]);
+			return -1;
+		}
+		global.spread_checks = atol(args[1]);
+		if (global.spread_checks < 0 || global.spread_checks > 50) {
+			Alert("parsing [%s:%d]: 'spread-checks' needs a positive value in range 0..50.\n", file, linenum);
+			return -1;
+		}
 	}
 	else {
 		Alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section\n", file, linenum, args[0], "global");
@@ -2261,7 +2275,6 @@
 	char *args[MAX_LINE_ARGS + 1];
 	int arg;
 	int cfgerr = 0;
-	int nbchk, mininter;
 	int confsect = CFG_NONE;
 
 	struct proxy *curproxy = NULL;
@@ -2708,56 +2721,6 @@
 			newsrv = newsrv->next;
 		}
 
-		/* now we'll start this proxy's health checks if any */
-		/* 1- count the checkers to run simultaneously */
-		nbchk = 0;
-		mininter = 0;
-		newsrv = curproxy->srv;
-		while (newsrv != NULL) {
-			if (newsrv->state & SRV_CHECKED) {
-				if (!mininter || mininter > newsrv->inter)
-					mininter = newsrv->inter;
-				nbchk++;
-			}
-			newsrv = newsrv->next;
-		}
-
-		/* 2- start them as far as possible from each others while respecting
-		 * their own intervals. For this, we will start them after their own
-		 * interval added to the min interval divided by the number of servers,
-		 * weighted by the server's position in the list.
-		 */
-		if (nbchk > 0) {
-			struct task *t;
-			int srvpos;
-
-			newsrv = curproxy->srv;
-			srvpos = 0;
-			while (newsrv != NULL) {
-				/* should this server be checked ? */
-				if (newsrv->state & SRV_CHECKED) {
-					if ((t = pool_alloc2(pool2_task)) == NULL) {
-						Alert("parsing [%s:%d] : out of memory.\n", file, linenum);
-						return -1;
-					}
-		
-					t->wq = NULL;
-					t->qlist.p = NULL;
-					t->state = TASK_IDLE;
-					t->process = process_chk;
-					t->context = newsrv;
-		
-					/* check this every ms */
-					tv_ms_add(&t->expire, &now,
-						  newsrv->inter + mininter * srvpos / nbchk);
-					task_queue(t);
-					//task_wakeup(&rq, t);
-					srvpos++;
-				}
-				newsrv = newsrv->next;
-			}
-		}
-
 		curproxy = curproxy->next;
 	}
 	if (cfgerr > 0) {
diff --git a/src/checks.c b/src/checks.c
index 62f0c2c..1d8f2b3 100644
--- a/src/checks.c
+++ b/src/checks.c
@@ -13,7 +13,9 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
+#include <time.h>
 #include <unistd.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
@@ -281,6 +283,7 @@
 	struct server *s = t->context;
 	struct sockaddr_in sa;
 	int fd;
+	int rv;
 
 	//fprintf(stderr, "process_chk: task=%p\n", t);
 
@@ -503,8 +506,15 @@
 				set_server_down(s);
 			s->curfd = -1;
 			fd_delete(fd);
+
+			rv = 0;
+			if (global.spread_checks > 0) {
+				rv = s->inter * global.spread_checks / 100;
+				rv -= (int) (2 * rv * (rand() / (RAND_MAX + 1.0)));
+				//fprintf(stderr, "process_chk: (%d+/-%d%%) random=%d\n", s->inter, global.spread_checks, rv);
+			}
 			while (tv_isle(&t->expire, &now))
-				tv_ms_add(&t->expire, &t->expire, s->inter);
+				tv_ms_add(&t->expire, &t->expire, s->inter + rv);
 			goto new_chk;
 		}
 		/* if result is 0 and there's no timeout, we have to wait again */
@@ -517,6 +527,65 @@
 	return;
 }
 
+/*
+ * Start health-check.
+ * Returns 0 if OK, -1 if error, and prints the error in this case.
+ */
+int start_checks() {
+
+	struct proxy *px;
+	struct server *s;
+	struct task *t;
+	int nbchk=0, mininter=0, srvpos=0;
+
+	/* 1- count the checkers to run simultaneously */
+	for (px = proxy; px; px = px->next) {
+		for (s = px->srv; s; s = s->next) {
+			if (!(s->state & SRV_CHECKED))
+				continue;
+
+			if (!mininter || mininter > s->inter)
+				mininter = s->inter;
+
+			nbchk++;
+		}
+	}
+
+	if (!nbchk)
+		return 0;
+
+	srand((unsigned)time(NULL));
+
+	/*
+	 * 2- start them as far as possible from each others. For this, we will
+	 * start them after their interval set to the min interval divided by
+	 * the number of servers, weighted by the server's position in the list.
+	 */
+	for (px = proxy; px; px = px->next) {
+		for (s = px->srv; s; s = s->next) {
+			if (!(s->state & SRV_CHECKED))
+				continue;
+
+			if ((t = pool_alloc2(pool2_task)) == NULL) {
+				Alert("Starting [%s:%s] check: out of memory.\n", px->id, s->id);
+				return -1;
+			}
+
+			t->wq = NULL;
+			t->qlist.p = NULL;
+			t->state = TASK_IDLE;
+			t->process = process_chk;
+			t->context = s;
+
+			/* check this every ms */
+			tv_ms_add(&t->expire, &now, mininter * srvpos / nbchk);
+			task_queue(t);
+
+			srvpos++;
+		}
+	}
+	return 0;
+}
 
 /*
  * Local variables:
diff --git a/src/haproxy.c b/src/haproxy.c
index 4437d45..7b7a691 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -81,6 +81,7 @@
 #include <proto/acl.h>
 #include <proto/backend.h>
 #include <proto/buffers.h>
+#include <proto/checks.h>
 #include <proto/client.h>
 #include <proto/fd.h>
 #include <proto/log.h>
@@ -506,6 +507,7 @@
 		Alert("Error reading configuration file : %s\n", cfg_cfgfile);
 		exit(1);
 	}
+
 	if (have_appsession)
 		appsession_init();
 
@@ -514,6 +516,9 @@
 		exit(0);
 	}
 
+	if (start_checks() < 0)
+		exit(1);
+
 	if (cfg_maxconn > 0)
 		global.maxconn = cfg_maxconn;