[MEDIUM] Spread health checks even more When one server appears at the same position in multiple backends, it receives all the checks from all the backends exactly at the same time because the health-checks are only spread within a backend but not globally. Attached patch implements per-server start delay in a different way. Checks are now spread globally - not locally to one backend. It also makes them start faster - IMHO there is no need to add a 'server->inter' when calculating first execution. Calculation were moved from cfgparse.c to checks.c. There is a new function start_checks() and now it is not called when haproxy is started in MODE_CHECK. With this patch it is also possible to set a global 'spread-checks' parameter. It takes a percentage value (1..50, probably something near 5..10 is a good idea) so haproxy adds or removes that many percent to the original interval after each check. My test shows that with 18 backends, 54 servers total and 10000ms/5% it takes about 45m to mix them completely. I decided to use rand/srand pseudo-random number generator. I am aware it is not recommend for a good randomness but a) we do not need a good random generator here b) it is probably the most portable one.

commit: b304dc7fd78e097f69ad2dcdaa0bc3e66cac3ada [log] [tgz]
author: Krzysztof Oledzki <ole@ans.pl> Sun Oct 14 23:40:01 2007 +0200
committer: Willy Tarreau <w@1wt.eu> Mon Oct 15 09:33:10 2007 +0200
tree: e761b23bb04a8c267e93a0adf5b46dff453ad114
parent: 87ea5483138a07b9a731be9f0d3e9868083be052 [diff]
diff --git a/include/proto/checks.h b/include/proto/checks.h
index 839af55..8499175 100644
--- a/include/proto/checks.h
+++ b/include/proto/checks.h

@@ -26,6 +26,7 @@
 #include <common/config.h>
 
 void process_chk(struct task *t, struct timeval *next);
+int start_checks();
 
 #endif /* _PROTO_CHECKS_H */
 

diff --git a/include/types/global.h b/include/types/global.h
index f2de0d9..340b583 100644
--- a/include/types/global.h
+++ b/include/types/global.h

@@ -55,6 +55,7 @@
 	int rlimit_memmax;	/* default ulimit-d in megs value : 0=unset */
 	int mode;
 	int last_checks;
+	int spread_checks;
 	char *chroot;
 	char *pidfile;
 	int logfac1, logfac2;

diff --git a/src/cfgparse.c b/src/cfgparse.c
index 19b2ee7..43ed8aa 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c

@@ -451,7 +451,21 @@
 			Alert("parsing [%s:%d] : too many syslog servers\n", file, linenum);
 			return -1;
 		}
-	
+	}
+	else if (!strcmp(args[0], "spread-checks")) {  /* random time between checks (0-50) */
+		if (global.spread_checks != 0) {
+			Alert("parsing [%s:%d]: spread-checks already specified. Continuing.\n", file, linenum);
+			return 0;
+		}
+		if (*(args[1]) == 0) {
+			Alert("parsing [%s:%d]: '%s' expects an integer argument (0..50).\n", file, linenum, args[0]);
+			return -1;
+		}
+		global.spread_checks = atol(args[1]);
+		if (global.spread_checks < 0 || global.spread_checks > 50) {
+			Alert("parsing [%s:%d]: 'spread-checks' needs a positive value in range 0..50.\n", file, linenum);
+			return -1;
+		}
 	}
 	else {
 		Alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section\n", file, linenum, args[0], "global");
@@ -2261,7 +2275,6 @@
 	char *args[MAX_LINE_ARGS + 1];
 	int arg;
 	int cfgerr = 0;
-	int nbchk, mininter;
 	int confsect = CFG_NONE;
 
 	struct proxy *curproxy = NULL;
@@ -2708,56 +2721,6 @@
 			newsrv = newsrv->next;
 		}
 
-		/* now we'll start this proxy's health checks if any */
-		/* 1- count the checkers to run simultaneously */
-		nbchk = 0;
-		mininter = 0;
-		newsrv = curproxy->srv;
-		while (newsrv != NULL) {
-			if (newsrv->state & SRV_CHECKED) {
-				if (!mininter || mininter > newsrv->inter)
-					mininter = newsrv->inter;
-				nbchk++;
-			}
-			newsrv = newsrv->next;
-		}
-
-		/* 2- start them as far as possible from each others while respecting
-		 * their own intervals. For this, we will start them after their own
-		 * interval added to the min interval divided by the number of servers,
-		 * weighted by the server's position in the list.
-		 */
-		if (nbchk > 0) {
-			struct task *t;
-			int srvpos;
-
-			newsrv = curproxy->srv;
-			srvpos = 0;
-			while (newsrv != NULL) {
-				/* should this server be checked ? */
-				if (newsrv->state & SRV_CHECKED) {
-					if ((t = pool_alloc2(pool2_task)) == NULL) {
-						Alert("parsing [%s:%d] : out of memory.\n", file, linenum);
-						return -1;
-					}
-		
-					t->wq = NULL;
-					t->qlist.p = NULL;
-					t->state = TASK_IDLE;
-					t->process = process_chk;
-					t->context = newsrv;
-		
-					/* check this every ms */
-					tv_ms_add(&t->expire, &now,
-						  newsrv->inter + mininter * srvpos / nbchk);
-					task_queue(t);
-					//task_wakeup(&rq, t);
-					srvpos++;
-				}
-				newsrv = newsrv->next;
-			}
-		}
-
 		curproxy = curproxy->next;
 	}
 	if (cfgerr > 0) {

diff --git a/src/checks.c b/src/checks.c
index 62f0c2c..1d8f2b3 100644
--- a/src/checks.c
+++ b/src/checks.c

@@ -13,7 +13,9 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
+#include <time.h>
 #include <unistd.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
@@ -281,6 +283,7 @@
 	struct server *s = t->context;
 	struct sockaddr_in sa;
 	int fd;
+	int rv;
 
 	//fprintf(stderr, "process_chk: task=%p\n", t);
 
@@ -503,8 +506,15 @@
 				set_server_down(s);
 			s->curfd = -1;
 			fd_delete(fd);
+
+			rv = 0;
+			if (global.spread_checks > 0) {
+				rv = s->inter * global.spread_checks / 100;
+				rv -= (int) (2 * rv * (rand() / (RAND_MAX + 1.0)));
+				//fprintf(stderr, "process_chk: (%d+/-%d%%) random=%d\n", s->inter, global.spread_checks, rv);
+			}
 			while (tv_isle(&t->expire, &now))
-				tv_ms_add(&t->expire, &t->expire, s->inter);
+				tv_ms_add(&t->expire, &t->expire, s->inter + rv);
 			goto new_chk;
 		}
 		/* if result is 0 and there's no timeout, we have to wait again */
@@ -517,6 +527,65 @@
 	return;
 }
 
+/*
+ * Start health-check.
+ * Returns 0 if OK, -1 if error, and prints the error in this case.
+ */
+int start_checks() {
+
+	struct proxy *px;
+	struct server *s;
+	struct task *t;
+	int nbchk=0, mininter=0, srvpos=0;
+
+	/* 1- count the checkers to run simultaneously */
+	for (px = proxy; px; px = px->next) {
+		for (s = px->srv; s; s = s->next) {
+			if (!(s->state & SRV_CHECKED))
+				continue;
+
+			if (!mininter || mininter > s->inter)
+				mininter = s->inter;
+
+			nbchk++;
+		}
+	}
+
+	if (!nbchk)
+		return 0;
+
+	srand((unsigned)time(NULL));
+
+	/*
+	 * 2- start them as far as possible from each others. For this, we will
+	 * start them after their interval set to the min interval divided by
+	 * the number of servers, weighted by the server's position in the list.
+	 */
+	for (px = proxy; px; px = px->next) {
+		for (s = px->srv; s; s = s->next) {
+			if (!(s->state & SRV_CHECKED))
+				continue;
+
+			if ((t = pool_alloc2(pool2_task)) == NULL) {
+				Alert("Starting [%s:%s] check: out of memory.\n", px->id, s->id);
+				return -1;
+			}
+
+			t->wq = NULL;
+			t->qlist.p = NULL;
+			t->state = TASK_IDLE;
+			t->process = process_chk;
+			t->context = s;
+
+			/* check this every ms */
+			tv_ms_add(&t->expire, &now, mininter * srvpos / nbchk);
+			task_queue(t);
+
+			srvpos++;
+		}
+	}
+	return 0;
+}
 
 /*
  * Local variables:

diff --git a/src/haproxy.c b/src/haproxy.c
index 4437d45..7b7a691 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c

@@ -81,6 +81,7 @@
 #include <proto/acl.h>
 #include <proto/backend.h>
 #include <proto/buffers.h>
+#include <proto/checks.h>
 #include <proto/client.h>
 #include <proto/fd.h>
 #include <proto/log.h>
@@ -506,6 +507,7 @@
 		Alert("Error reading configuration file : %s\n", cfg_cfgfile);
 		exit(1);
 	}
+
 	if (have_appsession)
 		appsession_init();
 
@@ -514,6 +516,9 @@
 		exit(0);
 	}
 
+	if (start_checks() < 0)
+		exit(1);
+
 	if (cfg_maxconn > 0)
 		global.maxconn = cfg_maxconn;
commit	b304dc7fd78e097f69ad2dcdaa0bc3e66cac3ada	[log] [tgz]
author	Krzysztof Oledzki <ole@ans.pl>	Sun Oct 14 23:40:01 2007 +0200
committer	Willy Tarreau <w@1wt.eu>	Mon Oct 15 09:33:10 2007 +0200
tree	e761b23bb04a8c267e93a0adf5b46dff453ad114
parent	87ea5483138a07b9a731be9f0d3e9868083be052 [diff]