[MEDIUM] Health check reporting code rework + health logging, v3

This patch adds health logging so it possible to check what
was happening before a crash. Failed healt checks are logged if
server is UP and succeeded healt checks if server is DOWN,
so the amount of additional information is limited.

I also reworked the code a little:

 - check_status_description[] and check_status_info[] is now
   joined into check_statuses[]

 - set_server_check_status updates not only s->check_status and
   s->check_duration but also s->result making the code simpler

Changes in v3:
 - for now calculate and use local versions of health/rise/fall/state,
   it is a slow path, no harm should be done. One day we may centralize
   processing of the checks and remove the duplicated code.
 - also log checks that are restoring current state
 - use "conditionally succeeded" for 404 with disable-on-404
diff --git a/doc/configuration.txt b/doc/configuration.txt
index e455368..2055a09 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -731,6 +731,8 @@
 [no] option http_proxy      X          X         X         X
 [no] option log-separate-
             errors          X          X         X         -
+[no] option log-health-     X          -         X         X
+            checks
 [no] option logasap         X          X         X         -
 [no] option nolinger        X          X         X         X
 option originalto           X          X         X         X
@@ -2464,6 +2466,23 @@
              logging.
 
 
+log-health-checks
+no log-health-checks
+  Enable or disable logging of health checks
+  May be used in sections :   defaults | frontend | listen | backend
+                                 yes   |    no    |   yes  |  yes
+  Arguments : none
+
+  Enable health checks logging so it possible to check for example what
+  was happening before a server crash. Failed health check are logged if
+  server is UP and succeeded health checks if server is DOWN, so the amount
+  of additional information is limited.
+
+  If health check logging is enabled no health check status is printed
+  when servers is set up UP/DOWN/ENABLED/DISABLED.
+
+  See also: "log" and section 8 about logging.
+
 option logasap
 no option logasap
   Enable or disable early logging of HTTP requests
diff --git a/include/types/checks.h b/include/types/checks.h
index 87ff2c8..1b04608 100644
--- a/include/types/checks.h
+++ b/include/types/checks.h
@@ -14,8 +14,9 @@
 enum {
 	HCHK_STATUS_UNKNOWN	 = 0,	/* Unknown */
 	HCHK_STATUS_INI,		/* Initializing */
+	HCHK_STATUS_START,		/* Check started - SPECIAL STATUS */
 
-	/* Below we have check finished */
+	/* Below we have finished checks */
 	HCHK_STATUS_CHECKED,		/* DUMMY STATUS */
 	HCHK_STATUS_SOCKERR,		/* Socket error */
 
@@ -39,3 +40,9 @@
 
 	HCHK_STATUS_SIZE
 };
+
+struct check_status {
+	short result;			/* one of SRV_CHK_* */
+	char *info;			/* human readable short info */
+	char *desc;			/* long description */
+};
diff --git a/include/types/proxy.h b/include/types/proxy.h
index c2323f6..8c93e55 100644
--- a/include/types/proxy.h
+++ b/include/types/proxy.h
@@ -121,6 +121,7 @@
 #define PR_O2_SMARTCON 	0x00000100      /* don't immediately send empty ACK after connect */
 #define PR_O2_RDPC_PRST	0x00000200      /* Actvate rdp cookie analyser */
 #define PR_O2_CLFLOG	0x00000400      /* log into clf format */
+#define PR_O2_LOGHCHKS	0x00000800	/* log health checks */
 
 /* This structure is used to apply fast weighted round robin on a server group */
 struct fwrr_group {
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 69fd00f..a86ed0e 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -135,6 +135,7 @@
 	{ "accept-invalid-http-response", PR_O2_RSPBUG_OK, PR_CAP_BE, 0 },
 	{ "dontlog-normal",               PR_O2_NOLOGNORM, PR_CAP_FE, 0 },
 	{ "log-separate-errors",          PR_O2_LOGERRORS, PR_CAP_FE, 0 },
+	{ "log-health-checks",            PR_O2_LOGHCHKS,  PR_CAP_BE, 0 },
 	{ "tcp-smart-accept",             PR_O2_SMARTACC,  PR_CAP_FE, 0 },
 	{ "tcp-smart-connect",            PR_O2_SMARTCON,  PR_CAP_BE, 0 },
 	{ NULL, 0, 0, 0 }
diff --git a/src/checks.c b/src/checks.c
index 3ee0808..aca3577 100644
--- a/src/checks.c
+++ b/src/checks.c
@@ -47,53 +47,31 @@
 #include <proto/server.h>
 #include <proto/task.h>
 
-const char *check_status_description[HCHK_STATUS_SIZE] = {
-	[HCHK_STATUS_UNKNOWN]	= "Unknown",
+const struct check_status check_statuses[HCHK_STATUS_SIZE] = {
+	[HCHK_STATUS_UNKNOWN]	= { SRV_CHK_UNKNOWN,                   "UNK",     "Unknown" },
+	[HCHK_STATUS_INI]	= { SRV_CHK_UNKNOWN,                   "INI",     "Initializing" },
+	[HCHK_STATUS_START]	= { /* SPECIAL STATUS*/ },
 
-	[HCHK_STATUS_INI]	= "Initializing",
+	[HCHK_STATUS_SOCKERR]	= { SRV_CHK_ERROR,                     "SOCKERR", "Socket error" },
 
-	[HCHK_STATUS_SOCKERR]	= "Socket error",
+	[HCHK_STATUS_L4OK]	= { SRV_CHK_RUNNING,                   "L4OK",    "Layer4 check passed" },
+	[HCHK_STATUS_L4TOUT]	= { SRV_CHK_ERROR,                     "L4TOUT",  "Layer4 timeout" },
+	[HCHK_STATUS_L4CON]	= { SRV_CHK_ERROR,                     "L4CON",   "Layer4 connection problem" },
 
-	[HCHK_STATUS_L4OK]	= "Layer4 check passed",
-	[HCHK_STATUS_L4TOUT]	= "Layer4 timeout",
-	[HCHK_STATUS_L4CON]	= "Layer4 connection problem",
+	[HCHK_STATUS_L6OK]	= { SRV_CHK_RUNNING,                   "L6OK",    "Layer6 check passed" },
+	[HCHK_STATUS_L6TOUT]	= { SRV_CHK_ERROR,                     "L6TOUT",  "Layer6 timeout" },
+	[HCHK_STATUS_L6RSP]	= { SRV_CHK_ERROR,                     "L6RSP",   "Layer6 invalid response" },
 
-	[HCHK_STATUS_L6OK]	= "Layer6 check passed",
-	[HCHK_STATUS_L6TOUT]	= "Layer6 timeout",
-	[HCHK_STATUS_L6RSP]	= "Layer6 invalid response",
+	[HCHK_STATUS_L7TOUT]	= { SRV_CHK_ERROR,                     "L7TOUT",  "Layer7 timeout" },
+	[HCHK_STATUS_L7RSP]	= { SRV_CHK_ERROR,                     "L7RSP",   "Layer7 invalid response" },
 
-	[HCHK_STATUS_L7TOUT]	= "Layer7 timeout",
-	[HCHK_STATUS_L7RSP]	= "Layer7 invalid response",
+	[HCHK_STATUS_L57DATA]	= { /* DUMMY STATUS */ },
 
-	[HCHK_STATUS_L7OKD]	= "Layer7 check passed",
-	[HCHK_STATUS_L7OKCD]	= "Layer7 check conditionally passed",
-	[HCHK_STATUS_L7STS]	= "Layer7 wrong status",
+	[HCHK_STATUS_L7OKD]	= { SRV_CHK_RUNNING,                   "L7OK",    "Layer7 check passed" },
+	[HCHK_STATUS_L7OKCD]	= { SRV_CHK_RUNNING | SRV_CHK_DISABLE, "L7OKC",   "Layer7 check conditionally passed" },
+	[HCHK_STATUS_L7STS]	= { SRV_CHK_ERROR,                     "L7STS",   "Layer7 wrong status" },
 };
 
-
-const char *check_status_info[HCHK_STATUS_SIZE] = {
-	[HCHK_STATUS_UNKNOWN]	= "UNK",
-
-	[HCHK_STATUS_INI]	= "INI",
-
-	[HCHK_STATUS_SOCKERR]	= "SOCKERR",
-
-	[HCHK_STATUS_L4OK]	= "L4OK",
-	[HCHK_STATUS_L4TOUT]	= "L4TOUT",
-	[HCHK_STATUS_L4CON]	= "L4CON",
-
-	[HCHK_STATUS_L6OK]	= "L6OK",
-	[HCHK_STATUS_L6TOUT]	= "L6TOUT",
-	[HCHK_STATUS_L6RSP]	= "L6RSP",
-
-	[HCHK_STATUS_L7TOUT]	= "L7TOUT",
-	[HCHK_STATUS_L7RSP]	= "L7RSP",
-
-	[HCHK_STATUS_L7OKD]	= "L7OK",
-	[HCHK_STATUS_L7OKCD]	= "L7OKC",
-	[HCHK_STATUS_L7STS]	= "L7STS",
-};
-
 /*
  * Convert check_status code to description
  */
@@ -102,14 +80,14 @@
 	const char *desc;
 
 	if (check_status < HCHK_STATUS_SIZE)
-		desc = check_status_description[check_status];
+		desc = check_statuses[check_status].desc;
 	else
 		desc = NULL;
 
 	if (desc && *desc)
 		return desc;
 	else
-		return check_status_description[HCHK_STATUS_UNKNOWN];
+		return check_statuses[HCHK_STATUS_UNKNOWN].desc;
 }
 
 /*
@@ -120,27 +98,108 @@
 	const char *info;
 
 	if (check_status < HCHK_STATUS_SIZE)
-		info = check_status_info[check_status];
+		info = check_statuses[check_status].info;
 	else
 		info = NULL;
 
 	if (info && *info)
 		return info;
 	else
-		return check_status_info[HCHK_STATUS_UNKNOWN];
+		return check_statuses[HCHK_STATUS_UNKNOWN].info;
 }
 
 /*
- * Set check_status and update check_duration
+ * Set s->check_status, update s->check_duration and fill s->result with
+ * an adequate SRV_CHK_* value.
+ *
+ * Show information in logs about failed health check if server is UP
+ * or succeeded health checks if server is DOWN.
  */
 static void set_server_check_status(struct server *s, short status) {
 
+	struct chunk msg;
+
+	if (status == HCHK_STATUS_START) {
+		s->result = SRV_CHK_UNKNOWN;	/* no result yet */
+		s->check_start = now;
+		return;
+	}
+
+	if (!s->check_status)
+		return;
+
-		if (tv_iszero(&s->check_start))
-			return;
+	s->check_status = status;
+	if (check_statuses[status].result)
+		s->result |= check_statuses[status].result;
 
-		s->check_status = status;
+	if (!tv_iszero(&s->check_start)) {
+		/* set_server_check_status() may be called more than once */
 		s->check_duration = tv_ms_elapsed(&s->check_start, &now);
 		tv_zero(&s->check_start);
+	}
+
+	if (s->proxy->options2 & PR_O2_LOGHCHKS &&
+	(((s->health != 0) && (s->result & SRV_CHK_ERROR)) ||
+	    ((s->health != s->rise + s->fall - 1) && (s->result & SRV_CHK_RUNNING)) ||
+	    ((s->state & SRV_GOINGDOWN) && !(s->result & SRV_CHK_DISABLE)) ||
+	    (!(s->state & SRV_GOINGDOWN) && (s->result & SRV_CHK_DISABLE)))) {
+
+		int health, rise, fall, state;
+
+		chunk_init(&msg, trash, sizeof(trash));
+
+		/* FIXME begin: calculate local version of the health/rise/fall/state */
+		health = s->health;
+		rise   = s->rise;
+		fall   = s->fall;
+		state  = s->state;
+
+		if (s->result & SRV_CHK_ERROR) {
+			if (health > rise) {
+				health--; /* still good */
+			} else {
+				if (health == rise)
+					state &= ~(SRV_RUNNING | SRV_GOINGDOWN);
+
+				health = 0;
+			}
+		}
+
+		if (s->result & SRV_CHK_RUNNING) {
+			if (health < rise + fall - 1) {
+				health++; /* was bad, stays for a while */
+
+				if (health == rise)
+					state |= SRV_RUNNING;
+
+				if (health >= rise)
+					health = rise + fall - 1; /* OK now */
+			}
+		}
+		/* FIXME end: calculate local version of the health/rise/fall/state */
+
+		chunk_printf(&msg,
+			"Health check for %sserver %s/%s %s%s",
+			s->state & SRV_BACKUP ? "backup " : "",
+			s->proxy->id, s->id,
+			(s->result & SRV_CHK_DISABLE)?"conditionally ":"",
+			(s->result & SRV_CHK_RUNNING)?"succeeded":"failed");
+
+		chunk_printf(&msg, ", reason: %s", get_check_status_description(s->check_status));
+
+		if (s->check_status >= HCHK_STATUS_L57DATA)
+			chunk_printf(&msg, ", code: %d", s->check_code);
+
+		chunk_printf(&msg, ", check duration: %lums", s->check_duration);
+
+		chunk_printf(&msg, ", status: %d/%d %s.\n",
+			(state & SRV_RUNNING) ? (health - rise + 1) : (health),
+			(state & SRV_RUNNING) ? (fall) : (rise),
+			(state & SRV_RUNNING)?"UP":"DOWN");
+
+		Warning("%s", trash);
+		send_log(s->proxy, LOG_NOTICE, "%s", trash);
+	}
 }
 
 /* sends a log message when a backend goes down, and also sets last
@@ -242,11 +301,13 @@
 			chunk_printf(&msg, " via %s/%s",
 				s->tracked->proxy->id, s->tracked->id);
 
-		chunk_printf(&msg, ", reason: %s", get_check_status_description(s->check_status));
-		if (s->check_status >= HCHK_STATUS_L57DATA)
-			chunk_printf(&msg, ", code: %d", s->check_code);
+		if (!s->tracked && !(s->proxy->options2 & PR_O2_LOGHCHKS)) {
+			chunk_printf(&msg, ", reason: %s", get_check_status_description(s->check_status));
+			if (s->check_status >= HCHK_STATUS_L57DATA)
+				chunk_printf(&msg, ", code: %d", s->check_code);
 
-		chunk_printf(&msg, ", check duration: %lums", s->check_duration);
+			chunk_printf(&msg, ", check duration: %lums", s->check_duration);
+		}
 
 		chunk_printf(&msg, ". %d active and %d backup servers left.%s"
 			" %d sessions active, %d requeued, %d remaining in queue.\n",
@@ -322,9 +383,11 @@
 			chunk_printf(&msg, " via %s/%s",
 				s->tracked->proxy->id, s->tracked->id);
 
-		chunk_printf(&msg, ", reason: %s", get_check_status_description(s->check_status));
-		if (s->check_status >= HCHK_STATUS_L57DATA)
-			chunk_printf(&msg, ", code: %d", s->check_code);
+		if (!s->tracked && !(s->proxy->options2 & PR_O2_LOGHCHKS)) {
+			chunk_printf(&msg, ", reason: %s", get_check_status_description(s->check_status));
+			if (s->check_status >= HCHK_STATUS_L57DATA)
+				chunk_printf(&msg, ", code: %d", s->check_code);
+		}
 
 		chunk_printf(&msg, ". %d active and %d backup servers online.%s"
 			" %d sessions requeued, %d total in queue.\n",
@@ -371,6 +434,14 @@
 		chunk_printf(&msg, " via %s/%s",
 			s->tracked->proxy->id, s->tracked->id);
 
+	if (!s->tracked && !(s->proxy->options2 & PR_O2_LOGHCHKS)) {
+		chunk_printf(&msg, ", reason: %s", get_check_status_description(s->check_status));
+
+		if (s->check_status >= HCHK_STATUS_L57DATA)
+			chunk_printf(&msg, ", code: %d", s->check_code);
+
+		chunk_printf(&msg, ", check duration: %lums", s->check_duration);
+	}
 
 	chunk_printf(&msg,". %d active and %d backup servers online.%s"
 		" %d sessions requeued, %d total in queue.\n",
@@ -415,6 +486,15 @@
 		chunk_printf(&msg, " via %s/%s",
 			s->tracked->proxy->id, s->tracked->id);
 
+	if (!s->tracked && !(s->proxy->options2 & PR_O2_LOGHCHKS)) {
+		chunk_printf(&msg, ", reason: %s", get_check_status_description(s->check_status));
+
+		if (s->check_status >= HCHK_STATUS_L57DATA)
+			chunk_printf(&msg, ", code: %d", s->check_code);
+
+		chunk_printf(&msg, ", check duration: %lums", s->check_duration);
+	}
+
 	chunk_printf(&msg, ". %d active and %d backup servers online.%s"
 		" %d sessions requeued, %d total in queue.\n",
 		s->proxy->srv_act, s->proxy->srv_bck,
@@ -431,8 +511,9 @@
 
 /*
  * This function is used only for server health-checks. It handles
- * the connection acknowledgement. If the proxy requires HTTP health-checks,
- * it sends the request. In other cases, it fills s->result with SRV_CHK_*.
+ * the connection acknowledgement. If the proxy requires L7 health-checks,
+ * it sends the request. In other cases, it calls set_server_check_status()
+ * to set s->check_status, s->check_duration and s->result.
  * The function itself returns 0 if it needs some polling before being called
  * again, otherwise 1.
  */
@@ -518,7 +599,6 @@
 			}
 
 			/* good TCP connection is enough */
-			s->result |= SRV_CHK_RUNNING;
 			set_server_check_status(s, HCHK_STATUS_L4OK);
 			goto out_wakeup;
 		}
@@ -535,8 +615,6 @@
 	fdtab[fd].ev &= ~FD_POLL_OUT;
 	return 0;
  out_error:
-	s->result |= SRV_CHK_ERROR;
-	/* set_server_check_status() called bofore goto into this label */
 	fdtab[fd].state = FD_STERROR;
 	goto out_wakeup;
 }
@@ -544,13 +622,19 @@
 
 /*
  * This function is used only for server health-checks. It handles the server's
- * reply to an HTTP request or SSL HELLO. It sets s->result to SRV_CHK_RUNNING
- * if an HTTP server replies HTTP 2xx or 3xx (valid responses), if an SMTP
- * server returns 2xx, or if an SSL server returns at least 5 bytes in response
- * to an SSL HELLO (the principle is that this is enough to distinguish between
- * an SSL server and a pure TCP relay). All other cases will set s->result to
- * SRV_CHK_ERROR. The function returns 0 if it needs to be called again after
- * some polling, otherwise non-zero..
+ * reply to an HTTP request or SSL HELLO. It calls set_server_check_status() to
+ * update s->check_status, s->check_duration and s->result.
+
+ * The set_server_check_status function is called with HCHK_STATUS_L7OKD if
+ * an HTTP server replies HTTP 2xx or 3xx (valid responses), if an SMTP server
+ * returns 2xx, HCHK_STATUS_L6OK if an SSL server returns at least 5 bytes in
+ * response to an SSL HELLO (the principle is that this is enough to
+ * distinguish between an SSL server and a pure TCP relay). All other cases will
+ * call it with a proper error status like HCHK_STATUS_L7STS, HCHK_STATUS_L6RSP,
+ * etc.
+ *
+ * The function returns 0 if it needs to be called again after some polling,
+ * otherwise non-zero..
  */
 static int event_srv_chk_r(int fd)
 {
@@ -573,8 +657,6 @@
 		if (!(s->result & SRV_CHK_ERROR))
 			set_server_check_status(s, HCHK_STATUS_SOCKERR);
 
-		s->result |= SRV_CHK_ERROR;
-
 		goto out_wakeup;
 	}
 
@@ -596,7 +678,6 @@
 		    (memcmp(trash, "HTTP/1.", 7) != 0 ||
 		    (trash[12] != ' ' && trash[12] != '\r')) ||
 		    !isdigit(trash[9]) || !isdigit(trash[10]) || !isdigit(trash[11])) {
-			s->result |= SRV_CHK_ERROR;
 			set_server_check_status(s, HCHK_STATUS_L7RSP);
 			goto out_wakeup;
 		}
@@ -604,37 +685,28 @@
 		s->check_code = str2uic(&trash[9]);
 
 		/* check the reply : HTTP/1.X 2xx and 3xx are OK */
-		if (trash[9] == '2' || trash[9] == '3') {
-			s->result |= SRV_CHK_RUNNING;
+		if (trash[9] == '2' || trash[9] == '3')
 			set_server_check_status(s, HCHK_STATUS_L7OKD);
-		} else if ((s->proxy->options & PR_O_DISABLE404) &&
+		else if ((s->proxy->options & PR_O_DISABLE404) &&
 			 (s->state & SRV_RUNNING) &&
-			 (s->check_code == 404)) {
-			/* 404 may be accepted as "stopping" only if the server was up */
-			s->result |= SRV_CHK_RUNNING | SRV_CHK_DISABLE;
+			 (s->check_code == 404))
+		/* 404 may be accepted as "stopping" only if the server was up */
 			set_server_check_status(s, HCHK_STATUS_L7OKCD);
-		}
-		else {
-			s->result |= SRV_CHK_ERROR;
+		else
 			set_server_check_status(s, HCHK_STATUS_L7STS);
-		}
 	}
 	else if (s->proxy->options & PR_O_SSL3_CHK) {
 		/* Check for SSLv3 alert or handshake */
-		if ((len >= 5) && (trash[0] == 0x15 || trash[0] == 0x16)) {
-			s->result |= SRV_CHK_RUNNING;
+		if ((len >= 5) && (trash[0] == 0x15 || trash[0] == 0x16))
 			set_server_check_status(s, HCHK_STATUS_L6OK);
-		} else {
-			s->result |= SRV_CHK_ERROR;
+		else
 			set_server_check_status(s, HCHK_STATUS_L6RSP);
-		}
 	}
 	else if (s->proxy->options & PR_O_SMTP_CHK) {
 		/* Check if the server speaks SMTP */
 		if ((len < strlen("000\r")) ||
 		    (trash[3] != ' ' && trash[3] != '\r') ||
 		    !isdigit(trash[0]) || !isdigit(trash[1]) || !isdigit(trash[2])) {
-			s->result |= SRV_CHK_ERROR;
 			set_server_check_status(s, HCHK_STATUS_L7RSP);
 			goto out_wakeup;
 		}
@@ -642,17 +714,13 @@
 		s->check_code = str2uic(&trash[0]);
 
 		/* Check for SMTP code 2xx (should be 250) */
-		if (trash[0] == '2') {
-			s->result |= SRV_CHK_RUNNING;
+		if (trash[0] == '2')
 			set_server_check_status(s, HCHK_STATUS_L7OKD);
-		} else {
-			s->result |= SRV_CHK_ERROR;
+		else
 			set_server_check_status(s, HCHK_STATUS_L7STS);
-		}
 	}
 	else {
 		/* other checks are valid if the connection succeeded anyway */
-		s->result |= SRV_CHK_RUNNING;
 		set_server_check_status(s, HCHK_STATUS_L4OK);
 	}
 
@@ -703,8 +771,7 @@
 		}
 
 		/* we'll initiate a new check */
-		s->result = SRV_CHK_UNKNOWN; /* no result yet */
-		s->check_start = now;
+		set_server_check_status(s, HCHK_STATUS_START);
 		if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) != -1) {
 			if ((fd < global.maxsock) &&
 			    (fcntl(fd, F_SETFL, O_NONBLOCK) != -1) &&
@@ -779,7 +846,6 @@
 					}
 
 					if (ret) {
-						s->result |= SRV_CHK_ERROR;
 						set_server_check_status(s, HCHK_STATUS_SOCKERR);
 						switch (ret) {
 						case 1:
@@ -811,7 +877,6 @@
 #endif
 					ret = tcpv4_bind_socket(fd, flags, &s->proxy->source_addr, remote);
 					if (ret) {
-						s->result |= SRV_CHK_ERROR;
 						set_server_check_status(s, HCHK_STATUS_SOCKERR);
 						switch (ret) {
 						case 1:
@@ -869,7 +934,7 @@
 						return t;
 					}
 					else if (errno != EALREADY && errno != EISCONN && errno != EAGAIN) {
-						s->result |= SRV_CHK_ERROR;    /* a real error */
+						/* a real error */
 
 						switch (errno) {
 							/* FIXME: is it possible to get ECONNREFUSED/ENETUNREACH with O_NONBLOCK? */