[MEDIUM] stats: report server and backend cumulated downtime

Hello,

This patch implements new statistics for SLA calculation by adding new
field 'Dwntime' with total down time since restart (both HTTP/CSV) and
extending status field (HTTP) or inserting a new one (CSV) with time
showing how long each server/backend is in a current state. Additionaly,
down transations are also calculated and displayed for backends, so it is
possible to know how many times selected backend was down, generating "No
server is available to handle this request." error.

New information are presentetd in two different ways:
   - for HTTP: a "human redable form", one of "100000d 23h", "23h 59m" or
      "59m 59s"
   - for CSV: seconds

I believe that seconds resolution is enough.

As there are more columns in the status page I decided to shrink some
names to make more space:
   - Weight -> Wght
   - Check -> Chk
   - Down -> Dwn

Making described changes I also made some improvements and fixed some
small bugs:
   - don't increment s->health above 's->rise + s->fall - 1'. Previously it
     was incremented an then (re)set to 's->rise + s->fall - 1'.
   - do not set server down if it is down already
   - do not set server up if it is up already
   - fix colspan in multiple places (mostly introduced by my previous patch)
   - add missing "status" header to CSV
   - fix order of retries/redispatches in server (CSV)
   - s/Tthen/Then/
   - s/server/backend/ in DATA_ST_PX_BE (dumpstats.c)

Changes from previous version:
  - deal with negative time intervales
  - don't relay on s->state (SRV_RUNNING)
  - little reworked human_time + compacted format (no spaces). If needed it
    can be used in the future for other purposes by optionally making "cnt"
    as an argument
  - leave set_server_down mostly unchanged
  - only little reworked "process_chk: 9"
  - additional fields in CSV are appended to the rigth
  - fix "SEC" macro
  - named arguments (human_time, be_downtime, srv_downtime)

Hope it is OK. If there are only cosmetic changes needed please fill free
to correct it, however if there are some bigger changes required I would
like to discuss it first or at last to know what exactly was changed
especially since I already put this patch into my production server. :)

Thank you,

Best regards,

 				Krzysztof Oledzki
diff --git a/include/common/time.h b/include/common/time.h
index 001d63e..ced4b6b 100644
--- a/include/common/time.h
+++ b/include/common/time.h
@@ -27,6 +27,11 @@
 #include <common/config.h>
 #include <common/standard.h>
 
+#define SEC 1
+#define MINUTE (60 * SEC)
+#define HOUR (60 * MINUTE)
+#define DAY (24 * HOUR)
+
 /* eternity when exprimed in timeval */
 #ifndef TV_ETERNITY
 #define TV_ETERNITY     (~0UL)
@@ -480,6 +485,7 @@
         tv1;                       \
 })
 
+char *human_time(int t, short hz);
 
 #endif /* _COMMON_TIME_H */
 
diff --git a/include/proto/backend.h b/include/proto/backend.h
index 467e756..d70c68f 100644
--- a/include/proto/backend.h
+++ b/include/proto/backend.h
@@ -39,7 +39,7 @@
 
 void recount_servers(struct proxy *px);
 void recalc_server_map(struct proxy *px);
-
+int be_downtime(struct proxy *px);
 
 /*
  * This function tries to find a running server with free connection slots for
diff --git a/include/proto/server.h b/include/proto/server.h
index 2df0763..27e4f36 100644
--- a/include/proto/server.h
+++ b/include/proto/server.h
@@ -31,7 +31,7 @@
 
 #include <proto/queue.h>
 
-
+int srv_downtime(struct server *s);
 
 #endif /* _PROTO_SERVER_H */
 
diff --git a/include/types/proxy.h b/include/types/proxy.h
index ced7630..04d1162 100644
--- a/include/types/proxy.h
+++ b/include/types/proxy.h
@@ -111,6 +111,11 @@
 	unsigned int maxconn;			/* max # of active sessions on the frontend */
 	unsigned int fullconn;			/* #conns on backend above which servers are used at full load */
 	struct in_addr except_net, except_mask; /* don't x-forward-for for this address. FIXME: should support IPv6 */
+
+	unsigned down_trans;			/* up-down transitions */
+	unsigned down_time;			/* total time the proxy was down */
+	time_t last_change;			/* last time, when the state was changed */
+
 	unsigned failed_conns, failed_resp;	/* failed connect() and responses */
 	unsigned retries, redispatches;		/* retried and redispatched connections */
 	unsigned denied_req, denied_resp;	/* blocked requests/responses because of security concerns */
diff --git a/include/types/server.h b/include/types/server.h
index 6e1e810..62179f3 100644
--- a/include/types/server.h
+++ b/include/types/server.h
@@ -86,10 +86,14 @@
 	unsigned wscore;			/* weight score, used during srv map computation */
 
 	unsigned failed_checks, down_trans;	/* failed checks and up-down transitions */
+	unsigned down_time;			/* total time the server was down */
+	time_t last_change;			/* last time, when the state was changed */
+
 	unsigned failed_conns, failed_resp;	/* failed connect() and responses */
 	unsigned retries;			/* retried connections */
 	unsigned failed_secu;			/* blocked responses because of security concerns */
 	unsigned cum_sess;			/* cumulated number of sessions really sent to this server */
+
 	long long bytes_in;			/* number of bytes transferred from the client to the server */
 	long long bytes_out;			/* number of bytes transferred from the server to the client */
 };
diff --git a/src/backend.c b/src/backend.c
index 6cecb17..596d567 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -717,6 +717,13 @@
 	return 0;
 }
 
+int be_downtime(struct proxy *px) {
+
+	if ((px->srv_act || px->srv_bck) && px->last_change < now.tv_sec)		// ignore negative time
+		return px->down_time;
+
+	return now.tv_sec - px->last_change + px->down_time;
+}
 
 /*
  * Local variables:
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 7483a9c..7b73e0b 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -564,7 +564,8 @@
 		tv_eternity(&curproxy->srvtimeout);
 		tv_eternity(&curproxy->contimeout);
 		tv_eternity(&curproxy->appsession_timeout);
-		
+
+		curproxy->last_change = now.tv_sec;
 		curproxy->id = strdup(args[1]);
 		curproxy->cap = rc;
 
@@ -1380,6 +1381,7 @@
 		LIST_INIT(&newsrv->pendconns);
 		do_check = 0;
 		newsrv->state = SRV_RUNNING; /* early server setup */
+		newsrv->last_change = now.tv_sec;
 		newsrv->id = strdup(args[1]);
 
 		/* several ways to check the port component :
diff --git a/src/checks.c b/src/checks.c
index 0cd1a87..77e9b51 100644
--- a/src/checks.c
+++ b/src/checks.c
@@ -57,9 +57,11 @@
 	struct session *sess;
 	int xferred;
 
-	s->state &= ~SRV_RUNNING;
-
 	if (s->health == s->rise) {
+
+		s->last_change = now.tv_sec;
+		s->state &= ~SRV_RUNNING;
+
 		recount_servers(s->proxy);
 		s->proxy->map_state |= PR_MAP_RECALC;
 
@@ -92,10 +94,13 @@
 
 		Warning("%s", trash);
 		send_log(s->proxy, LOG_ALERT, "%s", trash);
-	
+
 		if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) {
-			Alert("%s '%s' has no server available !\n", proxy_type_str(s->proxy), s->proxy->id);
-			send_log(s->proxy, LOG_EMERG, "%s %s has no server available !\n", proxy_type_str(s->proxy), s->proxy->id);
+			s->proxy->last_change = now.tv_sec;
+			s->proxy->down_trans++;
+
+			Alert("%s '%s' has no server available!\n", proxy_type_str(s->proxy), s->proxy->id);
+			send_log(s->proxy, LOG_EMERG, "%s %s has no server available!\n", proxy_type_str(s->proxy), s->proxy->id);
 		}
 		s->down_trans++;
 	}
@@ -455,13 +460,24 @@
 		/* there was a test running */
 		if (s->result > 0) { /* good server detected */
 			//fprintf(stderr, "process_chk: 9\n");
-			s->health++; /* was bad, stays for a while */
-			if (s->health >= s->rise) {
-				s->state |= SRV_RUNNING;
+
+			if (s->health < s->rise + s->fall - 1) {
+				s->health++; /* was bad, stays for a while */
 
 				if (s->health == s->rise) {
 					int xferred;
 
+					if (s->last_change < now.tv_sec)			// ignore negative times
+						s->down_time += now.tv_sec - s->last_change;
+					s->last_change = now.tv_sec;
+					s->state |= SRV_RUNNING;
+
+					if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) {
+						if (s->proxy->last_change < now.tv_sec)		// ignore negative times
+							s->proxy->down_time += now.tv_sec - s->proxy->last_change;
+						s->proxy->last_change = now.tv_sec;
+					}
+
 					recount_servers(s->proxy);
 					s->proxy->map_state |= PR_MAP_RECALC;
 
@@ -493,7 +509,8 @@
 					send_log(s->proxy, LOG_NOTICE, "%s", trash);
 				}
 
-				s->health = s->rise + s->fall - 1; /* OK now */
+				if (s->health >= s->rise)
+					s->health = s->rise + s->fall - 1; /* OK now */
 			}
 			s->curfd = -1; /* no check running anymore */
 			fd_delete(fd);
diff --git a/src/dumpstats.c b/src/dumpstats.c
index 02e0518..7ec4b56 100644
--- a/src/dumpstats.c
+++ b/src/dumpstats.c
@@ -16,7 +16,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <time.h>
 #include <pwd.h>
 #include <grp.h>
 
@@ -47,6 +46,7 @@
 #include <proto/proto_uxst.h>
 #include <proto/senddata.h>
 #include <proto/session.h>
+#include <proto/server.h>
 
 /* This function parses a "stats" statement in the "global" section. It returns
  * -1 if there is any error, otherwise zero. If it returns -1, it may write an
@@ -186,8 +186,8 @@
 			     "dreq,dresp,"
 			     "ereq,econ,eresp,"
 			     "wretr,wredis,"
-			     "weight,act,bck,"
-			     "chkfail,chkdown"
+			     "status,weight,act,bck,"
+			     "chkfail,chkdown,lastchg,downtime,"
 			     "\n");
 			
 		if (buffer_write_chunk(rep, &msg) != 0)
@@ -371,8 +371,8 @@
 			     "dreq,dresp,"
 			     "ereq,econ,eresp,"
 			     "wretr,wredis,"
-			     "weight,act,bck,"
-			     "chkfail,chkdown"
+			     "status,weight,act,bck,"
+			     "chkfail,chkdown,lastchg,downtime,"
 			     "\n");
 		}
 		if (buffer_write_chunk(rep, &msg) != 0)
@@ -582,26 +582,26 @@
 		if (flags & STAT_FMT_HTML) {
 			/* print a new table */
 			chunk_printf(&msg, sizeof(trash),
-				     "<table cols=\"22\" class=\"tbl\" width=\"100%%\">\n"
+				     "<table cols=\"23\" class=\"tbl\" width=\"100%%\">\n"
 				     "<tr align=\"center\" class=\"titre\">"
 				     "<th colspan=2 class=\"pxname\">%s</th>"
-				     "<th colspan=18 class=\"empty\"></th>"
+				     "<th colspan=21 class=\"empty\"></th>"
 				     "</tr>\n"
 				     "<tr align=\"center\" class=\"titre\">"
 				     "<th rowspan=2></th>"
 				     "<th colspan=2>Queue</th><th colspan=4>Sessions</th>"
 				     "<th colspan=2>Bytes</th><th colspan=2>Denied</th>"
 				     "<th colspan=3>Errors</th><th colspan=2>Warnings</th>"
-				     "<th colspan=6>Server</th>"
+				     "<th colspan=7>Server</th>"
 				     "</tr>\n"
 				     "<tr align=\"center\" class=\"titre\">"
 				     "<th>Cur</th><th>Max</th><th>Cur</th><th>Max</th>"
 				     "<th>Limit</th><th>Cumul</th><th>In</th><th>Out</th>"
 				     "<th>Req</th><th>Resp</th><th>Req</th><th>Conn</th>"
 				     "<th>Resp</th><th>Retr</th><th>Redis</th>"
-				     "<th>Status</th><th>Weight</th><th>Act</th>"
-				     "<th>Bck</th><th>Check</th><th>Down</th></tr>\n"
-				     "",
+				     "<th>Status</th><th>Wght</th><th>Act</th>"
+				     "<th>Bck</th><th>Chk</th><th>Dwn</th><th>Dwntme</th>\n"
+				     "</tr>",
 				     px->id);
 
 			if (buffer_write_chunk(rep, &msg) != 0)
@@ -632,7 +632,7 @@
 				     /* server status : reflect frontend status */
 				     "<td align=center>%s</td>"
 				     /* rest of server: nothing */
-				     "<td align=center colspan=5></td></tr>"
+				     "<td align=center colspan=6></td></tr>"
 				     "",
 				     px->feconn, px->feconn_max, px->maxconn, px->cum_feconn,
 				     px->bytes_in, px->bytes_out,
@@ -657,7 +657,7 @@
 				     /* server status : reflect frontend status */
 				     "%s,"
 				     /* rest of server: nothing */
-				     ",,,,,"
+				     ",,,,,,,"
 				     "\n",
 				     px->id,
 				     px->feconn, px->feconn_max, px->maxconn, px->cum_feconn,
@@ -734,6 +734,11 @@
 				     
 				/* status */
 				chunk_printf(&msg, sizeof(trash), "<td nowrap>");
+
+				if (sv->state & SRV_CHECKED)
+					chunk_printf(&msg, sizeof(trash), "%s ",
+						human_time(now.tv_sec - sv->last_change, 1));
+
 				chunk_printf(&msg, sizeof(trash),
 				     srv_hlt_st[sv_state],
 				     (sv->state & SRV_RUNNING) ? (sv->health - sv->rise + 1) : (sv->health),
@@ -749,14 +754,17 @@
 				     (sv->state & SRV_BACKUP) ? "-" : "Y",
 				     (sv->state & SRV_BACKUP) ? "Y" : "-");
 
-				/* check failures : unique, fatal */
+				/* check failures: unique, fatal, down time */
 				if (sv->state & SRV_CHECKED)
 					chunk_printf(&msg, sizeof(trash),
-					     "<td align=right>%d</td><td align=right>%d</td></tr>\n",
-					     sv->failed_checks, sv->down_trans);
+					     "<td align=right>%d</td><td align=right>%d</td>"
+					     "<td nowrap align=right>%s</td>"
+					     "</tr>\n",
+					     sv->failed_checks, sv->down_trans,
+					     human_time(srv_downtime(sv), 1));
 				else
 					chunk_printf(&msg, sizeof(trash),
-					     "<td colspan=2></td></tr>\n");
+					     "<td colspan=3></td></tr>\n");
 			} else {
 				static char *srv_hlt_st[5] = { "DOWN,", "DOWN %d/%d,", "UP %d/%d,",
 							       "UP,", "no check," };
@@ -774,7 +782,7 @@
 				     /* errors : request, connect, response */
 				     ",%d,%d,"
 				     /* warnings: retries, redispatches */
-				     ",%d,"
+				     "%d,,"
 				     "",
 				     px->id, sv->id,
 				     sv->nbpend, sv->nbpend_max,
@@ -798,14 +806,15 @@
 				     (sv->state & SRV_BACKUP) ? 0 : 1,
 				     (sv->state & SRV_BACKUP) ? 1 : 0);
 
-				/* check failures : unique, fatal */
+				/* check failures: unique, fatal; last change, total downtime */
 				if (sv->state & SRV_CHECKED)
 					chunk_printf(&msg, sizeof(trash),
-					     "%d,%d,\n",
-					     sv->failed_checks, sv->down_trans);
+					     "%d,%d,%d,%d,\n",
+					     sv->failed_checks, sv->down_trans,
+					     now.tv_sec - sv->last_change, srv_downtime(sv));
 				else
 					chunk_printf(&msg, sizeof(trash),
-					     ",,\n");
+					     ",,,,\n");
 			}
 			if (buffer_write_chunk(rep, &msg) != 0)
 				return 0;
@@ -847,23 +856,29 @@
 				     "<td align=right></td><td align=right>%d</td><td align=right>%d</td>\n"
 				     /* warnings: retries, redispatches */
 				     "<td align=right>%d</td><td align=right>%d</td>"
-				     /* server status : reflect backend status (up/down) : we display UP
+				     /* backend status: reflect backend status (up/down): we display UP
 				      * if the backend has known working servers or if it has no server at
-				      * all (eg: for stats). Tthen we display the total weight, number of
+				      * all (eg: for stats). Then we display the total weight, number of
 				      * active and backups. */
-				     "<td align=center>%s</td><td align=center>%d</td>"
-				     "<td align=center>%d</td><td align=center>%d</td>"
-				     /* rest of server: nothing */
-				     "<td align=center colspan=2></td></tr>"
-				     "",
+				     "<td align=center nowrap>%s %s</td><td align=center>%d</td>"
+				     "<td align=center>%d</td><td align=center>%d</td>",
 				     px->nbpend /* or px->totpend ? */, px->nbpend_max,
 				     px->beconn, px->beconn_max, px->fullconn, px->cum_beconn,
 				     px->bytes_in, px->bytes_out,
 				     px->denied_req, px->denied_resp,
 				     px->failed_conns, px->failed_resp,
 				     px->retries, px->redispatches,
+				     human_time(now.tv_sec - px->last_change, 1),
 				     (px->srv_map_sz > 0 || !px->srv) ? "UP" : "DOWN",
 				     px->srv_map_sz * gcd, px->srv_act, px->srv_bck);
+
+				chunk_printf(&msg, sizeof(trash),
+				     /* rest of backend: nothing, down transformations, total downtime */
+				     "<td align=center>&nbsp;</td><td align=\"right\">%d</td>"
+				     "<td align=\"right\" nowrap>%s</td>"
+				     "</tr>",
+				     px->down_trans,
+				     px->srv?human_time(be_downtime(px), 1):"&nbsp;");
 			} else {
 				chunk_printf(&msg, sizeof(trash),
 				     /* pxid, name */
@@ -880,14 +895,15 @@
 				     ",%d,%d,"
 				     /* warnings: retries, redispatches */
 				     "%d,%d,"
-				     /* server status : reflect backend status (up/down) : we display UP
+				     /* backend status: reflect backend status (up/down): we display UP
 				      * if the backend has known working servers or if it has no server at
-				      * all (eg: for stats). Tthen we display the total weight, number of
+				      * all (eg: for stats). Then we display the total weight, number of
 				      * active and backups. */
 				     "%s,"
 				     "%d,%d,%d,"
-				     /* rest of server: nothing */
-				     ",,"
+				     /* rest of backend: nothing, down transformations,
+				      * last change, total downtime. */
+				     ",%d,%d,%d,"
 				     "\n",
 				     px->id,
 				     px->nbpend /* or px->totpend ? */, px->nbpend_max,
@@ -897,7 +913,9 @@
 				     px->failed_conns, px->failed_resp,
 				     px->retries, px->redispatches,
 				     (px->srv_map_sz > 0 || !px->srv) ? "UP" : "DOWN",
-				     px->srv_map_sz * gcd, px->srv_act, px->srv_bck);
+				     px->srv_map_sz * gcd, px->srv_act, px->srv_bck,
+				     px->down_trans, now.tv_sec - px->last_change,
+				     px->srv?be_downtime(px):0);
 			}
 			if (buffer_write_chunk(rep, &msg) != 0)
 				return 0;
diff --git a/src/server.c b/src/server.c
index 5b33d11..8b0fa14 100644
--- a/src/server.c
+++ b/src/server.c
@@ -11,10 +11,19 @@
  */
 
 #include <common/config.h>
+#include <common/time.h>
+
 #include <types/backend.h>
 #include <types/proxy.h>
 #include <types/server.h>
 
+int srv_downtime(struct server *s) {
+
+	if ((s->state & SRV_RUNNING) && s->last_change < now.tv_sec)		// ignore negative time
+		return s->down_time;
+
+	return now.tv_sec - s->last_change + s->down_time;
+}
 
 /*
  * Local variables:
diff --git a/src/time.c b/src/time.c
index b80dca9..bb98d98 100644
--- a/src/time.c
+++ b/src/time.c
@@ -142,6 +142,40 @@
 	return __tv_isgt(tv1, tv2);
 }
 
+char *human_time(int t, short hz) {
+	static char rv[sizeof("24855d23h")+1];	// longest of "23h59m" and "59m59s"
+	char *p = rv;
+	int cnt=2;				// print two numbers
+
+	if (unlikely(t < 0 || hz <= 0)) {
+		sprintf(p, "?");
+		return rv;
+	}
+
+	if (unlikely(hz > 1))
+		t /= hz;
+
+	if (t >= DAY) {
+		p += sprintf(p, "%dd", t / DAY);
+		cnt--;
+	}
+
+	if (cnt && t % DAY / HOUR) {
+		p += sprintf(p, "%dh", t % DAY / HOUR);
+		cnt--;
+	}
+
+	if (cnt && t % HOUR / MINUTE) {
+		p += sprintf(p, "%dm", t % HOUR / MINUTE);
+		cnt--;
+	}
+
+	if ((cnt && t % MINUTE) || !t)					// also display '0s'
+		p += sprintf(p, "%ds", t % MINUTE / SEC);
+
+	return rv;
+}
+
 /*
  * Local variables:
  *  c-indent-level: 8