[MEDIUM] Collect & show information about last health check, v3
Collect information about last health check result,
including L7 code if possible (for example http or smtp
return code) and time took to finish last check.
Health check info is provided on both stats pages (html & csv)
and logged when a server is marked UP or DOWN. Currently active
check are marked with an asterisk, but only in html mode.
Currently there are 14 status codes:
UNK -> unknown
INI -> initializing
SOCKERR -> socket error
L4OK -> check passed on layer 4, no upper layers testing enabled
L4TOUT -> layer 1-4 timeout
L4CON -> layer 1-4 connection problem, for example "Connection refused"
(tcp rst) or "No route to host" (icmp)
L6OK -> check passed on layer 6
L6TOUT -> layer 6 (SSL) timeout
L6RSP -> layer 6 invalid response - protocol error
L7OK -> check passed on layer 7
L7OKC -> check conditionally passed on layer 7, for example
404 with disable-on-404
L7TOUT -> layer 7 (HTTP/SMTP) timeout
L7RSP -> layer 7 invalid response - protocol error
L7STS -> layer 7 response error, for example HTTP 5xx
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 3828812..b36470b 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -2643,6 +2643,8 @@
err_code |= ERR_ALERT | ERR_FATAL;
goto out;
}
+
+ newsrv->check_status = HCHK_STATUS_INI;
newsrv->state |= SRV_CHECKED;
}
diff --git a/src/checks.c b/src/checks.c
index 99c2122..278f0c1 100644
--- a/src/checks.c
+++ b/src/checks.c
@@ -2,7 +2,7 @@
* Health-checks functions.
*
* Copyright 2000-2009 Willy Tarreau <w@1wt.eu>
- * Copyright 2007-2008 Krzysztof Piotr Oledzki <ole@ans.pl>
+ * Copyright 2007-2009 Krzysztof Piotr Oledzki <ole@ans.pl>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -12,6 +12,7 @@
*/
#include <assert.h>
+#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
@@ -34,6 +35,7 @@
#include <types/global.h>
#include <proto/backend.h>
+#include <proto/checks.h>
#include <proto/buffers.h>
#include <proto/fd.h>
#include <proto/log.h>
@@ -45,6 +47,102 @@
#include <proto/server.h>
#include <proto/task.h>
+const char *check_status_description[HCHK_STATUS_SIZE] = {
+ [HCHK_STATUS_UNKNOWN] = "Unknown",
+
+ [HCHK_STATUS_INI] = "Initializing",
+
+ [HCHK_STATUS_SOCKERR] = "Socket error",
+
+ [HCHK_STATUS_L4OK] = "Layer4 check passed",
+ [HCHK_STATUS_L4TOUT] = "Layer4 timeout",
+ [HCHK_STATUS_L4CON] = "Layer4 connection problem",
+
+ [HCHK_STATUS_L6OK] = "Layer6 check passed",
+ [HCHK_STATUS_L6TOUT] = "Layer6 timeout",
+ [HCHK_STATUS_L6RSP] = "Layer6 invalid response",
+
+ [HCHK_STATUS_L7TOUT] = "Layer7 timeout",
+ [HCHK_STATUS_L7RSP] = "Layer7 invalid response",
+
+ [HCHK_STATUS_L7OKD] = "Layer7 check passed",
+ [HCHK_STATUS_L7OKCD] = "Layer7 check conditionally passed",
+ [HCHK_STATUS_L7STS] = "Layer7 wrong status",
+};
+
+
+const char *check_status_info[HCHK_STATUS_SIZE] = {
+ [HCHK_STATUS_UNKNOWN] = "UNK",
+
+ [HCHK_STATUS_INI] = "INI",
+
+ [HCHK_STATUS_SOCKERR] = "SOCKERR",
+
+ [HCHK_STATUS_L4OK] = "L4OK",
+ [HCHK_STATUS_L4TOUT] = "L4TOUT",
+ [HCHK_STATUS_L4CON] = "L4CON",
+
+ [HCHK_STATUS_L6OK] = "L6OK",
+ [HCHK_STATUS_L6TOUT] = "L6TOUT",
+ [HCHK_STATUS_L6RSP] = "L6RSP",
+
+ [HCHK_STATUS_L7TOUT] = "L7TOUT",
+ [HCHK_STATUS_L7RSP] = "L7RSP",
+
+ [HCHK_STATUS_L7OKD] = "L7OK",
+ [HCHK_STATUS_L7OKCD] = "L7OKC",
+ [HCHK_STATUS_L7STS] = "L7STS",
+};
+
+/*
+ * Convert check_status code to description
+ */
+const char *get_check_status_description(short check_status) {
+
+ const char *desc;
+
+ if (check_status < HCHK_STATUS_SIZE)
+ desc = check_status_description[check_status];
+ else
+ desc = NULL;
+
+ if (desc && *desc)
+ return desc;
+ else
+ return check_status_description[HCHK_STATUS_UNKNOWN];
+}
+
+/*
+ * Convert check_status code to short info
+ */
+const char *get_check_status_info(short check_status) {
+
+ const char *info;
+
+ if (check_status < HCHK_STATUS_SIZE)
+ info = check_status_info[check_status];
+ else
+ info = NULL;
+
+ if (info && *info)
+ return info;
+ else
+ return check_status_info[HCHK_STATUS_UNKNOWN];
+}
+
+/*
+ * Set check_status and update check_duration
+ */
+static void set_server_check_status(struct server *s, short status) {
+
+ if (tv_iszero(&s->check_start))
+ return;
+
+ s->check_status = status;
+ s->check_duration = tv_ms_elapsed(&s->check_start, &now);
+ tv_zero(&s->check_start);
+}
+
/* sends a log message when a backend goes down, and also sets last
* change date.
*/
@@ -145,6 +243,12 @@
chunk_printf(&msg, sizeof(trash), " via %s/%s",
s->tracked->proxy->id, s->tracked->id);
+ chunk_printf(&msg, sizeof(trash), ", reason: %s", get_check_status_description(s->check_status));
+ if (s->check_status >= HCHK_STATUS_L57DATA)
+ chunk_printf(&msg, sizeof(trash), ", code: %d", s->check_code);
+
+ chunk_printf(&msg, sizeof(trash), ", check duration: %lums", s->check_duration);
+
chunk_printf(&msg, sizeof(trash), ". %d active and %d backup servers left.%s"
" %d sessions active, %d requeued, %d remaining in queue.\n",
s->proxy->srv_act, s->proxy->srv_bck,
@@ -220,6 +324,10 @@
chunk_printf(&msg, sizeof(trash), " via %s/%s",
s->tracked->proxy->id, s->tracked->id);
+ chunk_printf(&msg, sizeof(trash), ", reason: %s", get_check_status_description(s->check_status));
+ if (s->check_status >= HCHK_STATUS_L57DATA)
+ chunk_printf(&msg, sizeof(trash), ", code: %d", s->check_code);
+
chunk_printf(&msg, sizeof(trash), ". %d active and %d backup servers online.%s"
" %d sessions requeued, %d total in queue.\n",
s->proxy->srv_act, s->proxy->srv_bck,
@@ -339,8 +447,10 @@
struct server *s = t->context;
//fprintf(stderr, "event_srv_chk_w, state=%ld\n", unlikely(fdtab[fd].state));
- if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR)))
+ if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))) {
+ set_server_check_status(s, HCHK_STATUS_L4CON);
goto out_error;
+ }
/* here, we know that the connection is established */
@@ -370,8 +480,19 @@
}
else if (ret == 0 || errno == EAGAIN)
goto out_poll;
- else
+ else {
+ switch (errno) {
+ case ECONNREFUSED:
+ case ENETUNREACH:
+ set_server_check_status(s, HCHK_STATUS_L4CON);
+ break;
+
+ default:
+ set_server_check_status(s, HCHK_STATUS_SOCKERR);
+ }
+
goto out_error;
+ }
}
else {
/* We have no data to send to check the connection, and
@@ -395,11 +516,14 @@
if (errno == EALREADY || errno == EINPROGRESS)
goto out_poll;
- if (errno && errno != EISCONN)
+ if (errno && errno != EISCONN) {
+ set_server_check_status(s, HCHK_STATUS_L4CON);
goto out_error;
+ }
/* good TCP connection is enough */
s->result |= SRV_CHK_RUNNING;
+ set_server_check_status(s, HCHK_STATUS_L4OK);
goto out_wakeup;
}
}
@@ -416,6 +540,7 @@
return 0;
out_error:
s->result |= SRV_CHK_ERROR;
+ /* set_server_check_status() called bofore goto into this label */
fdtab[fd].state = FD_STERROR;
goto out_wakeup;
}
@@ -448,7 +573,12 @@
(getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr) == -1) ||
(skerr != 0))) {
/* in case of TCP only, this tells us if the connection failed */
+
+ if (!(s->result & SRV_CHK_ERROR))
+ set_server_check_status(s, HCHK_STATUS_SOCKERR);
+
s->result |= SRV_CHK_ERROR;
+
goto out_wakeup;
}
@@ -467,40 +597,67 @@
if (s->proxy->options & PR_O_HTTP_CHK) {
/* Check if the server speaks HTTP 1.X */
if ((len < strlen("HTTP/1.0 000\r")) ||
- (memcmp(trash, "HTTP/1.", 7) != 0)) {
+ (memcmp(trash, "HTTP/1.", 7) != 0 ||
+ (trash[12] != ' ' && trash[12] != '\r')) ||
+ !isdigit(trash[9]) || !isdigit(trash[10]) || !isdigit(trash[11])) {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_L7RSP);
goto out_wakeup;
}
+ s->check_code = str2uic(&trash[9]);
+
/* check the reply : HTTP/1.X 2xx and 3xx are OK */
- if (trash[9] == '2' || trash[9] == '3')
+ if (trash[9] == '2' || trash[9] == '3') {
s->result |= SRV_CHK_RUNNING;
- else if ((s->proxy->options & PR_O_DISABLE404) &&
+ set_server_check_status(s, HCHK_STATUS_L7OKD);
+ } else if ((s->proxy->options & PR_O_DISABLE404) &&
(s->state & SRV_RUNNING) &&
- (memcmp(&trash[9], "404", 3) == 0)) {
+ (s->check_code == 404)) {
/* 404 may be accepted as "stopping" only if the server was up */
s->result |= SRV_CHK_RUNNING | SRV_CHK_DISABLE;
+ set_server_check_status(s, HCHK_STATUS_L7OKCD);
}
- else
+ else {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_L7STS);
+ }
}
else if (s->proxy->options & PR_O_SSL3_CHK) {
/* Check for SSLv3 alert or handshake */
- if ((len >= 5) && (trash[0] == 0x15 || trash[0] == 0x16))
+ if ((len >= 5) && (trash[0] == 0x15 || trash[0] == 0x16)) {
s->result |= SRV_CHK_RUNNING;
- else
+ set_server_check_status(s, HCHK_STATUS_L6OK);
+ } else {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_L6RSP);
+ }
}
else if (s->proxy->options & PR_O_SMTP_CHK) {
+ /* Check if the server speaks SMTP */
+ if ((len < strlen("000\r")) ||
+ (trash[3] != ' ' && trash[3] != '\r') ||
+ !isdigit(trash[0]) || !isdigit(trash[1]) || !isdigit(trash[2])) {
+ s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_L7RSP);
+ goto out_wakeup;
+ }
+
+ s->check_code = str2uic(&trash[0]);
+
/* Check for SMTP code 2xx (should be 250) */
- if ((len >= 3) && (trash[0] == '2'))
+ if (trash[0] == '2') {
s->result |= SRV_CHK_RUNNING;
- else
+ set_server_check_status(s, HCHK_STATUS_L7OKD);
+ } else {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_L7STS);
+ }
}
else {
/* other checks are valid if the connection succeeded anyway */
s->result |= SRV_CHK_RUNNING;
+ set_server_check_status(s, HCHK_STATUS_L4OK);
}
out_wakeup:
@@ -551,6 +708,7 @@
/* we'll initiate a new check */
s->result = SRV_CHK_UNKNOWN; /* no result yet */
+ s->check_start = now;
if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) != -1) {
if ((fd < global.maxsock) &&
(fcntl(fd, F_SETFL, O_NONBLOCK) != -1) &&
@@ -626,6 +784,7 @@
if (ret) {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_SOCKERR);
switch (ret) {
case 1:
Alert("Cannot bind to source address before connect() for server %s/%s. Aborting.\n",
@@ -657,6 +816,7 @@
ret = tcpv4_bind_socket(fd, flags, &s->proxy->source_addr, remote);
if (ret) {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_SOCKERR);
switch (ret) {
case 1:
Alert("Cannot bind to source address before connect() for %s '%s'. Aborting.\n",
@@ -714,6 +874,17 @@
}
else if (errno != EALREADY && errno != EISCONN && errno != EAGAIN) {
s->result |= SRV_CHK_ERROR; /* a real error */
+
+ switch (errno) {
+ /* FIXME: is it possible to get ECONNREFUSED/ENETUNREACH with O_NONBLOCK? */
+ case ECONNREFUSED:
+ case ENETUNREACH:
+ set_server_check_status(s, HCHK_STATUS_L4CON);
+ break;
+
+ default:
+ set_server_check_status(s, HCHK_STATUS_SOCKERR);
+ }
}
}
}
@@ -808,6 +979,17 @@
goto new_chk;
}
else if ((s->result & SRV_CHK_ERROR) || tick_is_expired(t->expire, now_ms)) {
+ if (!(s->result & SRV_CHK_ERROR)) {
+ if (!EV_FD_ISSET(fd, DIR_RD)) {
+ set_server_check_status(s, HCHK_STATUS_L4TOUT);
+ } else {
+ if (s->proxy->options & PR_O_SSL3_CHK)
+ set_server_check_status(s, HCHK_STATUS_L6TOUT);
+ else /* HTTP, SMTP */
+ set_server_check_status(s, HCHK_STATUS_L7TOUT);
+ }
+ }
+
//fprintf(stderr, "process_chk: 10\n");
/* failure or timeout detected */
if (s->health > s->rise) {
@@ -894,6 +1076,7 @@
t->expire = tick_add(now_ms,
MS_TO_TICKS(((mininter && mininter >= srv_getinter(s)) ?
mininter : srv_getinter(s)) * srvpos / nbchk));
+ s->check_start = now;
task_queue(t);
srvpos++;
diff --git a/src/dumpstats.c b/src/dumpstats.c
index 86e73a9..681ae89 100644
--- a/src/dumpstats.c
+++ b/src/dumpstats.c
@@ -2,6 +2,7 @@
* Functions dedicated to statistics output
*
* Copyright 2000-2009 Willy Tarreau <w@1wt.eu>
+ * Copyright 2007-2009 Krzysztof Piotr Oledzki <ole@ans.pl>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -39,6 +40,7 @@
#include <proto/backend.h>
#include <proto/buffers.h>
+#include <proto/checks.h>
#include <proto/dumpstats.h>
#include <proto/fd.h>
#include <proto/freq_ctr.h>
@@ -222,6 +224,7 @@
"chkfail,chkdown,lastchg,downtime,qlimit,"
"pid,iid,sid,throttle,lbtot,tracked,type,"
"rate,rate_lim,rate_max,"
+ "check_status,check_code,check_duration,"
"\n");
}
@@ -860,7 +863,7 @@
"<th colspan=3>Session rate</th><th colspan=5>Sessions</th>"
"<th colspan=2>Bytes</th><th colspan=2>Denied</th>"
"<th colspan=3>Errors</th><th colspan=2>Warnings</th>"
- "<th colspan=8>Server</th>"
+ "<th colspan=9>Server</th>"
"</tr>\n"
"<tr align=\"center\" class=\"titre\">"
"<th>Cur</th><th>Max</th><th>Limit</th>"
@@ -868,7 +871,7 @@
"<th>Limit</th><th>Total</th><th>LbTot</th><th>In</th><th>Out</th>"
"<th>Req</th><th>Resp</th><th>Req</th><th>Conn</th>"
"<th>Resp</th><th>Retr</th><th>Redis</th>"
- "<th>Status</th><th>Wght</th><th>Act</th>"
+ "<th>Status</th><th>LastChk</th><th>Wght</th><th>Act</th>"
"<th>Bck</th><th>Chk</th><th>Dwn</th><th>Dwntme</th>"
"<th>Thrtle</th>\n"
"</tr>",
@@ -912,7 +915,7 @@
/* server status : reflect frontend status */
"<td align=center>%s</td>"
/* rest of server: nothing */
- "<td align=center colspan=7></td></tr>"
+ "<td align=center colspan=8></td></tr>"
"",
U2H0(px->denied_req), U2H1(px->denied_resp),
U2H2(px->failed_req),
@@ -938,8 +941,10 @@
",,,,,,,,"
/* pid, iid, sid, throttle, lbtot, tracked, type */
"%d,%d,0,,,,%d,"
- /* rate, rate_lim, rate_max, */
+ /* rate, rate_lim, rate_max */
"%u,%u,%u,"
+ /* check_status, check_code, check_duration */
+ ",,,"
"\n",
px->id,
px->feconn, px->feconn_max, px->maxconn, px->cum_feconn,
@@ -1044,17 +1049,31 @@
U2H3(sv->failed_conns), U2H4(sv->failed_resp),
sv->retries, sv->redispatches);
- /* status */
+ /* status, lest check */
chunk_printf(&msg, sizeof(trash), "<td nowrap>");
- if (sv->state & SRV_CHECKED)
+ if (sv->state & SRV_CHECKED) {
chunk_printf(&msg, sizeof(trash), "%s ",
human_time(now.tv_sec - sv->last_change, 1));
- chunk_printf(&msg, sizeof(trash),
- srv_hlt_st[sv_state],
- (svs->state & SRV_RUNNING) ? (svs->health - svs->rise + 1) : (svs->health),
- (svs->state & SRV_RUNNING) ? (svs->fall) : (svs->rise));
+ chunk_printf(&msg, sizeof(trash),
+ srv_hlt_st[sv_state],
+ (svs->state & SRV_RUNNING) ? (svs->health - svs->rise + 1) : (svs->health),
+ (svs->state & SRV_RUNNING) ? (svs->fall) : (svs->rise));
+
+ chunk_printf(&msg, sizeof(trash), "</td><td title=\"%s\" nowrap> %s%s",
+ get_check_status_description(sv->check_status),
+ tv_iszero(&sv->check_start)?"":"* ",
+ get_check_status_info(sv->check_status));
+
+ if (sv->check_status >= HCHK_STATUS_L57DATA)
+ chunk_printf(&msg, sizeof(trash), "/%d", sv->check_code);
+
+ if (sv->check_status >= HCHK_STATUS_CHECKED)
+ chunk_printf(&msg, sizeof(trash), " in %lums", sv->check_duration);
+ } else {
+ chunk_printf(&msg, sizeof(trash), "</td><td>");
+ }
chunk_printf(&msg, sizeof(trash),
/* weight */
@@ -1180,6 +1199,26 @@
read_freq_ctr(&sv->sess_per_sec),
sv->sps_max);
+ if (sv->state & SRV_CHECKED) {
+ /* check_status */
+ chunk_printf(&msg, sizeof(trash), "%s,", get_check_status_info(sv->check_status));
+
+ /* check_code */
+ if (sv->check_status >= HCHK_STATUS_L57DATA)
+ chunk_printf(&msg, sizeof(trash), "%u,", sv->check_code);
+ else
+ chunk_printf(&msg, sizeof(trash), ",");
+
+ /* check_duration */
+ if (sv->check_status >= HCHK_STATUS_CHECKED)
+ chunk_printf(&msg, sizeof(trash), "%lu,", sv->check_duration);
+ else
+ chunk_printf(&msg, sizeof(trash), ",");
+
+ } else {
+ chunk_printf(&msg, sizeof(trash), ",,,");
+ }
+
/* finish with EOL */
chunk_printf(&msg, sizeof(trash), "\n");
}
@@ -1228,7 +1267,7 @@
* if the backend has known working servers or if it has no server at
* all (eg: for stats). Then we display the total weight, number of
* active and backups. */
- "<td align=center nowrap>%s %s</td><td align=center>%d</td>"
+ "<td align=center nowrap>%s %s</td><td align=center> </td><td align=center>%d</td>"
"<td align=center>%d</td><td align=center>%d</td>"
"",
U2H0(px->denied_req), U2H1(px->denied_resp),
@@ -1276,6 +1315,8 @@
"%d,%d,0,,%lld,,%d,"
/* rate, rate_lim, rate_max, */
"%u,,%u,"
+ /* check_status, check_code, check_duration */
+ ",,,"
"\n",
px->id,
px->nbpend /* or px->totpend ? */, px->nbpend_max,