MAJOR: checks: rework completely bogus state machine
The porting of checks to using connections was totally bogus. Some checks
were considered successful as soon as the connection was established,
regardless of any response. Some errors would be triggered upon recv
if polling was enabled for send or if the send channel was shut down.
Now the behaviour is much better. It would be cleaner to perform the
fd_delete() in wake_srv_chk() and to process failures and timeouts
separately, but this is already a good start.
diff --git a/include/types/server.h b/include/types/server.h
index 87ca0be..ac7ab7b 100644
--- a/include/types/server.h
+++ b/include/types/server.h
@@ -60,6 +60,7 @@
#define SRV_TPROXY_MASK 0x0700 /* bind to a non-local address to reach this server */
#define SRV_SEND_PROXY 0x0800 /* this server talks the PROXY protocol */
#define SRV_NON_STICK 0x1000 /* never add connections allocated to this server to a stick table */
+#define SRV_CHK_RUNNING 0x2000 /* a check is currently running on this server */
/* function which act on servers need to return various errors */
#define SRV_STATUS_OK 0 /* everything is OK. */
diff --git a/src/checks.c b/src/checks.c
index f5b7f23..24e6cb0 100644
--- a/src/checks.c
+++ b/src/checks.c
@@ -765,7 +765,7 @@
int fd = conn->t.sock.fd;
struct task *t = s->check.task;
- if (conn->flags & (CO_FL_SOCK_WR_SH | CO_FL_DATA_WR_SH | CO_FL_WAIT_DATA | CO_FL_WAIT_WR))
+ if (conn->flags & (CO_FL_SOCK_WR_SH | CO_FL_DATA_WR_SH))
conn->flags |= CO_FL_ERROR;
if (unlikely(conn->flags & CO_FL_ERROR)) {
@@ -779,7 +779,7 @@
goto out_error;
}
- if (conn->flags & CO_FL_HANDSHAKE)
+ if (conn->flags & (CO_FL_HANDSHAKE | CO_FL_WAIT_WR))
return;
/* here, we know that the connection is established */
@@ -791,25 +791,23 @@
set_server_check_status(s, HCHK_STATUS_L4CON, strerror(errno));
goto out_wakeup;
}
+ if (s->check.bo->o) {
+ goto out_incomplete;
+ }
}
- if (!s->check.bo->o) {
- /* full request sent, we allow up to <timeout.check> if nonzero for a response */
- if (s->proxy->timeout.check) {
- t->expire = tick_add_ifset(now_ms, s->proxy->timeout.check);
- task_queue(t);
- }
- goto out_nowake;
+ /* full request sent, we allow up to <timeout.check> if nonzero for a response */
+ if (s->proxy->timeout.check) {
+ t->expire = tick_add_ifset(now_ms, s->proxy->timeout.check);
+ task_queue(t);
}
- goto out_poll;
+ goto out_nowake;
}
out_wakeup:
task_wakeup(t, TASK_WOKEN_IO);
out_nowake:
__conn_data_stop_send(conn); /* nothing more to write */
- return;
- out_poll:
- __conn_data_poll_send(conn);
+ out_incomplete:
return;
out_error:
conn->flags |= CO_FL_ERROR;
@@ -839,9 +837,6 @@
int done;
unsigned short msglen;
- if (conn->flags & (CO_FL_SOCK_WR_SH | CO_FL_DATA_WR_SH | CO_FL_WAIT_DATA | CO_FL_WAIT_WR))
- conn->flags |= CO_FL_ERROR;
-
if (unlikely((s->result & SRV_CHK_FAILED) || (conn->flags & CO_FL_ERROR))) {
/* in case of TCP only, this tells us if the connection failed */
if (!(s->result & SRV_CHK_FAILED))
@@ -850,7 +845,7 @@
goto out_wakeup;
}
- if (conn->flags & CO_FL_HANDSHAKE)
+ if (conn->flags & (CO_FL_HANDSHAKE | CO_FL_WAIT_RD))
return;
/* Warning! Linux returns EAGAIN on SO_ERROR if data are still available
@@ -1247,38 +1242,26 @@
*/
static struct task *process_chk(struct task *t)
{
- int attempts = 0;
struct server *s = t->context;
struct connection *conn = s->check.conn;
- int fd;
int rv;
int ret;
- new_chk:
- if (attempts++ > 0) {
- /* we always fail to create a server, let's stop insisting... */
- while (tick_is_expired(t->expire, now_ms))
- t->expire = tick_add(t->expire, MS_TO_TICKS(s->inter));
- return t;
- }
-
- fd = conn->t.sock.fd;
- if (fd < 0) { /* no check currently running */
+ if (!(s->state & SRV_CHK_RUNNING)) {
+ /* no check currently running */
if (!tick_is_expired(t->expire, now_ms)) /* woke up too early */
return t;
/* we don't send any health-checks when the proxy is stopped or when
* the server should not be checked.
*/
- if (!(s->state & SRV_CHECKED) || s->proxy->state == PR_STSTOPPED || (s->state & SRV_MAINTAIN)) {
- while (tick_is_expired(t->expire, now_ms))
- t->expire = tick_add(t->expire, MS_TO_TICKS(s->inter));
- return t;
- }
+ if (!(s->state & SRV_CHECKED) || s->proxy->state == PR_STSTOPPED || (s->state & SRV_MAINTAIN))
+ goto reschedule;
/* we'll initiate a new check */
set_server_check_status(s, HCHK_STATUS_START, NULL);
+ s->state |= SRV_CHK_RUNNING;
s->check.bi->p = s->check.bi->data;
s->check.bi->i = 0;
s->check.bo->p = s->check.bo->data;
@@ -1337,21 +1320,6 @@
switch (ret) {
case SN_ERR_NONE:
- break;
- case SN_ERR_SRVTO: /* ETIMEDOUT */
- case SN_ERR_SRVCL: /* ECONNREFUSED, ENETUNREACH, ... */
- set_server_check_status(s, HCHK_STATUS_L4CON, strerror(errno));
- break;
- case SN_ERR_PRXCOND:
- case SN_ERR_RESOURCE:
- case SN_ERR_INTERNAL:
- set_server_check_status(s, HCHK_STATUS_SOCKERR, NULL);
- break;
- }
-
- if (s->result == SRV_CHK_UNKNOWN) {
- /* connection attempt was started */
-
/* we allow up to min(inter, timeout.connect) for a connection
* to establish but only when timeout.check is set
* as it may be to short for a full check otherwise
@@ -1362,12 +1330,22 @@
int t_con = tick_add(now_ms, s->proxy->timeout.connect);
t->expire = tick_first(t->expire, t_con);
}
- return t;
+ goto reschedule;
+
+ case SN_ERR_SRVTO: /* ETIMEDOUT */
+ case SN_ERR_SRVCL: /* ECONNREFUSED, ENETUNREACH, ... */
+ set_server_check_status(s, HCHK_STATUS_L4CON, strerror(errno));
+ break;
+ case SN_ERR_PRXCOND:
+ case SN_ERR_RESOURCE:
+ case SN_ERR_INTERNAL:
+ set_server_check_status(s, HCHK_STATUS_SOCKERR, NULL);
+ break;
}
- /* here, we have seen a failure */
+ /* here, we have seen a synchronous error, no fd was allocated */
- conn->t.sock.fd = -1; /* report that no check is running anymore */
+ s->state &= ~SRV_CHK_RUNNING;
if (s->health > s->rise) {
s->health--; /* still good */
s->counters.failed_checks++;
@@ -1375,7 +1353,6 @@
else
set_server_down(s);
- //fprintf(stderr, "process_chk: 7, %lu\n", __tv_to_ms(&s->proxy->timeout.connect));
/* we allow up to min(inter, timeout.connect) for a connection
* to establish but only when timeout.check is set
* as it may be to short for a full check otherwise
@@ -1389,7 +1366,6 @@
if (s->proxy->timeout.check)
t->expire = tick_first(t->expire, t_con);
}
- goto new_chk;
}
else {
/* there was a test running.
@@ -1397,21 +1373,21 @@
* which can happen on connect timeout or error.
*/
if (s->result == SRV_CHK_UNKNOWN) {
- if (conn->flags & CO_FL_CONNECTED) {
- /* good TCP connection is enough */
+ if ((conn->flags & CO_FL_CONNECTED) && !(s->proxy->options2 & PR_O2_CHK_ANY)) {
+ /* good connection is enough for pure TCP check */
if (s->check.use_ssl)
set_server_check_status(s, HCHK_STATUS_L6OK, NULL);
else
set_server_check_status(s, HCHK_STATUS_L4OK, NULL);
}
- else if (conn->flags & CO_FL_WAIT_L4_CONN) {
+ else if ((conn->flags & (CO_FL_CONNECTED|CO_FL_WAIT_L4_CONN)) == CO_FL_WAIT_L4_CONN) {
/* L4 failed */
if (conn->flags & CO_FL_ERROR)
set_server_check_status(s, HCHK_STATUS_L4CON, NULL);
else
set_server_check_status(s, HCHK_STATUS_L4TOUT, NULL);
}
- else if (conn->flags & CO_FL_WAIT_L6_CONN) {
+ else if ((conn->flags & (CO_FL_CONNECTED|CO_FL_WAIT_L6_CONN)) == CO_FL_WAIT_L6_CONN) {
/* L6 failed */
if (conn->flags & CO_FL_ERROR)
set_server_check_status(s, HCHK_STATUS_L6RSP, NULL);
@@ -1437,10 +1413,11 @@
set_server_up(s);
}
+
+ s->state &= ~SRV_CHK_RUNNING;
conn_xprt_close(conn);
if (conn->ctrl)
- fd_delete(fd);
- conn->t.sock.fd = -1; /* no check running anymore */
+ fd_delete(conn->t.sock.fd);
rv = 0;
if (global.spread_checks > 0) {
@@ -1448,7 +1425,6 @@
rv -= (int) (2 * rv * (rand() / (RAND_MAX + 1.0)));
}
t->expire = tick_add(now_ms, MS_TO_TICKS(srv_getinter(s) + rv));
- goto new_chk;
}
else if ((s->result & SRV_CHK_FAILED) || tick_is_expired(t->expire, now_ms)) {
if (!(s->result & SRV_CHK_FAILED)) {
@@ -1469,10 +1445,11 @@
}
else
set_server_down(s);
+
+ s->state &= ~SRV_CHK_RUNNING;
conn_xprt_close(conn);
if (conn->ctrl)
- fd_delete(fd);
- conn->t.sock.fd = -1;
+ fd_delete(conn->t.sock.fd);
rv = 0;
if (global.spread_checks > 0) {
@@ -1480,11 +1457,16 @@
rv -= (int) (2 * rv * (rand() / (RAND_MAX + 1.0)));
}
t->expire = tick_add(now_ms, MS_TO_TICKS(srv_getinter(s) + rv));
- goto new_chk;
}
- /* if result is unknown and there's no timeout, we have to wait again */
+ else {
+ /* if result is unknown and there's no timeout, we have to wait again */
+ s->result = SRV_CHK_UNKNOWN;
+ }
}
- s->result = SRV_CHK_UNKNOWN;
+
+ reschedule:
+ while (tick_is_expired(t->expire, now_ms))
+ t->expire = tick_add(t->expire, MS_TO_TICKS(s->inter));
return t;
}
diff --git a/src/dumpstats.c b/src/dumpstats.c
index 17bf973..988f94c 100644
--- a/src/dumpstats.c
+++ b/src/dumpstats.c
@@ -2933,7 +2933,7 @@
}
chunk_appendf(&trash, "\"><u> %s%s",
- tv_iszero(&sv->check.start)?"":"* ",
+ (sv->state & SRV_CHK_RUNNING) ? "" : "* ",
get_check_status_info(sv->check.status));
if (sv->check.status >= HCHK_STATUS_L57DATA)