MAJOR: checks: move health checks changes to set_server_check_status() We don't want to manipulate check's statuses anymore in functions which modify the server's state. So since any check is forced to call set_server_check_status() exactly once to report the result of the check, it's the best place to update the check's health.

commit: 7b1d47ce1b28f33a8cae811fdbfec2ac85d7f196 [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Tue May 20 14:55:13 2014 +0200
committer: Willy Tarreau <w@1wt.eu> Fri May 23 14:29:11 2014 +0200
tree: eb6eb2c028812f34997cd5a8b8b549344ffa1dc1
parent: af54958d7276a339b62fff67d1e368af4b958a91 [diff]
diff --git a/src/checks.c b/src/checks.c
index 8d708a3..9875980 100644
--- a/src/checks.c
+++ b/src/checks.c

@@ -206,7 +206,8 @@
 
 /*
  * Set check->status, update check->duration and fill check->result with
- * an adequate CHK_RES_* value.
+ * an adequate CHK_RES_* value. The new check->health is computed based
+ * on the result.
  *
  * Show information in logs about failed health check if server is UP
  * or succeeded health checks if server is DOWN.
@@ -215,6 +216,7 @@
 {
 	struct server *s = check->server;
 	short prev_status = check->status;
+	int report = 0;
 
 	if (status == HCHK_STATUS_START) {
 		check->result = CHK_RES_UNKNOWN;	/* no result yet */
@@ -245,71 +247,59 @@
 	}
 
 	/* Failure to connect to the agent as a secondary check should not
-	 * cause the server to be marked down. So only log status changes
-	 * for HCHK_STATUS_* statuses */
+	 * cause the server to be marked down.
+	 */
 	if ((check->state & CHK_ST_AGENT) && check->status < HCHK_STATUS_L7TOUT)
 		return;
 
-	if (s->proxy->options2 & PR_O2_LOGHCHKS &&
-	    ((status != prev_status) ||
-	    ((check->health != 0) && (check->result == CHK_RES_FAILED)) ||
-	    (((check->health != check->rise + check->fall - 1)) && (check->result >= CHK_RES_PASSED)))) {
-		int health, rise, fall;
-		enum srv_state state;
-
-		chunk_reset(&trash);
-
-		/* FIXME begin: calculate local version of the health/rise/fall/state */
-		health = check->health;
-		rise   = check->rise;
-		fall   = check->fall;
-		state  = s->state;
-
-		switch (check->result) {
-		case CHK_RES_FAILED:
-			if (health > rise) {
-				health--; /* still good */
-			} else {
-				if (health == rise)
-					state = SRV_ST_STOPPED;
+	report = 0;
 
-				health = 0;
-			}
-			break;
+	switch (check->result) {
+	case CHK_RES_FAILED:
+		if (check->health >= check->rise) {
+			s->counters.failed_checks++;
+			report = 1;
+			check->health--;
+			if (check->health < check->rise)
+				check->health = 0;
+		}
+		break;
 
-		case CHK_RES_PASSED:
-		case CHK_RES_CONDPASS:
-			if (health < rise + fall - 1) {
-				health++; /* was bad, stays for a while */
+	case CHK_RES_PASSED:
+	case CHK_RES_CONDPASS:	/* "condpass" cannot make the first step but it OK after a "passed" */
+		if ((check->health < check->rise + check->fall - 1) &&
+		    (check->result == CHK_RES_PASSED || check->health > 0)) {
+			report = 1;
+			check->health++;
 
-				if (health == rise)
-					state = SRV_ST_RUNNING;
+			if (check->health >= check->rise)
+				check->health = check->rise + check->fall - 1; /* OK now */
+		}
 
-				if (health >= rise)
-					health = rise + fall - 1; /* OK now */
-			}
+		/* clear consecutive_errors if observing is enabled */
+		if (s->onerror)
+			s->consecutive_errors = 0;
+		break;
 
-			/* clear consecutive_errors if observing is enabled */
-			if (s->onerror)
-				s->consecutive_errors = 0;
-			break;
-		default:
-			break;
-		}
+	default:
+		break;
+	}
 
-		chunk_appendf(&trash,
+	if (s->proxy->options2 & PR_O2_LOGHCHKS &&
+	    (status != prev_status || report)) {
+		chunk_printf(&trash,
 		             "Health check for %sserver %s/%s %s%s",
 		             s->flags & SRV_F_BACKUP ? "backup " : "",
 		             s->proxy->id, s->id,
 		             (check->result == CHK_RES_CONDPASS) ? "conditionally ":"",
-		             (check->result >= CHK_RES_PASSED)   ? "succeeded":"failed");
+		             (check->result >= CHK_RES_PASSED)   ? "succeeded" : "failed");
 
 		check_report_srv_status(&trash, s, check, -1);
 
 		chunk_appendf(&trash, ", status: %d/%d %s",
-		             (state != SRV_ST_STOPPED) ? (health - rise + 1) : (health),
-		             (state != SRV_ST_STOPPED) ? (fall) : (rise),
-			     (state != SRV_ST_STOPPED) ? (s->uweight?"UP":"DRAIN"):"DOWN");
+		             (check->health >= check->rise) ? check->health - check->rise + 1 : check->health,
+		             (check->health >= check->rise) ? check->fall : check->rise,
+			     (check->health >= check->rise) ? (s->uweight ? "UP" : "DRAIN") : "DOWN");
 
 		Warning("%s.\n", trash.str);
 		send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.str);
@@ -332,6 +322,15 @@
 	if ((s->admin & SRV_ADMF_MAINT) || s->state == SRV_ST_STOPPED)
 		return;
 
+	/* The agent secondary check should only cause a server to be marked
+	 * as down if check->status is HCHK_STATUS_L7STS, which indicates
+	 * that the agent returned "fail", "stopped" or "down".
+	 * The implication here is that failure to connect to the agent
+	 * as a secondary check should not cause the server to be marked
+	 * down. */
+	if ((check->state & CHK_ST_AGENT) && check->status != HCHK_STATUS_L7STS)
+		return;
+
 	check->health = 0; /* failure */
 	s->last_change = now.tv_sec;
 	s->state = SRV_ST_STOPPED;
@@ -500,27 +499,6 @@
 		check->health = check->rise + check->fall - 1; /* OK now */
 }
 
-static void check_failed(struct check *check)
-{
-	struct server *s = check->server;
-
-	/* The agent secondary check should only cause a server to be marked
-	 * as down if check->status is HCHK_STATUS_L7STS, which indicates
-	 * that the agent returned "fail", "stopped" or "down".
-	 * The implication here is that failure to connect to the agent
-	 * as a secondary check should not cause the server to be marked
-	 * down. */
-	if ((check->state & CHK_ST_AGENT) && check->status != HCHK_STATUS_L7STS)
-		return;
-
-	if (check->health > check->rise) {
-		check->health--; /* still good */
-		s->counters.failed_checks++;
-	}
-	else
-		check_set_server_down(check);
-}
-
 /* note: use health_adjust() only, which first checks that the observe mode is
  * enabled.
  */
@@ -577,7 +555,8 @@
 		case HANA_ONERR_FAILCHK:
 		/* simulate a failed health check */
 			set_server_check_status(&s->check, HCHK_STATUS_HANA, trash.str);
-			check_failed(&s->check);
+			if (!s->check.health)
+				check_set_server_down(&s->check);
 
 			break;
 
@@ -585,7 +564,8 @@
 		/* mark server down */
 			s->check.health = s->check.rise;
 			set_server_check_status(&s->check, HCHK_STATUS_HANA, trash.str);
-			check_set_server_down(&s->check);
+			if (!s->check.health)
+				check_set_server_down(&s->check);
 
 			break;
 
@@ -1528,7 +1508,8 @@
 		/* here, we have seen a synchronous error, no fd was allocated */
 
 		check->state &= ~CHK_ST_INPROGRESS;
-		check_failed(check);
+		if (!check->health)
+			check_set_server_down(check);
 
 		/* we allow up to min(inter, timeout.connect) for a connection
 		 * to establish but only when timeout.check is set
@@ -1577,21 +1558,20 @@
 
 		if (check->result == CHK_RES_FAILED) {
 			/* a failure or timeout detected */
-			check_failed(check);
+			if (!check->health && check->server->state != SRV_ST_STOPPED)
+				check_set_server_down(check);
 		}
 		else if (check->result == CHK_RES_CONDPASS && s->state != SRV_ST_STOPPED && !(s->admin & SRV_ADMF_MAINT)) {
 			/* check is OK but asks for drain mode */
-			if (check->health < check->rise + check->fall - 1) {
-				check->health++;
+			if (check->health >= check->rise && check->server->state != SRV_ST_STOPPING)
 				check_set_server_drain(check);
-			}
 		}
 		else if (check->result == CHK_RES_PASSED && !(s->admin & SRV_ADMF_MAINT)) {
 			/* check is OK */
-			if (check->health < check->rise + check->fall - 1) {
-				check->health++;
+			if (check->health >= check->rise &&
+			    (check->server->state == SRV_ST_STOPPING ||
+			     check->server->state == SRV_ST_STOPPED))
 				check_set_server_up(check);
-			}
 		}
 		check->state &= ~CHK_ST_INPROGRESS;
commit	7b1d47ce1b28f33a8cae811fdbfec2ac85d7f196	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Tue May 20 14:55:13 2014 +0200
committer	Willy Tarreau <w@1wt.eu>	Fri May 23 14:29:11 2014 +0200
tree	eb6eb2c028812f34997cd5a8b8b549344ffa1dc1
parent	af54958d7276a339b62fff67d1e368af4b958a91 [diff]