BUG/MEDIUM: checks: don't needlessly take the server lock in health_adjust()
The server lock was taken preventively for anything in health_adjust(),
including the static config checks needed to detect that the lock was not
needed, while the function is always called on the response path to update
a server's status. This was responsible for huge contention causing a
performance drop of about 17% on 16 threads. Let's move the lock only
where it should be, i.e. inside the function around the critical sections
only. By doing this, a 16-thread process jumped back from 575 to 675 krps.
This should be backported to 2.3 as the situation degraded there, and
maybe later to 2.2.
diff --git a/src/check.c b/src/check.c
index 06c86af..a09c67a 100644
--- a/src/check.c
+++ b/src/check.c
@@ -404,7 +404,7 @@
}
/* note: use health_adjust() only, which first checks that the observe mode is
- * enabled.
+ * enabled. This will take the server lock if needed.
*/
void __health_adjust(struct server *s, short status)
{
@@ -444,6 +444,13 @@
chunk_printf(&trash, "Detected %d consecutive errors, last one was: %s",
s->consecutive_errors, get_analyze_status(status));
+ if (s->check.fastinter)
+ expire = tick_add(now_ms, MS_TO_TICKS(s->check.fastinter));
+ else
+ expire = TICK_ETERNITY;
+
+ HA_SPIN_LOCK(SERVER_LOCK, &s->lock);
+
switch (s->onerror) {
case HANA_ONERR_FASTINTER:
/* force fastinter - nothing to do here as all modes force it */
@@ -476,16 +483,14 @@
break;
}
+ HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock);
+
s->consecutive_errors = 0;
_HA_ATOMIC_ADD(&s->counters.failed_hana, 1);
- if (s->check.fastinter) {
- expire = tick_add(now_ms, MS_TO_TICKS(s->check.fastinter));
- if (tick_is_lt(expire, s->check.task->expire)) {
- s->check.task->expire = expire;
- /* requeue check task with new expire */
- task_queue(s->check.task);
- }
+ if (tick_is_lt(expire, s->check.task->expire)) {
+ /* requeue check task with new expire */
+ task_schedule(s->check.task, expire);
}
}