MEDIUM: listener: make the accept function more robust against pauses
During some tests in multi-process mode under Linux, it appeared that
issuing "disable frontend foo" on the CLI to pause a listener would
make the shutdown(read) of certain processes disturb another process
listening on the same socket, resulting in a 100% CPU loop. What
happens is that accept() returns EAGAIN without accepting anything.
Fortunately, we see that epoll_wait() reports EPOLLIN+EPOLLRDHUP
(likely because the FD points to the same file in the kernel), so we
can use that to stop the other process from trying to accept connections
for a short time and try again later, hoping for the situation to change.
We must not disable the FD otherwise there's no way to re-enable it.
Additionally, during these tests, a loop was encountered on EINVAL which
was not caught. Now if we catch an EINVAL, we proceed the same way, in
case the socket is re-enabled later.
diff --git a/src/listener.c b/src/listener.c
index 7ab1a87..4a55e5a 100644
--- a/src/listener.c
+++ b/src/listener.c
@@ -257,6 +257,7 @@
struct listener *l = fdtab[fd].owner;
struct proxy *p = l->frontend;
int max_accept = l->maxaccept ? l->maxaccept : 1;
+ int expire;
int cfd;
int ret;
#ifdef USE_ACCEPT4
@@ -270,14 +271,11 @@
if (!(l->options & LI_O_UNLIMITED) && global.sps_lim) {
int max = freq_ctr_remain(&global.sess_per_sec, global.sps_lim, 0);
- int expire;
if (unlikely(!max)) {
/* frontend accept rate limit was reached */
- limit_listener(l, &global_listener_queue);
expire = tick_add(now_ms, next_event_delay(&global.sess_per_sec, global.sps_lim, 0));
- task_schedule(global_listener_queue_task, tick_first(expire, global_listener_queue_task->expire));
- return;
+ goto wait_expire;
}
if (max_accept > max)
@@ -286,14 +284,11 @@
if (!(l->options & LI_O_UNLIMITED) && global.cps_lim) {
int max = freq_ctr_remain(&global.conn_per_sec, global.cps_lim, 0);
- int expire;
if (unlikely(!max)) {
/* frontend accept rate limit was reached */
- limit_listener(l, &global_listener_queue);
expire = tick_add(now_ms, next_event_delay(&global.conn_per_sec, global.cps_lim, 0));
- task_schedule(global_listener_queue_task, tick_first(expire, global_listener_queue_task->expire));
- return;
+ goto wait_expire;
}
if (max_accept > max)
@@ -302,14 +297,11 @@
#ifdef USE_OPENSSL
if (!(l->options & LI_O_UNLIMITED) && global.ssl_lim && l->bind_conf && l->bind_conf->is_ssl) {
int max = freq_ctr_remain(&global.ssl_per_sec, global.ssl_lim, 0);
- int expire;
if (unlikely(!max)) {
/* frontend accept rate limit was reached */
- limit_listener(l, &global_listener_queue);
expire = tick_add(now_ms, next_event_delay(&global.ssl_per_sec, global.ssl_lim, 0));
- task_schedule(global_listener_queue_task, tick_first(expire, global_listener_queue_task->expire));
- return;
+ goto wait_expire;
}
if (max_accept > max)
@@ -365,8 +357,20 @@
if (unlikely(cfd == -1)) {
switch (errno) {
case EAGAIN:
+ if (fdtab[fd].ev & FD_POLL_HUP) {
+ /* the listening socket might have been disabled in a shared
+ * process and we're a collateral victim. We'll just pause for
+ * a while in case it comes back. In the mean time, we need to
+ * clear this sticky flag.
+ */
+ fdtab[fd].ev &= ~FD_POLL_HUP;
+ goto transient_error;
+ }
fd_cant_recv(fd);
return; /* nothing more to accept */
+ case EINVAL:
+ /* might be trying to accept on a shut fd (eg: soft stop) */
+ goto transient_error;
case EINTR:
case ECONNABORTED:
continue;
@@ -375,26 +379,20 @@
send_log(p, LOG_EMERG,
"Proxy %s reached system FD limit at %d. Please check system tunables.\n",
p->id, maxfd);
- limit_listener(l, &global_listener_queue);
- task_schedule(global_listener_queue_task, tick_add(now_ms, 100)); /* try again in 100 ms */
- return;
+ goto transient_error;
case EMFILE:
if (p)
send_log(p, LOG_EMERG,
"Proxy %s reached process FD limit at %d. Please check 'ulimit-n' and restart.\n",
p->id, maxfd);
- limit_listener(l, &global_listener_queue);
- task_schedule(global_listener_queue_task, tick_add(now_ms, 100)); /* try again in 100 ms */
- return;
+ goto transient_error;
case ENOBUFS:
case ENOMEM:
if (p)
send_log(p, LOG_EMERG,
"Proxy %s reached system memory limit at %d sockets. Please check system tunables.\n",
p->id, maxfd);
- limit_listener(l, &global_listener_queue);
- task_schedule(global_listener_queue_task, tick_add(now_ms, 100)); /* try again in 100 ms */
- return;
+ goto transient_error;
default:
/* unexpected result, let's give up and let other tasks run */
goto stop;
@@ -442,9 +440,7 @@
if (ret == 0) /* successful termination */
continue;
- limit_listener(l, &global_listener_queue);
- task_schedule(global_listener_queue_task, tick_add(now_ms, 100)); /* try again in 100 ms */
- return;
+ goto transient_error;
}
if (l->nbconn >= l->maxconn) {
@@ -473,6 +469,15 @@
stop:
fd_done_recv(fd);
return;
+
+ transient_error:
+ /* pause the listener and try again in 100 ms */
+ expire = tick_add(now_ms, 100);
+
+ wait_expire:
+ limit_listener(l, &global_listener_queue);
+ task_schedule(global_listener_queue_task, tick_first(expire, global_listener_queue_task->expire));
+ return;
}
/*