MINOR: global: add some global activity counters to help debugging

A number of counters have been added at special places helping better
understanding certain bug reports. These counters are maintained per
thread and are shown using "show activity" on the CLI. The "clear
counters" commands also reset these counters. The output is sent as a
single write(), which currently produces up to about 7 kB of data for
64 threads. If more counters are added, it may be necessary to write
into multiple buffers, or to reset the counters.

To backport to 1.8 to help collect more detailed bug reports.
diff --git a/doc/management.txt b/doc/management.txt
index 3240505..af21652 100644
--- a/doc/management.txt
+++ b/doc/management.txt
@@ -1321,7 +1321,8 @@
 
 clear counters
   Clear the max values of the statistics counters in each proxy (frontend &
-  backend) and in each server. The accumulated counters are not affected. This
+  backend) and in each server. The accumulated counters are not affected. The
+  internal activity counters reported by "show activity" are also reset. This
   can be used to get clean counters after an incident, without having to
   restart nor to clear traffic counters. This command is restricted and can
   only be issued on sockets configured for levels "operator" or "admin".
@@ -1861,6 +1862,18 @@
   that the output format may evolve over time so this output must not be parsed
   by tools designed to be durable.
 
+show activity
+  Reports some counters about internal events that will help developers and
+  more generally people who know haproxy well enough to narrow down the causes
+  of reports of abnormal behaviours. A typical example would be a properly
+  running process never sleeping and eating 100% of the CPU. The output fields
+  will be made of one line per metric, and per-thread counters on the same
+  line. These counters are 32-bit and will wrap during the process' life, which
+  is not a problem since calls to this command will typically be performed
+  twice. The fields are purposely not documented so that their exact meaning is
+  verified in the code where the counters are fed. These values are also reset
+  by the "clear counters" command.
+
 show info [typed|json]
   Dump info about haproxy status on current process. If "typed" is passed as an
   optional argument, field numbers, names and types are emitted as well so that
diff --git a/include/types/global.h b/include/types/global.h
index 5c5cf73..bd7761c 100644
--- a/include/types/global.h
+++ b/include/types/global.h
@@ -173,7 +173,34 @@
 #endif
 };
 
+/* per-thread activity reports. It's important that it's aligned on cache lines
+ * because some elements will be updated very often. Most counters are OK on
+ * 32-bit since this will be used during debugging sessions for troubleshooting
+ * in iterative mode.
+ */
+struct activity {
+	unsigned int loops;        // complete loops in run_poll_loop()
+	unsigned int wake_cache;   // active fd_cache prevented poll() from sleeping
+	unsigned int wake_tasks;   // active tasks prevented poll() from sleeping
+	unsigned int wake_applets; // active applets prevented poll() from sleeping
+	unsigned int wake_signal;  // pending signal prevented poll() from sleeping
+	unsigned int poll_exp;     // number of times poll() sees an expired timeout (includes wake_*)
+	unsigned int poll_drop;    // poller dropped a dead FD from the update list
+	unsigned int poll_dead;    // poller woke up with a dead FD
+	unsigned int poll_skip;    // poller skipped another thread's FD
+	unsigned int fd_skip;      // fd cache skipped another thread's FD
+	unsigned int fd_lock;      // fd cache skipped a locked FD
+	unsigned int fd_del;       // fd cache detected a deleted FD
+	unsigned int conn_dead;    // conn_fd_handler woke up on an FD indicating a dead connection
+	unsigned int stream;       // calls to process_stream()
+	unsigned int empty_rq;     // calls to process_runnable_tasks() with nothing for the thread
+	unsigned int long_rq;      // process_runnable_tasks() left with tasks in the run queue
+	char __pad[0]; // unused except to check remaining room
+	char __end[0] __attribute__((aligned(64))); // align size to 64.
+};
+
 extern struct global global;
+extern struct activity activity[MAX_THREADS];
 extern int  pid;                /* current process id */
 extern int  relative_pid;       /* process id starting at 1 */
 extern unsigned long pid_bit;   /* bit corresponding to the process id */
diff --git a/src/cli.c b/src/cli.c
index 149ecd7..3e62c31 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -855,6 +855,53 @@
 	return 1;
 }
 
+/* This function dumps some activity counters used by developers and support to
+ * rule out some hypothesis during bug reports. It returns 0 if the output
+ * buffer is full and it needs to be called again, otherwise non-zero. It dumps
+ * everything at once in the buffer and is not designed to do it in multiple
+ * passes.
+ */
+static int cli_io_handler_show_activity(struct appctx *appctx)
+{
+	struct stream_interface *si = appctx->owner;
+	int thr;
+
+	if (unlikely(si_ic(si)->flags & (CF_WRITE_ERROR|CF_SHUTW)))
+		return 1;
+
+	chunk_reset(&trash);
+
+	chunk_appendf(&trash, "thread_id: %u", tid);
+	chunk_appendf(&trash, "\ndate_now: %lu.%06lu", (long)now.tv_sec, (long)now.tv_usec);
+	chunk_appendf(&trash, "\nloops:");        for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].loops);
+	chunk_appendf(&trash, "\nwake_cache:");   for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].wake_cache);
+	chunk_appendf(&trash, "\nwake_tasks:");   for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].wake_tasks);
+	chunk_appendf(&trash, "\nwake_applets:"); for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].wake_applets);
+	chunk_appendf(&trash, "\nwake_signal:");  for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].wake_signal);
+	chunk_appendf(&trash, "\npoll_exp:");     for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].poll_exp);
+	chunk_appendf(&trash, "\npoll_drop:");    for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].poll_drop);
+	chunk_appendf(&trash, "\npoll_dead:");    for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].poll_dead);
+	chunk_appendf(&trash, "\npoll_skip:");    for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].poll_skip);
+	chunk_appendf(&trash, "\nfd_skip:");      for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].fd_skip);
+	chunk_appendf(&trash, "\nfd_lock:");      for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].fd_lock);
+	chunk_appendf(&trash, "\nfd_del:");       for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].fd_del);
+	chunk_appendf(&trash, "\nconn_dead:");    for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].conn_dead);
+	chunk_appendf(&trash, "\nstream:");       for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].stream);
+	chunk_appendf(&trash, "\nempty_rq:");     for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].empty_rq);
+	chunk_appendf(&trash, "\nlong_rq:");      for (thr = 0; thr < global.nbthread; thr++) chunk_appendf(&trash, " %u", activity[thr].long_rq);
+
+	chunk_appendf(&trash, "\n");
+
+	if (ci_putchk(si_ic(si), &trash) == -1) {
+		chunk_reset(&trash);
+		chunk_printf(&trash, "[output too large, cannot dump]\n");
+		si_applet_cant_put(si);
+	}
+
+	/* dump complete */
+	return 1;
+}
+
 /*
  * CLI IO handler for `show cli sockets`.
  * Uses ctx.cli.p0 to store the restart pointer.
@@ -1428,6 +1475,7 @@
 	{ { "show", "env",  NULL }, "show env [var] : dump environment variables known to the process", cli_parse_show_env, cli_io_handler_show_env, NULL },
 	{ { "show", "cli", "sockets",  NULL }, "show cli sockets : dump list of cli sockets", cli_parse_default, cli_io_handler_show_cli_sock, NULL },
 	{ { "show", "fd", NULL }, "show fd [num] : dump list of file descriptors in use", cli_parse_show_fd, cli_io_handler_show_fd, NULL },
+	{ { "show", "activity", NULL }, "show activity : show per-thread activity stats (for support/developers)", cli_parse_default, cli_io_handler_show_activity, NULL },
 	{ { "_getsocks", NULL }, NULL,  _getsocks, NULL },
 	{{},}
 }};
diff --git a/src/connection.c b/src/connection.c
index 0f8acb0..48d7a64 100644
--- a/src/connection.c
+++ b/src/connection.c
@@ -63,8 +63,10 @@
 	struct connection *conn = fdtab[fd].owner;
 	unsigned int flags;
 
-	if (unlikely(!conn))
+	if (unlikely(!conn)) {
+		activity[tid].conn_dead++;
 		return;
+	}
 
 	conn_refresh_polling_flags(conn);
 	conn->flags |= CO_FL_WILL_UPDATE;
diff --git a/src/ev_epoll.c b/src/ev_epoll.c
index 602a243..679dfee 100644
--- a/src/ev_epoll.c
+++ b/src/ev_epoll.c
@@ -68,8 +68,10 @@
 	for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) {
 		fd = fd_updt[updt_idx];
 
-		if (!fdtab[fd].owner)
+		if (!fdtab[fd].owner) {
+			activity[tid].poll_drop++;
 			continue;
+		}
 
 		HA_SPIN_LOCK(FD_LOCK, &fdtab[fd].lock);
 		fdtab[fd].updated = 0;
@@ -114,8 +116,10 @@
 	/* compute the epoll_wait() timeout */
 	if (!exp)
 		wait_time = MAX_DELAY_MS;
-	else if (tick_is_expired(exp, now_ms))
+	else if (tick_is_expired(exp, now_ms)) {
+		activity[tid].poll_exp++;
 		wait_time = 0;
+	}
 	else {
 		wait_time = TICKS_TO_MS(tick_remain(now_ms, exp)) + 1;
 		if (wait_time > MAX_DELAY_MS)
@@ -136,8 +140,15 @@
 		unsigned int e = epoll_events[count].events;
 		fd = epoll_events[count].data.fd;
 
+		if (!fdtab[fd].owner) {
+			activity[tid].poll_dead++;
+			continue;
+		}
+
-		if (!fdtab[fd].owner || !(fdtab[fd].thread_mask & tid_bit))
+		if (!(fdtab[fd].thread_mask & tid_bit)) {
+			activity[tid].poll_skip++;
 			continue;
+		}
 
 		/* it looks complicated but gcc can optimize it away when constants
 		 * have same values... In fact it depends on gcc :-(
diff --git a/src/ev_kqueue.c b/src/ev_kqueue.c
index b42ee3d..69d51b6 100644
--- a/src/ev_kqueue.c
+++ b/src/ev_kqueue.c
@@ -47,8 +47,10 @@
 	for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) {
 		fd = fd_updt[updt_idx];
 
-		if (!fdtab[fd].owner)
+		if (!fdtab[fd].owner) {
+			activity[tid].poll_drop++;
 			continue;
+		}
 
 		HA_SPIN_LOCK(FD_LOCK, &fdtab[fd].lock);
 		fdtab[fd].updated = 0;
@@ -106,6 +108,8 @@
 		timeout.tv_sec  = (delta_ms / 1000);
 		timeout.tv_nsec = (delta_ms % 1000) * 1000000;
 	}
+	else
+		activity[tid].poll_exp++;
 
 	fd = MIN(maxfd, global.tune.maxpollevents);
 	gettimeofday(&before_poll, NULL);
@@ -122,8 +126,15 @@
 		unsigned int n = 0;
 		fd = kev[count].ident;
 
-		if (!fdtab[fd].owner || !(fdtab[fd].thread_mask & tid_bit))
+		if (!fdtab[fd].owner) {
+			activity[tid].poll_dead++;
 			continue;
+		}
+
+		if (!(fdtab[fd].thread_mask & tid_bit)) {
+			activity[tid].poll_skip++;
+			continue;
+		}
 
 		if (kev[count].filter ==  EVFILT_READ) {
 			if (kev[count].data)
diff --git a/src/ev_poll.c b/src/ev_poll.c
index 610509b..efd56ee 100644
--- a/src/ev_poll.c
+++ b/src/ev_poll.c
@@ -73,8 +73,10 @@
 	for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) {
 		fd = fd_updt[updt_idx];
 
-		if (!fdtab[fd].owner)
+		if (!fdtab[fd].owner) {
+			activity[tid].poll_drop++;
 			continue;
+		}
 
 		HA_SPIN_LOCK(FD_LOCK, &fdtab[fd].lock);
 		fdtab[fd].updated = 0;
@@ -111,13 +113,21 @@
 			continue;
 
 		for (count = 0, fd = fds * 8*sizeof(**fd_evts); count < 8*sizeof(**fd_evts) && fd < maxfd; count++, fd++) {
-
-			if (!fdtab[fd].owner || !(fdtab[fd].thread_mask & tid_bit))
-				continue;
-
 			sr = (rn >> count) & 1;
 			sw = (wn >> count) & 1;
 			if ((sr|sw)) {
+				if (!fdtab[fd].owner) {
+					/* should normally not happen here except
+					 * due to rare thread concurrency
+					 */
+					continue;
+				}
+
+				if (!(fdtab[fd].thread_mask & tid_bit)) {
+					activity[tid].poll_skip++;
+					continue;
+				}
+
 				poll_events[nbfd].fd = fd;
 				poll_events[nbfd].events = (sr ? (POLLIN | POLLRDHUP) : 0) | (sw ? POLLOUT : 0);
 				nbfd++;
@@ -128,8 +138,10 @@
 	/* now let's wait for events */
 	if (!exp)
 		wait_time = MAX_DELAY_MS;
-	else if (tick_is_expired(exp, now_ms))
+	else if (tick_is_expired(exp, now_ms)) {
+		activity[tid].poll_exp++;
 		wait_time = 0;
+	}
 	else {
 		wait_time = TICKS_TO_MS(tick_remain(now_ms, exp)) + 1;
 		if (wait_time > MAX_DELAY_MS)
@@ -152,8 +164,10 @@
 		/* ok, we found one active fd */
 		status--;
 
-		if (!fdtab[fd].owner)
+		if (!fdtab[fd].owner) {
+			activity[tid].poll_dead++;
 			continue;
+		}
 
 		/* it looks complicated but gcc can optimize it away when constants
 		 * have same values... In fact it depends on gcc :-(
diff --git a/src/ev_select.c b/src/ev_select.c
index b2b4e50..52c4454 100644
--- a/src/ev_select.c
+++ b/src/ev_select.c
@@ -55,8 +55,10 @@
 	for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) {
 		fd = fd_updt[updt_idx];
 
-		if (!fdtab[fd].owner)
+		if (!fdtab[fd].owner) {
+			activity[tid].poll_drop++;
 			continue;
+		}
 
 		HA_SPIN_LOCK(FD_LOCK, &fdtab[fd].lock);
 		fdtab[fd].updated = 0;
@@ -117,6 +119,8 @@
 		delta.tv_sec  = (delta_ms / 1000);
 		delta.tv_usec = (delta_ms % 1000) * 1000;
 	}
+	else
+		activity[tid].poll_exp++;
 
 	gettimeofday(&before_poll, NULL);
 	status = select(maxfd,
@@ -138,11 +142,15 @@
 		for (count = BITS_PER_INT, fd = fds * BITS_PER_INT; count && fd < maxfd; count--, fd++) {
 			unsigned int n = 0;
 
+			if (!fdtab[fd].owner) {
+				activity[tid].poll_dead++;
+				continue;
+			}
+
-			/* if we specify read first, the accepts and zero reads will be
-			 * seen first. Moreover, system buffers will be flushed faster.
-			 */
-			if (!fdtab[fd].owner || !(fdtab[fd].thread_mask & tid_bit))
+			if (!(fdtab[fd].thread_mask & tid_bit)) {
+				activity[tid].poll_skip++;
 				continue;
+			}
 
 			if (FD_ISSET(fd, tmp_evts[DIR_RD]))
 				n |= FD_POLL_IN;
diff --git a/src/fd.c b/src/fd.c
index 9fb09ab..148b4d2 100644
--- a/src/fd.c
+++ b/src/fd.c
@@ -243,10 +243,14 @@
 	for (entry = 0; entry < fd_cache_num; ) {
 		fd = fd_cache[entry];
 
-		if (!(fdtab[fd].thread_mask & tid_bit))
+		if (!(fdtab[fd].thread_mask & tid_bit)) {
+			activity[tid].fd_skip++;
 			goto next;
-		if (HA_SPIN_TRYLOCK(FD_LOCK, &fdtab[fd].lock))
+		}
+		if (HA_SPIN_TRYLOCK(FD_LOCK, &fdtab[fd].lock)) {
+			activity[tid].fd_lock++;
 			goto next;
+		}
 
 		HA_RWLOCK_RDUNLOCK(FDCACHE_LOCK, &fdcache_lock);
 
@@ -272,8 +276,10 @@
 		/* If the fd was removed from the cache, it has been
 		 * replaced by the next one that we don't want to skip !
 		 */
-		if (entry < fd_cache_num && fd_cache[entry] != fd)
+		if (entry < fd_cache_num && fd_cache[entry] != fd) {
+			activity[tid].fd_del++;
 			continue;
+		}
 	  next:
 		entry++;
 	}
diff --git a/src/haproxy.c b/src/haproxy.c
index a8d0fad..952733e 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -165,6 +165,8 @@
 	/* others NULL OK */
 };
 
+struct activity activity[MAX_THREADS] __attribute__((aligned(64))) = { };
+
 /*********************************************************************/
 
 int stopping;	/* non zero means stopping in progress */
@@ -2371,7 +2373,7 @@
 /* Runs the polling loop */
 static void run_poll_loop()
 {
-	int next;
+	int next, exp;
 
 	tv_update_date(0,1);
 	while (1) {
@@ -2389,18 +2391,27 @@
 			break;
 
 		/* expire immediately if events are pending */
-		if (fd_cache_num || (active_tasks_mask & tid_bit) || signal_queue_len || (active_applets_mask & tid_bit))
-			next = now_ms;
+		exp = now_ms;
+		if (fd_cache_num)
+			activity[tid].wake_cache++;
+		else if (active_tasks_mask & tid_bit)
+			activity[tid].wake_tasks++;
+		else if (active_applets_mask & tid_bit)
+			activity[tid].wake_applets++;
+		else if (signal_queue_len)
+			activity[tid].wake_signal++;
+		else
+			exp = next;
 
 		/* The poller will ensure it returns around <next> */
-		cur_poller.poll(&cur_poller, next);
+		cur_poller.poll(&cur_poller, exp);
 		fd_process_cached_events();
 		applet_run_active();
 
 
 		/* Synchronize all polling loops */
 		sync_poll_loop();
-
+		activity[tid].loops++;
 	}
 }
 
diff --git a/src/stats.c b/src/stats.c
index 0125928..61e0549 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -3578,6 +3578,8 @@
 	global.ssl_max = 0;
 	global.ssl_fe_keys_max = 0;
 	global.ssl_be_keys_max = 0;
+
+	memset(activity, 0, sizeof(activity));
 	return 1;
 }
 
diff --git a/src/stream.c b/src/stream.c
index 60d3eff..ebe41be 100644
--- a/src/stream.c
+++ b/src/stream.c
@@ -1627,6 +1627,8 @@
 	struct channel *req, *res;
 	struct stream_interface *si_f, *si_b;
 
+	activity[tid].stream++;
+
 	req = &s->req;
 	res = &s->res;
 
diff --git a/src/task.c b/src/task.c
index 053376c..fd9acf6 100644
--- a/src/task.c
+++ b/src/task.c
@@ -196,8 +196,10 @@
 	max_processed = 200;
 	if (unlikely(global.nbthread <= 1)) {
 		/* when no lock is needed, this loop is much faster */
-		if (!(active_tasks_mask & tid_bit))
+		if (!(active_tasks_mask & tid_bit)) {
+			activity[tid].empty_rq++;
 			return;
+		}
 
 		active_tasks_mask &= ~tid_bit;
 		rq_next = eb32sc_lookup_ge(&rqueue, rqueue_ticks - TIMER_LOOK_BACK, tid_bit);
@@ -245,6 +247,7 @@
 			max_processed--;
 			if (max_processed <= 0) {
 				active_tasks_mask |= tid_bit;
+				activity[tid].long_rq++;
 				break;
 			}
 		}
@@ -254,6 +257,7 @@
 	HA_SPIN_LOCK(TASK_RQ_LOCK, &rq_lock);
 	if (!(active_tasks_mask & tid_bit)) {
 		HA_SPIN_UNLOCK(TASK_RQ_LOCK, &rq_lock);
+		activity[tid].empty_rq++;
 		return;
 	}
 
@@ -335,6 +339,7 @@
 		HA_SPIN_LOCK(TASK_RQ_LOCK, &rq_lock);
 		if (max_processed <= 0) {
 			active_tasks_mask |= tid_bit;
+			activity[tid].long_rq++;
 			break;
 		}
 	}