MEDIUM: debug/threads: implement an advanced thread dump system

The current "show threads" command was too limited as it was not possible
to dump other threads' detailed states (e.g. their tasks). This patch
goes further by using thread signals so that each thread can dump its
own state in turn into a shared buffer provided by the caller. Threads
are synchronized using a mechanism very similar to the rendez-vous point
and using this method, each thread can safely dump any of its contents
and the caller can finally report the aggregated ones from the buffer.

It is important to keep in mind that the list of signal-safe functions
is limited, so we take care of only using chunk_printf() to write to a
pre-allocated buffer.

This mechanism is enabled by USE_THREAD_DUMP and is enabled by default
on Linux 2.6.28+. On other platforms it falls back to the previous
solution using the loop and the less precise dump.
diff --git a/Makefile b/Makefile
index 258e5a1..ddf29f0 100644
--- a/Makefile
+++ b/Makefile
@@ -50,6 +50,7 @@
 #   USE_WURFL            : enable WURFL detection library from Scientiamobile
 #   USE_SYSTEMD          : enable sd_notify() support.
 #   USE_OBSOLETE_LINKER  : use when the linker fails to emit __start_init/__stop_init
+#   USE_THREAD_DUMP      : use the more advanced thread state dump system. Automatic.
 #
 # Options can be forced by specifying "USE_xxx=1" or can be disabled by using
 # "USE_xxx=" (empty string).
@@ -284,7 +285,7 @@
            USE_GETADDRINFO USE_OPENSSL USE_LUA USE_FUTEX USE_ACCEPT4          \
            USE_MY_ACCEPT4 USE_ZLIB USE_SLZ USE_CPU_AFFINITY USE_TFO USE_NS    \
            USE_DL USE_RT USE_DEVICEATLAS USE_51DEGREES USE_WURFL USE_SYSTEMD  \
-           USE_OBSOLETE_LINKER USE_PRCTL
+           USE_OBSOLETE_LINKER USE_PRCTL USE_THREAD_DUMP
 
 #### Target system options
 # Depending on the target platform, some options are set, as well as some
@@ -343,7 +344,7 @@
   set_target_defaults = $(call default_opts, \
     USE_POLL USE_TPROXY USE_LIBCRYPT USE_DL USE_RT USE_CRYPT_H USE_NETFILTER  \
     USE_CPU_AFFINITY USE_THREAD USE_EPOLL USE_FUTEX USE_LINUX_TPROXY          \
-    USE_ACCEPT4 USE_LINUX_SPLICE USE_PRCTL ASSUME_SPLICE_WORKS)
+    USE_ACCEPT4 USE_LINUX_SPLICE USE_PRCTL ASSUME_SPLICE_WORKS USE_THREAD_DUMP)
 endif
 
 # Solaris 8 and above
diff --git a/doc/management.txt b/doc/management.txt
index ecaf729..06c4287 100644
--- a/doc/management.txt
+++ b/doc/management.txt
@@ -2515,9 +2515,13 @@
 show threads
   Dumps some internal states and structures for each thread, that may be useful
   to help developers understand a problem. The output tries to be readable by
-  showing one block per thread, with a bit more info for the current thread.
-  The output format is purposely not documented so that it can easily evolve
-  as new needs are identified, without having to maintain any backwards
+  showing one block per thread. When haproxy is built with USE_THREAD_DUMP=1,
+  an advanced dump mechanism involving thread signals is used so that each
+  thread can dump its own state in turn. Without this option, the thread
+  processing the command shows all its details but the other ones are less
+  detailed. A stat ('*') is displayed in front of the thread handling the
+  command. The output format is purposely not documented so that it can easily
+  evolve as new needs are identified, without having to maintain any backwards
   compatibility, and just like with "show activity", the values are only
   meaningful with the code at hand.
 
diff --git a/include/common/debug.h b/include/common/debug.h
index 3fb96c5..4f3baed 100644
--- a/include/common/debug.h
+++ b/include/common/debug.h
@@ -86,6 +86,7 @@
 struct buffer;
 void ha_task_dump(struct buffer *buf, const struct task *task, const char *pfx);
 void ha_thread_dump(struct buffer *buf, int thr, int calling_tid);
+void ha_thread_dump_all_to_trash();
 
 /* This one is useful to automatically apply poisonning on an area returned
  * by malloc(). Only "p_" is required to make it work, and to define a poison
diff --git a/src/debug.c b/src/debug.c
index 2829851..51ded9e 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -125,10 +125,7 @@
 		thr = 0;
 
 	chunk_reset(&trash);
-	while (thr < global.nbthread) {
-		ha_thread_dump(&trash, thr, tid);
-		thr++;
-	}
+	ha_thread_dump_all_to_trash();
 
 	if (ci_putchk(si_ic(si), &trash) == -1) {
 		/* failed, try again */
@@ -139,6 +136,128 @@
 	return 1;
 }
 
+#ifndef USE_THREAD_DUMP
+
+/* This function dumps all threads' state to the trash. This version is the
+ * most basic one, which doesn't inspect other threads.
+ */
+void ha_thread_dump_all_to_trash()
+{
+	unsigned int thr;
+
+	for (thr = 0; thr < global.nbthread; thr++)
+		ha_thread_dump(&trash, thr, tid);
+}
+
+#else /* below USE_THREAD_DUMP is set */
+
+/* The signal to trigger a debug dump on a thread is SIGPWR */
+#define DEBUGSIG SIGPWR
+
+/* mask of threads still having to dump, used to respect ordering */
+static volatile unsigned long threads_to_dump;
+
+/* ID of the thread requesting the dump */
+static unsigned int thread_dump_tid;
+
+/* points to the buffer where the dump functions should write. It must
+ * have already been initialized by the requester. Nothing is done if
+ * it's NULL.
+ */
+struct buffer *thread_dump_buffer = NULL;
+
+void ha_thread_dump_all_to_trash()
+{
+	__maybe_unused unsigned int thr;
+	unsigned long old;
+
+	while (1) {
+		old = 0;
+		if (HA_ATOMIC_CAS(&threads_to_dump, &old, all_threads_mask))
+			break;
+		ha_thread_relax();
+	}
+
+	thread_dump_buffer = &trash;
+	thread_dump_tid = tid;
+
+#ifdef USE_THREAD
+	for (thr = 0; thr < global.nbthread; thr++) {
+		if (thr != tid)
+			pthread_kill(threads[thr], DEBUGSIG);
+	}
+#endif
+	/* dump ourselves last */
+	raise(DEBUGSIG);
+}
+
+/* handles DEBUGSIG to dump the state of the thread it's working on */
+void debug_handler(int sig, siginfo_t *si, void *arg)
+{
+	/* There are 4 phases in the dump process:
+	 *   1- wait for our turn, i.e. when all lower bits are gone.
+	 *   2- perform the action if our bit is set
+	 *   3- remove our bit to let the next one go, unless we're
+	 *      the last one and have to put them all but ours
+	 *   4- wait for zero and clear our bit if it's set
+	 */
+
+	/* wait for all previous threads to finish first */
+	while (threads_to_dump & (tid_bit - 1))
+		ha_thread_relax();
+
+	/* dump if needed */
+	if (threads_to_dump & tid_bit) {
+		if (thread_dump_buffer)
+			ha_thread_dump(thread_dump_buffer, tid, thread_dump_tid);
+		if ((threads_to_dump & all_threads_mask) == tid_bit) {
+			/* last one */
+			HA_ATOMIC_STORE(&threads_to_dump, all_threads_mask & ~tid_bit);
+			thread_dump_buffer = NULL;
+		}
+		else
+			HA_ATOMIC_AND(&threads_to_dump, ~tid_bit);
+	}
+
+	/* now wait for all others to finish dumping. The last one will set all
+	 * bits again to broadcast the leaving condition.
+	 */
+	while (threads_to_dump & all_threads_mask) {
+		if (threads_to_dump & tid_bit)
+			HA_ATOMIC_AND(&threads_to_dump, ~tid_bit);
+		else
+			ha_thread_relax();
+	}
+}
+
+static int init_debug_per_thread()
+{
+	sigset_t set;
+
+	/* unblock the DEBUGSIG signal we intend to use */
+	sigemptyset(&set);
+	sigaddset(&set, DEBUGSIG);
+	ha_sigmask(SIG_UNBLOCK, &set, NULL);
+	return 1;
+}
+
+static int init_debug()
+{
+	struct sigaction sa;
+
+	sa.sa_handler = NULL;
+	sa.sa_sigaction = debug_handler;
+	sigemptyset(&sa.sa_mask);
+	sa.sa_flags = SA_SIGINFO;
+	sigaction(DEBUGSIG, &sa, NULL);
+	return 0;
+}
+
+REGISTER_POST_CHECK(init_debug);
+REGISTER_PER_THREAD_INIT(init_debug_per_thread);
+
+#endif /* USE_THREAD_DUMP */
+
 /* register cli keywords */
 static struct cli_kw_list cli_kws = {{ },{
 	{ { "show", "threads", NULL },    "show threads   : show some threads debugging information",   NULL, cli_io_handler_show_threads, NULL },