MEDIUM: debug: add support for dumping backtraces of stuck threads
When a panic() occurs due to a stuck thread, we'll try to dump a
backtrace of this thread if the config directive USE_BACKTRACE is
set (which is the case on linux+glibc). For this we use the
backtrace() call provided by glibc and iterate the pointers through
resolve_sym_name(). In order to minimize the output (which is limited
to one buffer), we only do this for stuck threads, and we start the
dump above ha_panic()/ha_thread_dump_all_to_trash(), and stop when
meeting known points such as main/run_tasks_from_list/run_poll_loop.
If enabled without USE_DL, the dump will be complete with no details
except that pointers will all be given relative to main, which is
still better than nothing.
The new USE_BACKTRACE config option is enabled by default on glibc since
it has been present for ages. When it is set, the export-dynamic linker
option is enabled so that all non-static symbols are properly resolved.
diff --git a/Makefile b/Makefile
index 1e6b384..2d7e0e9 100644
--- a/Makefile
+++ b/Makefile
@@ -49,6 +49,7 @@
# USE_NS : enable network namespace support. Supported on Linux >= 2.6.24.
# USE_DL : enable it if your system requires -ldl. Automatic on Linux.
# USE_RT : enable it if your system requires -lrt. Automatic on Linux.
+# USE_BACKTRACE : enable backtrace(). Automatic on Linux.
# USE_DEVICEATLAS : enable DeviceAtlas api.
# USE_51DEGREES : enable third party device detection library from 51Degrees
# USE_WURFL : enable WURFL detection library from Scientiamobile
@@ -286,7 +287,7 @@
# the reported build options.
use_opts = USE_EPOLL USE_KQUEUE USE_MY_EPOLL USE_MY_SPLICE USE_NETFILTER \
USE_PCRE USE_PCRE_JIT USE_PCRE2 USE_PCRE2_JIT USE_POLL \
- USE_PRIVATE_CACHE USE_THREAD USE_PTHREAD_PSHARED \
+ USE_PRIVATE_CACHE USE_THREAD USE_PTHREAD_PSHARED USE_BACKTRACE \
USE_STATIC_PCRE USE_STATIC_PCRE2 USE_TPROXY USE_LINUX_TPROXY \
USE_LINUX_SPLICE USE_LIBCRYPT USE_CRYPT_H USE_VSYSCALL \
USE_GETADDRINFO USE_OPENSSL USE_LUA USE_FUTEX USE_ACCEPT4 \
@@ -326,7 +327,7 @@
USE_POLL USE_TPROXY USE_LIBCRYPT USE_DL USE_RT USE_CRYPT_H USE_NETFILTER \
USE_CPU_AFFINITY USE_THREAD USE_EPOLL USE_FUTEX USE_LINUX_TPROXY \
USE_ACCEPT4 USE_LINUX_SPLICE USE_PRCTL USE_THREAD_DUMP USE_NS USE_TFO \
- USE_GETADDRINFO)
+ USE_GETADDRINFO USE_BACKTRACE)
endif
# For linux >= 2.6.28, glibc without new features
@@ -515,6 +516,10 @@
OPTIONS_LDFLAGS += -lrt
endif
+ifneq ($(USE_BACKTRACE),)
+OPTIONS_LDFLAGS += -Wl,$(if $(EXPORT_SYMBOL),$(EXPORT_SYMBOL),--export-dynamic)
+endif
+
ifneq ($(USE_OPENSSL),)
# OpenSSL is packaged in various forms and with various dependencies.
# In general -lssl is enough, but on some platforms, -lcrypto may be needed,
diff --git a/src/debug.c b/src/debug.c
index 4b7f65e..6b41fc3 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -10,6 +10,12 @@
*
*/
+
+#ifdef USE_BACKTRACE
+#define _GNU_SOURCE
+#include <execinfo.h>
+#endif
+
#include <fcntl.h>
#include <signal.h>
#include <time.h>
@@ -87,6 +93,69 @@
chunk_appendf(buf, " curr_task=");
ha_task_dump(buf, sched->current, " ");
+
+#ifdef USE_BACKTRACE
+ if (stuck) {
+ /* We only emit the backtrace for stuck threads in order not to
+ * waste precious output buffer space with non-interesting data.
+ */
+ struct buffer bak;
+ void *callers[100];
+ int j, nptrs;
+ void *addr;
+ int dump = 0;
+
+ nptrs = backtrace(callers, sizeof(callers)/sizeof(*callers));
+
+ /* The call backtrace_symbols_fd(callers, nptrs, STDOUT_FILENO)
+ would produce similar output to the following: */
+
+ if (nptrs)
+ chunk_appendf(buf, " call trace:\n");
+
+#ifndef USE_DL
+ /* if we can't rely on dladdr1() we won't figure what level is
+ * in ha_panic() or ha_thread_dump_all_to_trash(), so we want
+ * to immediately start the dump.
+ */
+ dump = 2;
+#endif
+ for (j = 0; j < nptrs; j++) {
+ bak = *buf;
+ dump_addr_and_bytes(buf, " | ", callers[j], 8);
+ addr = resolve_sym_name(buf, ": ", callers[j]);
+ if (dump == 0) {
+ /* dump not started, will start *after*
+ * ha_thread_dump_all_to_trash and ha_panic
+ */
+ if (addr == ha_thread_dump_all_to_trash || addr == ha_panic)
+ dump = 1;
+ *buf = bak;
+ continue;
+ }
+
+ if (dump == 1) {
+ /* starting */
+ if (addr == ha_thread_dump_all_to_trash || addr == ha_panic) {
+ *buf = bak;
+ continue;
+ }
+ dump = 2;
+ }
+
+ if (dump == 2) {
+ /* dumping */
+ if (addr == run_poll_loop || addr == main || addr == run_tasks_from_list) {
+ dump = 3;
+ *buf = bak;
+ break;
+ }
+ }
+ /* OK, line dumped */
+ chunk_appendf(buf, "\n");
+ }
+ }
+#endif
}