MINOR: threads: add a "stuck" flag to the thread_info struct
This flag is constantly cleared by the scheduler and will be set by the
watchdog timer to detect stuck threads. It is also set by the "show
threads" command so that it is easy to spot if the situation has evolved
between two subsequent calls : if the first "show threads" shows no stuck
thread and the second one shows such a stuck thread, it indicates that
this thread didn't manage to make any forward progress since the previous
call, which is extremely suspicious.
diff --git a/doc/management.txt b/doc/management.txt
index 9baea96..09ae26e 100644
--- a/doc/management.txt
+++ b/doc/management.txt
@@ -2525,11 +2525,18 @@
an advanced dump mechanism involving thread signals is used so that each
thread can dump its own state in turn. Without this option, the thread
processing the command shows all its details but the other ones are less
- detailed. A stat ('*') is displayed in front of the thread handling the
- command. The output format is purposely not documented so that it can easily
- evolve as new needs are identified, without having to maintain any backwards
- compatibility, and just like with "show activity", the values are only
- meaningful with the code at hand.
+ detailed. A star ('*') is displayed in front of the thread handling the
+ command. A right angle bracket ('>') may also be displayed in front of
+ threads which didn't make any progress since last invocation of this command,
+ indicating a bug in the code which must absolutely be reported. When this
+ happens between two threads it usually indicates a deadlock. If a thread is
+ alone, it's a different bug like a corrupted list. In all cases the process
+ needs is not fully functional anymore and needs to be restarted.
+
+ The output format is purposely not documented so that it can easily evolve as
+ new needs are identified, without having to maintain any form of backwards
+ compatibility, and just like with "show activity", the values are meaningless
+ without the code at hand.
show tls-keys [id|*]
Dump all loaded TLS ticket keys references. The TLS ticket key reference ID
diff --git a/include/common/hathreads.h b/include/common/hathreads.h
index 77bd617..3209909 100644
--- a/include/common/hathreads.h
+++ b/include/common/hathreads.h
@@ -38,6 +38,10 @@
* only one thread is enabled, it equals 1.
*/
+/* thread info flags, for thread_info[].flags */
+#define TI_FL_STUCK 0x00000001
+
+
#ifndef USE_THREAD
#define MAX_THREADS 1
@@ -57,6 +61,7 @@
uint64_t prev_cpu_time; /* previous per thread CPU time */
uint64_t prev_mono_time; /* previous system wide monotonic time */
unsigned int idle_pct; /* idle to total ratio over last sample (percent) */
+ unsigned int flags; /* thread info flags, TI_FL_* */
/* pad to cache line (64B) */
char __pad[0]; /* unused except to check remaining room */
char __end[0] __attribute__((aligned(64)));
@@ -405,6 +410,7 @@
uint64_t prev_cpu_time; /* previous per thread CPU time */
uint64_t prev_mono_time; /* previous system wide monotonic time */
unsigned int idle_pct; /* idle to total ratio over last sample (percent) */
+ unsigned int flags; /* thread info flags, TI_FL_* */
/* pad to cache line (64B) */
char __pad[0]; /* unused except to check remaining room */
char __end[0] __attribute__((aligned(64)));
diff --git a/src/debug.c b/src/debug.c
index e47b85c..b34df72 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -33,18 +33,20 @@
* optionally extra info for the current thread. The dump will be appended to
* the buffer, so the caller is responsible for preliminary initializing it.
* The calling thread ID needs to be passed in <calling_tid> to display a star
- * in front of the calling thread's line (usually it's tid).
+ * in front of the calling thread's line (usually it's tid). Any stuck thread
+ * is also prefixed with a '>'.
*/
void ha_thread_dump(struct buffer *buf, int thr, int calling_tid)
{
unsigned long thr_bit = 1UL << thr;
unsigned long long p = thread_info[thr].prev_cpu_time;
unsigned long long n = now_cpu_time_thread(&thread_info[thr]);
+ int stuck = !!(thread_info[thr].flags & TI_FL_STUCK);
chunk_appendf(buf,
- "%c Thread %-2u: act=%d glob=%d wq=%d rq=%d tl=%d tlsz=%d rqsz=%d\n"
- " fdcache=%d prof=%d",
- (thr == calling_tid) ? '*' : ' ', thr + 1,
+ "%c%cThread %-2u: act=%d glob=%d wq=%d rq=%d tl=%d tlsz=%d rqsz=%d\n"
+ " stuck=%d fdcache=%d prof=%d",
+ (thr == calling_tid) ? '*' : ' ', stuck ? '>' : ' ', thr + 1,
!!(active_tasks_mask & thr_bit),
!!(global_tasks_mask & thr_bit),
!eb_is_empty(&task_per_thread[thr].timers),
@@ -52,6 +54,7 @@
!LIST_ISEMPTY(&task_per_thread[thr].task_list),
task_per_thread[thr].task_list_size,
task_per_thread[thr].rqueue_size,
+ stuck,
!!(fd_cache_mask & thr_bit),
!!(task_profiling_mask & thr_bit));
@@ -467,6 +470,12 @@
else
ha_thread_relax();
}
+
+ /* mark the current thread as stuck to detect it upon next invocation
+ * if it didn't move.
+ */
+ if (!((threads_harmless_mask|sleeping_thread_mask) & tid_bit))
+ ti->flags |= TI_FL_STUCK;
}
static int init_debug_per_thread()
diff --git a/src/task.c b/src/task.c
index afdd2b8..5cf7d74 100644
--- a/src/task.c
+++ b/src/task.c
@@ -281,6 +281,8 @@
struct task *t;
int max_processed;
+ ti->flags &= ~TI_FL_STUCK; // this thread is still running
+
if (!(active_tasks_mask & tid_bit)) {
activity[tid].empty_rq++;
return;
@@ -372,6 +374,7 @@
__ha_barrier_atomic_store();
__task_remove_from_tasklet_list(t);
+ ti->flags &= ~TI_FL_STUCK; // this thread is still running
activity[tid].ctxsw++;
ctx = t->context;
process = t->process;