blob: 18836ff440607bd5d6d64eb5d237809ea7118038 [file] [log] [blame]
Willy Tarreau2bfefdb2019-05-03 13:52:18 +02001/*
2 * Thread lockup detection
3 *
4 * Copyright 2000-2019 Willy Tarreau <willy@haproxy.org>.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <signal.h>
13#include <time.h>
14
Willy Tarreau4c7e4b72020-05-27 12:58:42 +020015#include <haproxy/api.h>
Willy Tarreau2a83d602020-05-27 16:58:08 +020016#include <haproxy/debug.h>
Willy Tarreau36979d92020-06-05 17:27:29 +020017#include <haproxy/errors.h>
Willy Tarreauf268ee82020-06-04 17:05:57 +020018#include <haproxy/global.h>
Willy Tarreau3f567e42020-05-28 15:29:19 +020019#include <haproxy/thread.h>
Willy Tarreau48fbcae2020-06-03 18:09:46 +020020#include <haproxy/tools.h>
Willy Tarreau2bfefdb2019-05-03 13:52:18 +020021
22
23/*
24 * It relies on timer_create() and timer_settime() which are only available in
25 * this case.
26 */
Willy Tarreaue58114e2020-03-04 10:53:07 +010027#if defined(USE_RT) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME)
Willy Tarreau2bfefdb2019-05-03 13:52:18 +020028
Willy Tarreau2bfefdb2019-05-03 13:52:18 +020029/* Setup (or ping) the watchdog timer for thread <thr>. Returns non-zero on
30 * success, zero on failure. It interrupts once per second of CPU time. It
31 * happens that timers based on the CPU time are not automatically re-armed
32 * so we only use the value and leave the interval unset.
33 */
34int wdt_ping(int thr)
35{
36 struct itimerspec its;
37
38 its.it_value.tv_sec = 1; its.it_value.tv_nsec = 0;
39 its.it_interval.tv_sec = 0; its.it_interval.tv_nsec = 0;
David Carliera92c5ce2019-09-13 05:03:12 +010040 return timer_settime(ha_thread_info[thr].wd_timer, 0, &its, NULL) == 0;
Willy Tarreau2bfefdb2019-05-03 13:52:18 +020041}
42
43/* This is the WDTSIG signal handler */
44void wdt_handler(int sig, siginfo_t *si, void *arg)
45{
46 unsigned long long n, p;
47 int thr;
48
49 switch (si->si_code) {
50 case SI_TIMER:
51 /* A thread's timer fired, the thread ID is in si_int. We have
52 * no guarantee that the thread handling this signal is in any
53 * way related to the one triggering it, so we need to retrieve
54 * the thread number from there. Note: this thread might
55 * continue to execute in parallel.
56 */
Willy Tarreau02255b22019-05-23 08:36:29 +020057 thr = si->si_value.sival_int;
Willy Tarreau2bfefdb2019-05-03 13:52:18 +020058
59 /* cannot happen unless an unknown timer tries to play with our
60 * nerves. Let's die for now if this happens.
61 */
62 if (thr < 0 || thr >= global.nbthread)
63 break;
64
David Carliera92c5ce2019-09-13 05:03:12 +010065 p = ha_thread_info[thr].prev_cpu_time;
66 n = now_cpu_time_thread(&ha_thread_info[thr]);
Willy Tarreau2bfefdb2019-05-03 13:52:18 +020067
68 /* not yet reached the deadline of 1 sec */
69 if (n - p < 1000000000UL)
70 goto update_and_leave;
71
Willy Tarreaua37cb182019-07-31 19:20:39 +020072 if ((threads_harmless_mask|sleeping_thread_mask|threads_to_dump) & (1UL << thr)) {
Willy Tarreau2bfefdb2019-05-03 13:52:18 +020073 /* This thread is currently doing exactly nothing
74 * waiting in the poll loop (unlikely but possible),
75 * waiting for all other threads to join the rendez-vous
76 * point (common), or waiting for another thread to
77 * finish an isolated operation (unlikely but possible).
78 */
79 goto update_and_leave;
80 }
81
82 /* So the thread indeed appears locked up. In order to be
83 * certain that we're not witnessing an exceptional spike of
84 * CPU usage due to a configuration issue (like running tens
85 * of thousands of tasks in a single loop), we'll check if the
86 * scheduler is still alive by setting the TI_FL_STUCK flag
87 * that the scheduler clears when switching to the next task.
88 * If it's already set, then it's our second call with no
89 * progress and the thread is dead.
90 */
David Carliera92c5ce2019-09-13 05:03:12 +010091 if (!(ha_thread_info[thr].flags & TI_FL_STUCK)) {
92 _HA_ATOMIC_OR(&ha_thread_info[thr].flags, TI_FL_STUCK);
Willy Tarreau2bfefdb2019-05-03 13:52:18 +020093 goto update_and_leave;
94 }
95
96 /* No doubt now, there's no hop to recover, die loudly! */
97 break;
Willy Tarreau06278152020-03-10 09:26:17 +010098#ifdef USE_THREAD
Willy Tarreau2bfefdb2019-05-03 13:52:18 +020099 case SI_TKILL:
100 /* we got a pthread_kill, stop on it */
101 thr = tid;
102 break;
Willy Tarreau06278152020-03-10 09:26:17 +0100103#endif
Willy Tarreau2bfefdb2019-05-03 13:52:18 +0200104 default:
105 /* unhandled other conditions */
106 return;
107 }
108
109 /* By default we terminate. If we're not on the victim thread, better
110 * bounce the signal there so that we produce a cleaner stack trace
111 * with the other thread interrupted exactly where it was running and
112 * the current one not involved in this.
113 */
Willy Tarreaue58114e2020-03-04 10:53:07 +0100114#ifdef USE_THREAD
Willy Tarreau2bfefdb2019-05-03 13:52:18 +0200115 if (thr != tid)
David Carliera92c5ce2019-09-13 05:03:12 +0100116 pthread_kill(ha_thread_info[thr].pthread, sig);
Willy Tarreau2bfefdb2019-05-03 13:52:18 +0200117 else
Willy Tarreaue58114e2020-03-04 10:53:07 +0100118#endif
Willy Tarreau2bfefdb2019-05-03 13:52:18 +0200119 ha_panic();
120 return;
121
122 update_and_leave:
123 wdt_ping(thr);
124}
125
126int init_wdt_per_thread()
127{
Willy Tarreauc1563e52020-02-26 14:03:05 +0100128 struct sigevent sev = { };
Willy Tarreau2bfefdb2019-05-03 13:52:18 +0200129 sigset_t set;
130
131 /* unblock the WDTSIG signal we intend to use */
132 sigemptyset(&set);
133 sigaddset(&set, WDTSIG);
134 ha_sigmask(SIG_UNBLOCK, &set, NULL);
135
136 /* this timer will signal WDTSIG when it fires, with tid in the si_int
137 * field (important since any thread will receive the signal).
138 */
139 sev.sigev_notify = SIGEV_SIGNAL;
140 sev.sigev_signo = WDTSIG;
141 sev.sigev_value.sival_int = tid;
Willy Tarreaud6f19662020-03-04 10:48:18 +0100142 if (timer_create(ti->clock_id, &sev, &ti->wd_timer) == -1 &&
143 timer_create(CLOCK_REALTIME, &sev, &ti->wd_timer) == -1)
Willy Tarreau2bfefdb2019-05-03 13:52:18 +0200144 goto fail1;
145
146 if (!wdt_ping(tid))
147 goto fail2;
148
149 return 1;
150
151 fail2:
152 timer_delete(ti->wd_timer);
153 fail1:
154 ti->wd_timer = TIMER_INVALID;
155 ha_warning("Failed to setup watchdog timer for thread %u, disabling lockup detection.\n", tid);
Willy Tarreau7259fa22020-03-04 10:46:13 +0100156 return 1;
Willy Tarreau2bfefdb2019-05-03 13:52:18 +0200157}
158
159void deinit_wdt_per_thread()
160{
161 if (ti->wd_timer != TIMER_INVALID)
162 timer_delete(ti->wd_timer);
163}
164
165/* registers the watchdog signal handler and returns 0. This sets up the signal
166 * handler for WDTSIG, so it must be called once per process.
167 */
168int init_wdt()
169{
170 struct sigaction sa;
171
172 sa.sa_handler = NULL;
173 sa.sa_sigaction = wdt_handler;
174 sigemptyset(&sa.sa_mask);
175 sa.sa_flags = SA_SIGINFO;
176 sigaction(WDTSIG, &sa, NULL);
Christopher Fauletfc633b62020-11-06 15:24:23 +0100177 return ERR_NONE;
Willy Tarreau2bfefdb2019-05-03 13:52:18 +0200178}
179
180REGISTER_POST_CHECK(init_wdt);
181REGISTER_PER_THREAD_INIT(init_wdt_per_thread);
182REGISTER_PER_THREAD_DEINIT(deinit_wdt_per_thread);
183#endif