blob: ceecf9da57fb6328ca7156727dd268351a6f86ce [file] [log] [blame]
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +00001/*
2 * FD polling functions for SunOS event ports.
3 *
4 * Copyright 2018 Joyent, Inc.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <unistd.h>
13#include <sys/time.h>
14#include <sys/types.h>
15
16#include <poll.h>
17#include <port.h>
18#include <errno.h>
19#include <syslog.h>
20
Willy Tarreaub2551052020-06-09 09:07:15 +020021#include <haproxy/activity.h>
Willy Tarreau4c7e4b72020-05-27 12:58:42 +020022#include <haproxy/api.h>
Willy Tarreau55542642021-10-08 09:33:24 +020023#include <haproxy/clock.h>
Willy Tarreaub2551052020-06-09 09:07:15 +020024#include <haproxy/fd.h>
25#include <haproxy/global.h>
Willy Tarreau3727a8a2020-06-04 17:37:26 +020026#include <haproxy/signal.h>
Willy Tarreau6dfab112021-09-30 17:53:22 +020027#include <haproxy/task.h>
Willy Tarreauc2f7c582020-06-02 18:15:32 +020028#include <haproxy/ticks.h>
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000029
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000030/*
31 * Private data:
32 */
33static int evports_fd[MAX_THREADS]; // per-thread evports_fd
34static THREAD_LOCAL port_event_t *evports_evlist = NULL;
35static THREAD_LOCAL int evports_evlist_max = 0;
36
37/*
38 * Convert the "state" member of "fdtab" into an event ports event mask.
39 */
40static inline int evports_state_to_events(int state)
41{
42 int events = 0;
43
Willy Tarreau5bee3e22019-09-04 09:52:57 +020044 if (state & FD_EV_ACTIVE_W)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000045 events |= POLLOUT;
Willy Tarreau5bee3e22019-09-04 09:52:57 +020046 if (state & FD_EV_ACTIVE_R)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000047 events |= POLLIN;
48
49 return (events);
50}
51
52/*
53 * Associate or dissociate this file descriptor with the event port, using the
54 * specified event mask.
55 */
56static inline void evports_resync_fd(int fd, int events)
57{
58 if (events == 0)
59 port_dissociate(evports_fd[tid], PORT_SOURCE_FD, fd);
60 else
61 port_associate(evports_fd[tid], PORT_SOURCE_FD, fd, events, NULL);
62}
63
64static void _update_fd(int fd)
65{
66 int en;
67 int events;
Willy Tarreau63022122022-07-06 10:37:31 +020068 ulong pr, ps;
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000069
70 en = fdtab[fd].state;
Willy Tarreau63022122022-07-06 10:37:31 +020071 pr = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_recv);
72 ps = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_send);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000073
Willy Tarreau3638d172022-07-07 08:23:03 +020074 if (!(fdtab[fd].thread_mask & ti->ltid_bit) || !(en & FD_EV_ACTIVE_RW)) {
Willy Tarreau63022122022-07-06 10:37:31 +020075 if (!((pr | ps) & ti->ltid_bit)) {
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000076 /* fd was not watched, it's still not */
77 return;
78 }
79 /* fd totally removed from poll list */
80 events = 0;
Willy Tarreau63022122022-07-06 10:37:31 +020081 if (pr & ti->ltid_bit)
82 _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~ti->ltid_bit);
83 if (ps & ti->ltid_bit)
84 _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~ti->ltid_bit);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000085 }
86 else {
87 /* OK fd has to be monitored, it was either added or changed */
88 events = evports_state_to_events(en);
Willy Tarreau5bee3e22019-09-04 09:52:57 +020089 if (en & FD_EV_ACTIVE_R) {
Willy Tarreau63022122022-07-06 10:37:31 +020090 if (!(pr & ti->ltid_bit))
91 _HA_ATOMIC_OR(&polled_mask[fd].poll_recv, ti->ltid_bit);
Olivier Houchard53055052019-07-25 14:00:18 +000092 } else {
Willy Tarreau63022122022-07-06 10:37:31 +020093 if (pr & ti->ltid_bit)
94 _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~ti->ltid_bit);
Olivier Houchard53055052019-07-25 14:00:18 +000095 }
Willy Tarreau5bee3e22019-09-04 09:52:57 +020096 if (en & FD_EV_ACTIVE_W) {
Willy Tarreau63022122022-07-06 10:37:31 +020097 if (!(ps & ti->ltid_bit))
98 _HA_ATOMIC_OR(&polled_mask[fd].poll_send, ti->ltid_bit);
Olivier Houchard53055052019-07-25 14:00:18 +000099 } else {
Willy Tarreau63022122022-07-06 10:37:31 +0200100 if (ps & ti->ltid_bit)
101 _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~ti->ltid_bit);
Olivier Houchard53055052019-07-25 14:00:18 +0000102 }
103
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000104 }
105 evports_resync_fd(fd, events);
106}
107
108/*
109 * Event Ports poller. This routine interacts with the file descriptor
110 * management data structures and routines; see the large block comment in
111 * "src/fd.c" for more information.
112 */
113
Willy Tarreau03e78532020-02-25 07:38:05 +0100114static void _do_poll(struct poller *p, int exp, int wake)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000115{
116 int i;
117 int wait_time;
118 struct timespec timeout_ts;
119 unsigned int nevlist;
120 int fd, old_fd;
121 int status;
122
123 /*
124 * Scan the list of file descriptors with an updated status:
125 */
126 for (i = 0; i < fd_nbupdt; i++) {
127 fd = fd_updt[i];
128
Willy Tarreau1f947cb2022-07-09 23:55:43 +0200129 if (!fd_grab_tgid(fd, tgid)) {
130 /* was reassigned */
Willy Tarreaue4063862020-06-17 20:35:33 +0200131 activity[tid].poll_drop_fd++;
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000132 continue;
133 }
134
Willy Tarreau1f947cb2022-07-09 23:55:43 +0200135 _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~ti->ltid_bit);
136
137 if (fdtab[fd].owner)
138 _update_fd(fd);
139 else
140 activity[tid].poll_drop_fd++;
141
142 fd_drop_tgid(fd);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000143 }
144 fd_nbupdt = 0;
Willy Tarreau1f947cb2022-07-09 23:55:43 +0200145
146 /* Scan the shared update list */
Willy Tarreau35ee7102022-07-08 11:33:43 +0200147 for (old_fd = fd = update_list[tgid - 1].first; fd != -1; fd = fdtab[fd].update.next) {
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000148 if (fd == -2) {
149 fd = old_fd;
150 continue;
151 }
152 else if (fd <= -3)
153 fd = -fd -4;
154 if (fd == -1)
155 break;
Willy Tarreau1f947cb2022-07-09 23:55:43 +0200156
157 if (!fd_grab_tgid(fd, tgid)) {
158 /* was reassigned */
159 activity[tid].poll_drop_fd++;
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000160 continue;
Willy Tarreau1f947cb2022-07-09 23:55:43 +0200161 }
162
Willy Tarreau69834262022-07-25 15:39:21 +0200163 if (!(fdtab[fd].update_mask & ti->ltid_bit)) {
164 fd_drop_tgid(fd);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000165 continue;
Willy Tarreau69834262022-07-25 15:39:21 +0200166 }
Willy Tarreau1f947cb2022-07-09 23:55:43 +0200167
168 done_update_polling(fd);
169
170 if (fdtab[fd].owner)
171 _update_fd(fd);
172 else
173 activity[tid].poll_drop_fd++;
174
175 fd_drop_tgid(fd);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000176 }
177
Willy Tarreau88d1c5d2021-08-04 11:44:17 +0200178 thread_idle_now();
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000179 thread_harmless_now();
180
Matthias Wirtheea152e2022-09-09 10:21:00 +0200181 /* Now let's wait for polled events. */
182 wait_time = wake ? 0 : compute_poll_timeout(exp);
Willy Tarreauf9d5e102021-10-08 10:43:59 +0200183 clock_entering_poll();
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000184
185 do {
186 int timeout = (global.tune.options & GTUNE_BUSY_POLLING) ? 0 : wait_time;
187 int interrupted = 0;
188 nevlist = 1; /* desired number of events to be retrieved */
189 timeout_ts.tv_sec = (timeout / 1000);
190 timeout_ts.tv_nsec = (timeout % 1000) * 1000000;
191
192 status = port_getn(evports_fd[tid],
193 evports_evlist,
194 evports_evlist_max,
195 &nevlist, /* updated to the number of events retrieved */
196 &timeout_ts);
Willy Tarreauffdceda2024-04-17 16:25:20 +0200197
198 /* Be careful, nevlist here is always updated by the syscall
199 * even on status == -1, so it must always be respected
200 * otherwise events are lost. Awkward API BTW, I wonder how
201 * they thought ENOSYS ought to be handled... -WT
202 */
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000203 if (status != 0) {
204 int e = errno;
205 switch (e) {
206 case ETIME:
207 /*
208 * Though the manual page has not historically made it
209 * clear, port_getn() can return -1 with an errno of
210 * ETIME and still have returned some number of events.
211 */
212 /* nevlist >= 0 */
213 break;
214 default:
Willy Tarreauffdceda2024-04-17 16:25:20 +0200215 /* signal or anything else */
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000216 interrupted = 1;
217 break;
218 }
219 }
Willy Tarreau8e973262024-09-12 17:47:13 +0200220 clock_update_local_date(wait_time, (global.tune.options & GTUNE_BUSY_POLLING) ? 1 : nevlist);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000221
222 if (nevlist || interrupted)
223 break;
224 if (timeout || !wait_time)
225 break;
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000226 if (tick_isset(exp) && tick_is_expired(exp, now_ms))
227 break;
228 } while(1);
229
Willy Tarreau58b73f92022-09-21 08:11:38 +0200230 clock_update_global_date();
Willy Tarreau058b2c12022-06-22 15:21:34 +0200231 fd_leaving_poll(wait_time, nevlist);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000232
Willy Tarreaue5451532020-06-17 20:25:18 +0200233 if (nevlist > 0)
234 activity[tid].poll_io++;
235
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000236 for (i = 0; i < nevlist; i++) {
237 unsigned int n = 0;
238 int events, rebind_events;
Willy Tarreau200bd502021-07-29 16:57:19 +0200239 int ret;
240
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000241 fd = evports_evlist[i].portev_object;
242 events = evports_evlist[i].portev_events;
243
Willy Tarreau38e8a1c2020-06-23 10:04:54 +0200244#ifdef DEBUG_FD
Willy Tarreau4781b152021-04-06 13:53:36 +0200245 _HA_ATOMIC_INC(&fdtab[fd].event_count);
Willy Tarreau38e8a1c2020-06-23 10:04:54 +0200246#endif
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000247 /*
248 * By virtue of receiving an event for this file descriptor, it
249 * is no longer associated with the port in question. Store
250 * the previous event mask so that we may reassociate after
251 * processing is complete.
252 */
253 rebind_events = evports_state_to_events(fdtab[fd].state);
254 /* rebind_events != 0 */
255
256 /*
257 * Set bits based on the events we received from the port:
258 */
Emmanuel Hocdet7ceb96b2019-09-19 11:08:26 +0000259 n = ((events & POLLIN) ? FD_EV_READY_R : 0) |
260 ((events & POLLOUT) ? FD_EV_READY_W : 0) |
261 ((events & POLLHUP) ? FD_EV_SHUT_RW : 0) |
262 ((events & POLLERR) ? FD_EV_ERR_RW : 0);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000263
264 /*
265 * Call connection processing callbacks. Note that it's
266 * possible for this processing to alter the required event
Ilya Shipitsince7b00f2020-03-23 22:28:40 +0500267 * port association; i.e., the "state" member of the "fdtab"
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000268 * entry. If it changes, the fd will be placed on the updated
269 * list for processing the next time we are called.
270 */
Willy Tarreau200bd502021-07-29 16:57:19 +0200271 ret = fd_update_events(fd, n);
272
Willy Tarreaub1093c62022-07-09 18:55:37 +0200273 /* polling will be on this instance if the FD was migrated */
274 if (ret == FD_UPDT_MIGRATED)
Willy Tarreau200bd502021-07-29 16:57:19 +0200275 continue;
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000276
277 /*
278 * This file descriptor was closed during the processing of
279 * polled events. No need to reassociate.
280 */
Willy Tarreau200bd502021-07-29 16:57:19 +0200281 if (ret == FD_UPDT_CLOSED)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000282 continue;
283
284 /*
285 * Reassociate with the port, using the same event mask as
286 * before. This call will not result in a dissociation as we
287 * asserted that _some_ events needed to be rebound above.
288 *
289 * Reassociating with the same mask allows us to mimic the
290 * level-triggered behaviour of poll(2). In the event that we
291 * are interested in the same events on the next turn of the
292 * loop, this represents no extra work.
293 *
294 * If this additional port_associate(3C) call becomes a
295 * performance problem, we would need to verify that we can
296 * correctly interact with the file descriptor cache and update
297 * list (see "src/fd.c") to avoid reassociating here, or to use
298 * a different events mask.
299 */
300 evports_resync_fd(fd, rebind_events);
301 }
302}
303
304static int init_evports_per_thread()
305{
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000306 evports_evlist_max = global.tune.maxpollevents;
Tim Duesterhus16cc16d2021-11-06 15:14:45 +0100307 evports_evlist = calloc(evports_evlist_max, sizeof(*evports_evlist));
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000308 if (evports_evlist == NULL) {
309 goto fail_alloc;
310 }
311
312 if (MAX_THREADS > 1 && tid) {
313 if ((evports_fd[tid] = port_create()) == -1) {
314 goto fail_fd;
315 }
316 }
317
318 /* we may have to unregister some events initially registered on the
319 * original fd when it was alone, and/or to register events on the new
320 * fd for this thread. Let's just mark them as updated, the poller will
321 * do the rest.
322 */
Willy Tarreaud95f18f2022-07-09 23:23:50 +0200323 fd_reregister_all(tgid, ti->ltid_bit);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000324
325 return 1;
326
327 fail_fd:
Willy Tarreau61cfdf42021-02-20 10:46:51 +0100328 ha_free(&evports_evlist);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000329 evports_evlist_max = 0;
330 fail_alloc:
331 return 0;
332}
333
334static void deinit_evports_per_thread()
335{
336 if (MAX_THREADS > 1 && tid)
337 close(evports_fd[tid]);
338
Willy Tarreau61cfdf42021-02-20 10:46:51 +0100339 ha_free(&evports_evlist);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000340 evports_evlist_max = 0;
341}
342
343/*
344 * Initialisation of the event ports poller.
345 * Returns 0 in case of failure, non-zero in case of success.
346 */
Willy Tarreau03e78532020-02-25 07:38:05 +0100347static int _do_init(struct poller *p)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000348{
349 p->private = NULL;
350
351 if ((evports_fd[tid] = port_create()) == -1) {
352 goto fail;
353 }
354
355 hap_register_per_thread_init(init_evports_per_thread);
356 hap_register_per_thread_deinit(deinit_evports_per_thread);
357
358 return 1;
359
360fail:
361 p->pref = 0;
362 return 0;
363}
364
365/*
366 * Termination of the event ports poller.
367 * All resources are released and the poller is marked as inoperative.
368 */
Willy Tarreau03e78532020-02-25 07:38:05 +0100369static void _do_term(struct poller *p)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000370{
371 if (evports_fd[tid] != -1) {
372 close(evports_fd[tid]);
373 evports_fd[tid] = -1;
374 }
375
376 p->private = NULL;
377 p->pref = 0;
378
Willy Tarreau61cfdf42021-02-20 10:46:51 +0100379 ha_free(&evports_evlist);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000380 evports_evlist_max = 0;
381}
382
383/*
384 * Run-time check to make sure we can allocate the resources needed for
385 * the poller to function correctly.
386 * Returns 1 on success, otherwise 0.
387 */
Willy Tarreau03e78532020-02-25 07:38:05 +0100388static int _do_test(struct poller *p)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000389{
390 int fd;
391
392 if ((fd = port_create()) == -1) {
393 return 0;
394 }
395
396 close(fd);
397 return 1;
398}
399
400/*
401 * Close and recreate the event port after fork(). Returns 1 on success,
402 * otherwise 0. If this function fails, "_do_term()" must be called to
403 * clean up the poller.
404 */
Willy Tarreau03e78532020-02-25 07:38:05 +0100405static int _do_fork(struct poller *p)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000406{
407 if (evports_fd[tid] != -1) {
408 close(evports_fd[tid]);
409 }
410
411 if ((evports_fd[tid] = port_create()) == -1) {
412 return 0;
413 }
414
415 return 1;
416}
417
418/*
Willy Tarreau740d7492022-04-25 19:00:55 +0200419 * Registers the poller.
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000420 */
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000421static void _do_register(void)
422{
423 struct poller *p;
424 int i;
425
426 if (nbpollers >= MAX_POLLERS)
427 return;
428
429 for (i = 0; i < MAX_THREADS; i++)
430 evports_fd[i] = -1;
431
432 p = &pollers[nbpollers++];
433
434 p->name = "evports";
435 p->pref = 300;
Willy Tarreau11ef0832019-11-28 18:17:33 +0100436 p->flags = HAP_POLL_F_ERRHUP;
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000437 p->private = NULL;
438
439 p->clo = NULL;
440 p->test = _do_test;
441 p->init = _do_init;
442 p->term = _do_term;
443 p->poll = _do_poll;
444 p->fork = _do_fork;
445}
Willy Tarreau740d7492022-04-25 19:00:55 +0200446
447INITCALL0(STG_REGISTER, _do_register);