blob: 6465b38758adfa358a44113ceed4e0b028bbb51a [file] [log] [blame]
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +00001/*
2 * FD polling functions for SunOS event ports.
3 *
4 * Copyright 2018 Joyent, Inc.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <unistd.h>
13#include <sys/time.h>
14#include <sys/types.h>
15
16#include <poll.h>
17#include <port.h>
18#include <errno.h>
19#include <syslog.h>
20
Willy Tarreaub2551052020-06-09 09:07:15 +020021#include <haproxy/activity.h>
Willy Tarreau4c7e4b72020-05-27 12:58:42 +020022#include <haproxy/api.h>
Willy Tarreaub2551052020-06-09 09:07:15 +020023#include <haproxy/fd.h>
24#include <haproxy/global.h>
Willy Tarreau3727a8a2020-06-04 17:37:26 +020025#include <haproxy/signal.h>
Willy Tarreauc2f7c582020-06-02 18:15:32 +020026#include <haproxy/ticks.h>
Willy Tarreau92b4f132020-06-01 11:05:15 +020027#include <haproxy/time.h>
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000028
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000029/*
30 * Private data:
31 */
32static int evports_fd[MAX_THREADS]; // per-thread evports_fd
33static THREAD_LOCAL port_event_t *evports_evlist = NULL;
34static THREAD_LOCAL int evports_evlist_max = 0;
35
36/*
37 * Convert the "state" member of "fdtab" into an event ports event mask.
38 */
39static inline int evports_state_to_events(int state)
40{
41 int events = 0;
42
Willy Tarreau5bee3e22019-09-04 09:52:57 +020043 if (state & FD_EV_ACTIVE_W)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000044 events |= POLLOUT;
Willy Tarreau5bee3e22019-09-04 09:52:57 +020045 if (state & FD_EV_ACTIVE_R)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000046 events |= POLLIN;
47
48 return (events);
49}
50
51/*
52 * Associate or dissociate this file descriptor with the event port, using the
53 * specified event mask.
54 */
55static inline void evports_resync_fd(int fd, int events)
56{
57 if (events == 0)
58 port_dissociate(evports_fd[tid], PORT_SOURCE_FD, fd);
59 else
60 port_associate(evports_fd[tid], PORT_SOURCE_FD, fd, events, NULL);
61}
62
63static void _update_fd(int fd)
64{
65 int en;
66 int events;
67
68 en = fdtab[fd].state;
69
Willy Tarreau5bee3e22019-09-04 09:52:57 +020070 if (!(fdtab[fd].thread_mask & tid_bit) || !(en & FD_EV_ACTIVE_RW)) {
Olivier Houchard53055052019-07-25 14:00:18 +000071 if (!(polled_mask[fd].poll_recv & tid_bit) &&
72 !(polled_mask[fd].poll_send & tid_bit)) {
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000073 /* fd was not watched, it's still not */
74 return;
75 }
76 /* fd totally removed from poll list */
77 events = 0;
Olivier Houchard53055052019-07-25 14:00:18 +000078 if (polled_mask[fd].poll_recv & tid_bit)
79 _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~tid_bit);
80 if (polled_mask[fd].poll_send & tid_bit)
81 _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~tid_bit);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +000082 }
83 else {
84 /* OK fd has to be monitored, it was either added or changed */
85 events = evports_state_to_events(en);
Willy Tarreau5bee3e22019-09-04 09:52:57 +020086 if (en & FD_EV_ACTIVE_R) {
Olivier Houchard53055052019-07-25 14:00:18 +000087 if (!(polled_mask[fd].poll_recv & tid_bit))
88 _HA_ATOMIC_OR(&polled_mask[fd].poll_recv, tid_bit);
89 } else {
90 if (polled_mask[fd].poll_recv & tid_bit)
91 _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~tid_bit);
92 }
Willy Tarreau5bee3e22019-09-04 09:52:57 +020093 if (en & FD_EV_ACTIVE_W) {
Olivier Houchard53055052019-07-25 14:00:18 +000094 if (!(polled_mask[fd].poll_send & tid_bit))
95 _HA_ATOMIC_OR(&polled_mask[fd].poll_send, tid_bit);
96 } else {
97 if (polled_mask[fd].poll_send & tid_bit)
98 _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~tid_bit);
99 }
100
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000101 }
102 evports_resync_fd(fd, events);
103}
104
105/*
106 * Event Ports poller. This routine interacts with the file descriptor
107 * management data structures and routines; see the large block comment in
108 * "src/fd.c" for more information.
109 */
110
Willy Tarreau03e78532020-02-25 07:38:05 +0100111static void _do_poll(struct poller *p, int exp, int wake)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000112{
113 int i;
114 int wait_time;
115 struct timespec timeout_ts;
116 unsigned int nevlist;
117 int fd, old_fd;
118 int status;
119
120 /*
121 * Scan the list of file descriptors with an updated status:
122 */
123 for (i = 0; i < fd_nbupdt; i++) {
124 fd = fd_updt[i];
125
126 _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~tid_bit);
127 if (fdtab[fd].owner == NULL) {
Willy Tarreaue4063862020-06-17 20:35:33 +0200128 activity[tid].poll_drop_fd++;
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000129 continue;
130 }
131
132 _update_fd(fd);
133 }
134 fd_nbupdt = 0;
135 /* Scan the global update list */
136 for (old_fd = fd = update_list.first; fd != -1; fd = fdtab[fd].update.next) {
137 if (fd == -2) {
138 fd = old_fd;
139 continue;
140 }
141 else if (fd <= -3)
142 fd = -fd -4;
143 if (fd == -1)
144 break;
145 if (fdtab[fd].update_mask & tid_bit)
146 done_update_polling(fd);
147 else
148 continue;
149 if (!fdtab[fd].owner)
150 continue;
151 _update_fd(fd);
152 }
153
154 thread_harmless_now();
155
Matthias Wirth6b933fc2022-09-09 10:21:00 +0200156 /* Now let's wait for polled events. */
157 wait_time = wake ? 0 : compute_poll_timeout(exp);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000158 tv_entering_poll();
159 activity_count_runtime();
160
161 do {
162 int timeout = (global.tune.options & GTUNE_BUSY_POLLING) ? 0 : wait_time;
163 int interrupted = 0;
164 nevlist = 1; /* desired number of events to be retrieved */
165 timeout_ts.tv_sec = (timeout / 1000);
166 timeout_ts.tv_nsec = (timeout % 1000) * 1000000;
167
168 status = port_getn(evports_fd[tid],
169 evports_evlist,
170 evports_evlist_max,
171 &nevlist, /* updated to the number of events retrieved */
172 &timeout_ts);
Willy Tarreauc5cb8f02024-04-17 16:25:20 +0200173
174 /* Be careful, nevlist here is always updated by the syscall
175 * even on status == -1, so it must always be respected
176 * otherwise events are lost. Awkward API BTW, I wonder how
177 * they thought ENOSYS ought to be handled... -WT
178 */
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000179 if (status != 0) {
180 int e = errno;
181 switch (e) {
182 case ETIME:
183 /*
184 * Though the manual page has not historically made it
185 * clear, port_getn() can return -1 with an errno of
186 * ETIME and still have returned some number of events.
187 */
188 /* nevlist >= 0 */
189 break;
190 default:
Willy Tarreauc5cb8f02024-04-17 16:25:20 +0200191 /* signal or anything else */
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000192 interrupted = 1;
193 break;
194 }
195 }
196 tv_update_date(timeout, nevlist);
197
198 if (nevlist || interrupted)
199 break;
200 if (timeout || !wait_time)
201 break;
Matthias Wirth6b933fc2022-09-09 10:21:00 +0200202 if (wake)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000203 break;
204 if (tick_isset(exp) && tick_is_expired(exp, now_ms))
205 break;
206 } while(1);
207
208 tv_leaving_poll(wait_time, nevlist);
209
210 thread_harmless_end();
Willy Tarreaue148cb02021-07-30 10:57:09 +0200211 if (sleeping_thread_mask & tid_bit)
212 _HA_ATOMIC_AND(&sleeping_thread_mask, ~tid_bit);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000213
Willy Tarreaue5451532020-06-17 20:25:18 +0200214 if (nevlist > 0)
215 activity[tid].poll_io++;
216
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000217 for (i = 0; i < nevlist; i++) {
218 unsigned int n = 0;
219 int events, rebind_events;
220 fd = evports_evlist[i].portev_object;
221 events = evports_evlist[i].portev_events;
222
Willy Tarreau38e8a1c2020-06-23 10:04:54 +0200223#ifdef DEBUG_FD
Willy Tarreau4781b152021-04-06 13:53:36 +0200224 _HA_ATOMIC_INC(&fdtab[fd].event_count);
Willy Tarreau38e8a1c2020-06-23 10:04:54 +0200225#endif
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000226 if (fdtab[fd].owner == NULL) {
Willy Tarreaue4063862020-06-17 20:35:33 +0200227 activity[tid].poll_dead_fd++;
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000228 continue;
229 }
230
231 if (!(fdtab[fd].thread_mask & tid_bit)) {
Willy Tarreaue4063862020-06-17 20:35:33 +0200232 activity[tid].poll_skip_fd++;
Willy Tarreauea2036d2021-07-30 14:18:49 +0200233 if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid))
234 fd_updt[fd_nbupdt++] = fd;
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000235 continue;
236 }
237
238 /*
239 * By virtue of receiving an event for this file descriptor, it
240 * is no longer associated with the port in question. Store
241 * the previous event mask so that we may reassociate after
242 * processing is complete.
243 */
244 rebind_events = evports_state_to_events(fdtab[fd].state);
245 /* rebind_events != 0 */
246
247 /*
248 * Set bits based on the events we received from the port:
249 */
Emmanuel Hocdet7ceb96b2019-09-19 11:08:26 +0000250 n = ((events & POLLIN) ? FD_EV_READY_R : 0) |
251 ((events & POLLOUT) ? FD_EV_READY_W : 0) |
252 ((events & POLLHUP) ? FD_EV_SHUT_RW : 0) |
253 ((events & POLLERR) ? FD_EV_ERR_RW : 0);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000254
255 /*
256 * Call connection processing callbacks. Note that it's
257 * possible for this processing to alter the required event
Ilya Shipitsince7b00f2020-03-23 22:28:40 +0500258 * port association; i.e., the "state" member of the "fdtab"
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000259 * entry. If it changes, the fd will be placed on the updated
260 * list for processing the next time we are called.
261 */
262 fd_update_events(fd, n);
263
264 /*
265 * This file descriptor was closed during the processing of
266 * polled events. No need to reassociate.
267 */
268 if (fdtab[fd].owner == NULL)
269 continue;
270
271 /*
272 * Reassociate with the port, using the same event mask as
273 * before. This call will not result in a dissociation as we
274 * asserted that _some_ events needed to be rebound above.
275 *
276 * Reassociating with the same mask allows us to mimic the
277 * level-triggered behaviour of poll(2). In the event that we
278 * are interested in the same events on the next turn of the
279 * loop, this represents no extra work.
280 *
281 * If this additional port_associate(3C) call becomes a
282 * performance problem, we would need to verify that we can
283 * correctly interact with the file descriptor cache and update
284 * list (see "src/fd.c") to avoid reassociating here, or to use
285 * a different events mask.
286 */
287 evports_resync_fd(fd, rebind_events);
288 }
289}
290
291static int init_evports_per_thread()
292{
293 int fd;
294
295 evports_evlist_max = global.tune.maxpollevents;
296 evports_evlist = calloc(evports_evlist_max, sizeof (port_event_t));
297 if (evports_evlist == NULL) {
298 goto fail_alloc;
299 }
300
301 if (MAX_THREADS > 1 && tid) {
302 if ((evports_fd[tid] = port_create()) == -1) {
303 goto fail_fd;
304 }
305 }
306
307 /* we may have to unregister some events initially registered on the
308 * original fd when it was alone, and/or to register events on the new
309 * fd for this thread. Let's just mark them as updated, the poller will
310 * do the rest.
311 */
312 for (fd = 0; fd < global.maxsock; fd++)
313 updt_fd_polling(fd);
314
315 return 1;
316
317 fail_fd:
Willy Tarreau61cfdf42021-02-20 10:46:51 +0100318 ha_free(&evports_evlist);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000319 evports_evlist_max = 0;
320 fail_alloc:
321 return 0;
322}
323
324static void deinit_evports_per_thread()
325{
326 if (MAX_THREADS > 1 && tid)
327 close(evports_fd[tid]);
328
Willy Tarreau61cfdf42021-02-20 10:46:51 +0100329 ha_free(&evports_evlist);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000330 evports_evlist_max = 0;
331}
332
333/*
334 * Initialisation of the event ports poller.
335 * Returns 0 in case of failure, non-zero in case of success.
336 */
Willy Tarreau03e78532020-02-25 07:38:05 +0100337static int _do_init(struct poller *p)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000338{
339 p->private = NULL;
340
341 if ((evports_fd[tid] = port_create()) == -1) {
342 goto fail;
343 }
344
345 hap_register_per_thread_init(init_evports_per_thread);
346 hap_register_per_thread_deinit(deinit_evports_per_thread);
347
348 return 1;
349
350fail:
351 p->pref = 0;
352 return 0;
353}
354
355/*
356 * Termination of the event ports poller.
357 * All resources are released and the poller is marked as inoperative.
358 */
Willy Tarreau03e78532020-02-25 07:38:05 +0100359static void _do_term(struct poller *p)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000360{
361 if (evports_fd[tid] != -1) {
362 close(evports_fd[tid]);
363 evports_fd[tid] = -1;
364 }
365
366 p->private = NULL;
367 p->pref = 0;
368
Willy Tarreau61cfdf42021-02-20 10:46:51 +0100369 ha_free(&evports_evlist);
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000370 evports_evlist_max = 0;
371}
372
373/*
374 * Run-time check to make sure we can allocate the resources needed for
375 * the poller to function correctly.
376 * Returns 1 on success, otherwise 0.
377 */
Willy Tarreau03e78532020-02-25 07:38:05 +0100378static int _do_test(struct poller *p)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000379{
380 int fd;
381
382 if ((fd = port_create()) == -1) {
383 return 0;
384 }
385
386 close(fd);
387 return 1;
388}
389
390/*
391 * Close and recreate the event port after fork(). Returns 1 on success,
392 * otherwise 0. If this function fails, "_do_term()" must be called to
393 * clean up the poller.
394 */
Willy Tarreau03e78532020-02-25 07:38:05 +0100395static int _do_fork(struct poller *p)
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000396{
397 if (evports_fd[tid] != -1) {
398 close(evports_fd[tid]);
399 }
400
401 if ((evports_fd[tid] = port_create()) == -1) {
402 return 0;
403 }
404
405 return 1;
406}
407
408/*
409 * This constructor must be called before main() to register the event ports
410 * poller.
411 */
412__attribute__((constructor))
413static void _do_register(void)
414{
415 struct poller *p;
416 int i;
417
418 if (nbpollers >= MAX_POLLERS)
419 return;
420
421 for (i = 0; i < MAX_THREADS; i++)
422 evports_fd[i] = -1;
423
424 p = &pollers[nbpollers++];
425
426 p->name = "evports";
427 p->pref = 300;
Willy Tarreau11ef0832019-11-28 18:17:33 +0100428 p->flags = HAP_POLL_F_ERRHUP;
Emmanuel Hocdet0ba4f482019-04-08 16:53:32 +0000429 p->private = NULL;
430
431 p->clo = NULL;
432 p->test = _do_test;
433 p->init = _do_init;
434 p->term = _do_term;
435 p->poll = _do_poll;
436 p->fork = _do_fork;
437}