blob: 35ce34c75cd1a9b3a4039ce60f3107bab5936aa3 [file] [log] [blame]
Willy Tarreaude99e992007-04-16 00:53:59 +02001/*
2 * FD polling functions for Speculative I/O combined with Linux epoll()
3 *
Willy Tarreau037d2c12012-11-06 02:34:46 +01004 * Copyright 2000-2012 Willy Tarreau <w@1wt.eu>
Willy Tarreaude99e992007-04-16 00:53:59 +02005 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
Willy Tarreauf2e8ee22008-05-25 10:39:02 +020011 *
12 * This code implements "speculative I/O" under Linux. The principle is to
13 * try to perform expected I/O before registering the events in the poller.
14 * Each time this succeeds, it saves an expensive epoll_ctl(). It generally
15 * succeeds for all reads after an accept(), and for writes after a connect().
16 * It also improves performance for streaming connections because even if only
17 * one side is polled, the other one may react accordingly depending on the
18 * level of the buffer.
19 *
Willy Tarreau037d2c12012-11-06 02:34:46 +010020 * More importantly, it enables I/O operations that are backed by invisible
21 * buffers. For example, SSL is able to read a whole socket buffer and not
22 * deliver it to the application buffer because it's full. Unfortunately, it
23 * won't be reported by epoll() anymore until some new activity happens. The
24 * only way to call it again thus is to perform speculative I/O as soon as
25 * reading on the FD is enabled again.
Willy Tarreauf2e8ee22008-05-25 10:39:02 +020026 *
Willy Tarreau037d2c12012-11-06 02:34:46 +010027 * The speculative I/O relies on a double list of expected events and updates.
28 * Expected events are events that are expected to come and that we must report
29 * to the application until it asks to stop or to poll. Updates are new requests
30 * for changing an FD state. Updates are the only way to create new events. This
31 * is important because it means that the number of speculative events cannot
32 * increase between updates and will only grow one at a time while processing
33 * updates. All updates must always be processed, though events might be
34 * processed by small batches if required. The result is that there is no need
35 * for preallocating room for spec events, updates evinced from the list always
36 * release at least as much as necessary.
Willy Tarreauf2e8ee22008-05-25 10:39:02 +020037 *
Willy Tarreau037d2c12012-11-06 02:34:46 +010038 * In order to limit memory usage, events and updates share the same list (an
39 * array to be exact). The lower part (0..nbevts) is used by events and the
40 * higher part by updates. This way, an fd may be mapped to any entry (evt or
41 * update) using a single index. Updates may be simply turned to events. When
42 * events are deleted, the last event from the list must replace the deleted
43 * event, and if there were updates past this event, one must be moved to take
44 * its place. It still means that any file descriptor might be present in the
45 * event or update list, so the list must be at least as large as the maximum
46 * number of simultaneous file descriptors.
Willy Tarreauf2e8ee22008-05-25 10:39:02 +020047 *
Willy Tarreau037d2c12012-11-06 02:34:46 +010048 * It is important to understand that as long as all expected events are
49 * processed, they might starve the polled events, especially because polled
50 * I/O starvation quickly induces more speculative I/O. One solution to this
51 * consists in only processing a part of the events at once, but one drawback
52 * is that unhandled events will still wake epoll_wait() up. Using EPOLL_ET
53 * will solve this issue though.
Willy Tarreauf2e8ee22008-05-25 10:39:02 +020054 *
Willy Tarreau037d2c12012-11-06 02:34:46 +010055 * A file descriptor has a distinct state for each direction. This state is a
56 * combination of two bits :
57 * bit 0 = active Y/N : is set if the FD is active, which means that its
58 * handler will be called without prior polling ;
59 * bit 1 = polled Y/N : is set if the FD was subscribed to polling
Willy Tarreauf2e8ee22008-05-25 10:39:02 +020060 *
Willy Tarreau037d2c12012-11-06 02:34:46 +010061 * It is perfectly valid to have both bits set at a time, which generally means
62 * that the FD was reported by polling, was marked active and not yet unpolled.
63 * Such a state must not last long to avoid unneeded wakeups.
Willy Tarreauf2e8ee22008-05-25 10:39:02 +020064 *
Willy Tarreau037d2c12012-11-06 02:34:46 +010065 * The state of the FD as of last change is preserved in two other bits. These
66 * ones are useful to save a significant amount of system calls during state
67 * changes, because there is no need to call epoll_ctl() until we're about to
68 * call epoll_wait().
69 *
70 * Since we do not want to scan all the FD list to find speculative I/O events,
71 * we store them in a list consisting in a linear array holding only the FD
72 * indexes right now. Note that a closed FD cannot exist in the spec list,
73 * because it is closed by fd_delete() which in turn calls __fd_clo() which
74 * always removes it from the list.
75 *
76 * For efficiency reasons, we will store the Read and Write bits interlaced to
77 * form a 4-bit field, so that we can simply shift the value right by 0/1 and
78 * get what we want :
79 * 3 2 1 0
80 * Wp Rp Wa Ra
81 *
82 * The FD array has to hold a back reference to the speculative list. This
83 * reference is always valid unless the FD if currently being polled and not
84 * updated (in which case the reference points to index 0).
85 *
86 * We store the FD state in the 4 lower bits of fdtab[fd].spec_e, and save the
87 * previous state upon changes in the 4 higher bits, so that changes are easy
88 * to spot.
Willy Tarreaude99e992007-04-16 00:53:59 +020089 */
90
91#include <unistd.h>
92#include <sys/time.h>
93#include <sys/types.h>
94
95#include <common/compat.h>
96#include <common/config.h>
Willy Tarreaud6f087e2008-01-18 17:20:13 +010097#include <common/debug.h>
Willy Tarreau43d8fb22011-08-22 17:12:02 +020098#include <common/epoll.h>
Willy Tarreaude99e992007-04-16 00:53:59 +020099#include <common/standard.h>
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200100#include <common/ticks.h>
Willy Tarreaude99e992007-04-16 00:53:59 +0200101#include <common/time.h>
Willy Tarreau1db37712007-06-03 17:16:49 +0200102#include <common/tools.h>
Willy Tarreaude99e992007-04-16 00:53:59 +0200103
Willy Tarreaude99e992007-04-16 00:53:59 +0200104#include <types/global.h>
105
106#include <proto/fd.h>
Willy Tarreau332740d2009-05-10 09:57:21 +0200107#include <proto/signal.h>
Willy Tarreaude99e992007-04-16 00:53:59 +0200108#include <proto/task.h>
109
Willy Tarreaude99e992007-04-16 00:53:59 +0200110
Willy Tarreau037d2c12012-11-06 02:34:46 +0100111#define FD_EV_ACTIVE 1U
112#define FD_EV_POLLED 4U
113#define FD_EV_STATUS (FD_EV_ACTIVE | FD_EV_POLLED)
114#define FD_EV_STATUS_R (FD_EV_STATUS)
115#define FD_EV_STATUS_W (FD_EV_STATUS << 1)
Willy Tarreaude99e992007-04-16 00:53:59 +0200116
Willy Tarreau037d2c12012-11-06 02:34:46 +0100117#define FD_EV_POLLED_R (FD_EV_POLLED)
118#define FD_EV_POLLED_W (FD_EV_POLLED << 1)
119#define FD_EV_POLLED_RW (FD_EV_POLLED_R | FD_EV_POLLED_W)
Willy Tarreaude99e992007-04-16 00:53:59 +0200120
Willy Tarreau037d2c12012-11-06 02:34:46 +0100121#define FD_EV_ACTIVE_R (FD_EV_ACTIVE)
122#define FD_EV_ACTIVE_W (FD_EV_ACTIVE << 1)
123#define FD_EV_ACTIVE_RW (FD_EV_ACTIVE_R | FD_EV_ACTIVE_W)
Willy Tarreaude99e992007-04-16 00:53:59 +0200124
Willy Tarreau037d2c12012-11-06 02:34:46 +0100125#define FD_EV_CURR_MASK 0x0FU
126#define FD_EV_PREV_MASK 0xF0U
Willy Tarreaude99e992007-04-16 00:53:59 +0200127
Willy Tarreau6653d172007-05-13 01:52:05 +0200128/* This is the minimum number of events successfully processed in speculative
129 * mode above which we agree to return without checking epoll() (1/2 times).
130 */
131#define MIN_RETURN_EVENTS 25
Willy Tarreaude99e992007-04-16 00:53:59 +0200132
Willy Tarreau037d2c12012-11-06 02:34:46 +0100133static int nbspec = 0; // number of speculative events in the list
134static int nbupdt = 0; // number of updates in the list
Willy Tarreauf2e8ee22008-05-25 10:39:02 +0200135static int absmaxevents = 0; // absolute maximum amounts of polled events
Willy Tarreau037d2c12012-11-06 02:34:46 +0100136static int in_poll_loop = 0; // non-null if polled events are being processed
Willy Tarreaude99e992007-04-16 00:53:59 +0200137
Willy Tarreau037d2c12012-11-06 02:34:46 +0100138static unsigned int *spec_list = NULL; // speculative I/O list
139static unsigned int *updt_list = NULL; // FD updates list
Willy Tarreaude99e992007-04-16 00:53:59 +0200140
141/* private data */
142static struct epoll_event *epoll_events;
143static int epoll_fd;
144
145/* This structure may be used for any purpose. Warning! do not use it in
146 * recursive functions !
147 */
148static struct epoll_event ev;
149
150
Willy Tarreau037d2c12012-11-06 02:34:46 +0100151/* Mark fd <fd> as updated and allocate an entry in the update list for this if
152 * it was not already there. This can be done at any time.
153 */
154REGPRM1 static inline void updt_fd(const int fd)
155{
156 if (fdtab[fd].updated)
157 /* already scheduled for update */
158 return;
159 updt_list[nbupdt++] = fd;
160 fdtab[fd].updated = 1;
161}
162
163
164/* allocate an entry for a speculative event. This can be done at any time. */
Willy Tarreauff9d5ba2009-10-17 21:43:03 +0200165REGPRM1 static inline void alloc_spec_entry(const int fd)
Willy Tarreaude99e992007-04-16 00:53:59 +0200166{
Willy Tarreau45dab732012-09-02 22:19:18 +0200167 if (fdtab[fd].spec_p)
Willy Tarreau037d2c12012-11-06 02:34:46 +0100168 /* FD already in speculative I/O list */
Willy Tarreaude99e992007-04-16 00:53:59 +0200169 return;
Willy Tarreau037d2c12012-11-06 02:34:46 +0100170 spec_list[nbspec++] = fd;
171 fdtab[fd].spec_p = nbspec;
Willy Tarreaude99e992007-04-16 00:53:59 +0200172}
173
Willy Tarreau4eac2092007-08-31 17:01:18 +0200174/* Removes entry used by fd <fd> from the spec list and replaces it with the
Willy Tarreaub48b3232009-10-17 22:54:17 +0200175 * last one. The fdtab.spec is adjusted to match the back reference if needed.
Willy Tarreau4eac2092007-08-31 17:01:18 +0200176 * If the fd has no entry assigned, return immediately.
Willy Tarreaude99e992007-04-16 00:53:59 +0200177 */
Willy Tarreau4eac2092007-08-31 17:01:18 +0200178REGPRM1 static void release_spec_entry(int fd)
Willy Tarreaude99e992007-04-16 00:53:59 +0200179{
Willy Tarreau4eac2092007-08-31 17:01:18 +0200180 unsigned int pos;
181
Willy Tarreau45dab732012-09-02 22:19:18 +0200182 pos = fdtab[fd].spec_p;
Willy Tarreau4eac2092007-08-31 17:01:18 +0200183 if (!pos)
184 return;
Willy Tarreau45dab732012-09-02 22:19:18 +0200185 fdtab[fd].spec_p = 0;
Willy Tarreaude99e992007-04-16 00:53:59 +0200186 nbspec--;
Willy Tarreau037d2c12012-11-06 02:34:46 +0100187 if (pos <= nbspec) {
188 /* was not the last entry */
189 fd = spec_list[nbspec];
190 spec_list[pos - 1] = fd;
191 fdtab[fd].spec_p = pos;
192 }
Willy Tarreaude99e992007-04-16 00:53:59 +0200193}
194
195/*
196 * Returns non-zero if <fd> is already monitored for events in direction <dir>.
197 */
198REGPRM2 static int __fd_is_set(const int fd, int dir)
199{
Willy Tarreau7a52a5c2008-08-16 16:06:02 +0200200#if DEBUG_DEV
Willy Tarreaudb3b3262012-07-05 23:19:22 +0200201 if (!fdtab[fd].owner) {
Willy Tarreau7a52a5c2008-08-16 16:06:02 +0200202 fprintf(stderr, "sepoll.fd_isset called on closed fd #%d.\n", fd);
203 ABORT_NOW();
204 }
205#endif
Willy Tarreau037d2c12012-11-06 02:34:46 +0100206 return ((unsigned)fdtab[fd].spec_e >> dir) & FD_EV_STATUS;
Willy Tarreaude99e992007-04-16 00:53:59 +0200207}
208
209/*
210 * Don't worry about the strange constructs in __fd_set/__fd_clr, they are
211 * designed like this in order to reduce the number of jumps (verified).
212 */
Willy Tarreaubabd05a2012-08-09 12:14:03 +0200213REGPRM2 static void __fd_wai(const int fd, int dir)
214{
215 unsigned int i;
216
217#if DEBUG_DEV
218 if (!fdtab[fd].owner) {
219 fprintf(stderr, "sepoll.fd_wai called on closed fd #%d.\n", fd);
220 ABORT_NOW();
221 }
222#endif
Willy Tarreau037d2c12012-11-06 02:34:46 +0100223 i = ((unsigned)fdtab[fd].spec_e >> dir) & FD_EV_STATUS;
Willy Tarreaubabd05a2012-08-09 12:14:03 +0200224
Willy Tarreau037d2c12012-11-06 02:34:46 +0100225 if (i == FD_EV_POLLED)
226 return; /* already in desired state */
227 updt_fd(fd); /* need an update entry to change the state */
228 fdtab[fd].spec_e ^= (i ^ (unsigned int)FD_EV_POLLED) << dir;
Willy Tarreaubabd05a2012-08-09 12:14:03 +0200229}
230
Willy Tarreau3788e4c2012-07-30 14:29:35 +0200231REGPRM2 static void __fd_set(const int fd, int dir)
Willy Tarreaude99e992007-04-16 00:53:59 +0200232{
Willy Tarreaude99e992007-04-16 00:53:59 +0200233 unsigned int i;
234
Willy Tarreau7a52a5c2008-08-16 16:06:02 +0200235#if DEBUG_DEV
Willy Tarreaudb3b3262012-07-05 23:19:22 +0200236 if (!fdtab[fd].owner) {
Willy Tarreau7a52a5c2008-08-16 16:06:02 +0200237 fprintf(stderr, "sepoll.fd_set called on closed fd #%d.\n", fd);
238 ABORT_NOW();
239 }
240#endif
Willy Tarreau037d2c12012-11-06 02:34:46 +0100241 i = ((unsigned)fdtab[fd].spec_e >> dir) & FD_EV_STATUS;
Willy Tarreaude99e992007-04-16 00:53:59 +0200242
Willy Tarreau037d2c12012-11-06 02:34:46 +0100243 /* note that we don't care about disabling the polled state when
244 * enabling the active state, since it brings no benefit but costs
245 * some syscalls.
246 */
247 if (i & FD_EV_ACTIVE)
248 return; /* already in desired state */
249 updt_fd(fd); /* need an update entry to change the state */
250 fdtab[fd].spec_e |= ((unsigned int)FD_EV_ACTIVE) << dir;
Willy Tarreaude99e992007-04-16 00:53:59 +0200251}
252
Willy Tarreau3788e4c2012-07-30 14:29:35 +0200253REGPRM2 static void __fd_clr(const int fd, int dir)
Willy Tarreaude99e992007-04-16 00:53:59 +0200254{
Willy Tarreaude99e992007-04-16 00:53:59 +0200255 unsigned int i;
256
Willy Tarreau7a52a5c2008-08-16 16:06:02 +0200257#if DEBUG_DEV
Willy Tarreaudb3b3262012-07-05 23:19:22 +0200258 if (!fdtab[fd].owner) {
Willy Tarreau7a52a5c2008-08-16 16:06:02 +0200259 fprintf(stderr, "sepoll.fd_clr called on closed fd #%d.\n", fd);
260 ABORT_NOW();
261 }
262#endif
Willy Tarreau037d2c12012-11-06 02:34:46 +0100263 i = ((unsigned)fdtab[fd].spec_e >> dir) & FD_EV_STATUS;
Willy Tarreaude99e992007-04-16 00:53:59 +0200264
Willy Tarreau037d2c12012-11-06 02:34:46 +0100265 if (i == 0)
266 return /* already disabled */;
267 updt_fd(fd); /* need an update entry to change the state */
268 fdtab[fd].spec_e ^= i << dir;
Willy Tarreaude99e992007-04-16 00:53:59 +0200269}
270
Willy Tarreau6653d172007-05-13 01:52:05 +0200271/* normally unused */
Willy Tarreaude99e992007-04-16 00:53:59 +0200272REGPRM1 static void __fd_rem(int fd)
273{
274 __fd_clr(fd, DIR_RD);
275 __fd_clr(fd, DIR_WR);
276}
277
278/*
279 * On valid epoll() implementations, a call to close() automatically removes
280 * the fds. This means that the FD will appear as previously unset.
281 */
282REGPRM1 static void __fd_clo(int fd)
283{
Willy Tarreau7a52a5c2008-08-16 16:06:02 +0200284 release_spec_entry(fd);
Willy Tarreau037d2c12012-11-06 02:34:46 +0100285 fdtab[fd].spec_e &= ~(FD_EV_CURR_MASK | FD_EV_PREV_MASK);
Willy Tarreaude99e992007-04-16 00:53:59 +0200286}
287
Willy Tarreaudc246a72007-05-09 21:57:51 +0200288/*
Willy Tarreaude99e992007-04-16 00:53:59 +0200289 * speculative epoll() poller
290 */
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200291REGPRM2 static void _do_poll(struct poller *p, int exp)
Willy Tarreaude99e992007-04-16 00:53:59 +0200292{
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200293 int status, eo, en;
Willy Tarreaude99e992007-04-16 00:53:59 +0200294 int fd, opcode;
295 int count;
296 int spec_idx;
Willy Tarreau037d2c12012-11-06 02:34:46 +0100297 int updt_idx;
Willy Tarreaud825eef2007-05-12 22:35:00 +0200298 int wait_time;
Willy Tarreaude99e992007-04-16 00:53:59 +0200299
Willy Tarreau037d2c12012-11-06 02:34:46 +0100300 /* first, scan the update list to find changes */
301 for (updt_idx = 0; updt_idx < nbupdt; updt_idx++) {
302 fd = updt_list[updt_idx];
Willy Tarreau45dab732012-09-02 22:19:18 +0200303 en = fdtab[fd].spec_e & 15; /* new events */
304 eo = fdtab[fd].spec_e >> 4; /* previous events */
Willy Tarreaude99e992007-04-16 00:53:59 +0200305
Willy Tarreau037d2c12012-11-06 02:34:46 +0100306 if (fdtab[fd].owner && (eo ^ en)) {
307 if ((eo ^ en) & FD_EV_POLLED_RW) {
308 /* poll status changed */
309 if ((en & FD_EV_POLLED_RW) == 0) {
310 /* fd removed from poll list */
311 opcode = EPOLL_CTL_DEL;
312 }
313 else if ((eo & FD_EV_POLLED_RW) == 0) {
314 /* new fd in the poll list */
315 opcode = EPOLL_CTL_ADD;
316 }
317 else {
318 /* fd status changed */
319 opcode = EPOLL_CTL_MOD;
320 }
Willy Tarreaude99e992007-04-16 00:53:59 +0200321
Willy Tarreau037d2c12012-11-06 02:34:46 +0100322 /* construct the epoll events based on new state */
323 ev.events = 0;
324 if (en & FD_EV_POLLED_R)
325 ev.events |= EPOLLIN;
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200326
Willy Tarreau037d2c12012-11-06 02:34:46 +0100327 if (en & FD_EV_POLLED_W)
328 ev.events |= EPOLLOUT;
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200329
Willy Tarreau037d2c12012-11-06 02:34:46 +0100330 ev.data.fd = fd;
331 epoll_ctl(epoll_fd, opcode, fd, &ev);
332 }
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200333
Willy Tarreau037d2c12012-11-06 02:34:46 +0100334 fdtab[fd].spec_e = (en << 4) + en; /* save new events */
Willy Tarreaude99e992007-04-16 00:53:59 +0200335
Willy Tarreau037d2c12012-11-06 02:34:46 +0100336 if (!(en & FD_EV_ACTIVE_RW)) {
337 /* This fd doesn't use any active entry anymore, we can
338 * kill its entry.
339 */
340 release_spec_entry(fd);
Willy Tarreau6653d172007-05-13 01:52:05 +0200341 }
Willy Tarreau037d2c12012-11-06 02:34:46 +0100342 else if ((en & ~eo) & FD_EV_ACTIVE_RW) {
343 /* we need a new spec entry now */
344 alloc_spec_entry(fd);
Willy Tarreaude99e992007-04-16 00:53:59 +0200345 }
Willy Tarreau6653d172007-05-13 01:52:05 +0200346
Willy Tarreau6653d172007-05-13 01:52:05 +0200347 }
Willy Tarreau037d2c12012-11-06 02:34:46 +0100348 fdtab[fd].updated = 0;
349 fdtab[fd].new = 0;
Willy Tarreaude99e992007-04-16 00:53:59 +0200350 }
Willy Tarreau037d2c12012-11-06 02:34:46 +0100351 nbupdt = 0;
Willy Tarreaude99e992007-04-16 00:53:59 +0200352
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200353 /* compute the epoll_wait() timeout */
Willy Tarreaucb651252008-08-29 13:57:30 +0200354
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200355 if (nbspec || run_queue || signal_queue_len) {
356 /* Maybe we still have events in the spec list, or there are
Willy Tarreau3a628112008-06-13 21:06:56 +0200357 * some tasks left pending in the run_queue, so we must not
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200358 * wait in epoll() otherwise we would delay their delivery by
Willy Tarreau6653d172007-05-13 01:52:05 +0200359 * the next timeout.
360 */
Willy Tarreaude99e992007-04-16 00:53:59 +0200361 wait_time = 0;
362 }
Willy Tarreaud825eef2007-05-12 22:35:00 +0200363 else {
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200364 if (!exp)
Willy Tarreaub0b37bc2008-06-23 14:00:57 +0200365 wait_time = MAX_DELAY_MS;
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200366 else if (tick_is_expired(exp, now_ms))
Willy Tarreaubdefc512007-05-14 02:02:04 +0200367 wait_time = 0;
Willy Tarreaub0b37bc2008-06-23 14:00:57 +0200368 else {
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200369 wait_time = TICKS_TO_MS(tick_remain(now_ms, exp)) + 1;
Willy Tarreaub0b37bc2008-06-23 14:00:57 +0200370 if (wait_time > MAX_DELAY_MS)
371 wait_time = MAX_DELAY_MS;
372 }
Willy Tarreaud825eef2007-05-12 22:35:00 +0200373 }
Willy Tarreaude99e992007-04-16 00:53:59 +0200374
Willy Tarreau037d2c12012-11-06 02:34:46 +0100375 /* now let's wait for polled events */
376
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200377 fd = MIN(maxfd, global.tune.maxpollevents);
Willy Tarreau45a12512011-09-10 16:56:42 +0200378 gettimeofday(&before_poll, NULL);
Willy Tarreau1db37712007-06-03 17:16:49 +0200379 status = epoll_wait(epoll_fd, epoll_events, fd, wait_time);
Willy Tarreaub0b37bc2008-06-23 14:00:57 +0200380 tv_update_date(wait_time, status);
Willy Tarreau45a12512011-09-10 16:56:42 +0200381 measure_idle();
Willy Tarreaude99e992007-04-16 00:53:59 +0200382
Willy Tarreau037d2c12012-11-06 02:34:46 +0100383 in_poll_loop = 1;
384
385 /* process polled events */
386
Willy Tarreaude99e992007-04-16 00:53:59 +0200387 for (count = 0; count < status; count++) {
388 int e = epoll_events[count].events;
389 fd = epoll_events[count].data.fd;
390
Willy Tarreau076be252012-07-06 16:02:29 +0200391 if (!fdtab[fd].owner)
392 continue;
393
Willy Tarreaude99e992007-04-16 00:53:59 +0200394 /* it looks complicated but gcc can optimize it away when constants
395 * have same values.
396 */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100397 fdtab[fd].ev &= FD_POLL_STICKY;
Willy Tarreau491c4982012-07-06 11:16:01 +0200398 fdtab[fd].ev |=
Willy Tarreaude99e992007-04-16 00:53:59 +0200399 ((e & EPOLLIN ) ? FD_POLL_IN : 0) |
400 ((e & EPOLLPRI) ? FD_POLL_PRI : 0) |
401 ((e & EPOLLOUT) ? FD_POLL_OUT : 0) |
402 ((e & EPOLLERR) ? FD_POLL_ERR : 0) |
403 ((e & EPOLLHUP) ? FD_POLL_HUP : 0);
Willy Tarreau491c4982012-07-06 11:16:01 +0200404
Willy Tarreau037d2c12012-11-06 02:34:46 +0100405 if (fdtab[fd].iocb && fdtab[fd].owner && fdtab[fd].ev) {
406 int new_updt, old_updt = nbupdt; /* Save number of updates to detect creation of new FDs. */
407
408 /* Mark the events as speculative before processing
409 * them so that if nothing can be done we don't need
410 * to poll again.
411 */
412 if (fdtab[fd].ev & (FD_POLL_IN|FD_POLL_HUP|FD_POLL_ERR))
413 __fd_set(fd, DIR_RD);
414
415 if (fdtab[fd].ev & (FD_POLL_OUT|FD_POLL_ERR))
416 __fd_set(fd, DIR_WR);
417
Willy Tarreau9845e752012-07-06 11:44:28 +0200418 fdtab[fd].iocb(fd);
Willy Tarreau037d2c12012-11-06 02:34:46 +0100419
420 /* One or more fd might have been created during the iocb().
421 * This mainly happens with new incoming connections that have
422 * just been accepted, so we'd like to process them immediately
423 * for better efficiency. Second benefit, if at the end the fds
424 * are disabled again, we can safely destroy their update entry
425 * to reduce the scope of later scans. This is the reason we
426 * scan the new entries backwards.
427 */
428
429 for (new_updt = nbupdt; new_updt > old_updt; new_updt--) {
430 fd = updt_list[new_updt - 1];
431 if (!fdtab[fd].new)
432 continue;
433
434 fdtab[fd].new = 0;
435 fdtab[fd].ev &= FD_POLL_STICKY;
436
437 if ((fdtab[fd].spec_e & FD_EV_STATUS_R) == FD_EV_ACTIVE_R)
438 fdtab[fd].ev |= FD_POLL_IN;
439
440 if ((fdtab[fd].spec_e & FD_EV_STATUS_W) == FD_EV_ACTIVE_W)
441 fdtab[fd].ev |= FD_POLL_OUT;
442
443 if (fdtab[fd].ev && fdtab[fd].iocb && fdtab[fd].owner)
444 fdtab[fd].iocb(fd);
445
446 /* we can remove this update entry if it's the last one and is
447 * unused, otherwise we don't touch anything.
448 */
449 if (new_updt == nbupdt && fdtab[fd].spec_e == 0) {
450 fdtab[fd].updated = 0;
451 nbupdt--;
452 }
453 }
454 }
Willy Tarreaude99e992007-04-16 00:53:59 +0200455 }
Willy Tarreaucb651252008-08-29 13:57:30 +0200456
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200457 /* now process speculative events if any */
458
Willy Tarreau037d2c12012-11-06 02:34:46 +0100459 for (spec_idx = 0; spec_idx < nbspec; ) {
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200460 fd = spec_list[spec_idx];
Willy Tarreau037d2c12012-11-06 02:34:46 +0100461 eo = fdtab[fd].spec_e;
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200462
463 /*
464 * Process the speculative events.
465 *
Willy Tarreau037d2c12012-11-06 02:34:46 +0100466 * Principle: events which are marked FD_EV_ACTIVE are processed
467 * with their usual I/O callback. The callback may remove the
468 * events from the list or tag them for polling. Changes will be
469 * applied on next round.
Willy Tarreaucb651252008-08-29 13:57:30 +0200470 */
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200471
472 fdtab[fd].ev &= FD_POLL_STICKY;
Willy Tarreau037d2c12012-11-06 02:34:46 +0100473
474 if ((eo & FD_EV_STATUS_R) == FD_EV_ACTIVE_R)
Willy Tarreau5d526b72012-07-05 23:33:51 +0200475 fdtab[fd].ev |= FD_POLL_IN;
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200476
Willy Tarreau037d2c12012-11-06 02:34:46 +0100477 if ((eo & FD_EV_STATUS_W) == FD_EV_ACTIVE_W)
Willy Tarreau5d526b72012-07-05 23:33:51 +0200478 fdtab[fd].ev |= FD_POLL_OUT;
Willy Tarreau9845e752012-07-06 11:44:28 +0200479
Willy Tarreau26f44d12012-08-17 23:55:05 +0200480 if (fdtab[fd].iocb && fdtab[fd].owner && fdtab[fd].ev)
481 fdtab[fd].iocb(fd);
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200482
Willy Tarreau037d2c12012-11-06 02:34:46 +0100483 /* if the fd was removed from the spec list, it has been
484 * replaced by the next one that we don't want to skip !
485 */
486 if (spec_idx < nbspec && spec_list[spec_idx] != fd)
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200487 continue;
488
Willy Tarreau037d2c12012-11-06 02:34:46 +0100489 spec_idx++;
Willy Tarreaucb651252008-08-29 13:57:30 +0200490 }
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200491
Willy Tarreau037d2c12012-11-06 02:34:46 +0100492 in_poll_loop = 0;
Willy Tarreaudbcd47e2012-05-13 09:42:26 +0200493 /* in the end, we have processed status + spec_processed FDs */
Willy Tarreaude99e992007-04-16 00:53:59 +0200494}
495
496/*
497 * Initialization of the speculative epoll() poller.
498 * Returns 0 in case of failure, non-zero in case of success. If it fails, it
499 * disables the poller by setting its pref to 0.
500 */
501REGPRM1 static int _do_init(struct poller *p)
502{
Willy Tarreaub48b3232009-10-17 22:54:17 +0200503 __label__ fail_spec, fail_ee, fail_fd;
Willy Tarreaude99e992007-04-16 00:53:59 +0200504
505 p->private = NULL;
506
507 epoll_fd = epoll_create(global.maxsock + 1);
508 if (epoll_fd < 0)
509 goto fail_fd;
510
Willy Tarreauf2e8ee22008-05-25 10:39:02 +0200511 /* See comments at the top of the file about this formula. */
Willy Tarreau037d2c12012-11-06 02:34:46 +0100512 absmaxevents = MAX(global.tune.maxpollevents, global.maxsock);
Willy Tarreaude99e992007-04-16 00:53:59 +0200513 epoll_events = (struct epoll_event*)
Willy Tarreauf2e8ee22008-05-25 10:39:02 +0200514 calloc(1, sizeof(struct epoll_event) * absmaxevents);
Willy Tarreaude99e992007-04-16 00:53:59 +0200515
516 if (epoll_events == NULL)
517 goto fail_ee;
518
519 if ((spec_list = (uint32_t *)calloc(1, sizeof(uint32_t) * global.maxsock)) == NULL)
520 goto fail_spec;
521
Willy Tarreau037d2c12012-11-06 02:34:46 +0100522 if ((updt_list = (uint32_t *)calloc(1, sizeof(uint32_t) * global.maxsock)) == NULL)
523 goto fail_updt;
524
Willy Tarreaude99e992007-04-16 00:53:59 +0200525 return 1;
526
Willy Tarreau037d2c12012-11-06 02:34:46 +0100527 fail_updt:
528 free(spec_list);
Willy Tarreaude99e992007-04-16 00:53:59 +0200529 fail_spec:
530 free(epoll_events);
531 fail_ee:
532 close(epoll_fd);
Willy Tarreaud79e79b2009-05-10 10:18:54 +0200533 epoll_fd = -1;
Willy Tarreaude99e992007-04-16 00:53:59 +0200534 fail_fd:
535 p->pref = 0;
536 return 0;
537}
538
539/*
540 * Termination of the speculative epoll() poller.
541 * Memory is released and the poller is marked as unselectable.
542 */
543REGPRM1 static void _do_term(struct poller *p)
544{
Willy Tarreau037d2c12012-11-06 02:34:46 +0100545 free(updt_list);
Willy Tarreaua534fea2008-08-03 12:19:50 +0200546 free(spec_list);
547 free(epoll_events);
Willy Tarreaude99e992007-04-16 00:53:59 +0200548
Willy Tarreaud79e79b2009-05-10 10:18:54 +0200549 if (epoll_fd >= 0) {
550 close(epoll_fd);
551 epoll_fd = -1;
552 }
Willy Tarreaude99e992007-04-16 00:53:59 +0200553
Willy Tarreau037d2c12012-11-06 02:34:46 +0100554 updt_list = NULL;
Willy Tarreaude99e992007-04-16 00:53:59 +0200555 spec_list = NULL;
556 epoll_events = NULL;
557
558 p->private = NULL;
559 p->pref = 0;
560}
561
562/*
563 * Check that the poller works.
564 * Returns 1 if OK, otherwise 0.
565 */
566REGPRM1 static int _do_test(struct poller *p)
567{
568 int fd;
569
570 fd = epoll_create(global.maxsock + 1);
571 if (fd < 0)
572 return 0;
573 close(fd);
574 return 1;
575}
576
577/*
Willy Tarreaufb8983f2007-06-03 16:40:44 +0200578 * Recreate the epoll file descriptor after a fork(). Returns 1 if OK,
579 * otherwise 0. It will ensure that all processes will not share their
580 * epoll_fd. Some side effects were encountered because of this, such
581 * as epoll_wait() returning an FD which was previously deleted.
582 */
583REGPRM1 static int _do_fork(struct poller *p)
584{
Willy Tarreaud79e79b2009-05-10 10:18:54 +0200585 if (epoll_fd >= 0)
586 close(epoll_fd);
Willy Tarreaufb8983f2007-06-03 16:40:44 +0200587 epoll_fd = epoll_create(global.maxsock + 1);
588 if (epoll_fd < 0)
589 return 0;
590 return 1;
591}
592
593/*
Willy Tarreaude99e992007-04-16 00:53:59 +0200594 * It is a constructor, which means that it will automatically be called before
595 * main(). This is GCC-specific but it works at least since 2.95.
596 * Special care must be taken so that it does not need any uninitialized data.
597 */
598__attribute__((constructor))
599static void _do_register(void)
600{
601 struct poller *p;
602
603 if (nbpollers >= MAX_POLLERS)
604 return;
Willy Tarreaud79e79b2009-05-10 10:18:54 +0200605
606 epoll_fd = -1;
Willy Tarreaude99e992007-04-16 00:53:59 +0200607 p = &pollers[nbpollers++];
608
609 p->name = "sepoll";
610 p->pref = 400;
611 p->private = NULL;
612
613 p->test = _do_test;
614 p->init = _do_init;
615 p->term = _do_term;
616 p->poll = _do_poll;
Willy Tarreaufb8983f2007-06-03 16:40:44 +0200617 p->fork = _do_fork;
Willy Tarreaude99e992007-04-16 00:53:59 +0200618
619 p->is_set = __fd_is_set;
Willy Tarreau3788e4c2012-07-30 14:29:35 +0200620 p->set = __fd_set;
Willy Tarreaubabd05a2012-08-09 12:14:03 +0200621 p->wai = __fd_wai;
Willy Tarreau3788e4c2012-07-30 14:29:35 +0200622 p->clr = __fd_clr;
Willy Tarreaude99e992007-04-16 00:53:59 +0200623 p->rem = __fd_rem;
624 p->clo = __fd_clo;
625}
626
627
628/*
629 * Local variables:
630 * c-indent-level: 8
631 * c-basic-offset: 8
632 * End:
633 */