blob: 61f1c6e57be1681c70c2371f490a79c8834fc5de [file] [log] [blame]
Willy Tarreaude99e992007-04-16 00:53:59 +02001/*
2 * FD polling functions for Speculative I/O combined with Linux epoll()
3 *
4 * Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <unistd.h>
14#include <sys/time.h>
15#include <sys/types.h>
16
17#include <common/compat.h>
18#include <common/config.h>
Willy Tarreaud6f087e2008-01-18 17:20:13 +010019#include <common/debug.h>
Willy Tarreaude99e992007-04-16 00:53:59 +020020#include <common/standard.h>
21#include <common/time.h>
Willy Tarreau1db37712007-06-03 17:16:49 +020022#include <common/tools.h>
Willy Tarreaude99e992007-04-16 00:53:59 +020023
24#include <types/fd.h>
25#include <types/global.h>
26
27#include <proto/fd.h>
28#include <proto/task.h>
29
30#if defined(USE_MY_EPOLL)
31#include <common/epoll.h>
32#include <errno.h>
33#include <sys/syscall.h>
34static _syscall1 (int, epoll_create, int, size);
35static _syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
36static _syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
37#else
38#include <sys/epoll.h>
39#endif
40
41/*
42 * We define 4 states for each direction of a file descriptor, which we store
43 * as 2 bits :
44 *
45 * 00 = IDLE : we're not interested in this event
46 * 01 = SPEC : perform speculative I/O on this FD
47 * 10 = WAIT : really wait for an availability event on this FD (poll)
48 * 11 = STOP : was marked WAIT, but disabled. It can switch back to WAIT if
49 * the application changes its mind, otherwise disable FD polling
50 * and switch back to IDLE.
51 *
52 * Since we do not want to scan all the FD list to find speculative I/O events,
53 * we store them in a list consisting in a linear array holding only the FD
54 * indexes right now.
55 *
56 * The STOP state requires the event to be present in the spec list so that
57 * it can be detected and flushed upon next scan without having to scan the
58 * whole FD list.
59 *
60 * This translates like this :
61 *
62 * EVENT_IN_SPEC_LIST = 01
63 * EVENT_IN_POLL_LIST = 10
64 *
65 * IDLE = 0
66 * SPEC = (EVENT_IN_SPEC_LIST)
67 * WAIT = (EVENT_IN_POLL_LIST)
68 * STOP = (EVENT_IN_SPEC_LIST|EVENT_IN_POLL_LIST)
69 *
70 * fd_is_set() just consists in checking that the status is 01 or 10.
71 *
72 * For efficiency reasons, we will store the Read and Write bits interlaced to
73 * form a 4-bit field, so that we can simply shift the value right by 0/1 and
74 * get what we want :
75 * 3 2 1 0
76 * Wp Rp Ws Rs
77 *
78 * The FD array has to hold a back reference to the speculative list. This
79 * reference is only valid if at least one of the directions is marked SPEC.
80 *
81 */
82
83#define FD_EV_IN_SL 1
84#define FD_EV_IN_PL 4
85
86#define FD_EV_IDLE 0
87#define FD_EV_SPEC (FD_EV_IN_SL)
88#define FD_EV_WAIT (FD_EV_IN_PL)
89#define FD_EV_STOP (FD_EV_IN_SL|FD_EV_IN_PL)
90
91/* Those match any of R or W for Spec list or Poll list */
92#define FD_EV_RW_SL (FD_EV_IN_SL | (FD_EV_IN_SL << 1))
93#define FD_EV_RW_PL (FD_EV_IN_PL | (FD_EV_IN_PL << 1))
94#define FD_EV_MASK_DIR (FD_EV_IN_SL|FD_EV_IN_PL)
95
96#define FD_EV_IDLE_R 0
97#define FD_EV_SPEC_R (FD_EV_IN_SL)
98#define FD_EV_WAIT_R (FD_EV_IN_PL)
99#define FD_EV_STOP_R (FD_EV_IN_SL|FD_EV_IN_PL)
100#define FD_EV_MASK_R (FD_EV_IN_SL|FD_EV_IN_PL)
101
102#define FD_EV_IDLE_W (FD_EV_IDLE_R << 1)
103#define FD_EV_SPEC_W (FD_EV_SPEC_R << 1)
104#define FD_EV_WAIT_W (FD_EV_WAIT_R << 1)
105#define FD_EV_STOP_W (FD_EV_STOP_R << 1)
106#define FD_EV_MASK_W (FD_EV_MASK_R << 1)
107
108#define FD_EV_MASK (FD_EV_MASK_W | FD_EV_MASK_R)
109
Willy Tarreau6653d172007-05-13 01:52:05 +0200110/* This is the minimum number of events successfully processed in speculative
111 * mode above which we agree to return without checking epoll() (1/2 times).
112 */
113#define MIN_RETURN_EVENTS 25
Willy Tarreaude99e992007-04-16 00:53:59 +0200114
115/* descriptor of one FD.
116 * FIXME: should be a bit field */
117struct fd_status {
118 unsigned int e:4; // read and write events status.
Willy Tarreau4eac2092007-08-31 17:01:18 +0200119 unsigned int s1:28; // Position in spec list+1. 0=not in list. Should be last.
Willy Tarreaude99e992007-04-16 00:53:59 +0200120};
121
122static int nbspec = 0; // current size of the spec list
123
124static struct fd_status *fd_list = NULL; // list of FDs
125static unsigned int *spec_list = NULL; // speculative I/O list
126
127/* private data */
128static struct epoll_event *epoll_events;
129static int epoll_fd;
130
131/* This structure may be used for any purpose. Warning! do not use it in
132 * recursive functions !
133 */
134static struct epoll_event ev;
135
136
137REGPRM1 static void alloc_spec_entry(const int fd)
138{
Willy Tarreau4eac2092007-08-31 17:01:18 +0200139 if (fd_list[fd].s1)
Willy Tarreaude99e992007-04-16 00:53:59 +0200140 return;
Willy Tarreau4eac2092007-08-31 17:01:18 +0200141 fd_list[fd].s1 = nbspec + 1;
142 spec_list[nbspec] = fd;
143 nbspec++;
Willy Tarreaude99e992007-04-16 00:53:59 +0200144}
145
Willy Tarreau4eac2092007-08-31 17:01:18 +0200146/* Removes entry used by fd <fd> from the spec list and replaces it with the
147 * last one. The fd_list is adjusted to match the back reference if needed.
148 * If the fd has no entry assigned, return immediately.
Willy Tarreaude99e992007-04-16 00:53:59 +0200149 */
Willy Tarreau4eac2092007-08-31 17:01:18 +0200150REGPRM1 static void release_spec_entry(int fd)
Willy Tarreaude99e992007-04-16 00:53:59 +0200151{
Willy Tarreau4eac2092007-08-31 17:01:18 +0200152 unsigned int pos;
153
154 pos = fd_list[fd].s1;
155 if (!pos)
156 return;
157
158 fd_list[fd].s1 = 0;
159 pos--;
160 /* we have spec_list[pos]==fd */
Willy Tarreaude99e992007-04-16 00:53:59 +0200161
162 nbspec--;
163 if (pos == nbspec)
164 return;
165
Willy Tarreau4eac2092007-08-31 17:01:18 +0200166 /* we replace current FD by the highest one, which may sometimes be the same */
Willy Tarreaude99e992007-04-16 00:53:59 +0200167 fd = spec_list[nbspec];
Willy Tarreau4eac2092007-08-31 17:01:18 +0200168 fd_list[fd].s1 = pos + 1;
Willy Tarreaude99e992007-04-16 00:53:59 +0200169 spec_list[pos] = fd;
Willy Tarreaude99e992007-04-16 00:53:59 +0200170}
171
172/*
173 * Returns non-zero if <fd> is already monitored for events in direction <dir>.
174 */
175REGPRM2 static int __fd_is_set(const int fd, int dir)
176{
177 int ret;
178
179 ret = ((unsigned)fd_list[fd].e >> dir) & FD_EV_MASK_DIR;
180 return (ret == FD_EV_SPEC || ret == FD_EV_WAIT);
181}
182
183/*
184 * Don't worry about the strange constructs in __fd_set/__fd_clr, they are
185 * designed like this in order to reduce the number of jumps (verified).
186 */
187REGPRM2 static int __fd_set(const int fd, int dir)
188{
189 __label__ switch_state;
190 unsigned int i;
191
192 i = ((unsigned)fd_list[fd].e >> dir) & FD_EV_MASK_DIR;
193
194 if (i == FD_EV_IDLE) {
195 // switch to SPEC state and allocate a SPEC entry.
196 alloc_spec_entry(fd);
197 switch_state:
198 fd_list[fd].e ^= (unsigned int)(FD_EV_IN_SL << dir);
199 return 1;
200 }
201 else if (i == FD_EV_STOP) {
202 // switch to WAIT state
203 goto switch_state;
204 }
205 else
206 return 0;
207}
208
209REGPRM2 static int __fd_clr(const int fd, int dir)
210{
211 __label__ switch_state;
212 unsigned int i;
213
214 i = ((unsigned)fd_list[fd].e >> dir) & FD_EV_MASK_DIR;
215
216 if (i == FD_EV_SPEC) {
217 // switch to IDLE state
218 goto switch_state;
219 }
220 else if (likely(i == FD_EV_WAIT)) {
221 // switch to STOP state
222 /* We will create a queue entry for this one because we want to
223 * process it later in order to merge it with other events on
224 * the same FD.
225 */
226 alloc_spec_entry(fd);
227 switch_state:
228 fd_list[fd].e ^= (unsigned int)(FD_EV_IN_SL << dir);
229 return 1;
230 }
231 return 0;
232}
233
Willy Tarreau6653d172007-05-13 01:52:05 +0200234/* normally unused */
Willy Tarreaude99e992007-04-16 00:53:59 +0200235REGPRM1 static void __fd_rem(int fd)
236{
237 __fd_clr(fd, DIR_RD);
238 __fd_clr(fd, DIR_WR);
239}
240
241/*
242 * On valid epoll() implementations, a call to close() automatically removes
243 * the fds. This means that the FD will appear as previously unset.
244 */
245REGPRM1 static void __fd_clo(int fd)
246{
247 if (fd_list[fd].e & FD_EV_RW_SL)
Willy Tarreau4eac2092007-08-31 17:01:18 +0200248 release_spec_entry(fd);
Willy Tarreaude99e992007-04-16 00:53:59 +0200249 fd_list[fd].e &= ~(FD_EV_MASK);
250}
251
Willy Tarreaudc246a72007-05-09 21:57:51 +0200252/*
Willy Tarreaude99e992007-04-16 00:53:59 +0200253 * speculative epoll() poller
254 */
Willy Tarreaud825eef2007-05-12 22:35:00 +0200255REGPRM2 static void _do_poll(struct poller *p, struct timeval *exp)
Willy Tarreaude99e992007-04-16 00:53:59 +0200256{
257 static unsigned int last_skipped;
Willy Tarreau6653d172007-05-13 01:52:05 +0200258 int status, eo;
Willy Tarreaude99e992007-04-16 00:53:59 +0200259 int fd, opcode;
260 int count;
261 int spec_idx;
Willy Tarreaud825eef2007-05-12 22:35:00 +0200262 int wait_time;
Willy Tarreaude99e992007-04-16 00:53:59 +0200263
264
265 /* Here we have two options :
Willy Tarreau6653d172007-05-13 01:52:05 +0200266 * - either walk the list forwards and hope to match more events
Willy Tarreaude99e992007-04-16 00:53:59 +0200267 * - or walk it backwards to minimize the number of changes and
268 * to make better use of the cache.
269 * Tests have shown that walking backwards improves perf by 0.2%.
270 */
271
Willy Tarreau6653d172007-05-13 01:52:05 +0200272 status = 0;
Willy Tarreaude99e992007-04-16 00:53:59 +0200273 spec_idx = nbspec;
274 while (likely(spec_idx > 0)) {
275 spec_idx--;
276 fd = spec_list[spec_idx];
Willy Tarreau6653d172007-05-13 01:52:05 +0200277 eo = fd_list[fd].e; /* save old events */
Willy Tarreaude99e992007-04-16 00:53:59 +0200278
Willy Tarreau6653d172007-05-13 01:52:05 +0200279 /*
280 * Process the speculative events.
281 *
282 * Principle: events which are marked FD_EV_SPEC are processed
283 * with their assigned function. If the function returns 0, it
284 * means there is nothing doable without polling first. We will
285 * then convert the event to a pollable one by assigning them
286 * the WAIT status.
Willy Tarreaude99e992007-04-16 00:53:59 +0200287 */
288
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100289 fdtab[fd].ev &= FD_POLL_STICKY;
Willy Tarreau6653d172007-05-13 01:52:05 +0200290 if ((eo & FD_EV_MASK_R) == FD_EV_SPEC_R) {
291 /* The owner is interested in reading from this FD */
Willy Tarreau8bb46f42007-04-30 12:56:21 +0200292 if (fdtab[fd].state != FD_STCLOSE && fdtab[fd].state != FD_STERROR) {
Willy Tarreau6653d172007-05-13 01:52:05 +0200293 /* Pretend there is something to read */
Willy Tarreaude99e992007-04-16 00:53:59 +0200294 fdtab[fd].ev |= FD_POLL_IN;
Willy Tarreau6653d172007-05-13 01:52:05 +0200295 if (!fdtab[fd].cb[DIR_RD].f(fd))
296 fd_list[fd].e ^= (FD_EV_WAIT_R ^ FD_EV_SPEC_R);
297 else
298 status++;
Willy Tarreaude99e992007-04-16 00:53:59 +0200299 }
300 }
Willy Tarreau6653d172007-05-13 01:52:05 +0200301 else if ((eo & FD_EV_MASK_R) == FD_EV_STOP_R) {
302 /* This FD was being polled and is now being removed. */
303 fd_list[fd].e &= ~FD_EV_MASK_R;
304 }
Willy Tarreaude99e992007-04-16 00:53:59 +0200305
Willy Tarreau6653d172007-05-13 01:52:05 +0200306 if ((eo & FD_EV_MASK_W) == FD_EV_SPEC_W) {
307 /* The owner is interested in writing to this FD */
Willy Tarreau8bb46f42007-04-30 12:56:21 +0200308 if (fdtab[fd].state != FD_STCLOSE && fdtab[fd].state != FD_STERROR) {
Willy Tarreau6653d172007-05-13 01:52:05 +0200309 /* Pretend there is something to write */
Willy Tarreaude99e992007-04-16 00:53:59 +0200310 fdtab[fd].ev |= FD_POLL_OUT;
Willy Tarreau6653d172007-05-13 01:52:05 +0200311 if (!fdtab[fd].cb[DIR_WR].f(fd))
312 fd_list[fd].e ^= (FD_EV_WAIT_W ^ FD_EV_SPEC_W);
313 else
314 status++;
Willy Tarreaude99e992007-04-16 00:53:59 +0200315 }
316 }
Willy Tarreau6653d172007-05-13 01:52:05 +0200317 else if ((eo & FD_EV_MASK_W) == FD_EV_STOP_W) {
318 /* This FD was being polled and is now being removed. */
319 fd_list[fd].e &= ~FD_EV_MASK_W;
320 }
Willy Tarreaude99e992007-04-16 00:53:59 +0200321
Willy Tarreau6653d172007-05-13 01:52:05 +0200322 /* Now, we will adjust the event in the poll list. Indeed, it
323 * is possible that an event which was previously in the poll
324 * list now goes out, and the opposite is possible too. We can
325 * have opposite changes for READ and WRITE too.
326 */
327
328 if ((eo ^ fd_list[fd].e) & FD_EV_RW_PL) {
329 /* poll status changed*/
330 if ((fd_list[fd].e & FD_EV_RW_PL) == 0) {
331 /* fd removed from poll list */
332 opcode = EPOLL_CTL_DEL;
333 }
334 else if ((eo & FD_EV_RW_PL) == 0) {
335 /* new fd in the poll list */
Willy Tarreaude99e992007-04-16 00:53:59 +0200336 opcode = EPOLL_CTL_ADD;
337 }
Willy Tarreau6653d172007-05-13 01:52:05 +0200338 else {
339 /* fd status changed */
340 opcode = EPOLL_CTL_MOD;
341 }
342
343 /* construct the epoll events based on new state */
344 ev.events = 0;
345 if (fd_list[fd].e & FD_EV_WAIT_R)
346 ev.events |= EPOLLIN;
347
348 if (fd_list[fd].e & FD_EV_WAIT_W)
349 ev.events |= EPOLLOUT;
350
351 ev.data.fd = fd;
Willy Tarreaude99e992007-04-16 00:53:59 +0200352 epoll_ctl(epoll_fd, opcode, fd, &ev);
Willy Tarreau6653d172007-05-13 01:52:05 +0200353 }
Willy Tarreaude99e992007-04-16 00:53:59 +0200354
Willy Tarreaude99e992007-04-16 00:53:59 +0200355
Willy Tarreau6653d172007-05-13 01:52:05 +0200356 if (!(fd_list[fd].e & FD_EV_RW_SL)) {
357 /* This fd switched to combinations of either WAIT or
358 * IDLE. It must be removed from the spec list.
359 */
Willy Tarreau4eac2092007-08-31 17:01:18 +0200360 release_spec_entry(fd);
Willy Tarreau6653d172007-05-13 01:52:05 +0200361 continue;
Willy Tarreaude99e992007-04-16 00:53:59 +0200362 }
363 }
364
Willy Tarreau6653d172007-05-13 01:52:05 +0200365 /* It may make sense to immediately return here if there are enough
366 * processed events, without passing through epoll_wait() because we
367 * have exactly done a poll.
368 * Measures have shown a great performance increase if we call the
369 * epoll_wait() only the second time after speculative accesses have
370 * succeeded. This reduces the number of unsucessful calls to
371 * epoll_wait() by a factor of about 3, and the total number of calls
372 * by about 2.
Willy Tarreaude99e992007-04-16 00:53:59 +0200373 */
Willy Tarreau6653d172007-05-13 01:52:05 +0200374
375 if (status >= MIN_RETURN_EVENTS) {
376 /* We have processed at least MIN_RETURN_EVENTS, it's worth
377 * returning now without checking epoll_wait().
378 */
379 if (++last_skipped <= 1) {
Willy Tarreaude99e992007-04-16 00:53:59 +0200380 tv_now(&now);
381 return;
382 }
Willy Tarreaude99e992007-04-16 00:53:59 +0200383 }
384 last_skipped = 0;
385
Willy Tarreau6653d172007-05-13 01:52:05 +0200386 if (nbspec || status) {
387 /* Maybe we have processed some events that we must report, or
388 * maybe we still have events in the spec list, so we must not
389 * wait in epoll() otherwise we will delay their delivery by
390 * the next timeout.
391 */
Willy Tarreaude99e992007-04-16 00:53:59 +0200392 wait_time = 0;
393 }
Willy Tarreaud825eef2007-05-12 22:35:00 +0200394 else {
Willy Tarreaubdefc512007-05-14 02:02:04 +0200395 if (tv_iseternity(exp))
Willy Tarreaud825eef2007-05-12 22:35:00 +0200396 wait_time = -1;
Willy Tarreaubdefc512007-05-14 02:02:04 +0200397 else if (tv_isge(&now, exp))
398 wait_time = 0;
399 else
400 wait_time = __tv_ms_elapsed(&now, exp) + 1;
Willy Tarreaud825eef2007-05-12 22:35:00 +0200401 }
Willy Tarreaude99e992007-04-16 00:53:59 +0200402
Willy Tarreau6653d172007-05-13 01:52:05 +0200403 /* now let's wait for real events */
Willy Tarreau1db37712007-06-03 17:16:49 +0200404 fd = MIN(maxfd, global.tune.maxpollevents);
405 status = epoll_wait(epoll_fd, epoll_events, fd, wait_time);
Willy Tarreau6653d172007-05-13 01:52:05 +0200406
Willy Tarreaude99e992007-04-16 00:53:59 +0200407 tv_now(&now);
408
409 for (count = 0; count < status; count++) {
410 int e = epoll_events[count].events;
411 fd = epoll_events[count].data.fd;
412
413 /* it looks complicated but gcc can optimize it away when constants
414 * have same values.
415 */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100416 DPRINTF(stderr, "%s:%d: fd=%d, ev=0x%08x, e=0x%08x\n",
417 __FUNCTION__, __LINE__,
418 fd, fdtab[fd].ev, e);
419
420 fdtab[fd].ev &= FD_POLL_STICKY;
421 fdtab[fd].ev |=
Willy Tarreaude99e992007-04-16 00:53:59 +0200422 ((e & EPOLLIN ) ? FD_POLL_IN : 0) |
423 ((e & EPOLLPRI) ? FD_POLL_PRI : 0) |
424 ((e & EPOLLOUT) ? FD_POLL_OUT : 0) |
425 ((e & EPOLLERR) ? FD_POLL_ERR : 0) |
426 ((e & EPOLLHUP) ? FD_POLL_HUP : 0);
427
428 if ((fd_list[fd].e & FD_EV_MASK_R) == FD_EV_WAIT_R) {
Willy Tarreau8bb46f42007-04-30 12:56:21 +0200429 if (fdtab[fd].state == FD_STCLOSE || fdtab[fd].state == FD_STERROR)
Willy Tarreaude99e992007-04-16 00:53:59 +0200430 continue;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100431 if (fdtab[fd].ev & (FD_POLL_IN|FD_POLL_HUP|FD_POLL_ERR))
Willy Tarreaude99e992007-04-16 00:53:59 +0200432 fdtab[fd].cb[DIR_RD].f(fd);
433 }
434
435 if ((fd_list[fd].e & FD_EV_MASK_W) == FD_EV_WAIT_W) {
Willy Tarreau8bb46f42007-04-30 12:56:21 +0200436 if (fdtab[fd].state == FD_STCLOSE || fdtab[fd].state == FD_STERROR)
Willy Tarreaude99e992007-04-16 00:53:59 +0200437 continue;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100438 if (fdtab[fd].ev & (FD_POLL_OUT|FD_POLL_ERR))
Willy Tarreaude99e992007-04-16 00:53:59 +0200439 fdtab[fd].cb[DIR_WR].f(fd);
440 }
441 }
442}
443
444/*
445 * Initialization of the speculative epoll() poller.
446 * Returns 0 in case of failure, non-zero in case of success. If it fails, it
447 * disables the poller by setting its pref to 0.
448 */
449REGPRM1 static int _do_init(struct poller *p)
450{
451 __label__ fail_fd_list, fail_spec, fail_ee, fail_fd;
452
453 p->private = NULL;
454
455 epoll_fd = epoll_create(global.maxsock + 1);
456 if (epoll_fd < 0)
457 goto fail_fd;
458
459 epoll_events = (struct epoll_event*)
Willy Tarreau1db37712007-06-03 17:16:49 +0200460 calloc(1, sizeof(struct epoll_event) * global.tune.maxpollevents);
Willy Tarreaude99e992007-04-16 00:53:59 +0200461
462 if (epoll_events == NULL)
463 goto fail_ee;
464
465 if ((spec_list = (uint32_t *)calloc(1, sizeof(uint32_t) * global.maxsock)) == NULL)
466 goto fail_spec;
467
468 fd_list = (struct fd_status *)calloc(1, sizeof(struct fd_status) * global.maxsock);
469 if (fd_list == NULL)
470 goto fail_fd_list;
471
472 return 1;
473
474 fail_fd_list:
475 free(spec_list);
476 fail_spec:
477 free(epoll_events);
478 fail_ee:
479 close(epoll_fd);
480 epoll_fd = 0;
481 fail_fd:
482 p->pref = 0;
483 return 0;
484}
485
486/*
487 * Termination of the speculative epoll() poller.
488 * Memory is released and the poller is marked as unselectable.
489 */
490REGPRM1 static void _do_term(struct poller *p)
491{
492 if (fd_list)
493 free(fd_list);
494 if (spec_list)
495 free(spec_list);
496 if (epoll_events)
497 free(epoll_events);
498
499 close(epoll_fd);
500 epoll_fd = 0;
501
502 fd_list = NULL;
503 spec_list = NULL;
504 epoll_events = NULL;
505
506 p->private = NULL;
507 p->pref = 0;
508}
509
510/*
511 * Check that the poller works.
512 * Returns 1 if OK, otherwise 0.
513 */
514REGPRM1 static int _do_test(struct poller *p)
515{
516 int fd;
517
518 fd = epoll_create(global.maxsock + 1);
519 if (fd < 0)
520 return 0;
521 close(fd);
522 return 1;
523}
524
525/*
Willy Tarreaufb8983f2007-06-03 16:40:44 +0200526 * Recreate the epoll file descriptor after a fork(). Returns 1 if OK,
527 * otherwise 0. It will ensure that all processes will not share their
528 * epoll_fd. Some side effects were encountered because of this, such
529 * as epoll_wait() returning an FD which was previously deleted.
530 */
531REGPRM1 static int _do_fork(struct poller *p)
532{
533 close(epoll_fd);
534 epoll_fd = epoll_create(global.maxsock + 1);
535 if (epoll_fd < 0)
536 return 0;
537 return 1;
538}
539
540/*
Willy Tarreaude99e992007-04-16 00:53:59 +0200541 * It is a constructor, which means that it will automatically be called before
542 * main(). This is GCC-specific but it works at least since 2.95.
543 * Special care must be taken so that it does not need any uninitialized data.
544 */
545__attribute__((constructor))
546static void _do_register(void)
547{
548 struct poller *p;
549
550 if (nbpollers >= MAX_POLLERS)
551 return;
552 p = &pollers[nbpollers++];
553
554 p->name = "sepoll";
555 p->pref = 400;
556 p->private = NULL;
557
558 p->test = _do_test;
559 p->init = _do_init;
560 p->term = _do_term;
561 p->poll = _do_poll;
Willy Tarreaufb8983f2007-06-03 16:40:44 +0200562 p->fork = _do_fork;
Willy Tarreaude99e992007-04-16 00:53:59 +0200563
564 p->is_set = __fd_is_set;
565 p->cond_s = p->set = __fd_set;
566 p->cond_c = p->clr = __fd_clr;
567 p->rem = __fd_rem;
568 p->clo = __fd_clo;
569}
570
571
572/*
573 * Local variables:
574 * c-indent-level: 8
575 * c-basic-offset: 8
576 * End:
577 */