blob: ff49505e28fe896ca642d8b674b6a85f1faa83f9 [file] [log] [blame]
Willy Tarreau4f60f162007-04-08 16:39:58 +02001/*
2 * FD polling functions for linux epoll()
3 *
4 * Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <unistd.h>
14#include <sys/time.h>
15#include <sys/types.h>
16
17#include <common/compat.h>
18#include <common/config.h>
19#include <common/time.h>
20
21#include <types/fd.h>
22#include <types/global.h>
23
24#include <proto/fd.h>
25#include <proto/polling.h>
26#include <proto/task.h>
27
28#if defined(USE_MY_EPOLL)
29#include <errno.h>
30#include <sys/syscall.h>
31_syscall1 (int, epoll_create, int, size);
32_syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
33_syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
34#endif
35
36
37static fd_set *StaticReadEvent, *StaticWriteEvent;
38static fd_set *PrevReadEvent, *PrevWriteEvent;
39
40/* private data */
41static struct epoll_event *epoll_events;
42static int epoll_fd;
43
44
45/*
46 * Benchmarks performed on a Pentium-M notebook show that using functions
47 * instead of the usual macros improve the FD_* performance by about 80%,
48 * and that marking them regparm(2) adds another 20%.
49 */
50REGPRM2 static int __fd_isset(const int fd, const int dir)
51{
52 fd_set *ev;
53 if (dir == DIR_RD)
54 ev = StaticReadEvent;
55 else
56 ev = StaticWriteEvent;
57
58 return FD_ISSET(fd, ev);
59}
60
61REGPRM2 static void __fd_set(const int fd, const int dir)
62{
63 fd_set *ev;
64 if (dir == DIR_RD)
65 ev = StaticReadEvent;
66 else
67 ev = StaticWriteEvent;
68
69 FD_SET(fd, ev);
70}
71
72REGPRM2 static void __fd_clr(const int fd, const int dir)
73{
74 fd_set *ev;
75 if (dir == DIR_RD)
76 ev = StaticReadEvent;
77 else
78 ev = StaticWriteEvent;
79
80 FD_CLR(fd, ev);
81}
82
83REGPRM2 static int __fd_cond_s(const int fd, const int dir)
84{
85 int ret;
86 fd_set *ev;
87 if (dir == DIR_RD)
88 ev = StaticReadEvent;
89 else
90 ev = StaticWriteEvent;
91
92 ret = !FD_ISSET(fd, ev);
93 if (ret)
94 FD_SET(fd, ev);
95 return ret;
96}
97
98REGPRM2 static int __fd_cond_c(const int fd, const int dir)
99{
100 int ret;
101 fd_set *ev;
102 if (dir == DIR_RD)
103 ev = StaticReadEvent;
104 else
105 ev = StaticWriteEvent;
106
107 ret = FD_ISSET(fd, ev);
108 if (ret)
109 FD_CLR(fd, ev);
110 return ret;
111}
112
113REGPRM1 static void __fd_rem(const int fd)
114{
115 FD_CLR(fd, StaticReadEvent);
116 FD_CLR(fd, StaticWriteEvent);
117}
118
119REGPRM1 static void __fd_clo(const int fd)
120{
121 FD_CLR(fd, StaticReadEvent);
122 FD_CLR(fd, StaticWriteEvent);
123 FD_CLR(fd, PrevReadEvent);
124 FD_CLR(fd, PrevWriteEvent);
125}
126
127
128
129/*
130 * Initialization of the epoll() poller.
131 * Returns 0 in case of failure, non-zero in case of success. If it fails, it
132 * disables the poller by setting its pref to 0.
133 */
134REGPRM1 static int epoll_init(struct poller *p)
135{
136 __label__ fail_pwevt, fail_prevt, fail_swevt, fail_srevt, fail_ee, fail_fd;
137 int fd_set_bytes;
138
139 p->private = NULL;
140 fd_set_bytes = sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE;
141
142 epoll_fd = epoll_create(global.maxsock + 1);
143 if (epoll_fd < 0)
144 goto fail_fd;
145
146 epoll_events = (struct epoll_event*)
147 calloc(1, sizeof(struct epoll_event) * global.maxsock);
148
149 if (epoll_events == NULL)
150 goto fail_ee;
151
152 if ((PrevReadEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
153 goto fail_prevt;
154
155 if ((PrevWriteEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
156 goto fail_pwevt;
157
158 if ((StaticReadEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
159 goto fail_srevt;
160
161 if ((StaticWriteEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
162 goto fail_swevt;
163
164 return 1;
165
166 fail_swevt:
167 free(StaticReadEvent);
168 fail_srevt:
169 free(PrevWriteEvent);
170 fail_pwevt:
171 free(PrevReadEvent);
172 fail_prevt:
173 free(epoll_events);
174 fail_ee:
175 close(epoll_fd);
176 epoll_fd = 0;
177 fail_fd:
178 p->pref = 0;
179 return 0;
180}
181
182/*
183 * Termination of the epoll() poller.
184 * Memory is released and the poller is marked as unselectable.
185 */
186REGPRM1 static void epoll_term(struct poller *p)
187{
188 if (StaticWriteEvent)
189 free(StaticWriteEvent);
190
191 if (StaticReadEvent)
192 free(StaticReadEvent);
193
194 if (PrevWriteEvent)
195 free(PrevWriteEvent);
196
197 if (PrevReadEvent)
198 free(PrevReadEvent);
199
200 if (epoll_events)
201 free(epoll_events);
202
203 close(epoll_fd);
204 epoll_fd = 0;
205
206 p->private = NULL;
207 p->pref = 0;
208}
209
210/*
211 * epoll() poller
212 */
213REGPRM2 static void epoll_poll(struct poller *p, int wait_time)
214{
215 int status;
216 int fd;
217
218 int fds, count;
219 int pr, pw, sr, sw;
220 unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
221 struct epoll_event ev;
222
223 for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
224
225 rn = ((int*)StaticReadEvent)[fds]; ro = ((int*)PrevReadEvent)[fds];
226 wn = ((int*)StaticWriteEvent)[fds]; wo = ((int*)PrevWriteEvent)[fds];
227
228 if ((ro^rn) | (wo^wn)) {
229 for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
230#define FDSETS_ARE_INT_ALIGNED
231#ifdef FDSETS_ARE_INT_ALIGNED
232
233#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
234#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
235 pr = (ro >> count) & 1;
236 pw = (wo >> count) & 1;
237 sr = (rn >> count) & 1;
238 sw = (wn >> count) & 1;
239#else
240 pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
241 pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
242 sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
243 sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
244#endif
245#else
246 pr = FD_ISSET(fd, PrevReadEvent);
247 pw = FD_ISSET(fd, PrevWriteEvent);
248 sr = FD_ISSET(fd, StaticReadEvent);
249 sw = FD_ISSET(fd, StaticWriteEvent);
250#endif
251 if (!((sr^pr) | (sw^pw)))
252 continue;
253
254 ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
255 ev.data.fd = fd;
256
257#ifdef EPOLL_CTL_MOD_WORKAROUND
258 /* I encountered a rarely reproducible problem with
259 * EPOLL_CTL_MOD where a modified FD (systematically
260 * the one in epoll_events[0], fd#7) would sometimes
261 * be set EPOLL_OUT while asked for a read ! This is
262 * with the 2.4 epoll patch. The workaround is to
263 * delete then recreate in case of modification.
264 * This is in 2.4 up to epoll-lt-0.21 but not in 2.6
265 * nor RHEL kernels.
266 */
267
268 if ((pr | pw) && fdtab[fd].state != FD_STCLOSE)
269 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev);
270
271 if ((sr | sw))
272 epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev);
273#else
274 if ((pr | pw)) {
275 /* the file-descriptor already exists... */
276 if ((sr | sw)) {
277 /* ...and it will still exist */
278 if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
279 // perror("epoll_ctl(MOD)");
280 // exit(1);
281 }
282 } else {
283 /* ...and it will be removed */
284 if (fdtab[fd].state != FD_STCLOSE &&
285 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
286 // perror("epoll_ctl(DEL)");
287 // exit(1);
288 }
289 }
290 } else {
291 /* the file-descriptor did not exist, let's add it */
292 if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
293 // perror("epoll_ctl(ADD)");
294 // exit(1);
295 }
296 }
297#endif // EPOLL_CTL_MOD_WORKAROUND
298 }
299 ((int*)PrevReadEvent)[fds] = rn;
300 ((int*)PrevWriteEvent)[fds] = wn;
301 }
302 }
303
304 /* now let's wait for events */
305 status = epoll_wait(epoll_fd, epoll_events, maxfd, wait_time);
306 tv_now(&now);
307
308 for (count = 0; count < status; count++) {
309 fd = epoll_events[count].data.fd;
310
311 if (FD_ISSET(fd, StaticReadEvent)) {
312 if (fdtab[fd].state == FD_STCLOSE)
313 continue;
314 if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
315 fdtab[fd].cb[DIR_RD].f(fd);
316 }
317
318 if (FD_ISSET(fd, StaticWriteEvent)) {
319 if (fdtab[fd].state == FD_STCLOSE)
320 continue;
321 if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
322 fdtab[fd].cb[DIR_WR].f(fd);
323 }
324 }
325}
326
327/*
328 * The only exported function. Returns 1.
329 */
330int epoll_register(struct poller *p)
331{
332 p->name = "epoll";
333 p->pref = 300;
334 p->private = NULL;
335
336 p->init = epoll_init;
337 p->term = epoll_term;
338 p->poll = epoll_poll;
339 p->isset = __fd_isset;
340 p->set = __fd_set;
341 p->clr = __fd_clr;
342 p->rem = __fd_rem;
343 p->clo = __fd_clo;
344 p->cond_s = __fd_cond_s;
345 p->cond_c = __fd_cond_c;
346 return 1;
347}
348
349
350/*
351 * Local variables:
352 * c-indent-level: 8
353 * c-basic-offset: 8
354 * End:
355 */