blob: 65b3d74055a544d4f40e9646488787841d6d19be [file] [log] [blame]
Willy Tarreau4f60f162007-04-08 16:39:58 +02001/*
2 * FD polling functions for linux epoll()
3 *
4 * Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <unistd.h>
14#include <sys/time.h>
15#include <sys/types.h>
16
17#include <common/compat.h>
18#include <common/config.h>
19#include <common/time.h>
20
21#include <types/fd.h>
22#include <types/global.h>
23
24#include <proto/fd.h>
25#include <proto/polling.h>
26#include <proto/task.h>
27
28#if defined(USE_MY_EPOLL)
29#include <errno.h>
30#include <sys/syscall.h>
31_syscall1 (int, epoll_create, int, size);
32_syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
33_syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
34#endif
35
36
Willy Tarreau28d86862007-04-08 17:42:27 +020037static fd_set *fd_evts[2];
38static fd_set *old_evts[2];
Willy Tarreau4f60f162007-04-08 16:39:58 +020039
40/* private data */
41static struct epoll_event *epoll_events;
42static int epoll_fd;
43
44
45/*
46 * Benchmarks performed on a Pentium-M notebook show that using functions
47 * instead of the usual macros improve the FD_* performance by about 80%,
48 * and that marking them regparm(2) adds another 20%.
49 */
50REGPRM2 static int __fd_isset(const int fd, const int dir)
51{
Willy Tarreau28d86862007-04-08 17:42:27 +020052 return FD_ISSET(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020053}
54
55REGPRM2 static void __fd_set(const int fd, const int dir)
56{
Willy Tarreau28d86862007-04-08 17:42:27 +020057 FD_SET(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020058}
59
60REGPRM2 static void __fd_clr(const int fd, const int dir)
61{
Willy Tarreau28d86862007-04-08 17:42:27 +020062 FD_CLR(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020063}
64
65REGPRM2 static int __fd_cond_s(const int fd, const int dir)
66{
67 int ret;
Willy Tarreau28d86862007-04-08 17:42:27 +020068 ret = !FD_ISSET(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020069 if (ret)
Willy Tarreau28d86862007-04-08 17:42:27 +020070 FD_SET(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020071 return ret;
72}
73
74REGPRM2 static int __fd_cond_c(const int fd, const int dir)
75{
76 int ret;
Willy Tarreau28d86862007-04-08 17:42:27 +020077 ret = FD_ISSET(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020078 if (ret)
Willy Tarreau28d86862007-04-08 17:42:27 +020079 FD_CLR(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020080 return ret;
81}
82
83REGPRM1 static void __fd_rem(const int fd)
84{
Willy Tarreau28d86862007-04-08 17:42:27 +020085 FD_CLR(fd, fd_evts[DIR_RD]);
86 FD_CLR(fd, fd_evts[DIR_WR]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020087}
88
89REGPRM1 static void __fd_clo(const int fd)
90{
Willy Tarreau28d86862007-04-08 17:42:27 +020091 FD_CLR(fd, fd_evts[DIR_RD]);
92 FD_CLR(fd, fd_evts[DIR_WR]);
93 FD_CLR(fd, old_evts[DIR_RD]);
94 FD_CLR(fd, old_evts[DIR_WR]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020095}
96
97
98
99/*
100 * Initialization of the epoll() poller.
101 * Returns 0 in case of failure, non-zero in case of success. If it fails, it
102 * disables the poller by setting its pref to 0.
103 */
104REGPRM1 static int epoll_init(struct poller *p)
105{
106 __label__ fail_pwevt, fail_prevt, fail_swevt, fail_srevt, fail_ee, fail_fd;
107 int fd_set_bytes;
108
109 p->private = NULL;
110 fd_set_bytes = sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE;
111
112 epoll_fd = epoll_create(global.maxsock + 1);
113 if (epoll_fd < 0)
114 goto fail_fd;
115
116 epoll_events = (struct epoll_event*)
117 calloc(1, sizeof(struct epoll_event) * global.maxsock);
118
119 if (epoll_events == NULL)
120 goto fail_ee;
121
Willy Tarreau28d86862007-04-08 17:42:27 +0200122 if ((old_evts[DIR_RD] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
Willy Tarreau4f60f162007-04-08 16:39:58 +0200123 goto fail_prevt;
124
Willy Tarreau28d86862007-04-08 17:42:27 +0200125 if ((old_evts[DIR_WR] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
Willy Tarreau4f60f162007-04-08 16:39:58 +0200126 goto fail_pwevt;
127
Willy Tarreau28d86862007-04-08 17:42:27 +0200128 if ((fd_evts[DIR_RD] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
Willy Tarreau4f60f162007-04-08 16:39:58 +0200129 goto fail_srevt;
130
Willy Tarreau28d86862007-04-08 17:42:27 +0200131 if ((fd_evts[DIR_WR] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
Willy Tarreau4f60f162007-04-08 16:39:58 +0200132 goto fail_swevt;
133
134 return 1;
135
136 fail_swevt:
Willy Tarreau28d86862007-04-08 17:42:27 +0200137 free(fd_evts[DIR_RD]);
Willy Tarreau4f60f162007-04-08 16:39:58 +0200138 fail_srevt:
Willy Tarreau28d86862007-04-08 17:42:27 +0200139 free(old_evts[DIR_WR]);
Willy Tarreau4f60f162007-04-08 16:39:58 +0200140 fail_pwevt:
Willy Tarreau28d86862007-04-08 17:42:27 +0200141 free(old_evts[DIR_RD]);
Willy Tarreau4f60f162007-04-08 16:39:58 +0200142 fail_prevt:
143 free(epoll_events);
144 fail_ee:
145 close(epoll_fd);
146 epoll_fd = 0;
147 fail_fd:
148 p->pref = 0;
149 return 0;
150}
151
152/*
153 * Termination of the epoll() poller.
154 * Memory is released and the poller is marked as unselectable.
155 */
156REGPRM1 static void epoll_term(struct poller *p)
157{
Willy Tarreau28d86862007-04-08 17:42:27 +0200158 if (fd_evts[DIR_WR])
159 free(fd_evts[DIR_WR]);
Willy Tarreau4f60f162007-04-08 16:39:58 +0200160
Willy Tarreau28d86862007-04-08 17:42:27 +0200161 if (fd_evts[DIR_RD])
162 free(fd_evts[DIR_RD]);
Willy Tarreau4f60f162007-04-08 16:39:58 +0200163
Willy Tarreau28d86862007-04-08 17:42:27 +0200164 if (old_evts[DIR_WR])
165 free(old_evts[DIR_WR]);
Willy Tarreau4f60f162007-04-08 16:39:58 +0200166
Willy Tarreau28d86862007-04-08 17:42:27 +0200167 if (old_evts[DIR_RD])
168 free(old_evts[DIR_RD]);
Willy Tarreau4f60f162007-04-08 16:39:58 +0200169
170 if (epoll_events)
171 free(epoll_events);
172
173 close(epoll_fd);
174 epoll_fd = 0;
175
176 p->private = NULL;
177 p->pref = 0;
178}
179
180/*
181 * epoll() poller
182 */
183REGPRM2 static void epoll_poll(struct poller *p, int wait_time)
184{
185 int status;
186 int fd;
187
188 int fds, count;
189 int pr, pw, sr, sw;
190 unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
191 struct epoll_event ev;
192
193 for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
194
Willy Tarreau28d86862007-04-08 17:42:27 +0200195 rn = ((int*)fd_evts[DIR_RD])[fds]; ro = ((int*)old_evts[DIR_RD])[fds];
196 wn = ((int*)fd_evts[DIR_WR])[fds]; wo = ((int*)old_evts[DIR_WR])[fds];
Willy Tarreau4f60f162007-04-08 16:39:58 +0200197
198 if ((ro^rn) | (wo^wn)) {
199 for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
200#define FDSETS_ARE_INT_ALIGNED
201#ifdef FDSETS_ARE_INT_ALIGNED
202
203#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
204#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
205 pr = (ro >> count) & 1;
206 pw = (wo >> count) & 1;
207 sr = (rn >> count) & 1;
208 sw = (wn >> count) & 1;
209#else
210 pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
211 pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
212 sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
213 sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
214#endif
215#else
Willy Tarreau28d86862007-04-08 17:42:27 +0200216 pr = FD_ISSET(fd, old_evts[DIR_RD]);
217 pw = FD_ISSET(fd, old_evts[DIR_WR]);
218 sr = FD_ISSET(fd, fd_evts[DIR_RD]);
219 sw = FD_ISSET(fd, fd_evts[DIR_WR]);
Willy Tarreau4f60f162007-04-08 16:39:58 +0200220#endif
221 if (!((sr^pr) | (sw^pw)))
222 continue;
223
224 ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
225 ev.data.fd = fd;
226
227#ifdef EPOLL_CTL_MOD_WORKAROUND
228 /* I encountered a rarely reproducible problem with
229 * EPOLL_CTL_MOD where a modified FD (systematically
230 * the one in epoll_events[0], fd#7) would sometimes
231 * be set EPOLL_OUT while asked for a read ! This is
232 * with the 2.4 epoll patch. The workaround is to
233 * delete then recreate in case of modification.
234 * This is in 2.4 up to epoll-lt-0.21 but not in 2.6
235 * nor RHEL kernels.
236 */
237
238 if ((pr | pw) && fdtab[fd].state != FD_STCLOSE)
239 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev);
240
241 if ((sr | sw))
242 epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev);
243#else
244 if ((pr | pw)) {
245 /* the file-descriptor already exists... */
246 if ((sr | sw)) {
247 /* ...and it will still exist */
248 if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
249 // perror("epoll_ctl(MOD)");
250 // exit(1);
251 }
252 } else {
253 /* ...and it will be removed */
254 if (fdtab[fd].state != FD_STCLOSE &&
255 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
256 // perror("epoll_ctl(DEL)");
257 // exit(1);
258 }
259 }
260 } else {
261 /* the file-descriptor did not exist, let's add it */
262 if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
263 // perror("epoll_ctl(ADD)");
264 // exit(1);
265 }
266 }
267#endif // EPOLL_CTL_MOD_WORKAROUND
268 }
Willy Tarreau28d86862007-04-08 17:42:27 +0200269 ((int*)old_evts[DIR_RD])[fds] = rn;
270 ((int*)old_evts[DIR_WR])[fds] = wn;
Willy Tarreau4f60f162007-04-08 16:39:58 +0200271 }
272 }
273
274 /* now let's wait for events */
275 status = epoll_wait(epoll_fd, epoll_events, maxfd, wait_time);
276 tv_now(&now);
277
278 for (count = 0; count < status; count++) {
279 fd = epoll_events[count].data.fd;
280
Willy Tarreau28d86862007-04-08 17:42:27 +0200281 if (FD_ISSET(fd, fd_evts[DIR_RD])) {
Willy Tarreau4f60f162007-04-08 16:39:58 +0200282 if (fdtab[fd].state == FD_STCLOSE)
283 continue;
284 if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
285 fdtab[fd].cb[DIR_RD].f(fd);
286 }
287
Willy Tarreau28d86862007-04-08 17:42:27 +0200288 if (FD_ISSET(fd, fd_evts[DIR_WR])) {
Willy Tarreau4f60f162007-04-08 16:39:58 +0200289 if (fdtab[fd].state == FD_STCLOSE)
290 continue;
291 if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
292 fdtab[fd].cb[DIR_WR].f(fd);
293 }
294 }
295}
296
297/*
298 * The only exported function. Returns 1.
299 */
300int epoll_register(struct poller *p)
301{
302 p->name = "epoll";
303 p->pref = 300;
304 p->private = NULL;
305
306 p->init = epoll_init;
307 p->term = epoll_term;
308 p->poll = epoll_poll;
309 p->isset = __fd_isset;
310 p->set = __fd_set;
311 p->clr = __fd_clr;
312 p->rem = __fd_rem;
313 p->clo = __fd_clo;
314 p->cond_s = __fd_cond_s;
315 p->cond_c = __fd_cond_c;
316 return 1;
317}
318
319
320/*
321 * Local variables:
322 * c-indent-level: 8
323 * c-basic-offset: 8
324 * End:
325 */