blob: 482ddc06ddd29838b0e42846a28c09e383f108d8 [file] [log] [blame]
Willy Tarreau4f60f162007-04-08 16:39:58 +02001/*
2 * FD polling functions for linux epoll()
3 *
4 * Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <unistd.h>
14#include <sys/time.h>
15#include <sys/types.h>
16
17#include <common/compat.h>
18#include <common/config.h>
19#include <common/time.h>
20
21#include <types/fd.h>
22#include <types/global.h>
23
24#include <proto/fd.h>
Willy Tarreau4f60f162007-04-08 16:39:58 +020025#include <proto/task.h>
26
27#if defined(USE_MY_EPOLL)
Willy Tarreau69801b82007-04-09 15:28:51 +020028#include <common/epoll.h>
Willy Tarreau4f60f162007-04-08 16:39:58 +020029#include <errno.h>
30#include <sys/syscall.h>
31_syscall1 (int, epoll_create, int, size);
32_syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
33_syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
Willy Tarreau69801b82007-04-09 15:28:51 +020034#else
35#include <sys/epoll.h>
Willy Tarreau4f60f162007-04-08 16:39:58 +020036#endif
37
38
Willy Tarreau28d86862007-04-08 17:42:27 +020039static fd_set *fd_evts[2];
40static fd_set *old_evts[2];
Willy Tarreau4f60f162007-04-08 16:39:58 +020041
42/* private data */
43static struct epoll_event *epoll_events;
44static int epoll_fd;
45
46
47/*
48 * Benchmarks performed on a Pentium-M notebook show that using functions
49 * instead of the usual macros improve the FD_* performance by about 80%,
50 * and that marking them regparm(2) adds another 20%.
51 */
Willy Tarreau97129b52007-04-09 00:54:46 +020052REGPRM2 static int __fd_isset(const int fd, int dir)
Willy Tarreau4f60f162007-04-08 16:39:58 +020053{
Willy Tarreau28d86862007-04-08 17:42:27 +020054 return FD_ISSET(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020055}
56
Willy Tarreau97129b52007-04-09 00:54:46 +020057REGPRM2 static int __fd_set(const int fd, int dir)
Willy Tarreau4f60f162007-04-08 16:39:58 +020058{
Willy Tarreau28d86862007-04-08 17:42:27 +020059 FD_SET(fd, fd_evts[dir]);
Willy Tarreau97129b52007-04-09 00:54:46 +020060 return 0;
Willy Tarreau4f60f162007-04-08 16:39:58 +020061}
62
Willy Tarreau97129b52007-04-09 00:54:46 +020063REGPRM2 static int __fd_clr(const int fd, int dir)
Willy Tarreau4f60f162007-04-08 16:39:58 +020064{
Willy Tarreau28d86862007-04-08 17:42:27 +020065 FD_CLR(fd, fd_evts[dir]);
Willy Tarreau97129b52007-04-09 00:54:46 +020066 return 0;
Willy Tarreau4f60f162007-04-08 16:39:58 +020067}
68
Willy Tarreau97129b52007-04-09 00:54:46 +020069REGPRM2 static int __fd_cond_s(const int fd, int dir)
Willy Tarreau4f60f162007-04-08 16:39:58 +020070{
71 int ret;
Willy Tarreau28d86862007-04-08 17:42:27 +020072 ret = !FD_ISSET(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020073 if (ret)
Willy Tarreau28d86862007-04-08 17:42:27 +020074 FD_SET(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020075 return ret;
76}
77
Willy Tarreau97129b52007-04-09 00:54:46 +020078REGPRM2 static int __fd_cond_c(const int fd, int dir)
Willy Tarreau4f60f162007-04-08 16:39:58 +020079{
80 int ret;
Willy Tarreau28d86862007-04-08 17:42:27 +020081 ret = FD_ISSET(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020082 if (ret)
Willy Tarreau28d86862007-04-08 17:42:27 +020083 FD_CLR(fd, fd_evts[dir]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020084 return ret;
85}
86
87REGPRM1 static void __fd_rem(const int fd)
88{
Willy Tarreau28d86862007-04-08 17:42:27 +020089 FD_CLR(fd, fd_evts[DIR_RD]);
90 FD_CLR(fd, fd_evts[DIR_WR]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020091}
92
93REGPRM1 static void __fd_clo(const int fd)
94{
Willy Tarreau28d86862007-04-08 17:42:27 +020095 FD_CLR(fd, fd_evts[DIR_RD]);
96 FD_CLR(fd, fd_evts[DIR_WR]);
97 FD_CLR(fd, old_evts[DIR_RD]);
98 FD_CLR(fd, old_evts[DIR_WR]);
Willy Tarreau4f60f162007-04-08 16:39:58 +020099}
100
Willy Tarreau4f60f162007-04-08 16:39:58 +0200101/*
102 * epoll() poller
103 */
104REGPRM2 static void epoll_poll(struct poller *p, int wait_time)
105{
106 int status;
107 int fd;
108
109 int fds, count;
110 int pr, pw, sr, sw;
111 unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
112 struct epoll_event ev;
113
114 for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
115
Willy Tarreau28d86862007-04-08 17:42:27 +0200116 rn = ((int*)fd_evts[DIR_RD])[fds]; ro = ((int*)old_evts[DIR_RD])[fds];
117 wn = ((int*)fd_evts[DIR_WR])[fds]; wo = ((int*)old_evts[DIR_WR])[fds];
Willy Tarreau4f60f162007-04-08 16:39:58 +0200118
119 if ((ro^rn) | (wo^wn)) {
120 for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
121#define FDSETS_ARE_INT_ALIGNED
122#ifdef FDSETS_ARE_INT_ALIGNED
123
124#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
125#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
126 pr = (ro >> count) & 1;
127 pw = (wo >> count) & 1;
128 sr = (rn >> count) & 1;
129 sw = (wn >> count) & 1;
130#else
131 pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
132 pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
133 sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
134 sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
135#endif
136#else
Willy Tarreau28d86862007-04-08 17:42:27 +0200137 pr = FD_ISSET(fd, old_evts[DIR_RD]);
138 pw = FD_ISSET(fd, old_evts[DIR_WR]);
139 sr = FD_ISSET(fd, fd_evts[DIR_RD]);
140 sw = FD_ISSET(fd, fd_evts[DIR_WR]);
Willy Tarreau4f60f162007-04-08 16:39:58 +0200141#endif
142 if (!((sr^pr) | (sw^pw)))
143 continue;
144
145 ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
146 ev.data.fd = fd;
147
148#ifdef EPOLL_CTL_MOD_WORKAROUND
149 /* I encountered a rarely reproducible problem with
150 * EPOLL_CTL_MOD where a modified FD (systematically
151 * the one in epoll_events[0], fd#7) would sometimes
152 * be set EPOLL_OUT while asked for a read ! This is
153 * with the 2.4 epoll patch. The workaround is to
154 * delete then recreate in case of modification.
155 * This is in 2.4 up to epoll-lt-0.21 but not in 2.6
156 * nor RHEL kernels.
157 */
158
159 if ((pr | pw) && fdtab[fd].state != FD_STCLOSE)
160 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev);
161
162 if ((sr | sw))
163 epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev);
164#else
165 if ((pr | pw)) {
166 /* the file-descriptor already exists... */
167 if ((sr | sw)) {
168 /* ...and it will still exist */
169 if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
170 // perror("epoll_ctl(MOD)");
171 // exit(1);
172 }
173 } else {
174 /* ...and it will be removed */
175 if (fdtab[fd].state != FD_STCLOSE &&
176 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
177 // perror("epoll_ctl(DEL)");
178 // exit(1);
179 }
180 }
181 } else {
182 /* the file-descriptor did not exist, let's add it */
183 if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
184 // perror("epoll_ctl(ADD)");
185 // exit(1);
186 }
187 }
188#endif // EPOLL_CTL_MOD_WORKAROUND
189 }
Willy Tarreau28d86862007-04-08 17:42:27 +0200190 ((int*)old_evts[DIR_RD])[fds] = rn;
191 ((int*)old_evts[DIR_WR])[fds] = wn;
Willy Tarreau4f60f162007-04-08 16:39:58 +0200192 }
193 }
194
195 /* now let's wait for events */
196 status = epoll_wait(epoll_fd, epoll_events, maxfd, wait_time);
197 tv_now(&now);
198
199 for (count = 0; count < status; count++) {
200 fd = epoll_events[count].data.fd;
201
Willy Tarreau28d86862007-04-08 17:42:27 +0200202 if (FD_ISSET(fd, fd_evts[DIR_RD])) {
Willy Tarreau4f60f162007-04-08 16:39:58 +0200203 if (fdtab[fd].state == FD_STCLOSE)
204 continue;
205 if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
206 fdtab[fd].cb[DIR_RD].f(fd);
207 }
208
Willy Tarreau28d86862007-04-08 17:42:27 +0200209 if (FD_ISSET(fd, fd_evts[DIR_WR])) {
Willy Tarreau4f60f162007-04-08 16:39:58 +0200210 if (fdtab[fd].state == FD_STCLOSE)
211 continue;
212 if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
213 fdtab[fd].cb[DIR_WR].f(fd);
214 }
215 }
216}
217
218/*
Willy Tarreaue54e9172007-04-09 09:23:31 +0200219 * Initialization of the epoll() poller.
220 * Returns 0 in case of failure, non-zero in case of success. If it fails, it
221 * disables the poller by setting its pref to 0.
222 */
223REGPRM1 static int epoll_init(struct poller *p)
224{
225 __label__ fail_pwevt, fail_prevt, fail_swevt, fail_srevt, fail_ee, fail_fd;
226 int fd_set_bytes;
227
228 p->private = NULL;
229 fd_set_bytes = sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE;
230
231 epoll_fd = epoll_create(global.maxsock + 1);
232 if (epoll_fd < 0)
233 goto fail_fd;
234
235 epoll_events = (struct epoll_event*)
236 calloc(1, sizeof(struct epoll_event) * global.maxsock);
237
238 if (epoll_events == NULL)
239 goto fail_ee;
240
241 if ((old_evts[DIR_RD] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
242 goto fail_prevt;
243
244 if ((old_evts[DIR_WR] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
245 goto fail_pwevt;
246
247 if ((fd_evts[DIR_RD] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
248 goto fail_srevt;
249
250 if ((fd_evts[DIR_WR] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
251 goto fail_swevt;
252
253 return 1;
254
255 fail_swevt:
256 free(fd_evts[DIR_RD]);
257 fail_srevt:
258 free(old_evts[DIR_WR]);
259 fail_pwevt:
260 free(old_evts[DIR_RD]);
261 fail_prevt:
262 free(epoll_events);
263 fail_ee:
264 close(epoll_fd);
265 epoll_fd = 0;
266 fail_fd:
267 p->pref = 0;
268 return 0;
269}
270
271/*
272 * Termination of the epoll() poller.
273 * Memory is released and the poller is marked as unselectable.
274 */
275REGPRM1 static void epoll_term(struct poller *p)
276{
277 if (fd_evts[DIR_WR])
278 free(fd_evts[DIR_WR]);
279
280 if (fd_evts[DIR_RD])
281 free(fd_evts[DIR_RD]);
282
283 if (old_evts[DIR_WR])
284 free(old_evts[DIR_WR]);
285
286 if (old_evts[DIR_RD])
287 free(old_evts[DIR_RD]);
288
289 if (epoll_events)
290 free(epoll_events);
291
292 close(epoll_fd);
293 epoll_fd = 0;
294
295 p->private = NULL;
296 p->pref = 0;
297}
298
299/*
Willy Tarreau4f60f162007-04-08 16:39:58 +0200300 * The only exported function. Returns 1.
301 */
302int epoll_register(struct poller *p)
303{
304 p->name = "epoll";
305 p->pref = 300;
306 p->private = NULL;
307
308 p->init = epoll_init;
309 p->term = epoll_term;
310 p->poll = epoll_poll;
311 p->isset = __fd_isset;
312 p->set = __fd_set;
313 p->clr = __fd_clr;
314 p->rem = __fd_rem;
315 p->clo = __fd_clo;
316 p->cond_s = __fd_cond_s;
317 p->cond_c = __fd_cond_c;
318 return 1;
319}
320
321
322/*
323 * Local variables:
324 * c-indent-level: 8
325 * c-basic-offset: 8
326 * End:
327 */