blob: 2f686519645a05abd679fb77f94cd4cdf507ab83 [file] [log] [blame]
Willy Tarreaubaaee002006-06-26 02:48:02 +02001/*
2 * File descriptors management functions.
3 *
4 * Copyright 2000-2006 Willy Tarreau <w@1wt.eu>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13/*
14 * FIXME:
15 * - we still use 'listeners' to check whether we want to stop or not.
16 * - the various pollers should be moved to other external files, possibly
17 * dynamic libs.
Willy Tarreaubaaee002006-06-26 02:48:02 +020018 */
19
20#include <unistd.h>
21#include <sys/time.h>
22#include <sys/types.h>
23
Willy Tarreau2dd0d472006-06-29 17:53:05 +020024#include <common/compat.h>
25#include <common/config.h>
26#include <common/time.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020027
28#include <types/fd.h>
29#include <types/global.h>
30
Willy Tarreau2a429502006-10-15 14:52:29 +020031#include <proto/fd.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020032#include <proto/polling.h>
33#include <proto/task.h>
34
35struct fdtab *fdtab = NULL; /* array of all the file descriptors */
36int maxfd; /* # of the highest fd + 1 */
37int totalconn; /* total # of terminated sessions */
38int actconn; /* # of active sessions */
39
40fd_set *StaticReadEvent, *StaticWriteEvent;
41int cfg_polling_mechanism = 0; /* POLL_USE_{SELECT|POLL|EPOLL} */
42
43
44/******************************
45 * pollers
46 ******************************/
47
48
Willy Tarreau2a429502006-10-15 14:52:29 +020049#if !defined(CONFIG_HAP_INLINE_FD_SET)
50/*
51 * Benchmarks performed on a Pentium-M notebook show that using functions
52 * instead of the usual macros improve the FD_* performance by about 80%,
53 * and that marking them regparm(2) adds another 20%.
54 */
55void __attribute__((regparm(2))) my_fd_set(const int fd, fd_set *ev)
56{
57 FD_SET(fd, ev);
58}
59
60void __attribute__((regparm(2))) my_fd_clr(const int fd, fd_set *ev)
61{
62 FD_CLR(fd, ev);
63}
64
65int __attribute__((regparm(2))) my_fd_isset(const int fd, const fd_set *ev)
66{
67 return FD_ISSET(fd, ev);
68}
69#endif
70
71
Willy Tarreaubaaee002006-06-26 02:48:02 +020072/*
73 * FIXME: this is dirty, but at the moment, there's no other solution to remove
74 * the old FDs from outside the loop. Perhaps we should export a global 'poll'
75 * structure with pointers to functions such as init_fd() and close_fd(), plus
76 * a private structure with several pointers to places such as below.
77 */
78
79#if defined(ENABLE_EPOLL)
80fd_set *PrevReadEvent = NULL, *PrevWriteEvent = NULL;
81
82#if defined(USE_MY_EPOLL)
Willy Tarreau2dd0d472006-06-29 17:53:05 +020083#include <errno.h>
84#include <sys/syscall.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020085_syscall1 (int, epoll_create, int, size);
86_syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
87_syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
88#endif
89
90/*
91 * Main epoll() loop.
92 * does 3 actions :
93 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
94 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
95 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
96 *
97 * returns 0 if initialization failed, !0 otherwise.
98 */
99
100int epoll_loop(int action)
101{
102 int next_time;
103 int status;
104 int fd;
105
106 int fds, count;
107 int pr, pw, sr, sw;
108 unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
109 struct epoll_event ev;
110
111 /* private data */
112 static struct epoll_event *epoll_events = NULL;
113 static int epoll_fd;
114
115 if (action == POLL_LOOP_ACTION_INIT) {
116 epoll_fd = epoll_create(global.maxsock + 1);
117 if (epoll_fd < 0)
118 return 0;
119 else {
120 epoll_events = (struct epoll_event*)
121 calloc(1, sizeof(struct epoll_event) * global.maxsock);
122 PrevReadEvent = (fd_set *)
123 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
124 PrevWriteEvent = (fd_set *)
125 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
126 }
127 return 1;
128 }
129 else if (action == POLL_LOOP_ACTION_CLEAN) {
130 if (PrevWriteEvent) free(PrevWriteEvent);
131 if (PrevReadEvent) free(PrevReadEvent);
132 if (epoll_events) free(epoll_events);
133 close(epoll_fd);
134 epoll_fd = 0;
135 return 1;
136 }
137
138 /* OK, it's POLL_LOOP_ACTION_RUN */
139
140 tv_now(&now);
141
142 while (1) {
143 next_time = process_runnable_tasks();
144
145 /* stop when there's no connection left and we don't allow them anymore */
146 if (!actconn && listeners == 0)
147 break;
148
149 for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
150
151 rn = ((int*)StaticReadEvent)[fds]; ro = ((int*)PrevReadEvent)[fds];
152 wn = ((int*)StaticWriteEvent)[fds]; wo = ((int*)PrevWriteEvent)[fds];
153
154 if ((ro^rn) | (wo^wn)) {
155 for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
156#define FDSETS_ARE_INT_ALIGNED
157#ifdef FDSETS_ARE_INT_ALIGNED
158
159#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
160#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
161 pr = (ro >> count) & 1;
162 pw = (wo >> count) & 1;
163 sr = (rn >> count) & 1;
164 sw = (wn >> count) & 1;
165#else
Willy Tarreau2a429502006-10-15 14:52:29 +0200166 pr = MY_FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
167 pw = MY_FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
168 sr = MY_FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
169 sw = MY_FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200170#endif
171#else
Willy Tarreau2a429502006-10-15 14:52:29 +0200172 pr = MY_FD_ISSET(fd, PrevReadEvent);
173 pw = MY_FD_ISSET(fd, PrevWriteEvent);
174 sr = MY_FD_ISSET(fd, StaticReadEvent);
175 sw = MY_FD_ISSET(fd, StaticWriteEvent);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200176#endif
177 if (!((sr^pr) | (sw^pw)))
178 continue;
179
180 ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
181 ev.data.fd = fd;
182
183#ifdef EPOLL_CTL_MOD_WORKAROUND
184 /* I encountered a rarely reproducible problem with
185 * EPOLL_CTL_MOD where a modified FD (systematically
186 * the one in epoll_events[0], fd#7) would sometimes
187 * be set EPOLL_OUT while asked for a read ! This is
188 * with the 2.4 epoll patch. The workaround is to
189 * delete then recreate in case of modification.
190 * This is in 2.4 up to epoll-lt-0.21 but not in 2.6
191 * nor RHEL kernels.
192 */
193
194 if ((pr | pw) && fdtab[fd].state != FD_STCLOSE)
195 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev);
196
197 if ((sr | sw))
198 epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev);
199#else
200 if ((pr | pw)) {
201 /* the file-descriptor already exists... */
202 if ((sr | sw)) {
203 /* ...and it will still exist */
204 if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
205 // perror("epoll_ctl(MOD)");
206 // exit(1);
207 }
208 } else {
209 /* ...and it will be removed */
210 if (fdtab[fd].state != FD_STCLOSE &&
211 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
212 // perror("epoll_ctl(DEL)");
213 // exit(1);
214 }
215 }
216 } else {
217 /* the file-descriptor did not exist, let's add it */
218 if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
219 // perror("epoll_ctl(ADD)");
220 // exit(1);
221 }
222 }
223#endif // EPOLL_CTL_MOD_WORKAROUND
224 }
225 ((int*)PrevReadEvent)[fds] = rn;
226 ((int*)PrevWriteEvent)[fds] = wn;
227 }
228 }
229
230 /* now let's wait for events */
231 status = epoll_wait(epoll_fd, epoll_events, maxfd, next_time);
232 tv_now(&now);
233
234 for (count = 0; count < status; count++) {
235 fd = epoll_events[count].data.fd;
236
Willy Tarreau2a429502006-10-15 14:52:29 +0200237 if (MY_FD_ISSET(fd, StaticReadEvent)) {
Willy Tarreaubaaee002006-06-26 02:48:02 +0200238 if (fdtab[fd].state == FD_STCLOSE)
239 continue;
240 if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
Willy Tarreau54469402006-07-29 16:59:06 +0200241 fdtab[fd].cb[DIR_RD].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200242 }
243
Willy Tarreau2a429502006-10-15 14:52:29 +0200244 if (MY_FD_ISSET(fd, StaticWriteEvent)) {
Willy Tarreaubaaee002006-06-26 02:48:02 +0200245 if (fdtab[fd].state == FD_STCLOSE)
246 continue;
247 if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
Willy Tarreau54469402006-07-29 16:59:06 +0200248 fdtab[fd].cb[DIR_WR].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200249 }
250 }
251 }
252 return 1;
253}
254#endif
255
256
257
258#if defined(ENABLE_POLL)
259/*
260 * Main poll() loop.
261 * does 3 actions :
262 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
263 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
264 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
265 *
266 * returns 0 if initialization failed, !0 otherwise.
267 */
268
269int poll_loop(int action)
270{
271 int next_time;
272 int status;
273 int fd, nbfd;
274
275 int fds, count;
276 int sr, sw;
277 unsigned rn, wn; /* read new, write new */
278
279 /* private data */
280 static struct pollfd *poll_events = NULL;
281
282 if (action == POLL_LOOP_ACTION_INIT) {
283 poll_events = (struct pollfd*)
284 calloc(1, sizeof(struct pollfd) * global.maxsock);
285 return 1;
286 }
287 else if (action == POLL_LOOP_ACTION_CLEAN) {
288 if (poll_events)
289 free(poll_events);
290 return 1;
291 }
292
293 /* OK, it's POLL_LOOP_ACTION_RUN */
294
295 tv_now(&now);
296
297 while (1) {
298 next_time = process_runnable_tasks();
299
300 /* stop when there's no connection left and we don't allow them anymore */
301 if (!actconn && listeners == 0)
302 break;
303
304 nbfd = 0;
305 for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
306
307 rn = ((int*)StaticReadEvent)[fds];
308 wn = ((int*)StaticWriteEvent)[fds];
309
310 if ((rn|wn)) {
311 for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
312#define FDSETS_ARE_INT_ALIGNED
313#ifdef FDSETS_ARE_INT_ALIGNED
314
315#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
316#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
317 sr = (rn >> count) & 1;
318 sw = (wn >> count) & 1;
319#else
Willy Tarreau2a429502006-10-15 14:52:29 +0200320 sr = MY_FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
321 sw = MY_FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200322#endif
323#else
Willy Tarreau2a429502006-10-15 14:52:29 +0200324 sr = MY_FD_ISSET(fd, StaticReadEvent);
325 sw = MY_FD_ISSET(fd, StaticWriteEvent);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200326#endif
327 if ((sr|sw)) {
328 poll_events[nbfd].fd = fd;
329 poll_events[nbfd].events = (sr ? POLLIN : 0) | (sw ? POLLOUT : 0);
330 nbfd++;
331 }
332 }
333 }
334 }
335
336 /* now let's wait for events */
337 status = poll(poll_events, nbfd, next_time);
338 tv_now(&now);
339
340 for (count = 0; status > 0 && count < nbfd; count++) {
341 fd = poll_events[count].fd;
342
343 if (!(poll_events[count].revents & ( POLLOUT | POLLIN | POLLERR | POLLHUP )))
344 continue;
345
346 /* ok, we found one active fd */
347 status--;
348
Willy Tarreau2a429502006-10-15 14:52:29 +0200349 if (MY_FD_ISSET(fd, StaticReadEvent)) {
Willy Tarreaubaaee002006-06-26 02:48:02 +0200350 if (fdtab[fd].state == FD_STCLOSE)
351 continue;
352 if (poll_events[count].revents & ( POLLIN | POLLERR | POLLHUP ))
Willy Tarreau54469402006-07-29 16:59:06 +0200353 fdtab[fd].cb[DIR_RD].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200354 }
355
Willy Tarreau2a429502006-10-15 14:52:29 +0200356 if (MY_FD_ISSET(fd, StaticWriteEvent)) {
Willy Tarreaubaaee002006-06-26 02:48:02 +0200357 if (fdtab[fd].state == FD_STCLOSE)
358 continue;
359 if (poll_events[count].revents & ( POLLOUT | POLLERR | POLLHUP ))
Willy Tarreau54469402006-07-29 16:59:06 +0200360 fdtab[fd].cb[DIR_WR].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200361 }
362 }
363 }
364 return 1;
365}
366#endif
367
368
369
370/*
371 * Main select() loop.
372 * does 3 actions :
373 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
374 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
375 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
376 *
377 * returns 0 if initialization failed, !0 otherwise.
378 */
379
380
381int select_loop(int action)
382{
383 int next_time;
384 int status;
385 int fd,i;
386 struct timeval delta;
387 int readnotnull, writenotnull;
388 static fd_set *ReadEvent = NULL, *WriteEvent = NULL;
389
390 if (action == POLL_LOOP_ACTION_INIT) {
391 ReadEvent = (fd_set *)
392 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
393 WriteEvent = (fd_set *)
394 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
395 return 1;
396 }
397 else if (action == POLL_LOOP_ACTION_CLEAN) {
398 if (WriteEvent) free(WriteEvent);
399 if (ReadEvent) free(ReadEvent);
400 return 1;
401 }
402
403 /* OK, it's POLL_LOOP_ACTION_RUN */
404
405 tv_now(&now);
406
407 while (1) {
408 next_time = process_runnable_tasks();
409
410 /* stop when there's no connection left and we don't allow them anymore */
411 if (!actconn && listeners == 0)
412 break;
413
414 if (next_time > 0) { /* FIXME */
415 /* Convert to timeval */
416 /* to avoid eventual select loops due to timer precision */
417 next_time += SCHEDULER_RESOLUTION;
418 delta.tv_sec = next_time / 1000;
419 delta.tv_usec = (next_time % 1000) * 1000;
420 }
421 else if (next_time == 0) { /* allow select to return immediately when needed */
422 delta.tv_sec = delta.tv_usec = 0;
423 }
424
425
426 /* let's restore fdset state */
427
428 readnotnull = 0; writenotnull = 0;
429 for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) {
430 readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0;
431 writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0;
432 }
433
434 // /* just a verification code, needs to be removed for performance */
435 // for (i=0; i<maxfd; i++) {
Willy Tarreau2a429502006-10-15 14:52:29 +0200436 // if (MY_FD_ISSET(i, ReadEvent) != MY_FD_ISSET(i, StaticReadEvent))
Willy Tarreaubaaee002006-06-26 02:48:02 +0200437 // abort();
Willy Tarreau2a429502006-10-15 14:52:29 +0200438 // if (MY_FD_ISSET(i, WriteEvent) != MY_FD_ISSET(i, StaticWriteEvent))
Willy Tarreaubaaee002006-06-26 02:48:02 +0200439 // abort();
440 //
441 // }
442
443 status = select(maxfd,
444 readnotnull ? ReadEvent : NULL,
445 writenotnull ? WriteEvent : NULL,
446 NULL,
447 (next_time >= 0) ? &delta : NULL);
448
449 /* this is an experiment on the separation of the select work */
450 // status = (readnotnull ? select(maxfd, ReadEvent, NULL, NULL, (next_time >= 0) ? &delta : NULL) : 0);
451 // status |= (writenotnull ? select(maxfd, NULL, WriteEvent, NULL, (next_time >= 0) ? &delta : NULL) : 0);
452
453 tv_now(&now);
454
455 if (status > 0) { /* must proceed with events */
456
457 int fds;
458 char count;
459
460 for (fds = 0; (fds << INTBITS) < maxfd; fds++)
461 if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) != 0)
462 for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
463
464 /* if we specify read first, the accepts and zero reads will be
465 * seen first. Moreover, system buffers will be flushed faster.
466 */
Willy Tarreau2a429502006-10-15 14:52:29 +0200467 if (MY_FD_ISSET(fd, ReadEvent)) {
Willy Tarreaubaaee002006-06-26 02:48:02 +0200468 if (fdtab[fd].state == FD_STCLOSE)
469 continue;
Willy Tarreau54469402006-07-29 16:59:06 +0200470 fdtab[fd].cb[DIR_RD].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200471 }
472
Willy Tarreau2a429502006-10-15 14:52:29 +0200473 if (MY_FD_ISSET(fd, WriteEvent)) {
Willy Tarreaubaaee002006-06-26 02:48:02 +0200474 if (fdtab[fd].state == FD_STCLOSE)
475 continue;
Willy Tarreau54469402006-07-29 16:59:06 +0200476 fdtab[fd].cb[DIR_WR].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200477 }
478 }
479 }
480 else {
481 // fprintf(stderr,"select returned %d, maxfd=%d\n", status, maxfd);
482 }
483 }
484 return 1;
485}
486
487
488
489/*********************
490 * generic functions
491 *********************/
492
493
494/* Deletes an FD from the fdsets, and recomputes the maxfd limit.
495 * The file descriptor is also closed.
496 */
497void fd_delete(int fd)
498{
Willy Tarreau2a429502006-10-15 14:52:29 +0200499 MY_FD_CLR(fd, StaticReadEvent);
500 MY_FD_CLR(fd, StaticWriteEvent);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200501#if defined(ENABLE_EPOLL)
502 if (PrevReadEvent) {
Willy Tarreau2a429502006-10-15 14:52:29 +0200503 MY_FD_CLR(fd, PrevReadEvent);
504 MY_FD_CLR(fd, PrevWriteEvent);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200505 }
506#endif
507
508 close(fd);
509 fdtab[fd].state = FD_STCLOSE;
510
511 while ((maxfd-1 >= 0) && (fdtab[maxfd-1].state == FD_STCLOSE))
512 maxfd--;
513}
514
515
516/*
517 * Local variables:
518 * c-indent-level: 8
519 * c-basic-offset: 8
520 * End:
521 */