blob: 66d963a4cb170cb5e8c5b8e44418786a21ba4178 [file] [log] [blame]
Willy Tarreaubaaee002006-06-26 02:48:02 +02001/*
2 * File descriptors management functions.
3 *
4 * Copyright 2000-2006 Willy Tarreau <w@1wt.eu>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13/*
14 * FIXME:
15 * - we still use 'listeners' to check whether we want to stop or not.
16 * - the various pollers should be moved to other external files, possibly
17 * dynamic libs.
Willy Tarreaubaaee002006-06-26 02:48:02 +020018 */
19
20#include <unistd.h>
21#include <sys/time.h>
22#include <sys/types.h>
23
Willy Tarreau2dd0d472006-06-29 17:53:05 +020024#include <common/compat.h>
25#include <common/config.h>
26#include <common/time.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020027
28#include <types/fd.h>
29#include <types/global.h>
30
31#include <proto/polling.h>
32#include <proto/task.h>
33
34struct fdtab *fdtab = NULL; /* array of all the file descriptors */
35int maxfd; /* # of the highest fd + 1 */
36int totalconn; /* total # of terminated sessions */
37int actconn; /* # of active sessions */
38
39fd_set *StaticReadEvent, *StaticWriteEvent;
40int cfg_polling_mechanism = 0; /* POLL_USE_{SELECT|POLL|EPOLL} */
41
42
43/******************************
44 * pollers
45 ******************************/
46
47
48/*
49 * FIXME: this is dirty, but at the moment, there's no other solution to remove
50 * the old FDs from outside the loop. Perhaps we should export a global 'poll'
51 * structure with pointers to functions such as init_fd() and close_fd(), plus
52 * a private structure with several pointers to places such as below.
53 */
54
55#if defined(ENABLE_EPOLL)
56fd_set *PrevReadEvent = NULL, *PrevWriteEvent = NULL;
57
58#if defined(USE_MY_EPOLL)
Willy Tarreau2dd0d472006-06-29 17:53:05 +020059#include <errno.h>
60#include <sys/syscall.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020061_syscall1 (int, epoll_create, int, size);
62_syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
63_syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
64#endif
65
66/*
67 * Main epoll() loop.
68 * does 3 actions :
69 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
70 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
71 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
72 *
73 * returns 0 if initialization failed, !0 otherwise.
74 */
75
76int epoll_loop(int action)
77{
78 int next_time;
79 int status;
80 int fd;
81
82 int fds, count;
83 int pr, pw, sr, sw;
84 unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
85 struct epoll_event ev;
86
87 /* private data */
88 static struct epoll_event *epoll_events = NULL;
89 static int epoll_fd;
90
91 if (action == POLL_LOOP_ACTION_INIT) {
92 epoll_fd = epoll_create(global.maxsock + 1);
93 if (epoll_fd < 0)
94 return 0;
95 else {
96 epoll_events = (struct epoll_event*)
97 calloc(1, sizeof(struct epoll_event) * global.maxsock);
98 PrevReadEvent = (fd_set *)
99 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
100 PrevWriteEvent = (fd_set *)
101 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
102 }
103 return 1;
104 }
105 else if (action == POLL_LOOP_ACTION_CLEAN) {
106 if (PrevWriteEvent) free(PrevWriteEvent);
107 if (PrevReadEvent) free(PrevReadEvent);
108 if (epoll_events) free(epoll_events);
109 close(epoll_fd);
110 epoll_fd = 0;
111 return 1;
112 }
113
114 /* OK, it's POLL_LOOP_ACTION_RUN */
115
116 tv_now(&now);
117
118 while (1) {
119 next_time = process_runnable_tasks();
120
121 /* stop when there's no connection left and we don't allow them anymore */
122 if (!actconn && listeners == 0)
123 break;
124
125 for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
126
127 rn = ((int*)StaticReadEvent)[fds]; ro = ((int*)PrevReadEvent)[fds];
128 wn = ((int*)StaticWriteEvent)[fds]; wo = ((int*)PrevWriteEvent)[fds];
129
130 if ((ro^rn) | (wo^wn)) {
131 for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
132#define FDSETS_ARE_INT_ALIGNED
133#ifdef FDSETS_ARE_INT_ALIGNED
134
135#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
136#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
137 pr = (ro >> count) & 1;
138 pw = (wo >> count) & 1;
139 sr = (rn >> count) & 1;
140 sw = (wn >> count) & 1;
141#else
142 pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
143 pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
144 sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
145 sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
146#endif
147#else
148 pr = FD_ISSET(fd, PrevReadEvent);
149 pw = FD_ISSET(fd, PrevWriteEvent);
150 sr = FD_ISSET(fd, StaticReadEvent);
151 sw = FD_ISSET(fd, StaticWriteEvent);
152#endif
153 if (!((sr^pr) | (sw^pw)))
154 continue;
155
156 ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
157 ev.data.fd = fd;
158
159#ifdef EPOLL_CTL_MOD_WORKAROUND
160 /* I encountered a rarely reproducible problem with
161 * EPOLL_CTL_MOD where a modified FD (systematically
162 * the one in epoll_events[0], fd#7) would sometimes
163 * be set EPOLL_OUT while asked for a read ! This is
164 * with the 2.4 epoll patch. The workaround is to
165 * delete then recreate in case of modification.
166 * This is in 2.4 up to epoll-lt-0.21 but not in 2.6
167 * nor RHEL kernels.
168 */
169
170 if ((pr | pw) && fdtab[fd].state != FD_STCLOSE)
171 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev);
172
173 if ((sr | sw))
174 epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev);
175#else
176 if ((pr | pw)) {
177 /* the file-descriptor already exists... */
178 if ((sr | sw)) {
179 /* ...and it will still exist */
180 if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
181 // perror("epoll_ctl(MOD)");
182 // exit(1);
183 }
184 } else {
185 /* ...and it will be removed */
186 if (fdtab[fd].state != FD_STCLOSE &&
187 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
188 // perror("epoll_ctl(DEL)");
189 // exit(1);
190 }
191 }
192 } else {
193 /* the file-descriptor did not exist, let's add it */
194 if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
195 // perror("epoll_ctl(ADD)");
196 // exit(1);
197 }
198 }
199#endif // EPOLL_CTL_MOD_WORKAROUND
200 }
201 ((int*)PrevReadEvent)[fds] = rn;
202 ((int*)PrevWriteEvent)[fds] = wn;
203 }
204 }
205
206 /* now let's wait for events */
207 status = epoll_wait(epoll_fd, epoll_events, maxfd, next_time);
208 tv_now(&now);
209
210 for (count = 0; count < status; count++) {
211 fd = epoll_events[count].data.fd;
212
213 if (FD_ISSET(fd, StaticReadEvent)) {
214 if (fdtab[fd].state == FD_STCLOSE)
215 continue;
216 if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
Willy Tarreau54469402006-07-29 16:59:06 +0200217 fdtab[fd].cb[DIR_RD].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200218 }
219
220 if (FD_ISSET(fd, StaticWriteEvent)) {
221 if (fdtab[fd].state == FD_STCLOSE)
222 continue;
223 if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
Willy Tarreau54469402006-07-29 16:59:06 +0200224 fdtab[fd].cb[DIR_WR].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200225 }
226 }
227 }
228 return 1;
229}
230#endif
231
232
233
234#if defined(ENABLE_POLL)
235/*
236 * Main poll() loop.
237 * does 3 actions :
238 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
239 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
240 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
241 *
242 * returns 0 if initialization failed, !0 otherwise.
243 */
244
245int poll_loop(int action)
246{
247 int next_time;
248 int status;
249 int fd, nbfd;
250
251 int fds, count;
252 int sr, sw;
253 unsigned rn, wn; /* read new, write new */
254
255 /* private data */
256 static struct pollfd *poll_events = NULL;
257
258 if (action == POLL_LOOP_ACTION_INIT) {
259 poll_events = (struct pollfd*)
260 calloc(1, sizeof(struct pollfd) * global.maxsock);
261 return 1;
262 }
263 else if (action == POLL_LOOP_ACTION_CLEAN) {
264 if (poll_events)
265 free(poll_events);
266 return 1;
267 }
268
269 /* OK, it's POLL_LOOP_ACTION_RUN */
270
271 tv_now(&now);
272
273 while (1) {
274 next_time = process_runnable_tasks();
275
276 /* stop when there's no connection left and we don't allow them anymore */
277 if (!actconn && listeners == 0)
278 break;
279
280 nbfd = 0;
281 for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
282
283 rn = ((int*)StaticReadEvent)[fds];
284 wn = ((int*)StaticWriteEvent)[fds];
285
286 if ((rn|wn)) {
287 for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
288#define FDSETS_ARE_INT_ALIGNED
289#ifdef FDSETS_ARE_INT_ALIGNED
290
291#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
292#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
293 sr = (rn >> count) & 1;
294 sw = (wn >> count) & 1;
295#else
296 sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
297 sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
298#endif
299#else
300 sr = FD_ISSET(fd, StaticReadEvent);
301 sw = FD_ISSET(fd, StaticWriteEvent);
302#endif
303 if ((sr|sw)) {
304 poll_events[nbfd].fd = fd;
305 poll_events[nbfd].events = (sr ? POLLIN : 0) | (sw ? POLLOUT : 0);
306 nbfd++;
307 }
308 }
309 }
310 }
311
312 /* now let's wait for events */
313 status = poll(poll_events, nbfd, next_time);
314 tv_now(&now);
315
316 for (count = 0; status > 0 && count < nbfd; count++) {
317 fd = poll_events[count].fd;
318
319 if (!(poll_events[count].revents & ( POLLOUT | POLLIN | POLLERR | POLLHUP )))
320 continue;
321
322 /* ok, we found one active fd */
323 status--;
324
325 if (FD_ISSET(fd, StaticReadEvent)) {
326 if (fdtab[fd].state == FD_STCLOSE)
327 continue;
328 if (poll_events[count].revents & ( POLLIN | POLLERR | POLLHUP ))
Willy Tarreau54469402006-07-29 16:59:06 +0200329 fdtab[fd].cb[DIR_RD].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200330 }
331
332 if (FD_ISSET(fd, StaticWriteEvent)) {
333 if (fdtab[fd].state == FD_STCLOSE)
334 continue;
335 if (poll_events[count].revents & ( POLLOUT | POLLERR | POLLHUP ))
Willy Tarreau54469402006-07-29 16:59:06 +0200336 fdtab[fd].cb[DIR_WR].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200337 }
338 }
339 }
340 return 1;
341}
342#endif
343
344
345
346/*
347 * Main select() loop.
348 * does 3 actions :
349 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
350 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
351 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
352 *
353 * returns 0 if initialization failed, !0 otherwise.
354 */
355
356
357int select_loop(int action)
358{
359 int next_time;
360 int status;
361 int fd,i;
362 struct timeval delta;
363 int readnotnull, writenotnull;
364 static fd_set *ReadEvent = NULL, *WriteEvent = NULL;
365
366 if (action == POLL_LOOP_ACTION_INIT) {
367 ReadEvent = (fd_set *)
368 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
369 WriteEvent = (fd_set *)
370 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
371 return 1;
372 }
373 else if (action == POLL_LOOP_ACTION_CLEAN) {
374 if (WriteEvent) free(WriteEvent);
375 if (ReadEvent) free(ReadEvent);
376 return 1;
377 }
378
379 /* OK, it's POLL_LOOP_ACTION_RUN */
380
381 tv_now(&now);
382
383 while (1) {
384 next_time = process_runnable_tasks();
385
386 /* stop when there's no connection left and we don't allow them anymore */
387 if (!actconn && listeners == 0)
388 break;
389
390 if (next_time > 0) { /* FIXME */
391 /* Convert to timeval */
392 /* to avoid eventual select loops due to timer precision */
393 next_time += SCHEDULER_RESOLUTION;
394 delta.tv_sec = next_time / 1000;
395 delta.tv_usec = (next_time % 1000) * 1000;
396 }
397 else if (next_time == 0) { /* allow select to return immediately when needed */
398 delta.tv_sec = delta.tv_usec = 0;
399 }
400
401
402 /* let's restore fdset state */
403
404 readnotnull = 0; writenotnull = 0;
405 for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) {
406 readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0;
407 writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0;
408 }
409
410 // /* just a verification code, needs to be removed for performance */
411 // for (i=0; i<maxfd; i++) {
412 // if (FD_ISSET(i, ReadEvent) != FD_ISSET(i, StaticReadEvent))
413 // abort();
414 // if (FD_ISSET(i, WriteEvent) != FD_ISSET(i, StaticWriteEvent))
415 // abort();
416 //
417 // }
418
419 status = select(maxfd,
420 readnotnull ? ReadEvent : NULL,
421 writenotnull ? WriteEvent : NULL,
422 NULL,
423 (next_time >= 0) ? &delta : NULL);
424
425 /* this is an experiment on the separation of the select work */
426 // status = (readnotnull ? select(maxfd, ReadEvent, NULL, NULL, (next_time >= 0) ? &delta : NULL) : 0);
427 // status |= (writenotnull ? select(maxfd, NULL, WriteEvent, NULL, (next_time >= 0) ? &delta : NULL) : 0);
428
429 tv_now(&now);
430
431 if (status > 0) { /* must proceed with events */
432
433 int fds;
434 char count;
435
436 for (fds = 0; (fds << INTBITS) < maxfd; fds++)
437 if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) != 0)
438 for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
439
440 /* if we specify read first, the accepts and zero reads will be
441 * seen first. Moreover, system buffers will be flushed faster.
442 */
443 if (FD_ISSET(fd, ReadEvent)) {
444 if (fdtab[fd].state == FD_STCLOSE)
445 continue;
Willy Tarreau54469402006-07-29 16:59:06 +0200446 fdtab[fd].cb[DIR_RD].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200447 }
448
449 if (FD_ISSET(fd, WriteEvent)) {
450 if (fdtab[fd].state == FD_STCLOSE)
451 continue;
Willy Tarreau54469402006-07-29 16:59:06 +0200452 fdtab[fd].cb[DIR_WR].f(fd);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200453 }
454 }
455 }
456 else {
457 // fprintf(stderr,"select returned %d, maxfd=%d\n", status, maxfd);
458 }
459 }
460 return 1;
461}
462
463
464
465/*********************
466 * generic functions
467 *********************/
468
469
470/* Deletes an FD from the fdsets, and recomputes the maxfd limit.
471 * The file descriptor is also closed.
472 */
473void fd_delete(int fd)
474{
475 FD_CLR(fd, StaticReadEvent);
476 FD_CLR(fd, StaticWriteEvent);
477#if defined(ENABLE_EPOLL)
478 if (PrevReadEvent) {
479 FD_CLR(fd, PrevReadEvent);
480 FD_CLR(fd, PrevWriteEvent);
481 }
482#endif
483
484 close(fd);
485 fdtab[fd].state = FD_STCLOSE;
486
487 while ((maxfd-1 >= 0) && (fdtab[maxfd-1].state == FD_STCLOSE))
488 maxfd--;
489}
490
491
492/*
493 * Local variables:
494 * c-indent-level: 8
495 * c-basic-offset: 8
496 * End:
497 */