blob: ec0607ebafa308acd9938f569883a9a9ba44a6d3 [file] [log] [blame]
Willy Tarreaubaaee002006-06-26 02:48:02 +02001/*
2 * File descriptors management functions.
3 *
4 * Copyright 2000-2006 Willy Tarreau <w@1wt.eu>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13/*
14 * FIXME:
15 * - we still use 'listeners' to check whether we want to stop or not.
16 * - the various pollers should be moved to other external files, possibly
17 * dynamic libs.
18 * - merge event_cli_read() and event_srv_read(). The difference is res_*,
19 * buffer (at the beginning) and timeouts (at the end).
20 * => event_tcp_read(). It may be called from event_accept().
21 * - extract the connect code from event_srv_write()
22 * => event_tcp_connect(). It must then call event_write().
23 * - merge the remaining event_cli_write() and event_srv_write()
24 * => single event_tcp_write(). Check buffer, fd_state, res*, and timeouts.
25 *
26 */
27
28#include <unistd.h>
29#include <sys/time.h>
30#include <sys/types.h>
31
Willy Tarreau2dd0d472006-06-29 17:53:05 +020032#include <common/compat.h>
33#include <common/config.h>
34#include <common/time.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020035
36#include <types/fd.h>
37#include <types/global.h>
38
39#include <proto/polling.h>
40#include <proto/task.h>
41
42struct fdtab *fdtab = NULL; /* array of all the file descriptors */
43int maxfd; /* # of the highest fd + 1 */
44int totalconn; /* total # of terminated sessions */
45int actconn; /* # of active sessions */
46
47fd_set *StaticReadEvent, *StaticWriteEvent;
48int cfg_polling_mechanism = 0; /* POLL_USE_{SELECT|POLL|EPOLL} */
49
50
51/******************************
52 * pollers
53 ******************************/
54
55
56/*
57 * FIXME: this is dirty, but at the moment, there's no other solution to remove
58 * the old FDs from outside the loop. Perhaps we should export a global 'poll'
59 * structure with pointers to functions such as init_fd() and close_fd(), plus
60 * a private structure with several pointers to places such as below.
61 */
62
63#if defined(ENABLE_EPOLL)
64fd_set *PrevReadEvent = NULL, *PrevWriteEvent = NULL;
65
66#if defined(USE_MY_EPOLL)
Willy Tarreau2dd0d472006-06-29 17:53:05 +020067#include <errno.h>
68#include <sys/syscall.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020069_syscall1 (int, epoll_create, int, size);
70_syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
71_syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
72#endif
73
74/*
75 * Main epoll() loop.
76 * does 3 actions :
77 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
78 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
79 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
80 *
81 * returns 0 if initialization failed, !0 otherwise.
82 */
83
84int epoll_loop(int action)
85{
86 int next_time;
87 int status;
88 int fd;
89
90 int fds, count;
91 int pr, pw, sr, sw;
92 unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
93 struct epoll_event ev;
94
95 /* private data */
96 static struct epoll_event *epoll_events = NULL;
97 static int epoll_fd;
98
99 if (action == POLL_LOOP_ACTION_INIT) {
100 epoll_fd = epoll_create(global.maxsock + 1);
101 if (epoll_fd < 0)
102 return 0;
103 else {
104 epoll_events = (struct epoll_event*)
105 calloc(1, sizeof(struct epoll_event) * global.maxsock);
106 PrevReadEvent = (fd_set *)
107 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
108 PrevWriteEvent = (fd_set *)
109 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
110 }
111 return 1;
112 }
113 else if (action == POLL_LOOP_ACTION_CLEAN) {
114 if (PrevWriteEvent) free(PrevWriteEvent);
115 if (PrevReadEvent) free(PrevReadEvent);
116 if (epoll_events) free(epoll_events);
117 close(epoll_fd);
118 epoll_fd = 0;
119 return 1;
120 }
121
122 /* OK, it's POLL_LOOP_ACTION_RUN */
123
124 tv_now(&now);
125
126 while (1) {
127 next_time = process_runnable_tasks();
128
129 /* stop when there's no connection left and we don't allow them anymore */
130 if (!actconn && listeners == 0)
131 break;
132
133 for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
134
135 rn = ((int*)StaticReadEvent)[fds]; ro = ((int*)PrevReadEvent)[fds];
136 wn = ((int*)StaticWriteEvent)[fds]; wo = ((int*)PrevWriteEvent)[fds];
137
138 if ((ro^rn) | (wo^wn)) {
139 for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
140#define FDSETS_ARE_INT_ALIGNED
141#ifdef FDSETS_ARE_INT_ALIGNED
142
143#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
144#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
145 pr = (ro >> count) & 1;
146 pw = (wo >> count) & 1;
147 sr = (rn >> count) & 1;
148 sw = (wn >> count) & 1;
149#else
150 pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
151 pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
152 sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
153 sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
154#endif
155#else
156 pr = FD_ISSET(fd, PrevReadEvent);
157 pw = FD_ISSET(fd, PrevWriteEvent);
158 sr = FD_ISSET(fd, StaticReadEvent);
159 sw = FD_ISSET(fd, StaticWriteEvent);
160#endif
161 if (!((sr^pr) | (sw^pw)))
162 continue;
163
164 ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
165 ev.data.fd = fd;
166
167#ifdef EPOLL_CTL_MOD_WORKAROUND
168 /* I encountered a rarely reproducible problem with
169 * EPOLL_CTL_MOD where a modified FD (systematically
170 * the one in epoll_events[0], fd#7) would sometimes
171 * be set EPOLL_OUT while asked for a read ! This is
172 * with the 2.4 epoll patch. The workaround is to
173 * delete then recreate in case of modification.
174 * This is in 2.4 up to epoll-lt-0.21 but not in 2.6
175 * nor RHEL kernels.
176 */
177
178 if ((pr | pw) && fdtab[fd].state != FD_STCLOSE)
179 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev);
180
181 if ((sr | sw))
182 epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev);
183#else
184 if ((pr | pw)) {
185 /* the file-descriptor already exists... */
186 if ((sr | sw)) {
187 /* ...and it will still exist */
188 if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
189 // perror("epoll_ctl(MOD)");
190 // exit(1);
191 }
192 } else {
193 /* ...and it will be removed */
194 if (fdtab[fd].state != FD_STCLOSE &&
195 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
196 // perror("epoll_ctl(DEL)");
197 // exit(1);
198 }
199 }
200 } else {
201 /* the file-descriptor did not exist, let's add it */
202 if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
203 // perror("epoll_ctl(ADD)");
204 // exit(1);
205 }
206 }
207#endif // EPOLL_CTL_MOD_WORKAROUND
208 }
209 ((int*)PrevReadEvent)[fds] = rn;
210 ((int*)PrevWriteEvent)[fds] = wn;
211 }
212 }
213
214 /* now let's wait for events */
215 status = epoll_wait(epoll_fd, epoll_events, maxfd, next_time);
216 tv_now(&now);
217
218 for (count = 0; count < status; count++) {
219 fd = epoll_events[count].data.fd;
220
221 if (FD_ISSET(fd, StaticReadEvent)) {
222 if (fdtab[fd].state == FD_STCLOSE)
223 continue;
224 if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
225 fdtab[fd].read(fd);
226 }
227
228 if (FD_ISSET(fd, StaticWriteEvent)) {
229 if (fdtab[fd].state == FD_STCLOSE)
230 continue;
231 if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
232 fdtab[fd].write(fd);
233 }
234 }
235 }
236 return 1;
237}
238#endif
239
240
241
242#if defined(ENABLE_POLL)
243/*
244 * Main poll() loop.
245 * does 3 actions :
246 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
247 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
248 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
249 *
250 * returns 0 if initialization failed, !0 otherwise.
251 */
252
253int poll_loop(int action)
254{
255 int next_time;
256 int status;
257 int fd, nbfd;
258
259 int fds, count;
260 int sr, sw;
261 unsigned rn, wn; /* read new, write new */
262
263 /* private data */
264 static struct pollfd *poll_events = NULL;
265
266 if (action == POLL_LOOP_ACTION_INIT) {
267 poll_events = (struct pollfd*)
268 calloc(1, sizeof(struct pollfd) * global.maxsock);
269 return 1;
270 }
271 else if (action == POLL_LOOP_ACTION_CLEAN) {
272 if (poll_events)
273 free(poll_events);
274 return 1;
275 }
276
277 /* OK, it's POLL_LOOP_ACTION_RUN */
278
279 tv_now(&now);
280
281 while (1) {
282 next_time = process_runnable_tasks();
283
284 /* stop when there's no connection left and we don't allow them anymore */
285 if (!actconn && listeners == 0)
286 break;
287
288 nbfd = 0;
289 for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
290
291 rn = ((int*)StaticReadEvent)[fds];
292 wn = ((int*)StaticWriteEvent)[fds];
293
294 if ((rn|wn)) {
295 for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
296#define FDSETS_ARE_INT_ALIGNED
297#ifdef FDSETS_ARE_INT_ALIGNED
298
299#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
300#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
301 sr = (rn >> count) & 1;
302 sw = (wn >> count) & 1;
303#else
304 sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
305 sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
306#endif
307#else
308 sr = FD_ISSET(fd, StaticReadEvent);
309 sw = FD_ISSET(fd, StaticWriteEvent);
310#endif
311 if ((sr|sw)) {
312 poll_events[nbfd].fd = fd;
313 poll_events[nbfd].events = (sr ? POLLIN : 0) | (sw ? POLLOUT : 0);
314 nbfd++;
315 }
316 }
317 }
318 }
319
320 /* now let's wait for events */
321 status = poll(poll_events, nbfd, next_time);
322 tv_now(&now);
323
324 for (count = 0; status > 0 && count < nbfd; count++) {
325 fd = poll_events[count].fd;
326
327 if (!(poll_events[count].revents & ( POLLOUT | POLLIN | POLLERR | POLLHUP )))
328 continue;
329
330 /* ok, we found one active fd */
331 status--;
332
333 if (FD_ISSET(fd, StaticReadEvent)) {
334 if (fdtab[fd].state == FD_STCLOSE)
335 continue;
336 if (poll_events[count].revents & ( POLLIN | POLLERR | POLLHUP ))
337 fdtab[fd].read(fd);
338 }
339
340 if (FD_ISSET(fd, StaticWriteEvent)) {
341 if (fdtab[fd].state == FD_STCLOSE)
342 continue;
343 if (poll_events[count].revents & ( POLLOUT | POLLERR | POLLHUP ))
344 fdtab[fd].write(fd);
345 }
346 }
347 }
348 return 1;
349}
350#endif
351
352
353
354/*
355 * Main select() loop.
356 * does 3 actions :
357 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
358 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
359 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
360 *
361 * returns 0 if initialization failed, !0 otherwise.
362 */
363
364
365int select_loop(int action)
366{
367 int next_time;
368 int status;
369 int fd,i;
370 struct timeval delta;
371 int readnotnull, writenotnull;
372 static fd_set *ReadEvent = NULL, *WriteEvent = NULL;
373
374 if (action == POLL_LOOP_ACTION_INIT) {
375 ReadEvent = (fd_set *)
376 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
377 WriteEvent = (fd_set *)
378 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
379 return 1;
380 }
381 else if (action == POLL_LOOP_ACTION_CLEAN) {
382 if (WriteEvent) free(WriteEvent);
383 if (ReadEvent) free(ReadEvent);
384 return 1;
385 }
386
387 /* OK, it's POLL_LOOP_ACTION_RUN */
388
389 tv_now(&now);
390
391 while (1) {
392 next_time = process_runnable_tasks();
393
394 /* stop when there's no connection left and we don't allow them anymore */
395 if (!actconn && listeners == 0)
396 break;
397
398 if (next_time > 0) { /* FIXME */
399 /* Convert to timeval */
400 /* to avoid eventual select loops due to timer precision */
401 next_time += SCHEDULER_RESOLUTION;
402 delta.tv_sec = next_time / 1000;
403 delta.tv_usec = (next_time % 1000) * 1000;
404 }
405 else if (next_time == 0) { /* allow select to return immediately when needed */
406 delta.tv_sec = delta.tv_usec = 0;
407 }
408
409
410 /* let's restore fdset state */
411
412 readnotnull = 0; writenotnull = 0;
413 for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) {
414 readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0;
415 writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0;
416 }
417
418 // /* just a verification code, needs to be removed for performance */
419 // for (i=0; i<maxfd; i++) {
420 // if (FD_ISSET(i, ReadEvent) != FD_ISSET(i, StaticReadEvent))
421 // abort();
422 // if (FD_ISSET(i, WriteEvent) != FD_ISSET(i, StaticWriteEvent))
423 // abort();
424 //
425 // }
426
427 status = select(maxfd,
428 readnotnull ? ReadEvent : NULL,
429 writenotnull ? WriteEvent : NULL,
430 NULL,
431 (next_time >= 0) ? &delta : NULL);
432
433 /* this is an experiment on the separation of the select work */
434 // status = (readnotnull ? select(maxfd, ReadEvent, NULL, NULL, (next_time >= 0) ? &delta : NULL) : 0);
435 // status |= (writenotnull ? select(maxfd, NULL, WriteEvent, NULL, (next_time >= 0) ? &delta : NULL) : 0);
436
437 tv_now(&now);
438
439 if (status > 0) { /* must proceed with events */
440
441 int fds;
442 char count;
443
444 for (fds = 0; (fds << INTBITS) < maxfd; fds++)
445 if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) != 0)
446 for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
447
448 /* if we specify read first, the accepts and zero reads will be
449 * seen first. Moreover, system buffers will be flushed faster.
450 */
451 if (FD_ISSET(fd, ReadEvent)) {
452 if (fdtab[fd].state == FD_STCLOSE)
453 continue;
454 fdtab[fd].read(fd);
455 }
456
457 if (FD_ISSET(fd, WriteEvent)) {
458 if (fdtab[fd].state == FD_STCLOSE)
459 continue;
460 fdtab[fd].write(fd);
461 }
462 }
463 }
464 else {
465 // fprintf(stderr,"select returned %d, maxfd=%d\n", status, maxfd);
466 }
467 }
468 return 1;
469}
470
471
472
473/*********************
474 * generic functions
475 *********************/
476
477
478/* Deletes an FD from the fdsets, and recomputes the maxfd limit.
479 * The file descriptor is also closed.
480 */
481void fd_delete(int fd)
482{
483 FD_CLR(fd, StaticReadEvent);
484 FD_CLR(fd, StaticWriteEvent);
485#if defined(ENABLE_EPOLL)
486 if (PrevReadEvent) {
487 FD_CLR(fd, PrevReadEvent);
488 FD_CLR(fd, PrevWriteEvent);
489 }
490#endif
491
492 close(fd);
493 fdtab[fd].state = FD_STCLOSE;
494
495 while ((maxfd-1 >= 0) && (fdtab[maxfd-1].state == FD_STCLOSE))
496 maxfd--;
497}
498
499
500/*
501 * Local variables:
502 * c-indent-level: 8
503 * c-basic-offset: 8
504 * End:
505 */