blob: b7ff8add74ea664f000941674c837512cc861670 [file] [log] [blame]
Willy Tarreaubaaee002006-06-26 02:48:02 +02001/*
2 * File descriptors management functions.
3 *
4 * Copyright 2000-2006 Willy Tarreau <w@1wt.eu>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13/*
14 * FIXME:
15 * - we still use 'listeners' to check whether we want to stop or not.
16 * - the various pollers should be moved to other external files, possibly
17 * dynamic libs.
18 * - merge event_cli_read() and event_srv_read(). The difference is res_*,
19 * buffer (at the beginning) and timeouts (at the end).
20 * => event_tcp_read(). It may be called from event_accept().
21 * - extract the connect code from event_srv_write()
22 * => event_tcp_connect(). It must then call event_write().
23 * - merge the remaining event_cli_write() and event_srv_write()
24 * => single event_tcp_write(). Check buffer, fd_state, res*, and timeouts.
25 *
26 */
27
28#include <unistd.h>
29#include <sys/time.h>
30#include <sys/types.h>
31
32#include <haproxy/compat.h>
33#include <haproxy/config.h>
34#include <haproxy/time.h>
35
36#include <types/fd.h>
37#include <types/global.h>
38
39#include <proto/polling.h>
40#include <proto/task.h>
41
42struct fdtab *fdtab = NULL; /* array of all the file descriptors */
43int maxfd; /* # of the highest fd + 1 */
44int totalconn; /* total # of terminated sessions */
45int actconn; /* # of active sessions */
46
47fd_set *StaticReadEvent, *StaticWriteEvent;
48int cfg_polling_mechanism = 0; /* POLL_USE_{SELECT|POLL|EPOLL} */
49
50
51/******************************
52 * pollers
53 ******************************/
54
55
56/*
57 * FIXME: this is dirty, but at the moment, there's no other solution to remove
58 * the old FDs from outside the loop. Perhaps we should export a global 'poll'
59 * structure with pointers to functions such as init_fd() and close_fd(), plus
60 * a private structure with several pointers to places such as below.
61 */
62
63#if defined(ENABLE_EPOLL)
64fd_set *PrevReadEvent = NULL, *PrevWriteEvent = NULL;
65
66#if defined(USE_MY_EPOLL)
67_syscall1 (int, epoll_create, int, size);
68_syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
69_syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
70#endif
71
72/*
73 * Main epoll() loop.
74 * does 3 actions :
75 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
76 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
77 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
78 *
79 * returns 0 if initialization failed, !0 otherwise.
80 */
81
82int epoll_loop(int action)
83{
84 int next_time;
85 int status;
86 int fd;
87
88 int fds, count;
89 int pr, pw, sr, sw;
90 unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
91 struct epoll_event ev;
92
93 /* private data */
94 static struct epoll_event *epoll_events = NULL;
95 static int epoll_fd;
96
97 if (action == POLL_LOOP_ACTION_INIT) {
98 epoll_fd = epoll_create(global.maxsock + 1);
99 if (epoll_fd < 0)
100 return 0;
101 else {
102 epoll_events = (struct epoll_event*)
103 calloc(1, sizeof(struct epoll_event) * global.maxsock);
104 PrevReadEvent = (fd_set *)
105 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
106 PrevWriteEvent = (fd_set *)
107 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
108 }
109 return 1;
110 }
111 else if (action == POLL_LOOP_ACTION_CLEAN) {
112 if (PrevWriteEvent) free(PrevWriteEvent);
113 if (PrevReadEvent) free(PrevReadEvent);
114 if (epoll_events) free(epoll_events);
115 close(epoll_fd);
116 epoll_fd = 0;
117 return 1;
118 }
119
120 /* OK, it's POLL_LOOP_ACTION_RUN */
121
122 tv_now(&now);
123
124 while (1) {
125 next_time = process_runnable_tasks();
126
127 /* stop when there's no connection left and we don't allow them anymore */
128 if (!actconn && listeners == 0)
129 break;
130
131 for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
132
133 rn = ((int*)StaticReadEvent)[fds]; ro = ((int*)PrevReadEvent)[fds];
134 wn = ((int*)StaticWriteEvent)[fds]; wo = ((int*)PrevWriteEvent)[fds];
135
136 if ((ro^rn) | (wo^wn)) {
137 for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
138#define FDSETS_ARE_INT_ALIGNED
139#ifdef FDSETS_ARE_INT_ALIGNED
140
141#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
142#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
143 pr = (ro >> count) & 1;
144 pw = (wo >> count) & 1;
145 sr = (rn >> count) & 1;
146 sw = (wn >> count) & 1;
147#else
148 pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
149 pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
150 sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
151 sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
152#endif
153#else
154 pr = FD_ISSET(fd, PrevReadEvent);
155 pw = FD_ISSET(fd, PrevWriteEvent);
156 sr = FD_ISSET(fd, StaticReadEvent);
157 sw = FD_ISSET(fd, StaticWriteEvent);
158#endif
159 if (!((sr^pr) | (sw^pw)))
160 continue;
161
162 ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
163 ev.data.fd = fd;
164
165#ifdef EPOLL_CTL_MOD_WORKAROUND
166 /* I encountered a rarely reproducible problem with
167 * EPOLL_CTL_MOD where a modified FD (systematically
168 * the one in epoll_events[0], fd#7) would sometimes
169 * be set EPOLL_OUT while asked for a read ! This is
170 * with the 2.4 epoll patch. The workaround is to
171 * delete then recreate in case of modification.
172 * This is in 2.4 up to epoll-lt-0.21 but not in 2.6
173 * nor RHEL kernels.
174 */
175
176 if ((pr | pw) && fdtab[fd].state != FD_STCLOSE)
177 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev);
178
179 if ((sr | sw))
180 epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev);
181#else
182 if ((pr | pw)) {
183 /* the file-descriptor already exists... */
184 if ((sr | sw)) {
185 /* ...and it will still exist */
186 if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
187 // perror("epoll_ctl(MOD)");
188 // exit(1);
189 }
190 } else {
191 /* ...and it will be removed */
192 if (fdtab[fd].state != FD_STCLOSE &&
193 epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
194 // perror("epoll_ctl(DEL)");
195 // exit(1);
196 }
197 }
198 } else {
199 /* the file-descriptor did not exist, let's add it */
200 if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
201 // perror("epoll_ctl(ADD)");
202 // exit(1);
203 }
204 }
205#endif // EPOLL_CTL_MOD_WORKAROUND
206 }
207 ((int*)PrevReadEvent)[fds] = rn;
208 ((int*)PrevWriteEvent)[fds] = wn;
209 }
210 }
211
212 /* now let's wait for events */
213 status = epoll_wait(epoll_fd, epoll_events, maxfd, next_time);
214 tv_now(&now);
215
216 for (count = 0; count < status; count++) {
217 fd = epoll_events[count].data.fd;
218
219 if (FD_ISSET(fd, StaticReadEvent)) {
220 if (fdtab[fd].state == FD_STCLOSE)
221 continue;
222 if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
223 fdtab[fd].read(fd);
224 }
225
226 if (FD_ISSET(fd, StaticWriteEvent)) {
227 if (fdtab[fd].state == FD_STCLOSE)
228 continue;
229 if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
230 fdtab[fd].write(fd);
231 }
232 }
233 }
234 return 1;
235}
236#endif
237
238
239
240#if defined(ENABLE_POLL)
241/*
242 * Main poll() loop.
243 * does 3 actions :
244 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
245 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
246 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
247 *
248 * returns 0 if initialization failed, !0 otherwise.
249 */
250
251int poll_loop(int action)
252{
253 int next_time;
254 int status;
255 int fd, nbfd;
256
257 int fds, count;
258 int sr, sw;
259 unsigned rn, wn; /* read new, write new */
260
261 /* private data */
262 static struct pollfd *poll_events = NULL;
263
264 if (action == POLL_LOOP_ACTION_INIT) {
265 poll_events = (struct pollfd*)
266 calloc(1, sizeof(struct pollfd) * global.maxsock);
267 return 1;
268 }
269 else if (action == POLL_LOOP_ACTION_CLEAN) {
270 if (poll_events)
271 free(poll_events);
272 return 1;
273 }
274
275 /* OK, it's POLL_LOOP_ACTION_RUN */
276
277 tv_now(&now);
278
279 while (1) {
280 next_time = process_runnable_tasks();
281
282 /* stop when there's no connection left and we don't allow them anymore */
283 if (!actconn && listeners == 0)
284 break;
285
286 nbfd = 0;
287 for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
288
289 rn = ((int*)StaticReadEvent)[fds];
290 wn = ((int*)StaticWriteEvent)[fds];
291
292 if ((rn|wn)) {
293 for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
294#define FDSETS_ARE_INT_ALIGNED
295#ifdef FDSETS_ARE_INT_ALIGNED
296
297#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
298#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
299 sr = (rn >> count) & 1;
300 sw = (wn >> count) & 1;
301#else
302 sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
303 sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
304#endif
305#else
306 sr = FD_ISSET(fd, StaticReadEvent);
307 sw = FD_ISSET(fd, StaticWriteEvent);
308#endif
309 if ((sr|sw)) {
310 poll_events[nbfd].fd = fd;
311 poll_events[nbfd].events = (sr ? POLLIN : 0) | (sw ? POLLOUT : 0);
312 nbfd++;
313 }
314 }
315 }
316 }
317
318 /* now let's wait for events */
319 status = poll(poll_events, nbfd, next_time);
320 tv_now(&now);
321
322 for (count = 0; status > 0 && count < nbfd; count++) {
323 fd = poll_events[count].fd;
324
325 if (!(poll_events[count].revents & ( POLLOUT | POLLIN | POLLERR | POLLHUP )))
326 continue;
327
328 /* ok, we found one active fd */
329 status--;
330
331 if (FD_ISSET(fd, StaticReadEvent)) {
332 if (fdtab[fd].state == FD_STCLOSE)
333 continue;
334 if (poll_events[count].revents & ( POLLIN | POLLERR | POLLHUP ))
335 fdtab[fd].read(fd);
336 }
337
338 if (FD_ISSET(fd, StaticWriteEvent)) {
339 if (fdtab[fd].state == FD_STCLOSE)
340 continue;
341 if (poll_events[count].revents & ( POLLOUT | POLLERR | POLLHUP ))
342 fdtab[fd].write(fd);
343 }
344 }
345 }
346 return 1;
347}
348#endif
349
350
351
352/*
353 * Main select() loop.
354 * does 3 actions :
355 * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
356 * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
357 * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
358 *
359 * returns 0 if initialization failed, !0 otherwise.
360 */
361
362
363int select_loop(int action)
364{
365 int next_time;
366 int status;
367 int fd,i;
368 struct timeval delta;
369 int readnotnull, writenotnull;
370 static fd_set *ReadEvent = NULL, *WriteEvent = NULL;
371
372 if (action == POLL_LOOP_ACTION_INIT) {
373 ReadEvent = (fd_set *)
374 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
375 WriteEvent = (fd_set *)
376 calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
377 return 1;
378 }
379 else if (action == POLL_LOOP_ACTION_CLEAN) {
380 if (WriteEvent) free(WriteEvent);
381 if (ReadEvent) free(ReadEvent);
382 return 1;
383 }
384
385 /* OK, it's POLL_LOOP_ACTION_RUN */
386
387 tv_now(&now);
388
389 while (1) {
390 next_time = process_runnable_tasks();
391
392 /* stop when there's no connection left and we don't allow them anymore */
393 if (!actconn && listeners == 0)
394 break;
395
396 if (next_time > 0) { /* FIXME */
397 /* Convert to timeval */
398 /* to avoid eventual select loops due to timer precision */
399 next_time += SCHEDULER_RESOLUTION;
400 delta.tv_sec = next_time / 1000;
401 delta.tv_usec = (next_time % 1000) * 1000;
402 }
403 else if (next_time == 0) { /* allow select to return immediately when needed */
404 delta.tv_sec = delta.tv_usec = 0;
405 }
406
407
408 /* let's restore fdset state */
409
410 readnotnull = 0; writenotnull = 0;
411 for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) {
412 readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0;
413 writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0;
414 }
415
416 // /* just a verification code, needs to be removed for performance */
417 // for (i=0; i<maxfd; i++) {
418 // if (FD_ISSET(i, ReadEvent) != FD_ISSET(i, StaticReadEvent))
419 // abort();
420 // if (FD_ISSET(i, WriteEvent) != FD_ISSET(i, StaticWriteEvent))
421 // abort();
422 //
423 // }
424
425 status = select(maxfd,
426 readnotnull ? ReadEvent : NULL,
427 writenotnull ? WriteEvent : NULL,
428 NULL,
429 (next_time >= 0) ? &delta : NULL);
430
431 /* this is an experiment on the separation of the select work */
432 // status = (readnotnull ? select(maxfd, ReadEvent, NULL, NULL, (next_time >= 0) ? &delta : NULL) : 0);
433 // status |= (writenotnull ? select(maxfd, NULL, WriteEvent, NULL, (next_time >= 0) ? &delta : NULL) : 0);
434
435 tv_now(&now);
436
437 if (status > 0) { /* must proceed with events */
438
439 int fds;
440 char count;
441
442 for (fds = 0; (fds << INTBITS) < maxfd; fds++)
443 if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) != 0)
444 for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
445
446 /* if we specify read first, the accepts and zero reads will be
447 * seen first. Moreover, system buffers will be flushed faster.
448 */
449 if (FD_ISSET(fd, ReadEvent)) {
450 if (fdtab[fd].state == FD_STCLOSE)
451 continue;
452 fdtab[fd].read(fd);
453 }
454
455 if (FD_ISSET(fd, WriteEvent)) {
456 if (fdtab[fd].state == FD_STCLOSE)
457 continue;
458 fdtab[fd].write(fd);
459 }
460 }
461 }
462 else {
463 // fprintf(stderr,"select returned %d, maxfd=%d\n", status, maxfd);
464 }
465 }
466 return 1;
467}
468
469
470
471/*********************
472 * generic functions
473 *********************/
474
475
476/* Deletes an FD from the fdsets, and recomputes the maxfd limit.
477 * The file descriptor is also closed.
478 */
479void fd_delete(int fd)
480{
481 FD_CLR(fd, StaticReadEvent);
482 FD_CLR(fd, StaticWriteEvent);
483#if defined(ENABLE_EPOLL)
484 if (PrevReadEvent) {
485 FD_CLR(fd, PrevReadEvent);
486 FD_CLR(fd, PrevWriteEvent);
487 }
488#endif
489
490 close(fd);
491 fdtab[fd].state = FD_STCLOSE;
492
493 while ((maxfd-1 >= 0) && (fdtab[maxfd-1].state == FD_STCLOSE))
494 maxfd--;
495}
496
497
498/*
499 * Local variables:
500 * c-indent-level: 8
501 * c-basic-offset: 8
502 * End:
503 */