blob: 56049bc046777117c03a71f0d85a4447a7e26535 [file] [log] [blame]
/*
* FD polling functions for linux epoll()
*
* Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <unistd.h>
#include <sys/time.h>
#include <sys/types.h>
#include <common/compat.h>
#include <common/config.h>
#include <common/time.h>
#include <types/fd.h>
#include <types/global.h>
#include <proto/fd.h>
#include <proto/polling.h>
#include <proto/task.h>
#if defined(USE_MY_EPOLL)
#include <errno.h>
#include <sys/syscall.h>
_syscall1 (int, epoll_create, int, size);
_syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
_syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
#endif
static fd_set *fd_evts[2];
static fd_set *old_evts[2];
/* private data */
static struct epoll_event *epoll_events;
static int epoll_fd;
/*
* Benchmarks performed on a Pentium-M notebook show that using functions
* instead of the usual macros improve the FD_* performance by about 80%,
* and that marking them regparm(2) adds another 20%.
*/
REGPRM2 static int __fd_isset(const int fd, int dir)
{
return FD_ISSET(fd, fd_evts[dir]);
}
REGPRM2 static int __fd_set(const int fd, int dir)
{
FD_SET(fd, fd_evts[dir]);
return 0;
}
REGPRM2 static int __fd_clr(const int fd, int dir)
{
FD_CLR(fd, fd_evts[dir]);
return 0;
}
REGPRM2 static int __fd_cond_s(const int fd, int dir)
{
int ret;
ret = !FD_ISSET(fd, fd_evts[dir]);
if (ret)
FD_SET(fd, fd_evts[dir]);
return ret;
}
REGPRM2 static int __fd_cond_c(const int fd, int dir)
{
int ret;
ret = FD_ISSET(fd, fd_evts[dir]);
if (ret)
FD_CLR(fd, fd_evts[dir]);
return ret;
}
REGPRM1 static void __fd_rem(const int fd)
{
FD_CLR(fd, fd_evts[DIR_RD]);
FD_CLR(fd, fd_evts[DIR_WR]);
}
REGPRM1 static void __fd_clo(const int fd)
{
FD_CLR(fd, fd_evts[DIR_RD]);
FD_CLR(fd, fd_evts[DIR_WR]);
FD_CLR(fd, old_evts[DIR_RD]);
FD_CLR(fd, old_evts[DIR_WR]);
}
/*
* Initialization of the epoll() poller.
* Returns 0 in case of failure, non-zero in case of success. If it fails, it
* disables the poller by setting its pref to 0.
*/
REGPRM1 static int epoll_init(struct poller *p)
{
__label__ fail_pwevt, fail_prevt, fail_swevt, fail_srevt, fail_ee, fail_fd;
int fd_set_bytes;
p->private = NULL;
fd_set_bytes = sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE;
epoll_fd = epoll_create(global.maxsock + 1);
if (epoll_fd < 0)
goto fail_fd;
epoll_events = (struct epoll_event*)
calloc(1, sizeof(struct epoll_event) * global.maxsock);
if (epoll_events == NULL)
goto fail_ee;
if ((old_evts[DIR_RD] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
goto fail_prevt;
if ((old_evts[DIR_WR] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
goto fail_pwevt;
if ((fd_evts[DIR_RD] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
goto fail_srevt;
if ((fd_evts[DIR_WR] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
goto fail_swevt;
return 1;
fail_swevt:
free(fd_evts[DIR_RD]);
fail_srevt:
free(old_evts[DIR_WR]);
fail_pwevt:
free(old_evts[DIR_RD]);
fail_prevt:
free(epoll_events);
fail_ee:
close(epoll_fd);
epoll_fd = 0;
fail_fd:
p->pref = 0;
return 0;
}
/*
* Termination of the epoll() poller.
* Memory is released and the poller is marked as unselectable.
*/
REGPRM1 static void epoll_term(struct poller *p)
{
if (fd_evts[DIR_WR])
free(fd_evts[DIR_WR]);
if (fd_evts[DIR_RD])
free(fd_evts[DIR_RD]);
if (old_evts[DIR_WR])
free(old_evts[DIR_WR]);
if (old_evts[DIR_RD])
free(old_evts[DIR_RD]);
if (epoll_events)
free(epoll_events);
close(epoll_fd);
epoll_fd = 0;
p->private = NULL;
p->pref = 0;
}
/*
* epoll() poller
*/
REGPRM2 static void epoll_poll(struct poller *p, int wait_time)
{
int status;
int fd;
int fds, count;
int pr, pw, sr, sw;
unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
struct epoll_event ev;
for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
rn = ((int*)fd_evts[DIR_RD])[fds]; ro = ((int*)old_evts[DIR_RD])[fds];
wn = ((int*)fd_evts[DIR_WR])[fds]; wo = ((int*)old_evts[DIR_WR])[fds];
if ((ro^rn) | (wo^wn)) {
for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
#define FDSETS_ARE_INT_ALIGNED
#ifdef FDSETS_ARE_INT_ALIGNED
#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
pr = (ro >> count) & 1;
pw = (wo >> count) & 1;
sr = (rn >> count) & 1;
sw = (wn >> count) & 1;
#else
pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
#endif
#else
pr = FD_ISSET(fd, old_evts[DIR_RD]);
pw = FD_ISSET(fd, old_evts[DIR_WR]);
sr = FD_ISSET(fd, fd_evts[DIR_RD]);
sw = FD_ISSET(fd, fd_evts[DIR_WR]);
#endif
if (!((sr^pr) | (sw^pw)))
continue;
ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
ev.data.fd = fd;
#ifdef EPOLL_CTL_MOD_WORKAROUND
/* I encountered a rarely reproducible problem with
* EPOLL_CTL_MOD where a modified FD (systematically
* the one in epoll_events[0], fd#7) would sometimes
* be set EPOLL_OUT while asked for a read ! This is
* with the 2.4 epoll patch. The workaround is to
* delete then recreate in case of modification.
* This is in 2.4 up to epoll-lt-0.21 but not in 2.6
* nor RHEL kernels.
*/
if ((pr | pw) && fdtab[fd].state != FD_STCLOSE)
epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev);
if ((sr | sw))
epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev);
#else
if ((pr | pw)) {
/* the file-descriptor already exists... */
if ((sr | sw)) {
/* ...and it will still exist */
if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
// perror("epoll_ctl(MOD)");
// exit(1);
}
} else {
/* ...and it will be removed */
if (fdtab[fd].state != FD_STCLOSE &&
epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
// perror("epoll_ctl(DEL)");
// exit(1);
}
}
} else {
/* the file-descriptor did not exist, let's add it */
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
// perror("epoll_ctl(ADD)");
// exit(1);
}
}
#endif // EPOLL_CTL_MOD_WORKAROUND
}
((int*)old_evts[DIR_RD])[fds] = rn;
((int*)old_evts[DIR_WR])[fds] = wn;
}
}
/* now let's wait for events */
status = epoll_wait(epoll_fd, epoll_events, maxfd, wait_time);
tv_now(&now);
for (count = 0; count < status; count++) {
fd = epoll_events[count].data.fd;
if (FD_ISSET(fd, fd_evts[DIR_RD])) {
if (fdtab[fd].state == FD_STCLOSE)
continue;
if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
fdtab[fd].cb[DIR_RD].f(fd);
}
if (FD_ISSET(fd, fd_evts[DIR_WR])) {
if (fdtab[fd].state == FD_STCLOSE)
continue;
if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
fdtab[fd].cb[DIR_WR].f(fd);
}
}
}
/*
* The only exported function. Returns 1.
*/
int epoll_register(struct poller *p)
{
p->name = "epoll";
p->pref = 300;
p->private = NULL;
p->init = epoll_init;
p->term = epoll_term;
p->poll = epoll_poll;
p->isset = __fd_isset;
p->set = __fd_set;
p->clr = __fd_clr;
p->rem = __fd_rem;
p->clo = __fd_clo;
p->cond_s = __fd_cond_s;
p->cond_c = __fd_cond_c;
return 1;
}
/*
* Local variables:
* c-indent-level: 8
* c-basic-offset: 8
* End:
*/